diff options
Diffstat (limited to 'fs')
427 files changed, 11335 insertions, 11206 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 7af425f53be..8482f2d1160 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -156,7 +156,7 @@ int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid) return -EOPNOTSUPP; acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS); if (acl) { - retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + retval = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (retval) return retval; set_cached_acl(inode, ACL_TYPE_ACCESS, acl); @@ -200,7 +200,7 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep, if (acl) { if (S_ISDIR(mode)) *dpacl = posix_acl_dup(acl); - retval = posix_acl_create(&acl, GFP_NOFS, &mode); + retval = __posix_acl_create(&acl, GFP_NOFS, &mode); if (retval < 0) return retval; if (retval > 0) diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 2b7a032c37b..a69260f2755 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -239,13 +239,12 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode) void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) { struct v9fs_inode *v9inode = V9FS_I(inode); - struct p9_fid *fid; if (!v9inode->fscache) return; spin_lock(&v9inode->fscache_lock); - fid = filp->private_data; + if ((filp->f_flags & O_ACCMODE) != O_RDONLY) v9fs_cache_inode_flush_cookie(inode); else diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 08f2e1e9a7e..14da82564f4 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -56,7 +56,7 @@ enum { /* Options that take no arguments */ Opt_nodevmap, /* Cache options */ - Opt_cache_loose, Opt_fscache, + Opt_cache_loose, Opt_fscache, Opt_mmap, /* Access options */ Opt_access, Opt_posixacl, /* Error token */ @@ -74,6 +74,7 @@ static const match_table_t tokens = { {Opt_cache, "cache=%s"}, {Opt_cache_loose, "loose"}, {Opt_fscache, "fscache"}, + {Opt_mmap, "mmap"}, {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_posixacl, "posixacl"}, @@ -91,6 +92,9 @@ static int get_cache_mode(char *s) } else if (!strcmp(s, "fscache")) { version = CACHE_FSCACHE; p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n"); + } else if (!strcmp(s, "mmap")) { + version = CACHE_MMAP; + p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n"); } else if (!strcmp(s, "none")) { version = CACHE_NONE; p9_debug(P9_DEBUG_9P, "Cache mode: none\n"); @@ -220,6 +224,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) case Opt_fscache: v9ses->cache = CACHE_FSCACHE; break; + case Opt_mmap: + v9ses->cache = CACHE_MMAP; + break; case Opt_cachetag: #ifdef CONFIG_9P_FSCACHE v9ses->cachetag = match_strdup(&args[0]); diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index a8e127c8962..099c7712631 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -64,6 +64,7 @@ enum p9_session_flags { enum p9_cache_modes { CACHE_NONE, + CACHE_MMAP, CACHE_LOOSE, CACHE_FSCACHE, }; diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index dc95a252523..b83ebfbf3fd 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -50,6 +50,8 @@ extern const struct dentry_operations v9fs_dentry_operations; extern const struct dentry_operations v9fs_cached_dentry_operations; extern const struct file_operations v9fs_cached_file_operations; extern const struct file_operations v9fs_cached_file_operations_dotl; +extern const struct file_operations v9fs_mmap_file_operations; +extern const struct file_operations v9fs_mmap_file_operations_dotl; extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 9ff073f4090..c71e88602ff 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -202,6 +202,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) { int retval; + p9_debug(P9_DEBUG_VFS, "page %p\n", page); + retval = v9fs_vfs_writepage_locked(page); if (retval < 0) { if (retval == -EAGAIN) { @@ -282,6 +284,9 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, pgoff_t index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = mapping->host; + + p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); + v9inode = V9FS_I(inode); start: page = grab_cache_page_write_begin(mapping, index, flags); @@ -312,6 +317,8 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, loff_t last_pos = pos + copied; struct inode *inode = page->mapping->host; + p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); + if (unlikely(copied < len)) { /* * zero out the rest of the area diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index a0df3e73c2b..a16b0ff497c 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -45,6 +45,7 @@ #include "cache.h" static const struct vm_operations_struct v9fs_file_vm_ops; +static const struct vm_operations_struct v9fs_mmap_file_vm_ops; /** * v9fs_file_open - open a file (or directory) @@ -87,7 +88,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) file->private_data = fid; mutex_lock(&v9inode->v_mutex); - if (v9ses->cache && !v9inode->writeback_fid && + if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && + !v9inode->writeback_fid && ((file->f_flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -105,7 +107,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9inode->writeback_fid = (void *) fid; } mutex_unlock(&v9inode->v_mutex); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) v9fs_cache_inode_set_cookie(inode, file); return 0; out_error: @@ -461,14 +463,12 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid, int n; loff_t i_size; size_t total = 0; - struct p9_client *clnt; loff_t origin = *offset; unsigned long pg_start, pg_end; p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count, (int)*offset); - clnt = fid->clnt; do { n = p9_client_write(fid, NULL, data+total, origin+total, count); if (n <= 0) @@ -581,11 +581,12 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, } static int -v9fs_file_mmap(struct file *file, struct vm_area_struct *vma) +v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) { int retval; - retval = generic_file_mmap(file, vma); + + retval = generic_file_mmap(filp, vma); if (!retval) vma->vm_ops = &v9fs_file_vm_ops; @@ -593,6 +594,43 @@ v9fs_file_mmap(struct file *file, struct vm_area_struct *vma) } static int +v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int retval; + struct inode *inode; + struct v9fs_inode *v9inode; + struct p9_fid *fid; + + inode = file_inode(filp); + v9inode = V9FS_I(inode); + mutex_lock(&v9inode->v_mutex); + if (!v9inode->writeback_fid && + (vma->vm_flags & VM_WRITE)) { + /* + * clone a fid and add it to writeback_fid + * we do it during mmap instead of + * page dirty time via write_begin/page_mkwrite + * because we want write after unlink usecase + * to work. + */ + fid = v9fs_writeback_fid(filp->f_path.dentry); + if (IS_ERR(fid)) { + retval = PTR_ERR(fid); + mutex_unlock(&v9inode->v_mutex); + return retval; + } + v9inode->writeback_fid = (void *) fid; + } + mutex_unlock(&v9inode->v_mutex); + + retval = generic_file_mmap(filp, vma); + if (!retval) + vma->vm_ops = &v9fs_mmap_file_vm_ops; + + return retval; +} + +static int v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct v9fs_inode *v9inode; @@ -660,6 +698,22 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count, return do_sync_read(filp, data, count, offset); } +/** + * v9fs_mmap_file_read - read from a file + * @filp: file pointer to read + * @udata: user data buffer to read data into + * @count: size of buffer + * @offset: offset at which to read data + * + */ +static ssize_t +v9fs_mmap_file_read(struct file *filp, char __user *data, size_t count, + loff_t *offset) +{ + /* TODO: Check if there are dirty pages */ + return v9fs_file_read(filp, data, count, offset); +} + static ssize_t v9fs_direct_write(struct file *filp, const char __user * data, size_t count, loff_t *offsetp) @@ -730,12 +784,65 @@ v9fs_cached_file_write(struct file *filp, const char __user * data, return do_sync_write(filp, data, count, offset); } + +/** + * v9fs_mmap_file_write - write to a file + * @filp: file pointer to write + * @data: data buffer to write data from + * @count: size of buffer + * @offset: offset at which to write data + * + */ +static ssize_t +v9fs_mmap_file_write(struct file *filp, const char __user *data, + size_t count, loff_t *offset) +{ + /* + * TODO: invalidate mmaps on filp's inode between + * offset and offset+count + */ + return v9fs_file_write(filp, data, count, offset); +} + +static void v9fs_mmap_vm_close(struct vm_area_struct *vma) +{ + struct inode *inode; + + struct writeback_control wbc = { + .nr_to_write = LONG_MAX, + .sync_mode = WB_SYNC_ALL, + .range_start = vma->vm_pgoff * PAGE_SIZE, + /* absolute end, byte at end included */ + .range_end = vma->vm_pgoff * PAGE_SIZE + + (vma->vm_end - vma->vm_start - 1), + }; + + + p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); + + inode = file_inode(vma->vm_file); + + if (!mapping_cap_writeback_dirty(inode->i_mapping)) + wbc.nr_to_write = 0; + + might_sleep(); + sync_inode(inode, &wbc); +} + + static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = v9fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; +static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { + .close = v9fs_mmap_vm_close, + .fault = filemap_fault, + .page_mkwrite = v9fs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, +}; + const struct file_operations v9fs_cached_file_operations = { .llseek = generic_file_llseek, @@ -786,3 +893,26 @@ const struct file_operations v9fs_file_operations_dotl = { .mmap = generic_file_readonly_mmap, .fsync = v9fs_file_fsync_dotl, }; + +const struct file_operations v9fs_mmap_file_operations = { + .llseek = generic_file_llseek, + .read = v9fs_mmap_file_read, + .write = v9fs_mmap_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock, + .mmap = v9fs_mmap_file_mmap, + .fsync = v9fs_file_fsync, +}; + +const struct file_operations v9fs_mmap_file_operations_dotl = { + .llseek = generic_file_llseek, + .read = v9fs_mmap_file_read, + .write = v9fs_mmap_file_write, + .open = v9fs_file_open, + .release = v9fs_dir_release, + .lock = v9fs_file_lock_dotl, + .flock = v9fs_file_flock_dotl, + .mmap = v9fs_mmap_file_mmap, + .fsync = v9fs_file_fsync_dotl, +}; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 4e65aa90334..bb7991c7e5c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -299,15 +299,22 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, case S_IFREG: if (v9fs_proto_dotl(v9ses)) { inode->i_op = &v9fs_file_inode_operations_dotl; - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || + v9ses->cache == CACHE_FSCACHE) inode->i_fop = &v9fs_cached_file_operations_dotl; + else if (v9ses->cache == CACHE_MMAP) + inode->i_fop = &v9fs_mmap_file_operations_dotl; else inode->i_fop = &v9fs_file_operations_dotl; } else { inode->i_op = &v9fs_file_inode_operations; - if (v9ses->cache) - inode->i_fop = &v9fs_cached_file_operations; + if (v9ses->cache == CACHE_LOOSE || + v9ses->cache == CACHE_FSCACHE) + inode->i_fop = + &v9fs_cached_file_operations; + else if (v9ses->cache == CACHE_MMAP) + inode->i_fop = &v9fs_mmap_file_operations; else inode->i_fop = &v9fs_file_operations; } @@ -779,7 +786,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct dentry *res; - struct super_block *sb; struct v9fs_session_info *v9ses; struct p9_fid *dfid, *fid; struct inode *inode; @@ -791,7 +797,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - sb = dir->i_sb; v9ses = v9fs_inode2v9ses(dir); /* We can walk d_parent because we hold the dir->i_mutex */ dfid = v9fs_fid_lookup(dentry->d_parent); @@ -812,7 +817,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, * unlink. For cached mode create calls request for new * inode. But with cache disabled, lookup should do this. */ - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); else inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); @@ -863,7 +868,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, return finish_no_open(file, res); err = 0; - fid = NULL; + v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode); fid = v9fs_create(v9ses, dir, dentry, NULL, perm, @@ -878,7 +883,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, v9fs_invalidate_inode_attr(dir); v9inode = V9FS_I(dentry->d_inode); mutex_lock(&v9inode->v_mutex); - if (v9ses->cache && !v9inode->writeback_fid && + if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && + !v9inode->writeback_fid && ((flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -901,7 +907,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, goto error; file->private_data = fid; - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) v9fs_cache_inode_set_cookie(dentry->d_inode, file); *opened |= FILE_CREATED; @@ -1479,7 +1485,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) */ i_size = inode->i_size; v9fs_stat2inode(st, inode, inode->i_sb); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) inode->i_size = i_size; spin_unlock(&inode->i_lock); out: diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 4c10edec26a..59dc8e87647 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -330,7 +330,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); - if (v9ses->cache && !v9inode->writeback_fid && + if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && + !v9inode->writeback_fid && ((flags & O_ACCMODE) != O_RDONLY)) { /* * clone a fid and add it to writeback_fid @@ -353,7 +354,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err) goto err_clunk_old_fid; file->private_data = ofid; - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) v9fs_cache_inode_set_cookie(inode, file); *opened |= FILE_CREATED; out: @@ -473,13 +474,11 @@ static int v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - int err; struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_stat_dotl *st; p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); - err = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { generic_fillattr(dentry->d_inode, stat); @@ -556,7 +555,6 @@ static int v9fs_mapped_iattr_valid(int iattr_valid) int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) { int retval; - struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_iattr_dotl p9attr; struct inode *inode = dentry->d_inode; @@ -577,8 +575,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) p9attr.mtime_sec = iattr->ia_mtime.tv_sec; p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; - retval = -EPERM; - v9ses = v9fs_dentry2v9ses(dentry); fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -715,7 +711,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, } v9fs_invalidate_inode_attr(dir); - if (v9ses->cache) { + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { /* Now walk from the parent so we can get an unopened fid. */ fid = p9_client_walk(dfid, 1, &name, 1); if (IS_ERR(fid)) { @@ -768,7 +764,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int err; - char *name; struct dentry *dir_dentry; struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; @@ -786,8 +781,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, if (IS_ERR(oldfid)) return PTR_ERR(oldfid); - name = (char *) dentry->d_name.name; - err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); if (err < 0) { @@ -973,7 +966,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) */ i_size = inode->i_size; v9fs_stat2inode_dotl(st, inode); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) inode->i_size = i_size; spin_unlock(&inode->i_lock); out: diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 2756dcd5de6..0afd0382822 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -144,7 +144,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, } v9fs_fill_super(sb, v9ses, flags, data); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) sb->s_d_op = &v9fs_cached_dentry_operations; else sb->s_d_op = &v9fs_dentry_operations; @@ -282,7 +282,7 @@ static int v9fs_drop_inode(struct inode *inode) { struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(inode); - if (v9ses->cache) + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) return generic_drop_inode(inode); /* * in case of non cached mode always drop the @@ -325,10 +325,12 @@ static int v9fs_write_inode_dotl(struct inode *inode, * send an fsync request to server irrespective of * wbc->sync_mode. */ - p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); v9inode = V9FS_I(inode); + p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n", + __func__, inode, v9inode->writeback_fid); if (!v9inode->writeback_fid) return 0; + ret = p9_client_fsync(v9inode->writeback_fid, 0); if (ret < 0) { __mark_inode_dirty(inode, I_DIRTY_DATASYNC); diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 3c28cdfb8c4..04133a1fd9c 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -138,8 +138,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, if (retval < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n", retval); - p9_client_clunk(fid); - return retval; + goto err; } msize = fid->clnt->msize; while (value_len) { @@ -152,12 +151,15 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, if (write_count < 0) { /* error in xattr write */ retval = write_count; - break; + goto err; } offset += write_count; value_len -= write_count; } - return p9_client_clunk(fid); + retval = offset; +err: + p9_client_clunk(fid); + return retval; } ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) diff --git a/fs/Kconfig b/fs/Kconfig index c229f828eb0..7385e54be4b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -68,10 +68,6 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" -config GENERIC_ACL - bool - select FS_POSIX_ACL - menu "Caches" source "fs/fscache/Kconfig" @@ -119,7 +115,7 @@ config TMPFS_POSIX_ACL bool "Tmpfs POSIX Access Control Lists" depends on TMPFS select TMPFS_XATTR - select GENERIC_ACL + select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support additional access rights for users and groups beyond the standard owner/group/world scheme, diff --git a/fs/Makefile b/fs/Makefile index 4fe6df3ec28..47ac07bb4ac 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -42,9 +42,8 @@ obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o -obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o +obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ -obj-$(CONFIG_GENERIC_ACL) += generic_acl.o obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o @@ -53,7 +52,7 @@ obj-$(CONFIG_FHANDLE) += fhandle.o obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ -obj-$(CONFIG_SYSFS) += sysfs/ +obj-$(CONFIG_SYSFS) += sysfs/ kernfs/ obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ diff --git a/fs/affs/super.c b/fs/affs/super.c index 45161a832bb..d098731b82f 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -49,11 +49,6 @@ affs_put_super(struct super_block *sb) pr_debug("AFFS: put_super()\n"); cancel_delayed_work_sync(&sbi->sb_work); - kfree(sbi->s_prefix); - affs_free_bitmap(sb); - affs_brelse(sbi->s_root_bh); - kfree(sbi); - sb->s_fs_info = NULL; } static int @@ -316,7 +311,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) unsigned long mount_flags; int tmp_flags; /* fix remount prototype... */ u8 sig[4]; - int ret = -EINVAL; + int ret; save_mount_options(sb, data); @@ -412,17 +407,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) if (!silent) printk(KERN_ERR "AFFS: No valid root block on device %s\n", sb->s_id); - goto out_error; + return -EINVAL; /* N.B. after this point bh must be released */ got_root: + /* Keep super block in cache */ + sbi->s_root_bh = root_bh; root_block = sbi->s_root_block; /* Find out which kind of FS we have */ boot_bh = sb_bread(sb, 0); if (!boot_bh) { printk(KERN_ERR "AFFS: Cannot read boot block\n"); - goto out_error; + return -EINVAL; } memcpy(sig, boot_bh->b_data, 4); brelse(boot_bh); @@ -471,7 +468,7 @@ got_root: default: printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n", sb->s_id, chksum); - goto out_error; + return -EINVAL; } if (mount_flags & SF_VERBOSE) { @@ -488,22 +485,17 @@ got_root: if (sbi->s_flags & SF_OFS) sbi->s_data_blksize -= 24; - /* Keep super block in cache */ - sbi->s_root_bh = root_bh; - /* N.B. after this point s_root_bh must be released */ - tmp_flags = sb->s_flags; - if (affs_init_bitmap(sb, &tmp_flags)) - goto out_error; + ret = affs_init_bitmap(sb, &tmp_flags); + if (ret) + return ret; sb->s_flags = tmp_flags; /* set up enough so that it can read an inode */ root_inode = affs_iget(sb, root_block); - if (IS_ERR(root_inode)) { - ret = PTR_ERR(root_inode); - goto out_error; - } + if (IS_ERR(root_inode)) + return PTR_ERR(root_inode); if (AFFS_SB(sb)->s_flags & SF_INTL) sb->s_d_op = &affs_intl_dentry_operations; @@ -513,22 +505,11 @@ got_root: sb->s_root = d_make_root(root_inode); if (!sb->s_root) { printk(KERN_ERR "AFFS: Get root inode failed\n"); - goto out_error; + return -ENOMEM; } pr_debug("AFFS: s_flags=%lX\n",sb->s_flags); return 0; - - /* - * Begin the cascaded cleanup ... - */ -out_error: - kfree(sbi->s_bitmap); - affs_brelse(root_bh); - kfree(sbi->s_prefix); - kfree(sbi); - sb->s_fs_info = NULL; - return ret; } static int @@ -615,11 +596,23 @@ static struct dentry *affs_mount(struct file_system_type *fs_type, return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super); } +static void affs_kill_sb(struct super_block *sb) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + kill_block_super(sb); + if (sbi) { + affs_free_bitmap(sb); + affs_brelse(sbi->s_root_bh); + kfree(sbi->s_prefix); + kfree(sbi); + } +} + static struct file_system_type affs_fs_type = { .owner = THIS_MODULE, .name = "affs", .mount = affs_mount, - .kill_sb = kill_block_super, + .kill_sb = affs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("affs"); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a306bb6d88d..6621f800812 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -195,7 +195,6 @@ struct afs_cell { struct list_head link; /* main cell list link */ struct key *anonymous_key; /* anonymous user key for this cell */ struct list_head proc_link; /* /proc cell list link */ - struct proc_dir_entry *proc_dir; /* /proc dir for this cell */ #ifdef CONFIG_AFS_FSCACHE struct fscache_cookie *cache; /* caching cookie */ #endif diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 526e4bbbde5..bddc5120ed4 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -41,11 +41,8 @@ static const struct file_operations afs_proc_cells_fops = { .write = afs_proc_cells_write, .llseek = seq_lseek, .release = seq_release, - .owner = THIS_MODULE, }; -static int afs_proc_rootcell_open(struct inode *inode, struct file *file); -static int afs_proc_rootcell_release(struct inode *inode, struct file *file); static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, size_t size, loff_t *_pos); static ssize_t afs_proc_rootcell_write(struct file *file, @@ -53,17 +50,12 @@ static ssize_t afs_proc_rootcell_write(struct file *file, size_t size, loff_t *_pos); static const struct file_operations afs_proc_rootcell_fops = { - .open = afs_proc_rootcell_open, .read = afs_proc_rootcell_read, .write = afs_proc_rootcell_write, .llseek = no_llseek, - .release = afs_proc_rootcell_release, - .owner = THIS_MODULE, }; static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file); -static int afs_proc_cell_volumes_release(struct inode *inode, - struct file *file); static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v, loff_t *pos); @@ -81,14 +73,11 @@ static const struct file_operations afs_proc_cell_volumes_fops = { .open = afs_proc_cell_volumes_open, .read = seq_read, .llseek = seq_lseek, - .release = afs_proc_cell_volumes_release, - .owner = THIS_MODULE, + .release = seq_release, }; static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file); -static int afs_proc_cell_vlservers_release(struct inode *inode, - struct file *file); static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v, loff_t *pos); @@ -106,13 +95,10 @@ static const struct file_operations afs_proc_cell_vlservers_fops = { .open = afs_proc_cell_vlservers_open, .read = seq_read, .llseek = seq_lseek, - .release = afs_proc_cell_vlservers_release, - .owner = THIS_MODULE, + .release = seq_release, }; static int afs_proc_cell_servers_open(struct inode *inode, struct file *file); -static int afs_proc_cell_servers_release(struct inode *inode, - struct file *file); static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos); static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, loff_t *pos); @@ -130,8 +116,7 @@ static const struct file_operations afs_proc_cell_servers_fops = { .open = afs_proc_cell_servers_open, .read = seq_read, .llseek = seq_lseek, - .release = afs_proc_cell_servers_release, - .owner = THIS_MODULE, + .release = seq_release, }; /* @@ -139,29 +124,21 @@ static const struct file_operations afs_proc_cell_servers_fops = { */ int afs_proc_init(void) { - struct proc_dir_entry *p; - _enter(""); proc_afs = proc_mkdir("fs/afs", NULL); if (!proc_afs) goto error_dir; - p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops); - if (!p) - goto error_cells; - - p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops); - if (!p) - goto error_rootcell; + if (!proc_create("cells", 0, proc_afs, &afs_proc_cells_fops) || + !proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops)) + goto error_tree; _leave(" = 0"); return 0; -error_rootcell: - remove_proc_entry("cells", proc_afs); -error_cells: - remove_proc_entry("fs/afs", NULL); +error_tree: + remove_proc_subtree("fs/afs", NULL); error_dir: _leave(" = -ENOMEM"); return -ENOMEM; @@ -172,9 +149,7 @@ error_dir: */ void afs_proc_cleanup(void) { - remove_proc_entry("rootcell", proc_afs); - remove_proc_entry("cells", proc_afs); - remove_proc_entry("fs/afs", NULL); + remove_proc_subtree("fs/afs", NULL); } /* @@ -319,19 +294,6 @@ inval: goto done; } -/* - * Stubs for /proc/fs/afs/rootcell - */ -static int afs_proc_rootcell_open(struct inode *inode, struct file *file) -{ - return 0; -} - -static int afs_proc_rootcell_release(struct inode *inode, struct file *file) -{ - return 0; -} - static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf, size_t size, loff_t *_pos) { @@ -387,38 +349,27 @@ nomem: */ int afs_proc_cell_setup(struct afs_cell *cell) { - struct proc_dir_entry *p; + struct proc_dir_entry *dir; _enter("%p{%s}", cell, cell->name); - cell->proc_dir = proc_mkdir(cell->name, proc_afs); - if (!cell->proc_dir) + dir = proc_mkdir(cell->name, proc_afs); + if (!dir) goto error_dir; - p = proc_create_data("servers", 0, cell->proc_dir, - &afs_proc_cell_servers_fops, cell); - if (!p) - goto error_servers; - - p = proc_create_data("vlservers", 0, cell->proc_dir, - &afs_proc_cell_vlservers_fops, cell); - if (!p) - goto error_vlservers; - - p = proc_create_data("volumes", 0, cell->proc_dir, - &afs_proc_cell_volumes_fops, cell); - if (!p) - goto error_volumes; + if (!proc_create_data("servers", 0, dir, + &afs_proc_cell_servers_fops, cell) || + !proc_create_data("vlservers", 0, dir, + &afs_proc_cell_vlservers_fops, cell) || + !proc_create_data("volumes", 0, dir, + &afs_proc_cell_volumes_fops, cell)) + goto error_tree; _leave(" = 0"); return 0; -error_volumes: - remove_proc_entry("vlservers", cell->proc_dir); -error_vlservers: - remove_proc_entry("servers", cell->proc_dir); -error_servers: - remove_proc_entry(cell->name, proc_afs); +error_tree: + remove_proc_subtree(cell->name, proc_afs); error_dir: _leave(" = -ENOMEM"); return -ENOMEM; @@ -431,10 +382,7 @@ void afs_proc_cell_remove(struct afs_cell *cell) { _enter(""); - remove_proc_entry("volumes", cell->proc_dir); - remove_proc_entry("vlservers", cell->proc_dir); - remove_proc_entry("servers", cell->proc_dir); - remove_proc_entry(cell->name, proc_afs); + remove_proc_subtree(cell->name, proc_afs); _leave(""); } @@ -463,14 +411,6 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file) } /* - * close the file and release the ref to the cell - */ -static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file) -{ - return seq_release(inode, file); -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ @@ -569,15 +509,6 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file) } /* - * close the file and release the ref to the cell - */ -static int afs_proc_cell_vlservers_release(struct inode *inode, - struct file *file) -{ - return seq_release(inode, file); -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ @@ -673,15 +604,6 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file) } /* - * close the file and release the ref to the cell - */ -static int afs_proc_cell_servers_release(struct inode *inode, - struct file *file) -{ - return seq_release(inode, file); -} - -/* * set up the iterator to start reading from the cells list and return the * first item */ @@ -244,9 +244,14 @@ static void aio_free_ring(struct kioctx *ctx) int i; for (i = 0; i < ctx->nr_pages; i++) { + struct page *page; pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, page_count(ctx->ring_pages[i])); - put_page(ctx->ring_pages[i]); + page = ctx->ring_pages[i]; + if (!page) + continue; + ctx->ring_pages[i] = NULL; + put_page(page); } put_aio_ring_file(ctx); @@ -280,18 +285,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, unsigned long flags; int rc; + rc = 0; + + /* Make sure the old page hasn't already been changed */ + spin_lock(&mapping->private_lock); + ctx = mapping->private_data; + if (ctx) { + pgoff_t idx; + spin_lock_irqsave(&ctx->completion_lock, flags); + idx = old->index; + if (idx < (pgoff_t)ctx->nr_pages) { + if (ctx->ring_pages[idx] != old) + rc = -EAGAIN; + } else + rc = -EINVAL; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else + rc = -EINVAL; + spin_unlock(&mapping->private_lock); + + if (rc != 0) + return rc; + /* Writeback must be complete */ BUG_ON(PageWriteback(old)); - put_page(old); + get_page(new); - rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { - get_page(old); + put_page(new); return rc; } - get_page(new); - /* We can potentially race against kioctx teardown here. Use the * address_space's private data lock to protect the mapping's * private_data. @@ -303,13 +328,24 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, spin_lock_irqsave(&ctx->completion_lock, flags); migrate_page_copy(new, old); idx = old->index; - if (idx < (pgoff_t)ctx->nr_pages) - ctx->ring_pages[idx] = new; + if (idx < (pgoff_t)ctx->nr_pages) { + /* And only do the move if things haven't changed */ + if (ctx->ring_pages[idx] == old) + ctx->ring_pages[idx] = new; + else + rc = -EAGAIN; + } else + rc = -EINVAL; spin_unlock_irqrestore(&ctx->completion_lock, flags); } else rc = -EBUSY; spin_unlock(&mapping->private_lock); + if (rc == MIGRATEPAGE_SUCCESS) + put_page(old); + else + put_page(new); + return rc; } #endif @@ -326,7 +362,7 @@ static int aio_setup_ring(struct kioctx *ctx) struct aio_ring *ring; unsigned nr_events = ctx->max_reqs; struct mm_struct *mm = current->mm; - unsigned long size, populate; + unsigned long size, unused; int nr_pages; int i; struct file *file; @@ -347,6 +383,20 @@ static int aio_setup_ring(struct kioctx *ctx) return -EAGAIN; } + ctx->aio_ring_file = file; + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) + / sizeof(struct io_event); + + ctx->ring_pages = ctx->internal_pages; + if (nr_pages > AIO_RING_PAGES) { + ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!ctx->ring_pages) { + put_aio_ring_file(ctx); + return -ENOMEM; + } + } + for (i = 0; i < nr_pages; i++) { struct page *page; page = find_or_create_page(file->f_inode->i_mapping, @@ -358,19 +408,14 @@ static int aio_setup_ring(struct kioctx *ctx) SetPageUptodate(page); SetPageDirty(page); unlock_page(page); + + ctx->ring_pages[i] = page; } - ctx->aio_ring_file = file; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) - / sizeof(struct io_event); + ctx->nr_pages = i; - ctx->ring_pages = ctx->internal_pages; - if (nr_pages > AIO_RING_PAGES) { - ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_KERNEL); - if (!ctx->ring_pages) { - put_aio_ring_file(ctx); - return -ENOMEM; - } + if (unlikely(i != nr_pages)) { + aio_free_ring(ctx); + return -EAGAIN; } ctx->mmap_size = nr_pages * PAGE_SIZE; @@ -379,9 +424,9 @@ static int aio_setup_ring(struct kioctx *ctx) down_write(&mm->mmap_sem); ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, 0, &populate); + MAP_SHARED, 0, &unused); + up_write(&mm->mmap_sem); if (IS_ERR((void *)ctx->mmap_base)) { - up_write(&mm->mmap_sem); ctx->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; @@ -389,27 +434,6 @@ static int aio_setup_ring(struct kioctx *ctx) pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); - /* We must do this while still holding mmap_sem for write, as we - * need to be protected against userspace attempting to mremap() - * or munmap() the ring buffer. - */ - ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, - 1, 0, ctx->ring_pages, NULL); - - /* Dropping the reference here is safe as the page cache will hold - * onto the pages for us. It is also required so that page migration - * can unmap the pages and get the right reference count. - */ - for (i = 0; i < ctx->nr_pages; i++) - put_page(ctx->ring_pages[i]); - - up_write(&mm->mmap_sem); - - if (unlikely(ctx->nr_pages != nr_pages)) { - aio_free_ring(ctx); - return -EAGAIN; - } - ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ @@ -652,7 +676,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); - percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */ err = ioctx_add_table(ctx, mm); if (err) diff --git a/fs/attr.c b/fs/attr.c index 267968d9467..5d4e59d56e8 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -202,11 +202,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de return -EPERM; } - if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) { - if (attr->ia_size != inode->i_size) - inode_inc_iversion(inode); - } - if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 4218e26df91..acf32054edd 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -104,7 +104,7 @@ struct autofs_sb_info { u32 magic; int pipefd; struct file *pipe; - pid_t oz_pgrp; + struct pid *oz_pgrp; int catatonic; int version; int sub_version; @@ -140,7 +140,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) filesystem without "magic".) */ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { - return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp; + return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } /* Does a dentry have some pending activity? */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 1818ce7f5a0..3182c0e68b4 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -346,6 +346,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, { int pipefd; int err = 0; + struct pid *new_pid = NULL; if (param->setpipefd.pipefd == -1) return -EINVAL; @@ -357,7 +358,17 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, mutex_unlock(&sbi->wq_mutex); return -EBUSY; } else { - struct file *pipe = fget(pipefd); + struct file *pipe; + + new_pid = get_task_pid(current, PIDTYPE_PGID); + + if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { + AUTOFS_WARN("Not allowed to change PID namespace"); + err = -EINVAL; + goto out; + } + + pipe = fget(pipefd); if (!pipe) { err = -EBADF; goto out; @@ -367,12 +378,13 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, fput(pipe); goto out; } - sbi->oz_pgrp = task_pgrp_nr(current); + swap(sbi->oz_pgrp, new_pid); sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->catatonic = 0; } out: + put_pid(new_pid); mutex_unlock(&sbi->wq_mutex); return err; } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 3d9d3f5d5dd..394e90b02c5 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -402,6 +402,20 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, goto next; } + if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) { + DPRINTK("checking symlink %p %.*s", + dentry, (int)dentry->d_name.len, dentry->d_name.name); + /* + * A symlink can't be "busy" in the usual sense so + * just check last used for expire timeout. + */ + if (autofs4_can_expire(dentry, timeout, do_now)) { + expired = dentry; + goto found; + } + goto next; + } + if (simple_empty(dentry)) goto next; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 3b9cc9b973c..d7bd395ab58 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -56,8 +56,11 @@ void autofs4_kill_sb(struct super_block *sb) * just call kill_anon_super when we are called from * deactivate_super. */ - if (sbi) /* Free wait queues, close pipe */ + if (sbi) { + /* Free wait queues, close pipe */ autofs4_catatonic_mode(sbi); + put_pid(sbi->oz_pgrp); + } DPRINTK("shutting down"); kill_litter_super(sb); @@ -80,7 +83,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, root_inode->i_gid)); - seq_printf(m, ",pgrp=%d", sbi->oz_pgrp); + seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); @@ -124,7 +127,8 @@ static const match_table_t tokens = { }; static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, - pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto) + int *pgrp, bool *pgrp_set, unsigned int *type, + int *minproto, int *maxproto) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -132,7 +136,6 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, *uid = current_uid(); *gid = current_gid(); - *pgrp = task_pgrp_nr(current); *minproto = AUTOFS_MIN_PROTO_VERSION; *maxproto = AUTOFS_MAX_PROTO_VERSION; @@ -171,6 +174,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, if (match_int(args, &option)) return 1; *pgrp = option; + *pgrp_set = true; break; case Opt_minproto: if (match_int(args, &option)) @@ -206,10 +210,13 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; + int pgrp; + bool pgrp_set = false; + int ret = -EINVAL; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - goto fail_unlock; + return -ENOMEM; DPRINTK("starting up, sbi = %p",sbi); s->s_fs_info = sbi; @@ -218,7 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->pipe = NULL; sbi->catatonic = 1; sbi->exp_timeout = 0; - sbi->oz_pgrp = task_pgrp_nr(current); + sbi->oz_pgrp = NULL; sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; @@ -243,8 +250,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) * Get the root inode and dentry, but defer checking for errors. */ ino = autofs4_new_ino(sbi); - if (!ino) + if (!ino) { + ret = -ENOMEM; goto fail_free; + } root_inode = autofs4_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); if (!root) @@ -255,12 +264,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) /* Can this call block? */ if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, - &sbi->oz_pgrp, &sbi->type, &sbi->min_proto, - &sbi->max_proto)) { + &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, + &sbi->max_proto)) { printk("autofs: called with bogus options\n"); goto fail_dput; } + if (pgrp_set) { + sbi->oz_pgrp = find_get_pid(pgrp); + if (!sbi->oz_pgrp) { + pr_warn("autofs: could not find process group %d\n", + pgrp); + goto fail_dput; + } + } else { + sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); + } + if (autofs_type_trigger(sbi->type)) __managed_dentry_set_managed(root); @@ -284,14 +304,15 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->version = sbi->max_proto; sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp); + DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); - + if (!pipe) { printk("autofs: could not open pipe file descriptor\n"); goto fail_dput; } - if (autofs_prepare_pipe(pipe) < 0) + ret = autofs_prepare_pipe(pipe); + if (ret < 0) goto fail_fput; sbi->pipe = pipe; sbi->pipefd = pipefd; @@ -316,10 +337,10 @@ fail_dput: fail_ino: kfree(ino); fail_free: + put_pid(sbi->oz_pgrp); kfree(sbi); s->s_fs_info = NULL; -fail_unlock: - return -EINVAL; + return ret; } struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 92ef341ba0c..2caf36ac3e9 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -558,7 +558,7 @@ static int autofs4_dir_symlink(struct inode *dir, dget(dentry); atomic_inc(&ino->count); p_ino = autofs4_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) + if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); dir->i_mtime = CURRENT_TIME; @@ -593,7 +593,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) if (atomic_dec_and_test(&ino->count)) { p_ino = autofs4_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) + if (p_ino && !IS_ROOT(dentry)) atomic_dec(&p_ino->count); } dput(ino->dentry); @@ -732,7 +732,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m dget(dentry); atomic_inc(&ino->count); p_ino = autofs4_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) + if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); inc_nlink(dir); dir->i_mtime = CURRENT_TIME; diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index f27c094a191..1e8ea192be2 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -14,6 +14,10 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) { + struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); + if (ino && !autofs4_oz_mode(sbi)) + ino->last_used = jiffies; nd_set_link(nd, dentry->d_inode->i_private); return NULL; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 689e40d983a..116fd38ee47 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -347,11 +347,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, struct qstr qstr; char *name; int status, ret, type; + pid_t pid; + pid_t tgid; /* In catatonic mode, we don't wait for nobody */ if (sbi->catatonic) return -ENOENT; + /* + * Try translating pids to the namespace of the daemon. + * + * Zero means failure: we are in an unrelated pid namespace. + */ + pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + if (pid == 0 || tgid == 0) + return -ENOENT; + if (!dentry->d_inode) { /* * A wait for a negative dentry is invalid for certain @@ -417,8 +429,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->ino = autofs4_get_ino(sbi); wq->uid = current_uid(); wq->gid = current_gid(); - wq->pid = current->pid; - wq->tgid = current->tgid; + wq->pid = pid; + wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ wq->wait_ctr = 2; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index daa15d6ba45..845d2d690ce 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -324,8 +324,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino); inode = iget_locked(sb, ino); - if (IS_ERR(inode)) - return inode; + if (!inode) + return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 571a4232690..67be2951b98 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -543,9 +543,6 @@ out: * libraries. There is no binary dependent code anywhere else. */ -#define INTERPRETER_NONE 0 -#define INTERPRETER_ELF 2 - #ifndef STACK_RND_MASK #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ #endif diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0890c83643e..ff9b3995d45 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -35,13 +35,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) char *value = NULL; struct posix_acl *acl; - if (!IS_POSIXACL(inode)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; @@ -76,31 +69,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) return acl; } -static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, - void *value, size_t size, int type) -{ - struct posix_acl *acl; - int ret = 0; - - if (!IS_POSIXACL(dentry->d_inode)) - return -EOPNOTSUPP; - - acl = btrfs_get_acl(dentry->d_inode, type); - - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); - posix_acl_release(acl); - - return ret; -} - /* * Needs to be called with fs_mutex held */ -static int btrfs_set_acl(struct btrfs_trans_handle *trans, +static int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type) { int ret, size = 0; @@ -158,35 +130,9 @@ out: return ret; } -static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) +int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - int ret; - struct posix_acl *acl = NULL; - - if (!inode_owner_or_capable(dentry->d_inode)) - return -EPERM; - - if (!IS_POSIXACL(dentry->d_inode)) - return -EOPNOTSUPP; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - ret = posix_acl_valid(acl); - if (ret) - goto out; - } - } - - ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); -out: - posix_acl_release(acl); - - return ret; + return __btrfs_set_acl(NULL, inode, acl, type); } /* @@ -197,83 +143,31 @@ out: int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir) { - struct posix_acl *acl = NULL; + struct posix_acl *default_acl, *acl; int ret = 0; /* this happens with subvols */ if (!dir) return 0; - if (!S_ISLNK(inode->i_mode)) { - if (IS_POSIXACL(dir)) { - acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } + ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (ret) + return ret; - if (!acl) - inode->i_mode &= ~current_umask(); + if (default_acl) { + ret = __btrfs_set_acl(trans, inode, default_acl, + ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); } - if (IS_POSIXACL(dir) && acl) { - if (S_ISDIR(inode->i_mode)) { - ret = btrfs_set_acl(trans, inode, acl, - ACL_TYPE_DEFAULT); - if (ret) - goto failed; - } - ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (ret < 0) - return ret; - - if (ret > 0) { - /* we need an acl */ - ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); - } else if (ret < 0) { - cache_no_acl(inode); - } - } else { - cache_no_acl(inode); + if (acl) { + if (!ret) + ret = __btrfs_set_acl(trans, inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); } -failed: - posix_acl_release(acl); - - return ret; -} -int btrfs_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int ret = 0; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - if (!IS_POSIXACL(inode)) - return 0; - - acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR_OR_NULL(acl)) - return PTR_ERR(acl); - - ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (ret) - return ret; - ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS); - posix_acl_release(acl); + if (!default_acl && !acl) + cache_no_acl(inode); return ret; } - -const struct xattr_handler btrfs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = btrfs_xattr_acl_get, - .set = btrfs_xattr_acl_set, -}; - -const struct xattr_handler btrfs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = btrfs_xattr_acl_get, - .set = btrfs_xattr_acl_set, -}; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 54ab86127f7..7506825211a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3899,20 +3899,17 @@ do { \ /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); +int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); -int btrfs_acl_chmod(struct inode *inode); #else #define btrfs_get_acl NULL +#define btrfs_set_acl NULL static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir) { return 0; } -static inline int btrfs_acl_chmod(struct inode *inode) -{ - return 0; -} #endif /* relocation.c */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 45d98d01028..9c01509dd8a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -767,20 +767,19 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - if (metadata) { - key.objectid = bytenr; - key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = offset; - } else { - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = offset; - } - if (!trans) { path->skip_locking = 1; path->search_commit_root = 1; } + +search_again: + key.objectid = bytenr; + key.offset = offset; + if (metadata) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + again: ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 0); @@ -788,7 +787,6 @@ again: goto out_free; if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { - metadata = 0; if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -855,7 +853,7 @@ again: mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); - goto again; + goto search_again; } if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f1a77449d03..514b291b135 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4354,8 +4354,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) - inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize != oldsize) { + inode_inc_iversion(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) + inode->i_ctime = inode->i_mtime = + current_fs_time(inode->i_sb); + } if (newsize > oldsize) { truncate_pagecache(inode, newsize); @@ -4464,7 +4468,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) - err = btrfs_acl_chmod(inode); + err = posix_acl_chmod(inode, inode->i_mode); } return err; @@ -8649,12 +8653,14 @@ static const struct inode_operations btrfs_dir_inode_operations = { .removexattr = btrfs_removexattr, .permission = btrfs_permission, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; @@ -8724,6 +8730,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_special_inode_operations = { @@ -8735,6 +8742,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, + .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_symlink_inode_operations = { @@ -8748,7 +8756,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .getxattr = btrfs_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, - .get_acl = btrfs_get_acl, .update_time = btrfs_update_time, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a111622598b..ad27dcea319 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2121,7 +2121,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); if (err == -EINTR) - goto out; + goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2284,6 +2284,7 @@ out_dput: dput(dentry); out_unlock_dir: mutex_unlock(&dir->i_mutex); +out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args); @@ -2685,14 +2686,11 @@ out_unlock: #define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) static long btrfs_ioctl_file_extent_same(struct file *file, - void __user *argp) + struct btrfs_ioctl_same_args __user *argp) { - struct btrfs_ioctl_same_args tmp; struct btrfs_ioctl_same_args *same; struct btrfs_ioctl_same_extent_info *info; - struct inode *src = file->f_dentry->d_inode; - struct file *dst_file = NULL; - struct inode *dst; + struct inode *src = file_inode(file); u64 off; u64 len; int i; @@ -2700,6 +2698,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, unsigned long size; u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; bool is_admin = capable(CAP_SYS_ADMIN); + u16 count; if (!(file->f_mode & FMODE_READ)) return -EINVAL; @@ -2708,17 +2707,14 @@ static long btrfs_ioctl_file_extent_same(struct file *file, if (ret) return ret; - if (copy_from_user(&tmp, - (struct btrfs_ioctl_same_args __user *)argp, - sizeof(tmp))) { + if (get_user(count, &argp->dest_count)) { ret = -EFAULT; goto out; } - size = sizeof(tmp) + - tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info); + size = offsetof(struct btrfs_ioctl_same_args __user, info[count]); - same = memdup_user((struct btrfs_ioctl_same_args __user *)argp, size); + same = memdup_user(argp, size); if (IS_ERR(same)) { ret = PTR_ERR(same); @@ -2755,52 +2751,35 @@ static long btrfs_ioctl_file_extent_same(struct file *file, goto out; /* pre-format output fields to sane values */ - for (i = 0; i < same->dest_count; i++) { + for (i = 0; i < count; i++) { same->info[i].bytes_deduped = 0ULL; same->info[i].status = 0; } - ret = 0; - for (i = 0; i < same->dest_count; i++) { - info = &same->info[i]; - - dst_file = fget(info->fd); - if (!dst_file) { + for (i = 0, info = same->info; i < count; i++, info++) { + struct inode *dst; + struct fd dst_file = fdget(info->fd); + if (!dst_file.file) { info->status = -EBADF; - goto next; + continue; } + dst = file_inode(dst_file.file); - if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { + if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) { info->status = -EINVAL; - goto next; - } - - info->status = -EXDEV; - if (file->f_path.mnt != dst_file->f_path.mnt) - goto next; - - dst = dst_file->f_dentry->d_inode; - if (src->i_sb != dst->i_sb) - goto next; - - if (S_ISDIR(dst->i_mode)) { + } else if (file->f_path.mnt != dst_file.file->f_path.mnt) { + info->status = -EXDEV; + } else if (S_ISDIR(dst->i_mode)) { info->status = -EISDIR; - goto next; - } - - if (!S_ISREG(dst->i_mode)) { + } else if (!S_ISREG(dst->i_mode)) { info->status = -EACCES; - goto next; + } else { + info->status = btrfs_extent_same(src, off, len, dst, + info->logical_offset); + if (info->status == 0) + info->bytes_deduped += len; } - - info->status = btrfs_extent_same(src, off, len, dst, - info->logical_offset); - if (info->status == 0) - info->bytes_deduped += len; - -next: - if (dst_file) - fput(dst_file); + fdput(dst_file); } ret = copy_to_user(argp, same, size); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ce459a7cb16..429c73c374b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -571,7 +571,9 @@ static int is_cowonly_root(u64 root_objectid) root_objectid == BTRFS_CHUNK_TREE_OBJECTID || root_objectid == BTRFS_DEV_TREE_OBJECTID || root_objectid == BTRFS_TREE_LOG_OBJECTID || - root_objectid == BTRFS_CSUM_TREE_OBJECTID) + root_objectid == BTRFS_CSUM_TREE_OBJECTID || + root_objectid == BTRFS_UUID_TREE_OBJECTID || + root_objectid == BTRFS_QUOTA_TREE_OBJECTID) return 1; return 0; } @@ -1264,10 +1266,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) } /* - * helper to update/delete the 'address of tree root -> reloc tree' + * helper to delete the 'address of tree root -> reloc tree' * mapping */ -static int __update_reloc_root(struct btrfs_root *root, int del) +static void __del_reloc_root(struct btrfs_root *root) { struct rb_node *rb_node; struct mapping_node *node = NULL; @@ -1275,7 +1277,7 @@ static int __update_reloc_root(struct btrfs_root *root, int del) spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->commit_root->start); + root->node->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1283,23 +1285,45 @@ static int __update_reloc_root(struct btrfs_root *root, int del) spin_unlock(&rc->reloc_root_tree.lock); if (!node) - return 0; + return; BUG_ON((struct btrfs_root *)node->data != root); - if (!del) { - spin_lock(&rc->reloc_root_tree.lock); - node->bytenr = root->node->start; - rb_node = tree_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); - spin_unlock(&rc->reloc_root_tree.lock); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, node->bytenr); - } else { - spin_lock(&root->fs_info->trans_lock); - list_del_init(&root->root_list); - spin_unlock(&root->fs_info->trans_lock); - kfree(node); + spin_lock(&root->fs_info->trans_lock); + list_del_init(&root->root_list); + spin_unlock(&root->fs_info->trans_lock); + kfree(node); +} + +/* + * helper to update the 'address of tree root -> reloc tree' + * mapping + */ +static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->node->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); } + spin_unlock(&rc->reloc_root_tree.lock); + + if (!node) + return 0; + BUG_ON((struct btrfs_root *)node->data != root); + + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = new_bytenr; + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + if (rb_node) + backref_tree_panic(rb_node, -EEXIST, node->bytenr); return 0; } @@ -1420,7 +1444,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, { struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; - int del = 0; int ret; if (!root->reloc_root) @@ -1432,11 +1455,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, if (root->fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { root->reloc_root = NULL; - del = 1; + __del_reloc_root(reloc_root); } - __update_reloc_root(reloc_root, del); - if (reloc_root->commit_root != reloc_root->node) { btrfs_set_root_node(root_item, reloc_root->node); free_extent_buffer(reloc_root->commit_root); @@ -2287,7 +2308,7 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); - __update_reloc_root(reloc_root, 1); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); kfree(reloc_root); @@ -2332,7 +2353,7 @@ again: ret = merge_reloc_root(rc, root); if (ret) { - __update_reloc_root(reloc_root, 1); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); kfree(reloc_root); @@ -2388,6 +2409,13 @@ out: btrfs_std_error(root->fs_info, ret); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); + + /* new reloc root may be added */ + mutex_lock(&root->fs_info->reloc_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->reloc_mutex); + if (!list_empty(&reloc_roots)) + free_reloc_roots(&reloc_roots); } BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); @@ -4522,6 +4550,11 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, BUG_ON(rc->stage == UPDATE_DATA_PTRS && root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (buf == root->node) + __update_reloc_root(root, cow->start); + } + level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item)) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6837fe87f3a..945d1db98f2 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4723,8 +4723,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } if (!access_ok(VERIFY_READ, arg->clone_sources, - sizeof(*arg->clone_sources * - arg->clone_sources_count))) { + sizeof(*arg->clone_sources) * + arg->clone_sources_count)) { ret = -EFAULT; goto out; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2d8ac1bf0cf..d71a11d13df 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -432,7 +432,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } else { printk(KERN_INFO "btrfs: setting nodatacow\n"); } - info->compress_type = BTRFS_COMPRESS_NONE; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); btrfs_set_opt(info->mount_opt, NODATACOW); @@ -461,7 +460,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_fs_incompat(info, COMPRESS_LZO); } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; - info->compress_type = BTRFS_COMPRESS_NONE; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); compress_force = false; @@ -474,9 +472,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); pr_info("btrfs: force %s compression\n", compress_type); - } else + } else if (btrfs_test_opt(root, COMPRESS)) { pr_info("btrfs: use %s compression\n", compress_type); + } break; case Opt_ssd: printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 6fc82010dc1..c8d9ddf84c6 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -101,7 +101,7 @@ static int test_extents(struct btrfs_block_group_cache *cache) ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); if (ret) { - test_msg("Error removing middle peice %d\n", ret); + test_msg("Error removing middle piece %d\n", ret); return ret; } @@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) } if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { - test_msg("Left over peices after removing overlapping\n"); + test_msg("Left over pieces after removing overlapping\n"); return -1; } diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 05740b9789e..3d1c301c926 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -22,6 +22,7 @@ #include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/security.h> +#include <linux/posix_acl_xattr.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" @@ -313,8 +314,8 @@ err: */ const struct xattr_handler *btrfs_xattr_handlers[] = { #ifdef CONFIG_BTRFS_FS_POSIX_ACL - &btrfs_xattr_acl_access_handler, - &btrfs_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL, }; diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index b3cc8039134..5049608d138 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -21,8 +21,6 @@ #include <linux/xattr.h> -extern const struct xattr_handler btrfs_xattr_acl_access_handler; -extern const struct xattr_handler btrfs_xattr_acl_default_handler; extern const struct xattr_handler *btrfs_xattr_handlers[]; extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index ac9a2ef5bb9..264e9bf83ff 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -25,3 +25,16 @@ config CEPH_FSCACHE caching support for Ceph clients using FS-Cache endif + +config CEPH_FS_POSIX_ACL + bool "Ceph POSIX Access Control Lists" + depends on CEPH_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 32e30106a2f..85a4230b9bf 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -10,3 +10,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ debugfs.o ceph-$(CONFIG_CEPH_FSCACHE) += cache.o +ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c new file mode 100644 index 00000000000..66d377a12f7 --- /dev/null +++ b/fs/ceph/acl.c @@ -0,0 +1,230 @@ +/* + * linux/fs/ceph/acl.c + * + * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include "super.h" + +static inline void ceph_set_cached_acl(struct inode *inode, + int type, struct posix_acl *acl) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + set_cached_acl(inode, type, acl); + spin_unlock(&ci->i_ceph_lock); +} + +static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, + int type) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct posix_acl *acl = ACL_NOT_CACHED; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + acl = get_cached_acl(inode, type); + spin_unlock(&ci->i_ceph_lock); + + return acl; +} + +void ceph_forget_all_cached_acls(struct inode *inode) +{ + forget_all_cached_acls(inode); +} + +struct posix_acl *ceph_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl; + + if (!IS_POSIXACL(inode)) + return NULL; + + acl = ceph_get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + + size = __ceph_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __ceph_getxattr(inode, name, value, size); + } + + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if (size == -ERANGE || size == -ENODATA || size == 0) + acl = NULL; + else + acl = ERR_PTR(-EIO); + + kfree(value); + + if (!IS_ERR(acl)) + ceph_set_cached_acl(inode, type, acl); + + return acl; +} + +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int ret = 0, size = 0; + const char *name = NULL; + char *value = NULL; + struct iattr newattrs; + umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; + struct dentry *dentry = d_find_alias(inode); + + if (acl) { + ret = posix_acl_valid(acl); + if (ret < 0) + goto out; + } + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &new_mode); + if (ret < 0) + goto out; + if (ret == 0) + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + ret = acl ? -EINVAL : 0; + goto out; + } + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + ret = -EINVAL; + goto out; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) + goto out_free; + } + + if (new_mode != old_mode) { + newattrs.ia_mode = new_mode; + newattrs.ia_valid = ATTR_MODE; + ret = ceph_setattr(dentry, &newattrs); + if (ret) + goto out_free; + } + + if (value) + ret = __ceph_setxattr(dentry, name, value, size, 0); + else + ret = __ceph_removexattr(dentry, name); + + if (ret) { + if (new_mode != old_mode) { + newattrs.ia_mode = old_mode; + newattrs.ia_valid = ATTR_MODE; + ceph_setattr(dentry, &newattrs); + } + goto out_free; + } + + ceph_set_cached_acl(inode, type, acl); + +out_free: + kfree(value); +out: + return ret; +} + +int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int ret = 0; + + if (!S_ISLNK(inode->i_mode)) { + if (IS_POSIXACL(dir)) { + acl = ceph_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) { + ret = PTR_ERR(acl); + goto out; + } + } + + if (!acl) + inode->i_mode &= ~current_umask(); + } + + if (IS_POSIXACL(dir) && acl) { + if (S_ISDIR(inode->i_mode)) { + ret = ceph_set_acl(inode, acl, ACL_TYPE_DEFAULT); + if (ret) + goto out_release; + } + ret = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + if (ret < 0) + goto out; + else if (ret > 0) + ret = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS); + else + cache_no_acl(inode); + } else { + cache_no_acl(inode); + } + +out_release: + posix_acl_release(acl); +out: + return ret; +} diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1e561c05953..b53278c9fd9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -209,10 +209,15 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = 0; if (err < 0) { SetPageError(page); + ceph_fscache_readpage_cancel(inode, page); goto out; - } else if (err < PAGE_CACHE_SIZE) { + } else { + if (err < PAGE_CACHE_SIZE) { /* zero fill remainder of page */ - zero_user_segment(page, err, PAGE_CACHE_SIZE); + zero_user_segment(page, err, PAGE_CACHE_SIZE); + } else { + flush_dcache_page(page); + } } SetPageUptodate(page); @@ -252,6 +257,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) for (i = 0; i < num_pages; i++) { struct page *page = osd_data->pages[i]; + if (rc < 0) + goto unlock; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ int s = bytes < 0 ? 0 : bytes; @@ -262,6 +269,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) flush_dcache_page(page); SetPageUptodate(page); ceph_readpage_to_fscache(inode, page); +unlock: unlock_page(page); page_cache_release(page); bytes -= PAGE_CACHE_SIZE; @@ -1203,6 +1211,41 @@ const struct address_space_operations ceph_aops = { /* * vm ops */ +static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; + loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; + int want, got, ret; + + dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", + inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + while (1) { + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); + if (ret == 0) + break; + if (ret != -ERESTARTSYS) { + WARN_ON(1); + return VM_FAULT_SIGBUS; + } + } + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); + + ret = filemap_fault(vma, vmf); + + dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); + ceph_put_cap_refs(ci, got); + + return ret; +} /* * Reuse write_begin here for simplicity. @@ -1210,23 +1253,41 @@ const struct address_space_operations ceph_aops = { static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); - struct page *page = vmf->page; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct page *page = vmf->page; loff_t off = page_offset(page); - loff_t size, len; - int ret; - - /* Update time before taking page lock */ - file_update_time(vma->vm_file); + loff_t size = i_size_read(inode); + size_t len; + int want, got, ret; - size = i_size_read(inode); if (off + PAGE_CACHE_SIZE <= size) len = PAGE_CACHE_SIZE; else len = size & ~PAGE_CACHE_MASK; - dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, - off, len, page, page->index); + dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", + inode, ceph_vinop(inode), off, len, size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + while (1) { + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); + if (ret == 0) + break; + if (ret != -ERESTARTSYS) { + WARN_ON(1); + return VM_FAULT_SIGBUS; + } + } + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", + inode, off, len, ceph_cap_string(got)); + + /* Update time before taking page lock */ + file_update_time(vma->vm_file); lock_page(page); @@ -1248,14 +1309,26 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } out: - dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); - if (ret != VM_FAULT_LOCKED) + if (ret != VM_FAULT_LOCKED) { unlock_page(page); + } else { + int dirty; + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", + inode, off, len, ceph_cap_string(got), ret); + ceph_put_cap_refs(ci, got); + return ret; } static struct vm_operations_struct ceph_vmops = { - .fault = filemap_fault, + .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index ba949408a33..da95f61b7a0 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -67,6 +67,14 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) return fscache_maybe_release_page(ci->fscache, page, gfp); } +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) + __fscache_uncache_page(ci->fscache, page); +} + static inline void ceph_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { @@ -145,6 +153,11 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) return 1; } +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ +} + static inline void ceph_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) { diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 3c0a4bd7499..17543383545 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -555,21 +555,34 @@ retry: cap->ci = ci; __insert_cap_node(ci, cap); - /* clear out old exporting info? (i.e. on cap import) */ - if (ci->i_cap_exporting_mds == mds) { - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; - } - /* add to session cap list */ cap->session = session; spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; spin_unlock(&session->s_cap_lock); - } else if (new_cap) - ceph_put_cap(mdsc, new_cap); + } else { + if (new_cap) + ceph_put_cap(mdsc, new_cap); + + /* + * auth mds of the inode changed. we received the cap export + * message, but still haven't received the cap import message. + * handle_cap_export() updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing + * a message that was send before the cap import message. So + * don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != cap_id); + seq = cap->seq; + mseq = cap->mseq; + issued |= cap->issued; + flags |= CEPH_CAP_FLAG_AUTH; + } + } if (!ci->i_snap_realm) { /* @@ -611,15 +624,9 @@ retry: if (ci->i_auth_cap == NULL || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) ci->i_auth_cap = cap; - } else if (ci->i_auth_cap == cap) { - ci->i_auth_cap = NULL; - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p to cap_dirty_migrating\n", inode); - list_move(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); - } - spin_unlock(&mdsc->cap_dirty_lock); + ci->i_cap_exporting_issued = 0; + } else { + WARN_ON(ci->i_auth_cap == cap); } dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", @@ -628,7 +635,7 @@ retry: cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; - if (mseq > cap->mseq) + if (ceph_seq_cmp(mseq, cap->mseq) > 0) cap->mds_wanted = wanted; else cap->mds_wanted |= wanted; @@ -816,7 +823,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (cap != ocap && __cap_is_valid(cap) && + if (cap != ocap && (cap->implemented & ~cap->issued & mask)) return 1; } @@ -888,7 +895,19 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) */ static int __ceph_is_any_caps(struct ceph_inode_info *ci) { - return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; + return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued; +} + +int ceph_is_any_caps(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_is_any_caps(ci); + spin_unlock(&ci->i_ceph_lock); + + return ret; } /* @@ -1383,13 +1402,10 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ci->i_snap_realm->cached_context); dout(" inode %p now dirty snapc %p auth cap %p\n", &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + WARN_ON(!ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); - if (ci->i_auth_cap) - list_add(&ci->i_dirty_item, &mdsc->cap_dirty); - else - list_add(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); + list_add(&ci->i_dirty_item, &mdsc->cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); @@ -1735,13 +1751,12 @@ ack: /* * Try to flush dirty caps back to the auth mds. */ -static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, - unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, unsigned *flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - int unlock_session = session ? 0 : 1; int flushing = 0; + struct ceph_mds_session *session = NULL; retry: spin_lock(&ci->i_ceph_lock); @@ -1755,13 +1770,14 @@ retry: int want = __ceph_caps_wanted(ci); int delayed; - if (!session) { + if (!session || session != cap->session) { spin_unlock(&ci->i_ceph_lock); + if (session) + mutex_unlock(&session->s_mutex); session = cap->session; mutex_lock(&session->s_mutex); goto retry; } - BUG_ON(session != cap->session); if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; @@ -1780,7 +1796,7 @@ retry: out: spin_unlock(&ci->i_ceph_lock); out_unlocked: - if (session && unlock_session) + if (session) mutex_unlock(&session->s_mutex); return flushing; } @@ -1865,7 +1881,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; mutex_lock(&inode->i_mutex); - dirty = try_flush_caps(inode, NULL, &flush_tid); + dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); /* @@ -1900,7 +1916,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) dout("write_inode %p wait=%d\n", inode, wait); if (wait) { - dirty = try_flush_caps(inode, NULL, &flush_tid); + dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); @@ -2350,11 +2366,11 @@ static void invalidate_aliases(struct inode *inode) d_prune_aliases(inode); /* * For non-directory inode, d_find_alias() only returns - * connected dentry. After calling d_invalidate(), the - * dentry become disconnected. + * hashed dentry. After calling d_invalidate(), the + * dentry becomes unhashed. * * For directory inode, d_find_alias() can return - * disconnected dentry. But directory inode should have + * unhashed dentry. But directory inode should have * one alias at most. */ while ((dn = d_find_alias(inode))) { @@ -2408,6 +2424,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); + + /* + * auth mds of the inode changed. we received the cap export message, + * but still haven't received the cap import message. handle_cap_export + * updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message + * that was sent before the cap import message. So don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); + seq = cap->seq; + newcaps |= cap->issued; + } + /* * If CACHE is being revoked, and we have no dirty buffers, * try to invalidate (once). (If there are dirty buffers, we @@ -2434,6 +2466,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, issued |= implemented | __ceph_caps_dirty(ci); cap->cap_gen = session->s_cap_gen; + cap->seq = seq; __check_cap_issue(ci, cap, newcaps); @@ -2464,6 +2497,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); ci->i_xattrs.version = version; + ceph_forget_all_cached_acls(inode); } } @@ -2483,6 +2517,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, &atime); + + /* file layout may have changed */ + ci->i_layout = grant->layout; + /* max size increase? */ if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); @@ -2511,11 +2549,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, check_caps = 1; } - cap->seq = seq; - - /* file layout may have changed */ - ci->i_layout = grant->layout; - /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { int revoking = cap->issued & ~newcaps; @@ -2741,65 +2774,114 @@ static void handle_cap_trunc(struct inode *inode, * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, - struct ceph_mds_session *session, - int *open_target_sessions) + struct ceph_mds_cap_peer *ph, + struct ceph_mds_session *session) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *tsession = NULL; + struct ceph_cap *cap, *tcap; struct ceph_inode_info *ci = ceph_inode(inode); - int mds = session->s_mds; + u64 t_cap_id; unsigned mseq = le32_to_cpu(ex->migrate_seq); - struct ceph_cap *cap = NULL, *t; - struct rb_node *p; - int remember = 1; + unsigned t_seq, t_mseq; + int target, issued; + int mds = session->s_mds; - dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + if (ph) { + t_cap_id = le64_to_cpu(ph->cap_id); + t_seq = le32_to_cpu(ph->seq); + t_mseq = le32_to_cpu(ph->mseq); + target = le32_to_cpu(ph->mds); + } else { + t_cap_id = t_seq = t_mseq = 0; + target = -1; + } + dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", + inode, ci, mds, mseq, target); +retry: spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) + goto out_unlock; - /* make sure we haven't seen a higher mseq */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - t = rb_entry(p, struct ceph_cap, ci_node); - if (ceph_seq_cmp(t->mseq, mseq) > 0) { - dout(" higher mseq on cap from mds%d\n", - t->session->s_mds); - remember = 0; - } - if (t->session->s_mds == mds) - cap = t; + if (target < 0) { + __ceph_remove_cap(cap, false); + goto out_unlock; } - if (cap) { - if (remember) { - /* make note */ - ci->i_cap_exporting_mds = mds; - ci->i_cap_exporting_mseq = mseq; - ci->i_cap_exporting_issued = cap->issued; - - /* - * make sure we have open sessions with all possible - * export targets, so that we get the matching IMPORT - */ - *open_target_sessions = 1; + /* + * now we know we haven't received the cap import message yet + * because the exported cap still exist. + */ - /* - * we can't flush dirty caps that we've seen the - * EXPORT but no IMPORT for - */ - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p to cap_dirty_migrating\n", - inode); - list_move(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); + issued = cap->issued; + WARN_ON(issued != cap->implemented); + + tcap = __get_cap_for_mds(ci, target); + if (tcap) { + /* already have caps from the target */ + if (tcap->cap_id != t_cap_id || + ceph_seq_cmp(tcap->seq, t_seq) < 0) { + dout(" updating import cap %p mds%d\n", tcap, target); + tcap->cap_id = t_cap_id; + tcap->seq = t_seq - 1; + tcap->issue_seq = t_seq - 1; + tcap->mseq = t_mseq; + tcap->issued |= issued; + tcap->implemented |= issued; + if (cap == ci->i_auth_cap) + ci->i_auth_cap = tcap; + if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &tcap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); } - spin_unlock(&mdsc->cap_dirty_lock); } __ceph_remove_cap(cap, false); + goto out_unlock; } - /* else, we already released it */ + if (tsession) { + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; + spin_unlock(&ci->i_ceph_lock); + /* add placeholder for the export tagert */ + ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, + t_seq - 1, t_mseq, (u64)-1, flag, NULL); + goto retry; + } + + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + + /* open target session */ + tsession = ceph_mdsc_open_export_target_session(mdsc, target); + if (!IS_ERR(tsession)) { + if (mds > target) { + mutex_lock(&session->s_mutex); + mutex_lock_nested(&tsession->s_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&tsession->s_mutex); + mutex_lock_nested(&session->s_mutex, + SINGLE_DEPTH_NESTING); + } + ceph_add_cap_releases(mdsc, tsession); + } else { + WARN_ON(1); + tsession = NULL; + target = -1; + } + goto retry; + +out_unlock: spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + if (tsession) { + mutex_unlock(&tsession->s_mutex); + ceph_put_mds_session(tsession); + } } /* @@ -2810,10 +2892,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, + struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, void *snaptrace, int snaptrace_len) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; int mds = session->s_mds; unsigned issued = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); @@ -2821,28 +2905,44 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, unsigned mseq = le32_to_cpu(im->migrate_seq); u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); + u64 p_cap_id; + int peer; - if (ci->i_cap_exporting_mds >= 0 && - ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { - dout("handle_cap_import inode %p ci %p mds%d mseq %d" - " - cleared exporting from mds%d\n", - inode, ci, mds, mseq, - ci->i_cap_exporting_mds); - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; + if (ph) { + p_cap_id = le64_to_cpu(ph->cap_id); + peer = le32_to_cpu(ph->mds); + } else { + p_cap_id = 0; + peer = -1; + } - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p back to cap_dirty\n", inode); - list_move(&ci->i_dirty_item, &mdsc->cap_dirty); + dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", + inode, ci, mds, mseq, peer); + + spin_lock(&ci->i_ceph_lock); + cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; + if (cap && cap->cap_id == p_cap_id) { + dout(" remove export cap %p mds%d flags %d\n", + cap, peer, ph->flags); + if ((ph->flags & CEPH_CAP_FLAG_AUTH) && + (cap->seq != le32_to_cpu(ph->seq) || + cap->mseq != le32_to_cpu(ph->mseq))) { + pr_err("handle_cap_import: mismatched seq/mseq: " + "ino (%llx.%llx) mds%d seq %d mseq %d " + "importer mds%d has peer seq %d mseq %d\n", + ceph_vinop(inode), peer, cap->seq, + cap->mseq, mds, le32_to_cpu(ph->seq), + le32_to_cpu(ph->mseq)); } - spin_unlock(&mdsc->cap_dirty_lock); - } else { - dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + ci->i_cap_exporting_issued = cap->issued; + __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } + /* make sure we re-request max_size, if necessary */ + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + spin_unlock(&ci->i_ceph_lock); + down_write(&mdsc->snap_rwsem); ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, false); @@ -2853,11 +2953,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, kick_flushing_inode_caps(mdsc, session, inode); up_read(&mdsc->snap_rwsem); - /* make sure we re-request max_size, if necessary */ - spin_lock(&ci->i_ceph_lock); - ci->i_wanted_max_size = 0; /* reset */ - ci->i_requested_max_size = 0; - spin_unlock(&ci->i_ceph_lock); } /* @@ -2875,6 +2970,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; + struct ceph_mds_cap_peer *peer = NULL; int mds = session->s_mds; int op; u32 seq, mseq; @@ -2885,12 +2981,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, void *snaptrace; size_t snaptrace_len; void *flock; + void *end; u32 flock_len; - int open_target_sessions = 0; dout("handle_caps from mds%d\n", mds); /* decode */ + end = msg->front.iov_base + msg->front.iov_len; tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len < sizeof(*h)) goto bad; @@ -2908,17 +3005,28 @@ void ceph_handle_caps(struct ceph_mds_session *session, snaptrace_len = le32_to_cpu(h->snap_trace_len); if (le16_to_cpu(msg->hdr.version) >= 2) { - void *p, *end; - - p = snaptrace + snaptrace_len; - end = msg->front.iov_base + msg->front.iov_len; + void *p = snaptrace + snaptrace_len; ceph_decode_32_safe(&p, end, flock_len, bad); + if (p + flock_len > end) + goto bad; flock = p; } else { flock = NULL; flock_len = 0; } + if (le16_to_cpu(msg->hdr.version) >= 3) { + if (op == CEPH_CAP_OP_IMPORT) { + void *p = flock + flock_len; + if (p + sizeof(*peer) > end) + goto bad; + peer = p; + } else if (op == CEPH_CAP_OP_EXPORT) { + /* recorded in unused fields */ + peer = (void *)&h->size; + } + } + mutex_lock(&session->s_mutex); session->s_seq++; dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, @@ -2951,11 +3059,11 @@ void ceph_handle_caps(struct ceph_mds_session *session, goto done; case CEPH_CAP_OP_EXPORT: - handle_cap_export(inode, h, session, &open_target_sessions); - goto done; + handle_cap_export(inode, h, peer, session); + goto done_unlocked; case CEPH_CAP_OP_IMPORT: - handle_cap_import(mdsc, inode, h, session, + handle_cap_import(mdsc, inode, h, peer, session, snaptrace, snaptrace_len); } @@ -3007,8 +3115,6 @@ done: done_unlocked: if (inode) iput(inode); - if (open_target_sessions) - ceph_mdsc_open_export_target_sessions(mdsc, session); return; bad: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 2a0bcaeb189..6da4df84ba3 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -693,6 +693,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); + + if (!err) + err = ceph_init_acl(dentry, dentry->d_inode, dir); + if (err) d_drop(dentry); return err; @@ -1037,14 +1041,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) valid = 1; } else if (dentry_lease_is_valid(dentry) || dir_lease_is_valid(dir, dentry)) { - valid = 1; + if (dentry->d_inode) + valid = ceph_is_any_caps(dentry->d_inode); + else + valid = 1; } dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); - if (valid) + if (valid) { ceph_dentry_lru_touch(dentry); - else + } else { + ceph_dir_clear_complete(dir); d_drop(dentry); + } iput(dir); return valid; } @@ -1293,6 +1302,8 @@ const struct inode_operations ceph_dir_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, .mknod = ceph_mknod, .symlink = ceph_symlink, .mkdir = ceph_mkdir, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3de89829e2a..dfd2ce3419f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -408,51 +408,92 @@ more: * * If the read spans object boundary, just do multiple reads. */ -static ssize_t ceph_sync_read(struct file *file, char __user *data, - unsigned len, loff_t *poff, int *checkeof) +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, + int *checkeof) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct page **pages; - u64 off = *poff; + u64 off = iocb->ki_pos; int num_pages, ret; + size_t len = i->count; - dout("sync_read on file %p %llu~%u %s\n", file, off, len, + dout("sync_read on file %p %llu~%u %s\n", file, off, + (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - - if (file->f_flags & O_DIRECT) { - num_pages = calc_pages_for((unsigned long)data, len); - pages = ceph_get_direct_page_vector(data, num_pages, true); - } else { - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); - } - if (IS_ERR(pages)) - return PTR_ERR(pages); - /* * flush any page cache pages in this range. this * will make concurrent normal and sync io slow, * but it will at least behave sensibly when they are * in sequence. */ - ret = filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait_range(inode->i_mapping, off, + off + len); if (ret < 0) - goto done; + return ret; - ret = striped_read(inode, off, len, pages, num_pages, checkeof, - file->f_flags & O_DIRECT, - (unsigned long)data & ~PAGE_MASK); + if (file->f_flags & O_DIRECT) { + while (iov_iter_count(i)) { + void __user *data = i->iov[0].iov_base + i->iov_offset; + size_t len = i->iov[0].iov_len - i->iov_offset; + + num_pages = calc_pages_for((unsigned long)data, len); + pages = ceph_get_direct_page_vector(data, + num_pages, true); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = striped_read(inode, off, len, + pages, num_pages, checkeof, + 1, (unsigned long)data & ~PAGE_MASK); + ceph_put_page_vector(pages, num_pages, true); + + if (ret <= 0) + break; + off += ret; + iov_iter_advance(i, ret); + if (ret < len) + break; + } + } else { + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = striped_read(inode, off, len, pages, + num_pages, checkeof, 0, 0); + if (ret > 0) { + int l, k = 0; + size_t left = len = ret; + + while (left) { + void __user *data = i->iov[0].iov_base + + i->iov_offset; + l = min(i->iov[0].iov_len - i->iov_offset, + left); + + ret = ceph_copy_page_vector_to_user(&pages[k], + data, off, + l); + if (ret > 0) { + iov_iter_advance(i, ret); + left -= ret; + off += ret; + k = calc_pages_for(iocb->ki_pos, + len - left + 1) - 1; + BUG_ON(k >= num_pages && left); + } else + break; + } + } + ceph_release_page_vector(pages, num_pages); + } - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) - ret = ceph_copy_page_vector_to_user(pages, data, off, ret); - if (ret >= 0) - *poff = off + ret; + if (off > iocb->ki_pos) { + ret = off - iocb->ki_pos; + iocb->ki_pos = off; + } -done: - if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages, true); - else - ceph_release_page_vector(pages, num_pages); dout("sync_read result %d\n", ret); return ret; } @@ -489,83 +530,79 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } } + /* - * Synchronous write, straight from __user pointer or user pages (if - * O_DIRECT). + * Synchronous write, straight from __user pointer or user pages. * * If write spans object boundary, just do multiple writes. (For a * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct file *file, const char __user *data, - size_t left, loff_t pos, loff_t *ppos) +static ssize_t +ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, size_t count) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; - int num_ops = 1; struct page **pages; int num_pages; - u64 len; int written = 0; int flags; int check_caps = 0; - int page_align, io_align; - unsigned long buf_align; + int page_align; int ret; struct timespec mtime = CURRENT_TIME; - bool own_pages = false; + loff_t pos = iocb->ki_pos; + struct iov_iter i; if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u %s\n", file, pos, - (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + dout("sync_direct_write on file %p %lld~%u\n", file, pos, + (unsigned)count); - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) return ret; ret = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_CACHE_SHIFT, - (pos + left) >> PAGE_CACHE_SHIFT); + (pos + count) >> PAGE_CACHE_SHIFT); if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) - flags |= CEPH_OSD_FLAG_ACK; - else - num_ops++; /* Also include a 'startsync' command. */ - /* - * we may need to do multiple writes here if we span an object - * boundary. this isn't atomic, unfortunately. :( - */ -more: - io_align = pos & ~PAGE_MASK; - buf_align = (unsigned long)data & ~PAGE_MASK; - len = left; - - snapc = ci->i_snap_realm->cached_context; - vino = ceph_vino(inode); - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, num_ops, - CEPH_OSD_OP_WRITE, flags, snapc, - ci->i_truncate_seq, ci->i_truncate_size, - false); - if (IS_ERR(req)) - return PTR_ERR(req); + iov_iter_init(&i, iov, nr_segs, count, 0); + + while (iov_iter_count(&i) > 0) { + void __user *data = i.iov->iov_base + i.iov_offset; + u64 len = i.iov->iov_len - i.iov_offset; + + page_align = (unsigned long)data & ~PAGE_MASK; + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, + 2,/*include a 'startsync' command*/ + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } - /* write from beginning of first page, regardless of io alignment */ - page_align = file->f_flags & O_DIRECT ? buf_align : io_align; - num_pages = calc_pages_for(page_align, len); - if (file->f_flags & O_DIRECT) { + num_pages = calc_pages_for(page_align, len); pages = ceph_get_direct_page_vector(data, num_pages, false); if (IS_ERR(pages)) { ret = PTR_ERR(pages); @@ -577,60 +614,175 @@ more: * may block. */ truncate_inode_pages_range(inode->i_mapping, pos, - (pos+len) | (PAGE_CACHE_SIZE-1)); - } else { + (pos+len) | (PAGE_CACHE_SIZE-1)); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, false); + + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + + ceph_put_page_vector(pages, num_pages, false); + +out: + ceph_osdc_put_request(req); + if (ret == 0) { + pos += len; + written += len; + iov_iter_advance(&i, (size_t)len); + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } else + break; + } + + if (ret != -EOLDSNAPC && written > 0) { + iocb->ki_pos = pos; + ret = written; + } + return ret; +} + + +/* + * Synchronous write, straight from __user pointer or user pages. + * + * If write spans object boundary, just do multiple writes. (For a + * correct atomic write, we should e.g. take write locks on all + * objects, rollback on failure, etc.) + */ +static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, size_t count) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; + struct ceph_osd_request *req; + struct page **pages; + u64 len; + int num_pages; + int written = 0; + int flags; + int check_caps = 0; + int ret; + struct timespec mtime = CURRENT_TIME; + loff_t pos = iocb->ki_pos; + struct iov_iter i; + + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) + return -EROFS; + + dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); + + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + if (ret < 0) + return ret; + + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + count) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); + + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ACK; + + iov_iter_init(&i, iov, nr_segs, count, 0); + + while ((len = iov_iter_count(&i)) > 0) { + size_t left; + int n; + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, 1, + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } + + /* + * write from beginning of first page, + * regardless of io alignment + */ + num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; } - ret = ceph_copy_user_to_page_vector(pages, data, pos, len); + + left = len; + for (n = 0; n < num_pages; n++) { + size_t plen = min_t(size_t, left, PAGE_SIZE); + ret = iov_iter_copy_from_user(pages[n], &i, 0, plen); + if (ret != plen) { + ret = -EFAULT; + break; + } + left -= ret; + iov_iter_advance(&i, ret); + } + if (ret < 0) { ceph_release_page_vector(pages, num_pages); goto out; } - if ((file->f_flags & O_SYNC) == 0) { - /* get a second commit callback */ - req->r_unsafe_callback = ceph_sync_write_unsafe; - req->r_inode = inode; - own_pages = true; - } - } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, - false, own_pages); + /* get a second commit callback */ + req->r_unsafe_callback = ceph_sync_write_unsafe; + req->r_inode = inode; - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + false, true); - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages, false); - else if (file->f_flags & O_SYNC) - ceph_release_page_vector(pages, num_pages); + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); out: - ceph_osdc_put_request(req); - if (ret == 0) { - pos += len; - written += len; - left -= len; - data += len; - if (left) - goto more; + ceph_osdc_put_request(req); + if (ret == 0) { + pos += len; + written += len; + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } else + break; + } + if (ret != -EOLDSNAPC && written > 0) { ret = written; - *ppos = pos; - if (pos > i_size_read(inode)) - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, - NULL); - } else if (ret != -EOLDSNAPC && written > 0) { - ret = written; + iocb->ki_pos = pos; } return ret; } @@ -647,55 +799,84 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, { struct file *filp = iocb->ki_filp; struct ceph_file_info *fi = filp->private_data; - loff_t *ppos = &iocb->ki_pos; - size_t len = iov->iov_len; + size_t len = iocb->ki_nbytes; struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - void __user *base = iov->iov_base; ssize_t ret; int want, got = 0; int checkeof = 0, read = 0; - dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", - inode, ceph_vinop(inode), pos, (unsigned)len, inode); again: + dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); if (ret < 0) - goto out; - dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)len, - ceph_cap_string(got)); + return ret; if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (fi->flags & CEPH_F_SYNC)) + (fi->flags & CEPH_F_SYNC)) { + struct iov_iter i; + + dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); + + if (!read) { + ret = generic_segment_checks(iov, &nr_segs, + &len, VERIFY_WRITE); + if (ret) + goto out; + } + + iov_iter_init(&i, iov, nr_segs, len, read); + /* hmm, this isn't really async... */ - ret = ceph_sync_read(filp, base, len, ppos, &checkeof); - else - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + ret = ceph_sync_read(iocb, &i, &checkeof); + } else { + /* + * We can't modify the content of iov, + * so we only read from beginning. + */ + if (read) { + iocb->ki_pos = pos; + len = iocb->ki_nbytes; + read = 0; + } + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)len, + ceph_cap_string(got)); + ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + } out: dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); ceph_put_cap_refs(ci, got); if (checkeof && ret >= 0) { - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); + int statret = ceph_do_getattr(inode, + CEPH_STAT_CAP_SIZE); /* hit EOF or hole? */ - if (statret == 0 && *ppos < inode->i_size) { - dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); + if (statret == 0 && iocb->ki_pos < inode->i_size && + ret < len) { + dout("sync_read hit hole, ppos %lld < size %lld" + ", reading more\n", iocb->ki_pos, + inode->i_size); + read += ret; - base += ret; len -= ret; checkeof = 0; goto again; } } + if (ret >= 0) ret += read; @@ -772,11 +953,13 @@ retry_snap: inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_filp->f_flags & O_DIRECT) || - (fi->flags & CEPH_F_SYNC)) { + (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { mutex_unlock(&inode->i_mutex); - written = ceph_sync_write(file, iov->iov_base, count, - pos, &iocb->ki_pos); + if (file->f_flags & O_DIRECT) + written = ceph_sync_direct_write(iocb, iov, + nr_segs, count); + else + written = ceph_sync_write(iocb, iov, nr_segs, count); if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", @@ -1018,7 +1201,7 @@ static long ceph_fallocate(struct file *file, int mode, loff_t offset, loff_t length) { struct ceph_file_info *fi = file->private_data; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 9a8e396aed8..32d519d8a2e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -9,6 +9,7 @@ #include <linux/namei.h> #include <linux/writeback.h> #include <linux/vmalloc.h> +#include <linux/posix_acl.h> #include "super.h" #include "mds_client.h" @@ -95,6 +96,8 @@ const struct inode_operations ceph_file_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, }; @@ -335,12 +338,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; INIT_LIST_HEAD(&ci->i_cap_delay_list); - ci->i_cap_exporting_mds = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_issued = 0; INIT_LIST_HEAD(&ci->i_cap_snaps); ci->i_head_snapc = NULL; ci->i_snap_caps = 0; + ci->i_cap_exporting_issued = 0; for (i = 0; i < CEPH_FILE_MODE_NUM; i++) ci->i_nr_by_mode[i] = 0; @@ -436,6 +437,16 @@ void ceph_destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, ceph_i_callback); } +int ceph_drop_inode(struct inode *inode) +{ + /* + * Positve dentry and corresponding inode are always accompanied + * in MDS reply. So no need to keep inode in the cache after + * dropping all its aliases. + */ + return 1; +} + /* * Helpers to fill in size, ctime, mtime, and atime. We have to be * careful because either the client or MDS may have more up to date @@ -670,6 +681,7 @@ static int fill_inode(struct inode *inode, memcpy(ci->i_xattrs.blob->vec.iov_base, iinfo->xattr_data, iinfo->xattr_len); ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + ceph_forget_all_cached_acls(inode); xattr_blob = NULL; } @@ -978,7 +990,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); - int i = 0; int err = 0; dout("fill_trace %p is_dentry %d is_target %d\n", req, @@ -1039,6 +1050,29 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, } } + if (rinfo->head->is_target) { + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto done; + } + req->r_target_inode = in; + + err = fill_inode(in, &rinfo->targeti, NULL, + session, req->r_request_started, + (le32_to_cpu(rinfo->head->result) == 0) ? + req->r_fmode : -1, + &req->r_caps_reservation); + if (err < 0) { + pr_err("fill_inode badness %p %llx.%llx\n", + in, ceph_vinop(in)); + goto done; + } + } + /* * ignore null lease/binding on snapdir ENOENT, or else we * will have trouble splicing in the virtual snapdir later @@ -1108,7 +1142,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ceph_dentry(req->r_old_dentry)->offset); dn = req->r_old_dentry; /* use old_dentry */ - in = dn->d_inode; } /* null dentry? */ @@ -1130,44 +1163,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, } /* attach proper inode */ - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = dn->d_inode; - if (!in) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_trace bad get_inode " - "%llx.%llx\n", vino.ino, vino.snap); - err = PTR_ERR(in); - d_drop(dn); - goto done; - } + if (!dn->d_inode) { + ihold(in); dn = splice_dentry(dn, in, &have_lease, true); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; } req->r_dentry = dn; /* may have spliced */ - ihold(in); - } else if (ceph_ino(in) == vino.ino && - ceph_snap(in) == vino.snap) { - ihold(in); - } else { + } else if (dn->d_inode && dn->d_inode != in) { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", - dn, in, ceph_ino(in), ceph_snap(in), - vino.ino, vino.snap); + dn, dn->d_inode, ceph_vinop(dn->d_inode), + ceph_vinop(in)); have_lease = false; - in = NULL; } if (have_lease) update_dentry_lease(dn, rinfo->dlease, session, req->r_request_started); dout(" final dn %p\n", dn); - i++; - } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || - req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) { + } else if (!req->r_aborted && + (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP)) { struct dentry *dn = req->r_dentry; /* fill out a snapdir LOOKUPSNAP dentry */ @@ -1177,52 +1194,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ininfo = rinfo->targeti.in; vino.ino = le64_to_cpu(ininfo->ino); vino.snap = le64_to_cpu(ininfo->snapid); - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_inode get_inode badness %llx.%llx\n", - vino.ino, vino.snap); - err = PTR_ERR(in); - d_delete(dn); - goto done; - } dout(" linking snapped dir %p to dn %p\n", in, dn); + ihold(in); dn = splice_dentry(dn, in, NULL, true); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; } req->r_dentry = dn; /* may have spliced */ - ihold(in); - rinfo->head->is_dentry = 1; /* fool notrace handlers */ } - - if (rinfo->head->is_target) { - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); - - if (in == NULL || ceph_ino(in) != vino.ino || - ceph_snap(in) != vino.snap) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - err = PTR_ERR(in); - goto done; - } - } - req->r_target_inode = in; - - err = fill_inode(in, - &rinfo->targeti, NULL, - session, req->r_request_started, - (le32_to_cpu(rinfo->head->result) == 0) ? - req->r_fmode : -1, - &req->r_caps_reservation); - if (err < 0) { - pr_err("fill_inode badness %p %llx.%llx\n", - in, ceph_vinop(in)); - goto done; - } - } - done: dout("fill_trace done err=%d\n", err); return err; @@ -1272,7 +1252,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct qstr dname; struct dentry *dn; struct inode *in; - int err = 0, i; + int err = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_dentry_info *di; @@ -1305,6 +1285,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); } + /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_vino vino; @@ -1329,9 +1310,10 @@ retry_lookup: err = -ENOMEM; goto out; } - err = ceph_init_dentry(dn); - if (err < 0) { + ret = ceph_init_dentry(dn); + if (ret < 0) { dput(dn); + err = ret; goto out; } } else if (dn->d_inode && @@ -1351,9 +1333,6 @@ retry_lookup: spin_unlock(&parent->d_lock); } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + r_readdir_offset); - /* inode */ if (dn->d_inode) { in = dn->d_inode; @@ -1366,26 +1345,39 @@ retry_lookup: err = PTR_ERR(in); goto out; } - dn = splice_dentry(dn, in, NULL, false); - if (IS_ERR(dn)) - dn = NULL; } if (fill_inode(in, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); + if (!dn->d_inode) + iput(in); + d_drop(dn); goto next_item; } - if (dn) - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, - req->r_request_started); + + if (!dn->d_inode) { + dn = splice_dentry(dn, in, NULL, false); + if (IS_ERR(dn)) { + err = PTR_ERR(dn); + dn = NULL; + goto next_item; + } + } + + di = dn->d_fsdata; + di->offset = ceph_make_fpos(frag, i + r_readdir_offset); + + update_dentry_lease(dn, rinfo->dir_dlease[i], + req->r_session, + req->r_request_started); next_item: if (dn) dput(dn); } - req->r_did_prepopulate = true; + if (err == 0) + req->r_did_prepopulate = true; out: if (snapdir) { @@ -1474,7 +1466,8 @@ static void ceph_invalidate_work(struct work_struct *work) dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { - /* nevermind! */ + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; spin_unlock(&ci->i_ceph_lock); mutex_unlock(&ci->i_truncate_mutex); goto out; @@ -1495,13 +1488,14 @@ static void ceph_invalidate_work(struct work_struct *work) dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", inode, orig_gen, ci->i_rdcache_gen, ci->i_rdcache_revoking); + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; } spin_unlock(&ci->i_ceph_lock); mutex_unlock(&ci->i_truncate_mutex); - +out: if (check) ceph_check_caps(ci, 0, NULL); -out: iput(inode); } @@ -1622,6 +1616,8 @@ static const struct inode_operations ceph_symlink_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, }; /* @@ -1695,6 +1691,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || attr->ia_mode != inode->i_mode) { + inode->i_mode = attr->ia_mode; req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); mask |= CEPH_SETATTR_MODE; release |= CEPH_CAP_AUTH_SHARED; @@ -1810,6 +1807,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); + if (ia_valid & ATTR_MODE) { + err = posix_acl_chmod(inode, attr->ia_mode); + if (err) + goto out_put; + } + if (mask) { req->r_inode = inode; ihold(inode); @@ -1829,6 +1832,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) return err; out: spin_unlock(&ci->i_ceph_lock); +out_put: ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 669622fd1ae..dc66c9e023e 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -183,6 +183,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_object_locator oloc; + struct ceph_object_id oid; u64 len = 1, olen; u64 tmp; struct ceph_pg pgid; @@ -211,8 +213,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, - ceph_file_layout_pg_pool(ci->i_layout)); + oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); + ceph_oid_set_name(&oid, dl.object_name); + + r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); if (r < 0) { up_read(&osdc->map_sem); return r; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d90861f4521..f4f050a69a4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -63,7 +63,7 @@ static const struct ceph_connection_operations mds_con_ops; */ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, - int features) + u64 features) { int err = -EIO; @@ -98,7 +98,7 @@ bad: */ static int parse_reply_info_trace(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { int err; @@ -145,7 +145,7 @@ out_bad: */ static int parse_reply_info_dir(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { u32 num, i = 0; int err; @@ -217,7 +217,7 @@ out_bad: */ static int parse_reply_info_filelock(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (*p + sizeof(*info->filelock_reply) > end) goto bad; @@ -238,7 +238,7 @@ bad: */ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { if (*p == end) { @@ -262,7 +262,7 @@ bad: */ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); @@ -280,7 +280,7 @@ static int parse_reply_info_extra(void **p, void *end, */ static int parse_reply_info(struct ceph_msg *msg, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { void *p, *end; u32 len; @@ -713,14 +713,15 @@ static int __choose_mds(struct ceph_mds_client *mdsc, struct dentry *dn = get_nonsnap_parent(parent); inode = dn->d_inode; dout("__choose_mds using nonsnap parent %p\n", inode); - } else if (req->r_dentry->d_inode) { + } else { /* dentry target */ inode = req->r_dentry->d_inode; - } else { - /* dir + name */ - inode = dir; - hash = ceph_dentry_hash(dir, req->r_dentry); - is_hash = true; + if (!inode || mode == USE_AUTH_MDS) { + /* dir + name */ + inode = dir; + hash = ceph_dentry_hash(dir, req->r_dentry); + is_hash = true; + } } } @@ -846,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc, * * called under mdsc->mutex */ +static struct ceph_mds_session * +__open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + session = __ceph_lookup_mds_session(mdsc, target); + if (!session) { + session = register_session(mdsc, target); + if (IS_ERR(session)) + return session; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) + __open_session(mdsc, session); + + return session; +} + +struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + dout("open_export_target_session to mds%d\n", target); + + mutex_lock(&mdsc->mutex); + session = __open_export_target_session(mdsc, target); + mutex_unlock(&mdsc->mutex); + + return session; +} + static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_info *mi; struct ceph_mds_session *ts; int i, mds = session->s_mds; - int target; if (mds >= mdsc->mdsmap->m_max_mds) return; + mi = &mdsc->mdsmap->m_info[mds]; dout("open_export_target_sessions for mds%d (%d targets)\n", session->s_mds, mi->num_export_targets); for (i = 0; i < mi->num_export_targets; i++) { - target = mi->export_targets[i]; - ts = __ceph_lookup_mds_session(mdsc, target); - if (!ts) { - ts = register_session(mdsc, target); - if (IS_ERR(ts)) - return; - } - if (session->s_state == CEPH_MDS_SESSION_NEW || - session->s_state == CEPH_MDS_SESSION_CLOSING) - __open_session(mdsc, session); - else - dout(" mds%d target mds%d %p is %s\n", session->s_mds, - i, ts, session_state_name(ts->s_state)); - ceph_put_mds_session(ts); + ts = __open_export_target_session(mdsc, mi->export_targets[i]); + if (!IS_ERR(ts)) + ceph_put_mds_session(ts); } } @@ -1136,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, return 0; } +static int send_flushmsg_ack(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, u64 seq) +{ + struct ceph_msg *msg; + + dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", + session->s_mds, session_state_name(session->s_state), seq); + msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + + /* * Note new cap ttl, and any transition from stale -> not stale (fresh?). * @@ -1214,7 +1251,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { struct ceph_mds_session *session = arg; struct ceph_inode_info *ci = ceph_inode(inode); - int used, oissued, mine; + int used, wanted, oissued, mine; if (session->s_trim_caps <= 0) return -1; @@ -1222,14 +1259,19 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) spin_lock(&ci->i_ceph_lock); mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); + wanted = __ceph_caps_file_wanted(ci); oissued = __ceph_caps_issued_other(ci, cap); - dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", + dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n", inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), - ceph_cap_string(used)); - if (ci->i_dirty_caps) - goto out; /* dirty caps */ - if ((used & ~oissued) & mine) + ceph_cap_string(used), ceph_cap_string(wanted)); + if (cap == ci->i_auth_cap) { + if (ci->i_dirty_caps | ci->i_flushing_caps) + goto out; + if ((used | wanted) & CEPH_CAP_ANY_WR) + goto out; + } + if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ session->s_trim_caps--; @@ -2156,26 +2198,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) */ if (result == -ESTALE) { dout("got ESTALE on request %llu", req->r_tid); - if (!req->r_inode) { - /* do nothing; not an authority problem */ - } else if (req->r_direct_mode != USE_AUTH_MDS) { + if (req->r_direct_mode != USE_AUTH_MDS) { dout("not using auth, setting for that now"); req->r_direct_mode = USE_AUTH_MDS; __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); goto out; } else { - struct ceph_inode_info *ci = ceph_inode(req->r_inode); - struct ceph_cap *cap = NULL; - - if (req->r_session) - cap = ceph_get_cap_for_mds(ci, - req->r_session->s_mds); - - dout("already using auth"); - if ((!cap || cap != ci->i_auth_cap) || - (cap->mseq != req->r_sent_on_mseq)) { - dout("but cap changed, so resending"); + int mds = __choose_mds(mdsc, req); + if (mds >= 0 && mds != req->r_session->s_mds) { + dout("but auth changed, so resending"); __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); goto out; @@ -2400,6 +2432,10 @@ static void handle_session(struct ceph_mds_session *session, trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); break; + case CEPH_SESSION_FLUSHMSG: + send_flushmsg_ack(mdsc, session, seq); + break; + default: pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); WARN_ON(1); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4c053d099ae..68288917c73 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -383,6 +383,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg); +extern struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 89fa4a940a0..4440f447fd3 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -41,6 +41,8 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_RENEWCAPS: return "renewcaps"; case CEPH_SESSION_STALE: return "stale"; case CEPH_SESSION_RECALL_STATE: return "recall_state"; + case CEPH_SESSION_FLUSHMSG: return "flushmsg"; + case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; } return "???"; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6a0951e4304..2df963f1cf5 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -490,10 +490,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; - const unsigned supported_features = + const u64 supported_features = CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH; - const unsigned required_features = 0; + const u64 required_features = 0; int page_count; size_t size; int err = -ENOMEM; @@ -686,6 +686,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .destroy_inode = ceph_destroy_inode, .write_inode = ceph_write_inode, + .drop_inode = ceph_drop_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, @@ -818,7 +819,11 @@ static int ceph_set_super(struct super_block *s, void *data) s->s_flags = fsc->mount_options->sb_flags; s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ +#ifdef CONFIG_CEPH_FS_POSIX_ACL + s->s_flags |= MS_POSIXACL; +#endif + s->s_xattr = ceph_xattr_handlers; s->s_fs_info = fsc; fsc->sb = s; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ef4ac38bb61..aa260590f61 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -287,14 +287,12 @@ struct ceph_inode_info { unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ struct list_head i_cap_delay_list; /* for delayed cap release to mds */ - int i_cap_exporting_mds; /* to handle cap migration between */ - unsigned i_cap_exporting_mseq; /* mds's. */ - unsigned i_cap_exporting_issued; struct ceph_cap_reservation i_cap_migration_resv; struct list_head i_cap_snaps; /* snapped state pending flush to mds */ struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ + unsigned i_cap_exporting_issued; int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ @@ -335,7 +333,6 @@ struct ceph_inode_info { u32 i_fscache_gen; /* sequence, for delayed fscache validate */ struct work_struct i_revalidate_work; #endif - struct inode vfs_inode; /* at end */ }; @@ -529,6 +526,8 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) } extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask); extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); @@ -691,6 +690,7 @@ extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_destroy_inode(struct inode *inode); +extern int ceph_drop_inode(struct inode *inode); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino); @@ -718,12 +718,16 @@ extern void ceph_queue_writeback(struct inode *inode); extern int ceph_do_getattr(struct inode *inode, int mask); extern int ceph_permission(struct inode *inode, int mask); extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); +extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); /* xattr.c */ extern int ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); +int __ceph_removexattr(struct dentry *, const char *); extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern int ceph_removexattr(struct dentry *, const char *); @@ -732,6 +736,38 @@ extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); extern void __init ceph_xattr_init(void); extern void ceph_xattr_exit(void); +/* acl.c */ +extern const struct xattr_handler *ceph_xattr_handlers[]; + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + +struct posix_acl *ceph_get_acl(struct inode *, int); +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ceph_init_acl(struct dentry *, struct inode *, struct inode *); +void ceph_forget_all_cached_acls(struct inode *inode); + +#else + +#define ceph_get_acl NULL +#define ceph_set_acl NULL + +static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode, + struct inode *dir) +{ + return 0; +} + +static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) +{ + return 0; +} + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ +} + +#endif + /* caps.c */ extern const char *ceph_cap_string(int c); extern void ceph_handle_caps(struct ceph_mds_session *session, @@ -744,6 +780,7 @@ extern int ceph_add_cap(struct inode *inode, extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); +extern int ceph_is_any_caps(struct inode *inode); extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, u64 cap_id, u32 migrate_seq, u32 issue_seq); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index be661d8f532..898b6565ad3 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -6,16 +6,30 @@ #include <linux/ceph/decode.h> #include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> #include <linux/slab.h> #define XATTR_CEPH_PREFIX "ceph." #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) +/* + * List of handlers for synthetic system.* attributes. Other + * attributes are handled directly. + */ +const struct xattr_handler *ceph_xattr_handlers[] = { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL, +}; + static bool ceph_is_valid_xattr(const char *name) { return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -663,10 +677,9 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) } } -ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, +ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); int err; struct ceph_inode_xattr *xattr; @@ -675,7 +688,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (!ceph_is_valid_xattr(name)) return -ENODATA; - /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { @@ -725,6 +737,15 @@ out: return err; } +ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size) +{ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, value, size); + + return __ceph_getxattr(dentry->d_inode, name, value, size); +} + ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = dentry->d_inode; @@ -863,8 +884,8 @@ out: return err; } -int ceph_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int __ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct ceph_vxattr *vxattr; @@ -879,9 +900,6 @@ int ceph_setxattr(struct dentry *dentry, const char *name, struct ceph_inode_xattr *xattr = NULL; int required_blob_size; - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; - if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -958,6 +976,18 @@ out: return err; } +int ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + return -EROFS; + + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + return __ceph_setxattr(dentry, name, value, size, flags); +} + static int ceph_send_removexattr(struct dentry *dentry, const char *name) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); @@ -984,7 +1014,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) return err; } -int ceph_removexattr(struct dentry *dentry, const char *name) +int __ceph_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; struct ceph_vxattr *vxattr; @@ -994,9 +1024,6 @@ int ceph_removexattr(struct dentry *dentry, const char *name) int required_blob_size; int dirty; - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; - if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -1053,3 +1080,13 @@ out: return err; } +int ceph_removexattr(struct dentry *dentry, const char *name) +{ + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + return -EROFS; + + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + return __ceph_removexattr(dentry, name); +} diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index aa339762034..2c29db6a247 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -477,9 +477,10 @@ extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon, const int netfid, __u64 *pExtAttrBits, __u64 *pMask); extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb); extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr); -extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, - const unsigned char *path, - struct cifs_sb_info *cifs_sb, unsigned int xid); +extern int CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + const unsigned char *path); extern int mdfour(unsigned char *, unsigned char *, int); extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, const struct nls_table *codepage); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 124aa0230c1..d707edb6b85 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -4010,7 +4010,7 @@ QFileInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + cifs_dbg(FYI, "Send error in QFileInfo = %d", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4179,7 +4179,7 @@ UnixQFileInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + cifs_dbg(FYI, "Send error in UnixQFileInfo = %d", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); @@ -4263,7 +4263,7 @@ UnixQPathInfoRetry: rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { - cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc); + cifs_dbg(FYI, "Send error in UnixQPathInfo = %d", rc); } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 11ff5f116b2..a514e0a65f6 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -193,7 +193,7 @@ check_name(struct dentry *direntry) static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, struct tcon_link *tlink, unsigned oflags, umode_t mode, - __u32 *oplock, struct cifs_fid *fid, int *created) + __u32 *oplock, struct cifs_fid *fid) { int rc = -ENOENT; int create_options = CREATE_NOT_DIR; @@ -349,7 +349,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, .device = 0, }; - *created |= FILE_CREATED; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { args.uid = current_fsuid(); if (inode->i_mode & S_ISGID) @@ -480,13 +479,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, cifs_add_pending_open(&fid, tlink, &open); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid, opened); + &oplock, &fid); if (rc) { cifs_del_pending_open(&open); goto out; } + if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + *opened |= FILE_CREATED; + rc = finish_open(file, direntry, generic_file_open, opened); if (rc) { if (server->ops->close) @@ -529,7 +531,6 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, struct TCP_Server_Info *server; struct cifs_fid fid; __u32 oplock; - int created = FILE_CREATED; cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n", inode, direntry->d_name.name, direntry); @@ -546,7 +547,7 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, server->ops->new_lease_key(&fid); rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode, - &oplock, &fid, &created); + &oplock, &fid); if (!rc && server->ops->close) server->ops->close(xid, tcon, &fid); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 36f9ebb93ce..49719b8228e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -383,7 +383,8 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); + int tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr, + full_path); if (tmprc) cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); } @@ -799,7 +800,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid); + tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr, + full_path); if (tmprc) cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc); } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index cc0234710dd..92aee08483a 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -354,34 +354,30 @@ open_query_close_cifs_symlink(const unsigned char *path, char *pbuf, int -CIFSCheckMFSymlink(struct cifs_fattr *fattr, - const unsigned char *path, - struct cifs_sb_info *cifs_sb, unsigned int xid) +CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, + const unsigned char *path) { - int rc = 0; + int rc; u8 *buf = NULL; unsigned int link_len = 0; unsigned int bytes_read = 0; - struct cifs_tcon *ptcon; if (!CIFSCouldBeMFSymlink(fattr)) /* it's not a symlink */ return 0; buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL); - if (!buf) { - rc = -ENOMEM; - goto out; - } + if (!buf) + return -ENOMEM; - ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb)); - if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink)) - rc = ptcon->ses->server->ops->query_mf_symlink(path, buf, - &bytes_read, cifs_sb, xid); + if (tcon->ses->server->ops->query_mf_symlink) + rc = tcon->ses->server->ops->query_mf_symlink(path, buf, + &bytes_read, cifs_sb, xid); else - goto out; + rc = -ENOSYS; - if (rc != 0) + if (rc) goto out; if (bytes_read == 0) /* not a symlink */ diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index dc52e13d58e..3881610b643 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, struct i2c_msg __user *tmsgs; struct i2c_msg32 __user *umsgs; compat_caddr_t datap; - int nmsgs, i; + u32 nmsgs; + int i; if (get_user(nmsgs, &udata->nmsgs)) return -EFAULT; diff --git a/fs/coredump.c b/fs/coredump.c index bc3fbcd3255..e3ad709a423 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -40,7 +40,6 @@ #include <trace/events/task.h> #include "internal.h" -#include "coredump.h" #include <trace/events/sched.h> diff --git a/fs/coredump.h b/fs/coredump.h deleted file mode 100644 index e39ff072110..00000000000 --- a/fs/coredump.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _FS_COREDUMP_H -#define _FS_COREDUMP_H - -extern int __get_dumpable(unsigned long mm_flags); - -#endif diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index e501ac3a49f..06610cf94d5 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -17,14 +17,30 @@ #include <linux/init.h> #include <linux/string.h> #include <linux/blkdev.h> -#include <linux/cramfs_fs.h> #include <linux/slab.h> -#include <linux/cramfs_fs_sb.h> #include <linux/vfs.h> #include <linux/mutex.h> - +#include <uapi/linux/cramfs_fs.h> #include <asm/uaccess.h> +#include "internal.h" + +/* + * cramfs super-block data in memory + */ +struct cramfs_sb_info { + unsigned long magic; + unsigned long size; + unsigned long blocks; + unsigned long files; + unsigned long flags; +}; + +static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + static const struct super_operations cramfs_ops; static const struct inode_operations cramfs_dir_inode_operations; static const struct file_operations cramfs_directory_operations; @@ -219,10 +235,11 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i return read_buffers[buffer] + offset; } -static void cramfs_put_super(struct super_block *sb) +static void cramfs_kill_sb(struct super_block *sb) { - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; + struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + kill_block_super(sb); + kfree(sbi); } static int cramfs_remount(struct super_block *sb, int *flags, char *data) @@ -261,7 +278,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) if (super.magic == CRAMFS_MAGIC_WEND) { if (!silent) printk(KERN_ERR "cramfs: wrong endianness\n"); - goto out; + return -EINVAL; } /* check at 512 byte offset */ @@ -273,20 +290,20 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) printk(KERN_ERR "cramfs: wrong endianness\n"); else if (!silent) printk(KERN_ERR "cramfs: wrong magic\n"); - goto out; + return -EINVAL; } } /* get feature flags first */ if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) { printk(KERN_ERR "cramfs: unsupported filesystem features\n"); - goto out; + return -EINVAL; } /* Check that the root inode is in a sane state */ if (!S_ISDIR(super.root.mode)) { printk(KERN_ERR "cramfs: root is not a directory\n"); - goto out; + return -EINVAL; } /* correct strange, hard-coded permissions of mkcramfs */ super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); @@ -310,22 +327,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) (root_offset != 512 + sizeof(struct cramfs_super)))) { printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset); - goto out; + return -EINVAL; } /* Set it all up.. */ sb->s_op = &cramfs_ops; root = get_cramfs_inode(sb, &super.root, 0); if (IS_ERR(root)) - goto out; + return PTR_ERR(root); sb->s_root = d_make_root(root); if (!sb->s_root) - goto out; + return -ENOMEM; return 0; -out: - kfree(sbi); - sb->s_fs_info = NULL; - return -EINVAL; } static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -550,7 +563,6 @@ static const struct inode_operations cramfs_dir_inode_operations = { }; static const struct super_operations cramfs_ops = { - .put_super = cramfs_put_super, .remount_fs = cramfs_remount, .statfs = cramfs_statfs, }; @@ -565,7 +577,7 @@ static struct file_system_type cramfs_fs_type = { .owner = THIS_MODULE, .name = "cramfs", .mount = cramfs_mount, - .kill_sb = kill_block_super, + .kill_sb = cramfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("cramfs"); diff --git a/fs/cramfs/internal.h b/fs/cramfs/internal.h new file mode 100644 index 00000000000..349d7127215 --- /dev/null +++ b/fs/cramfs/internal.h @@ -0,0 +1,4 @@ +/* Uncompression interfaces to the underlying zlib */ +int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen); +int cramfs_uncompress_init(void); +void cramfs_uncompress_exit(void); diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c index 023329800d2..1760c1b84d9 100644 --- a/fs/cramfs/uncompress.c +++ b/fs/cramfs/uncompress.c @@ -19,7 +19,7 @@ #include <linux/errno.h> #include <linux/vmalloc.h> #include <linux/zlib.h> -#include <linux/cramfs_fs.h> +#include "internal.h" static z_stream stream; static int initialized; diff --git a/fs/dcache.c b/fs/dcache.c index 4bdb300b16e..265e0ce9769 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -192,7 +192,7 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char if (!tcount) return 0; } - mask = ~(~0ul << tcount*8); + mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } @@ -3061,8 +3061,13 @@ char *d_path(const struct path *path, char *buf, int buflen) * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: + * + * Some pseudo inodes are mountable. When they are mounted + * path->dentry == path->mnt->mnt_root. In that case don't call d_dname + * and instead have d_path return the mounted path. */ - if (path->dentry->d_op && path->dentry->d_op->d_dname) + if (path->dentry->d_op && path->dentry->d_op->d_dname && + (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); rcu_read_lock(); @@ -3111,26 +3116,28 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) /* * Write full pathname from the root of the filesystem into the buffer. */ -static char *__dentry_path(struct dentry *dentry, char *buf, int buflen) +static char *__dentry_path(struct dentry *d, char *buf, int buflen) { + struct dentry *dentry; char *end, *retval; int len, seq = 0; int error = 0; + if (buflen < 2) + goto Elong; + rcu_read_lock(); restart: + dentry = d; end = buf + buflen; len = buflen; prepend(&end, &len, "\0", 1); - if (buflen < 1) - goto Elong; /* Get '/' right */ retval = end-1; *retval = '/'; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { struct dentry *parent = dentry->d_parent; - int error; prefetch(parent); error = prepend_name(&end, &len, &dentry->d_name); diff --git a/fs/dcookies.c b/fs/dcookies.c index ab5954b5026..ac44a69fbea 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -204,7 +204,7 @@ out: } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len) +COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len) { #ifdef __BIG_ENDIAN return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d90909ec6aa..3190ca973dd 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -649,6 +649,7 @@ static void process_sctp_notification(struct connection *con, struct msghdr *msg, char *buf) { union sctp_notification *sn = (union sctp_notification *)buf; + struct linger linger; switch (sn->sn_header.sn_type) { case SCTP_SEND_FAILED: @@ -713,11 +714,11 @@ static void process_sctp_notification(struct connection *con, return; /* Peel off a new sock */ - sctp_lock_sock(con->sock->sk); + lock_sock(con->sock->sk); ret = sctp_do_peeloff(con->sock->sk, sn->sn_assoc_change.sac_assoc_id, &new_con->sock); - sctp_release_sock(con->sock->sk); + release_sock(con->sock->sk); if (ret < 0) { log_print("Can't peel off a socket for " "connection %d to node %d: err=%d", @@ -727,6 +728,13 @@ static void process_sctp_notification(struct connection *con, } add_sock(new_con->sock, new_con); + linger.l_onoff = 1; + linger.l_linger = 0; + ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER, + (char *)&linger, sizeof(linger)); + if (ret < 0) + log_print("set socket option SO_LINGER failed"); + log_print("connecting to %d sctp association %d", nodeid, (int)sn->sn_assoc_change.sac_assoc_id); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index c36c4482447..b167ca48b8e 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -659,19 +659,17 @@ out_lock: return rc; } -static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, - size_t *bufsiz) +static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); char *lower_buf; + char *buf; mm_segment_t old_fs; int rc; lower_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!lower_buf) { - rc = -ENOMEM; - goto out; - } + if (!lower_buf) + return ERR_PTR(-ENOMEM); old_fs = get_fs(); set_fs(get_ds()); rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, @@ -680,21 +678,18 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, set_fs(old_fs); if (rc < 0) goto out; - rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb, + rc = ecryptfs_decode_and_decrypt_filename(&buf, bufsiz, dentry->d_sb, lower_buf, rc); out: kfree(lower_buf); - return rc; + return rc ? ERR_PTR(rc) : buf; } static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - char *buf; - size_t len = PATH_MAX; - int rc; - - rc = ecryptfs_readlink_lower(dentry, &buf, &len); - if (rc) + size_t len; + char *buf = ecryptfs_readlink_lower(dentry, &len); + if (IS_ERR(buf)) goto out; fsstack_copy_attr_atime(dentry->d_inode, ecryptfs_dentry_to_lower(dentry)->d_inode); @@ -1003,10 +998,12 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, char *target; size_t targetsiz; - rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz); - if (!rc) { + target = ecryptfs_readlink_lower(dentry, &targetsiz); + if (!IS_ERR(target)) { kfree(target); stat->size = targetsiz; + } else { + rc = PTR_ERR(target); } } return rc; diff --git a/fs/efs/super.c b/fs/efs/super.c index c6f57a74a55..50215bbd646 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -26,11 +26,18 @@ static struct dentry *efs_mount(struct file_system_type *fs_type, return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super); } +static void efs_kill_sb(struct super_block *s) +{ + struct efs_sb_info *sbi = SUPER_INFO(s); + kill_block_super(s); + kfree(sbi); +} + static struct file_system_type efs_fs_type = { .owner = THIS_MODULE, .name = "efs", .mount = efs_mount, - .kill_sb = kill_block_super, + .kill_sb = efs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("efs"); @@ -105,12 +112,6 @@ static void destroy_inodecache(void) kmem_cache_destroy(efs_inode_cachep); } -static void efs_put_super(struct super_block *s) -{ - kfree(s->s_fs_info); - s->s_fs_info = NULL; -} - static int efs_remount(struct super_block *sb, int *flags, char *data) { *flags |= MS_RDONLY; @@ -120,7 +121,6 @@ static int efs_remount(struct super_block *sb, int *flags, char *data) static const struct super_operations efs_superblock_operations = { .alloc_inode = efs_alloc_inode, .destroy_inode = efs_destroy_inode, - .put_super = efs_put_super, .statfs = efs_statfs, .remount_fs = efs_remount, }; @@ -259,7 +259,6 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) struct efs_sb_info *sb; struct buffer_head *bh; struct inode *root; - int ret = -EINVAL; sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); if (!sb) @@ -270,7 +269,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { printk(KERN_ERR "EFS: device does not support %d byte blocks\n", EFS_BLOCKSIZE); - goto out_no_fs_ul; + return -EINVAL; } /* read the vh (volume header) block */ @@ -278,7 +277,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) if (!bh) { printk(KERN_ERR "EFS: cannot read volume header\n"); - goto out_no_fs_ul; + return -EINVAL; } /* @@ -290,13 +289,13 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) brelse(bh); if (sb->fs_start == -1) { - goto out_no_fs_ul; + return -EINVAL; } bh = sb_bread(s, sb->fs_start + EFS_SUPER); if (!bh) { printk(KERN_ERR "EFS: cannot read superblock\n"); - goto out_no_fs_ul; + return -EINVAL; } if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { @@ -304,7 +303,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); #endif brelse(bh); - goto out_no_fs_ul; + return -EINVAL; } brelse(bh); @@ -319,24 +318,16 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) root = efs_iget(s, EFS_ROOTINODE); if (IS_ERR(root)) { printk(KERN_ERR "EFS: get root inode failed\n"); - ret = PTR_ERR(root); - goto out_no_fs; + return PTR_ERR(root); } s->s_root = d_make_root(root); if (!(s->s_root)) { printk(KERN_ERR "EFS: get root dentry failed\n"); - ret = -ENOMEM; - goto out_no_fs; + return -ENOMEM; } return 0; - -out_no_fs_ul: -out_no_fs: - s->s_fs_info = NULL; - kfree(sb); - return ret; } static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { diff --git a/fs/eventfd.c b/fs/eventfd.c index 35470d9b96e..d6a88e7812f 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -349,15 +349,12 @@ EXPORT_SYMBOL_GPL(eventfd_fget); */ struct eventfd_ctx *eventfd_ctx_fdget(int fd) { - struct file *file; struct eventfd_ctx *ctx; - - file = eventfd_fget(fd); - if (IS_ERR(file)) - return (struct eventfd_ctx *) file; - ctx = eventfd_ctx_get(file->private_data); - fput(file); - + struct fd f = fdget(fd); + if (!f.file) + return ERR_PTR(-EBADF); + ctx = eventfd_ctx_fileget(f.file); + fdput(f); return ctx; } EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8b5e2584c84..af903128891 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1907,10 +1907,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } } } - if (op == EPOLL_CTL_DEL && is_file_epoll(tf.file)) { - tep = tf.file->private_data; - mutex_lock_nested(&tep->mtx, 1); - } /* * Try to lookup the file inside our RB tree, Since we grabbed "mtx" diff --git a/fs/exec.c b/fs/exec.c index 7ea097f6b34..e1529b4c79b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -62,7 +62,6 @@ #include <trace/events/task.h> #include "internal.h" -#include "coredump.h" #include <trace/events/sched.h> @@ -843,7 +842,6 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); - arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); @@ -1088,8 +1086,8 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ set_fs(USER_DS); - current->flags &= - ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE); + current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); current->personality &= ~bprm->per_clear; @@ -1139,9 +1137,7 @@ void setup_new_exec(struct linux_binprm * bprm) /* An exec changes our domain. We are no longer part of the thread group */ - current->self_exec_id++; - flush_signal_handlers(current, 0); do_close_on_exec(current->files); } @@ -1173,6 +1169,10 @@ void free_bprm(struct linux_binprm *bprm) mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } + if (bprm->file) { + allow_write_access(bprm->file); + fput(bprm->file); + } /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); @@ -1224,11 +1224,10 @@ EXPORT_SYMBOL(install_exec_creds); * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH */ -static int check_unsafe_exec(struct linux_binprm *bprm) +static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; - int res = 0; if (p->ptrace) { if (p->ptrace & PT_PTRACE_CAP) @@ -1244,31 +1243,25 @@ static int check_unsafe_exec(struct linux_binprm *bprm) if (current->no_new_privs) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; + t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); - for (t = next_thread(p); t != p; t = next_thread(t)) { + while_each_thread(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); - if (p->fs->users > n_fs) { + if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; - } else { - res = -EAGAIN; - if (!p->fs->in_exec) { - p->fs->in_exec = 1; - res = 1; - } - } + else + p->fs->in_exec = 1; spin_unlock(&p->fs->lock); - - return res; } -/* - * Fill the binprm structure from the inode. +/* + * Fill the binprm structure from the inode. * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes * * This may be called multiple times for binary chains (scripts for example). @@ -1430,14 +1423,7 @@ static int exec_binprm(struct linux_binprm *bprm) audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); - current->did_exec = 1; proc_exec_connector(current); - - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; /* to catch use-after-free */ - } } return ret; @@ -1453,7 +1439,6 @@ static int do_execve_common(const char *filename, struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; - bool clear_in_exec; int retval; /* @@ -1485,10 +1470,7 @@ static int do_execve_common(const char *filename, if (retval) goto out_free; - retval = check_unsafe_exec(bprm); - if (retval < 0) - goto out_free; - clear_in_exec = retval; + check_unsafe_exec(bprm); current->in_execve = 1; file = open_exec(filename); @@ -1504,7 +1486,7 @@ static int do_execve_common(const char *filename, retval = bprm_mm_init(bprm); if (retval) - goto out_file; + goto out_unmark; bprm->argc = count(argv, MAX_ARG_STRINGS); if ((retval = bprm->argc) < 0) @@ -1551,15 +1533,8 @@ out: mmput(bprm->mm); } -out_file: - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } - out_unmark: - if (clear_in_exec) - current->fs->in_exec = 0; + current->fs->in_exec = 0; current->in_execve = 0; out_free: @@ -1609,67 +1584,22 @@ void set_binfmt(struct linux_binfmt *new) if (new) __module_get(new->module); } - EXPORT_SYMBOL(set_binfmt); /* - * set_dumpable converts traditional three-value dumpable to two flags and - * stores them into mm->flags. It modifies lower two bits of mm->flags, but - * these bits are not changed atomically. So get_dumpable can observe the - * intermediate state. To avoid doing unexpected behavior, get get_dumpable - * return either old dumpable or new one by paying attention to the order of - * modifying the bits. - * - * dumpable | mm->flags (binary) - * old new | initial interim final - * ---------+----------------------- - * 0 1 | 00 01 01 - * 0 2 | 00 10(*) 11 - * 1 0 | 01 00 00 - * 1 2 | 01 11 11 - * 2 0 | 11 10(*) 00 - * 2 1 | 11 11 01 - * - * (*) get_dumpable regards interim value of 10 as 11. + * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { - switch (value) { - case SUID_DUMP_DISABLE: - clear_bit(MMF_DUMPABLE, &mm->flags); - smp_wmb(); - clear_bit(MMF_DUMP_SECURELY, &mm->flags); - break; - case SUID_DUMP_USER: - set_bit(MMF_DUMPABLE, &mm->flags); - smp_wmb(); - clear_bit(MMF_DUMP_SECURELY, &mm->flags); - break; - case SUID_DUMP_ROOT: - set_bit(MMF_DUMP_SECURELY, &mm->flags); - smp_wmb(); - set_bit(MMF_DUMPABLE, &mm->flags); - break; - } -} - -int __get_dumpable(unsigned long mm_flags) -{ - int ret; + unsigned long old, new; - ret = mm_flags & MMF_DUMPABLE_MASK; - return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret; -} + if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) + return; -/* - * This returns the actual value of the suid_dumpable flag. For things - * that are using this for checking for privilege transitions, it must - * test against SUID_DUMP_USER rather than treating it as a boolean - * value. - */ -int get_dumpable(struct mm_struct *mm) -{ - return __get_dumpable(mm->flags); + do { + old = ACCESS_ONCE(mm->flags); + new = (old & ~MMF_DUMPABLE_MASK) | value; + } while (cmpxchg(&mm->flags, old, new) != old); } SYSCALL_DEFINE3(execve, diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index a52a5d23c30..ee4317faccb 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -577,7 +577,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) if (offset >= i_size) { *uptodate = true; - EXOFS_DBGMSG("offset >= i_size index=0x%lx\n", index); + EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index); return ZERO_PAGE(0); } @@ -596,10 +596,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) *uptodate = true; else *uptodate = PageUptodate(page); - EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate); + EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate); return page; } else { - EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n", + EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n", pcol->that_locked_page->index); *uptodate = true; return pcol->that_locked_page; @@ -611,11 +611,11 @@ static void __r4w_put_page(void *priv, struct page *page) struct page_collect *pcol = priv; if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { - EXOFS_DBGMSG("index=0x%lx\n", page->index); + EXOFS_DBGMSG2("index=0x%lx\n", page->index); page_cache_release(page); return; } - EXOFS_DBGMSG("that_locked_page index=0x%lx\n", + EXOFS_DBGMSG2("that_locked_page index=0x%lx\n", ZERO_PAGE(0) == page ? -1 : page->index); } @@ -961,6 +961,14 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset, WARN_ON(1); } + + /* TODO: Should be easy enough to do proprly */ +static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, unsigned long nr_segs) +{ + return 0; +} + const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, @@ -974,7 +982,7 @@ const struct address_space_operations exofs_aops = { /* Not implemented Yet */ .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ - .direct_IO = NULL, /* TODO: Should be trivial to do */ + .direct_IO = exofs_direct_IO, /* With these NULL has special meaning or default is not exported */ .get_xip_mem = NULL, @@ -1010,7 +1018,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) if (likely(!ret)) truncate_setsize(inode, newsize); - EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n", + EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n", inode->i_ino, newsize, ret); return ret; } @@ -1094,14 +1102,13 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, /* If object is lost on target we might as well enable it's * delete. */ - if ((ret == -ENOENT) || (ret == -EINVAL)) - ret = 0; + ret = 0; goto out; } ret = extract_attr_from_ios(ios, &attrs[0]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__); goto out; } WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); @@ -1109,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, ret = extract_attr_from_ios(ios, &attrs[1]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__); goto out; } if (attrs[1].len) { @@ -1124,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, ret = extract_attr_from_ios(ios, &attrs[2]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__); goto out; } if (attrs[2].len) { diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index b7442288860..dae884694bd 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -103,7 +103,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->max_io_length = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * - layout->group_width; + (layout->group_width - layout->parity); if (layout->parity) { unsigned stripe_length = (layout->group_width - layout->parity) * @@ -286,7 +286,8 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, if (length) { ore_calc_stripe_info(layout, offset, length, &ios->si); ios->length = ios->si.length; - ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) + + ios->length + PAGE_SIZE - 1) / PAGE_SIZE; if (layout->parity) _ore_post_alloc_raid_stuff(ios); } @@ -430,8 +431,12 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) if (likely(!ret)) continue; - if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { - /* start read offset passed endof file */ + if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) && + per_dev->bio) { + /* start read offset passed endof file. + * Note: if we do not have bio it means read-attributes + * In this case we should return error to caller. + */ _clear_bio(per_dev->bio); ORE_DBGMSG("start read offset passed end of file " "offset=0x%llx, length=0x%llx\n", @@ -536,6 +541,7 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, u64 H = LmodS - G * T; u32 N = div_u64(H, U); + u32 Nlast; /* "H - (N * U)" is just "H % U" so it's bound to u32 */ u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; @@ -568,6 +574,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, si->length = T - H; if (si->length > length) si->length = length; + + Nlast = div_u64(H + si->length + U - 1, U); + si->maxdevUnits = Nlast - N; + si->M = M; } EXPORT_SYMBOL(ore_calc_stripe_info); @@ -583,13 +593,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, int ret; if (per_dev->bio == NULL) { - unsigned pages_in_stripe = ios->layout->group_width * - (ios->layout->stripe_unit / PAGE_SIZE); - unsigned nr_pages = ios->nr_pages * ios->layout->group_width / - (ios->layout->group_width - - ios->layout->parity); - unsigned bio_size = (nr_pages + pages_in_stripe) / - ios->layout->group_width; + unsigned bio_size; + + if (!ios->reading) { + bio_size = ios->si.maxdevUnits; + } else { + bio_size = (ios->si.maxdevUnits + 1) * + (ios->layout->group_width - ios->layout->parity) / + ios->layout->group_width; + } + bio_size *= (ios->layout->stripe_unit / PAGE_SIZE); per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); if (unlikely(!per_dev->bio)) { @@ -609,8 +622,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], pglen, pgbase); if (unlikely(pglen != added_len)) { - ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", - per_dev->bio->bi_vcnt); + /* If bi_vcnt == bi_max then this is a SW BUG */ + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x " + "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n", + per_dev->bio->bi_vcnt, + per_dev->bio->bi_max_vecs, + BIO_MAX_PAGES_KMALLOC, cur_len); ret = -ENOMEM; goto out; } @@ -1098,7 +1115,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc, size_attr->attr = g_attr_logical_length; size_attr->attr.val_ptr = &size_attr->newsize; - ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", + ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", _LLU(oc->comps->obj.id), _LLU(obj_size), i); ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, &size_attr->attr); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 110b6b371a4..1b8001bbe94 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -148,13 +148,6 @@ ext2_get_acl(struct inode *inode, int type) struct posix_acl *acl; int retval; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -189,19 +182,14 @@ ext2_get_acl(struct inode *inode, int type) /* * inode->i_mutex: down */ -static int -ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) +int +ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int name_index; void *value = NULL; size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - switch(type) { case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -250,169 +238,21 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) int ext2_init_acl(struct inode *inode, struct inode *dir) { - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext2_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) { - /* This is an extended ACL */ - error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); - } - } -cleanup: - posix_acl_release(acl); - return error; -} - -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ -int -ext2_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int error; + struct posix_acl *default_acl, *acl; + int error; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; - error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return error; -} -/* - * Extended attribut handlers - */ -static size_t -ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_size) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_size) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext2_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(dentry->d_inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; - - error = ext2_set_acl(dentry->d_inode, type, acl); - -release_and_out: - posix_acl_release(acl); + if (default_acl) { + error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } return error; } - -const struct xattr_handler ext2_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ext2_xattr_list_acl_access, - .get = ext2_xattr_get_acl, - .set = ext2_xattr_set_acl, -}; - -const struct xattr_handler ext2_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ext2_xattr_list_acl_default, - .get = ext2_xattr_get_acl, - .set = ext2_xattr_set_acl, -}; diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 503bfb0ed79..44937f9fcf3 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -55,7 +55,7 @@ static inline int ext2_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext2_get_acl(struct inode *inode, int type); -extern int ext2_acl_chmod (struct inode *); +extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); #else @@ -63,12 +63,6 @@ extern int ext2_init_acl (struct inode *, struct inode *); #define ext2_get_acl NULL #define ext2_set_acl NULL -static inline int -ext2_acl_chmod (struct inode *inode) -{ - return 0; -} - static inline int ext2_init_acl (struct inode *inode, struct inode *dir) { return 0; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index a5b3a5db312..44c36e59076 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -103,5 +103,6 @@ const struct inode_operations ext2_file_inode_operations = { #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, + .set_acl = ext2_set_acl, .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 8a337640a46..94ed36849b7 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1566,7 +1566,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) } setattr_copy(inode, iattr); if (iattr->ia_valid & ATTR_MODE) - error = ext2_acl_chmod(inode); + error = posix_acl_chmod(inode, inode->i_mode); mark_inode_dirty(inode); return error; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 256dd5f4c1c..c268d0af1db 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -421,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = { #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, + .set_acl = ext2_set_acl, .tmpfile = ext2_tmpfile, }; @@ -433,4 +434,5 @@ const struct inode_operations ext2_special_inode_operations = { #endif .setattr = ext2_setattr, .get_acl = ext2_get_acl, + .set_acl = ext2_set_acl, }; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 288534920fe..20d6697bd63 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1493,6 +1493,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, sb->s_blocksize - offset : towrite; tmp_bh.b_state = 0; + tmp_bh.b_size = sb->s_blocksize; err = ext2_get_block(inode, blk, &tmp_bh, 1); if (err < 0) goto out; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 2d7557db3ae..91426141c33 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -103,8 +103,8 @@ static struct mb_cache *ext2_xattr_cache; static const struct xattr_handler *ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL - [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, - [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext2_xattr_acl_default_handler, + [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT2_XATTR_INDEX_TRUSTED] = &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_SECURITY @@ -116,8 +116,8 @@ const struct xattr_handler *ext2_xattr_handlers[] = { &ext2_xattr_user_handler, &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL - &ext2_xattr_acl_access_handler, - &ext2_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT2_FS_SECURITY &ext2_xattr_security_handler, diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index 5e41cccff76..60edf298644 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -57,8 +57,6 @@ struct ext2_xattr_entry { extern const struct xattr_handler ext2_xattr_user_handler; extern const struct xattr_handler ext2_xattr_trusted_handler; -extern const struct xattr_handler ext2_xattr_acl_access_handler; -extern const struct xattr_handler ext2_xattr_acl_default_handler; extern const struct xattr_handler ext2_xattr_security_handler; extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index dbb5ad59a7f..8bbaf5bcf98 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -145,13 +145,6 @@ ext3_get_acl(struct inode *inode, int type) struct posix_acl *acl; int retval; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -190,7 +183,7 @@ ext3_get_acl(struct inode *inode, int type) * inode->i_mutex: down unless called from ext3_new_inode */ static int -ext3_set_acl(handle_t *handle, struct inode *inode, int type, +__ext3_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl) { int name_index; @@ -198,9 +191,6 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch(type) { case ACL_TYPE_ACCESS: name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -243,204 +233,49 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -/* - * Initialize the ACLs of a new inode. Called from ext3_new_inode. - * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) - */ int -ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - error = ext3_set_acl(handle, inode, - ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (error < 0) - return error; - - if (error > 0) { - /* This is an extended ACL */ - error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); - } - } -cleanup: - posix_acl_release(acl); - return error; -} - -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ -int -ext3_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; handle_t *handle; - int retries = 0; - int error; + int error, retries = 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; retry: - handle = ext3_journal_start(inode, - EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext3_std_error(inode->i_sb, error); - goto out; - } - error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + error = __ext3_set_acl(handle, inode, type, acl); ext3_journal_stop(handle); - if (error == -ENOSPC && - ext3_should_retry_alloc(inode->i_sb, &retries)) + if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; -out: - posix_acl_release(acl); return error; } /* - * Extended attribute handlers + * Initialize the ACLs of a new inode. Called from ext3_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) */ -static size_t -ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) +int +ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - struct posix_acl *acl; + struct posix_acl *default_acl, *acl; int error; - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext3_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - handle_t *handle; - struct posix_acl *acl; - int error, retries = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; - -retry: - handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - error = ext3_set_acl(handle, inode, type, acl); - ext3_journal_stop(handle); - if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) - goto retry; + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; -release_and_out: - posix_acl_release(acl); + if (default_acl) { + error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); + } return error; } - -const struct xattr_handler ext3_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ext3_xattr_list_acl_access, - .get = ext3_xattr_get_acl, - .set = ext3_xattr_set_acl, -}; - -const struct xattr_handler ext3_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ext3_xattr_list_acl_default, - .get = ext3_xattr_get_acl, - .set = ext3_xattr_set_acl, -}; diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index dbc921e458c..ea1c69edab9 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -55,18 +55,13 @@ static inline int ext3_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext3_get_acl(struct inode *inode, int type); -extern int ext3_acl_chmod (struct inode *); +extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include <linux/sched.h> #define ext3_get_acl NULL - -static inline int -ext3_acl_chmod(struct inode *inode) -{ - return 0; -} +#define ext3_set_acl NULL static inline int ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index bafdd48eefd..e66e4808719 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -309,43 +309,17 @@ struct fname { */ static void free_rb_tree_fname(struct rb_root *root) { - struct rb_node *n = root->rb_node; - struct rb_node *parent; - struct fname *fname; - - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - fname = rb_entry(n, struct fname, rb_hash); - while (fname) { - struct fname * old = fname; + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) + do { + struct fname *old = fname; fname = fname->next; - kfree (old); - } - if (!parent) - *root = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } -} + kfree(old); + } while (fname); + *root = RB_ROOT; +} static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, loff_t pos) diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 25cb413277e..aad05311392 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -75,6 +75,7 @@ const struct inode_operations ext3_file_inode_operations = { .removexattr = generic_removexattr, #endif .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2bd85486b87..384b6ebb655 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -3365,7 +3365,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) mark_inode_dirty(inode); if (ia_valid & ATTR_MODE) - rc = ext3_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); err_out: ext3_std_error(inode->i_sb, error); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index f8cde46de9c..f197736dccf 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2569,6 +2569,7 @@ const struct inode_operations ext3_dir_inode_operations = { .removexattr = generic_removexattr, #endif .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2580,4 +2581,5 @@ const struct inode_operations ext3_special_inode_operations = { .removexattr = generic_removexattr, #endif .get_acl = ext3_get_acl, + .set_acl = ext3_set_acl, }; diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index b1fc96383e0..c6874be6d58 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -102,8 +102,8 @@ static struct mb_cache *ext3_xattr_cache; static const struct xattr_handler *ext3_xattr_handler_map[] = { [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL - [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, - [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_SECURITY @@ -115,8 +115,8 @@ const struct xattr_handler *ext3_xattr_handlers[] = { &ext3_xattr_user_handler, &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL - &ext3_xattr_acl_access_handler, - &ext3_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT3_FS_SECURITY &ext3_xattr_security_handler, diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h index 2be4f69bfa6..32e93ebf803 100644 --- a/fs/ext3/xattr.h +++ b/fs/ext3/xattr.h @@ -60,8 +60,6 @@ struct ext3_xattr_entry { extern const struct xattr_handler ext3_xattr_user_handler; extern const struct xattr_handler ext3_xattr_trusted_handler; -extern const struct xattr_handler ext3_xattr_acl_access_handler; -extern const struct xattr_handler ext3_xattr_acl_default_handler; extern const struct xattr_handler ext3_xattr_security_handler; extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 39a54a0e9fe..d40c8dbbb0d 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -152,13 +152,6 @@ ext4_get_acl(struct inode *inode, int type) struct posix_acl *acl; int retval; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -196,7 +189,7 @@ ext4_get_acl(struct inode *inode, int type) * inode->i_mutex: down unless called from ext4_new_inode */ static int -ext4_set_acl(handle_t *handle, struct inode *inode, int type, +__ext4_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl) { int name_index; @@ -204,9 +197,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -248,208 +238,51 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -/* - * Initialize the ACLs of a new inode. Called from ext4_new_inode. - * - * dir->i_mutex: down - * inode->i_mutex: up (access to inode is still exclusive) - */ int -ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) +ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(dir->i_sb, POSIX_ACL)) { - acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - if (test_opt(inode->i_sb, POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - error = ext4_set_acl(handle, inode, - ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (error < 0) - return error; - - if (error > 0) { - /* This is an extended ACL */ - error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); - } - } -cleanup: - posix_acl_release(acl); - return error; -} - -/* - * Does chmod for an inode that may have an Access Control List. The - * inode->i_mode field must be updated to the desired value by the caller - * before calling this function. - * Returns 0 on success, or a negative error number. - * - * We change the ACL rather than storing some ACL entries in the file - * mode permission bits (which would be more efficient), because that - * would break once additional permissions (like ACL_APPEND, ACL_DELETE - * for directories) are added. There are no more bits available in the - * file mode. - * - * inode->i_mutex: down - */ -int -ext4_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; handle_t *handle; - int retries = 0; - int error; - + int error, retries = 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return 0; - acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, ext4_jbd2_credits_xattr(inode)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - ext4_std_error(inode->i_sb, error); - goto out; - } - error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + error = __ext4_set_acl(handle, inode, type, acl); ext4_journal_stop(handle); - if (error == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) + if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; -out: - posix_acl_release(acl); return error; } /* - * Extended attribute handlers + * Initialize the ACLs of a new inode. Called from ext4_new_inode. + * + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) */ -static size_t -ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t -ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return 0; - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int -ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) +int +ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - struct posix_acl *acl; + struct posix_acl *default_acl, *acl; int error; - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(dentry->d_sb, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ext4_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - handle_t *handle; - struct posix_acl *acl; - int error, retries = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(inode->i_sb, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; -retry: - handle = ext4_journal_start(inode, EXT4_HT_XATTR, - ext4_jbd2_credits_xattr(inode)); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto release_and_out; + if (default_acl) { + error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, + acl); + posix_acl_release(acl); } - error = ext4_set_acl(handle, inode, type, acl); - ext4_journal_stop(handle); - if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - -release_and_out: - posix_acl_release(acl); return error; } - -const struct xattr_handler ext4_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ext4_xattr_list_acl_access, - .get = ext4_xattr_get_acl, - .set = ext4_xattr_set_acl, -}; - -const struct xattr_handler ext4_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ext4_xattr_list_acl_default, - .get = ext4_xattr_get_acl, - .set = ext4_xattr_set_acl, -}; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 18cb39ed7c7..da2c79577d7 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -55,18 +55,13 @@ static inline int ext4_acl_count(size_t size) /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type); -extern int ext4_acl_chmod(struct inode *); +int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> #define ext4_get_acl NULL - -static inline int -ext4_acl_chmod(struct inode *inode) -{ - return 0; -} +#define ext4_set_acl NULL static inline int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 3f11656bd72..41eb9dcfac7 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -180,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb) /* Called when the filesystem is unmounted */ void ext4_release_system_zone(struct super_block *sb) { - struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node; - struct rb_node *parent; - struct ext4_system_zone *entry; + struct ext4_system_zone *entry, *n; - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - entry = rb_entry(n, struct ext4_system_zone, node); + rbtree_postorder_for_each_entry_safe(entry, n, + &EXT4_SB(sb)->system_blks, node) kmem_cache_free(ext4_system_zone_cachep, entry); - if (!parent) - EXT4_SB(sb)->system_blks = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } + EXT4_SB(sb)->system_blks = RB_ROOT; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 680bb338891..d638c57e996 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -353,41 +353,16 @@ struct fname { */ static void free_rb_tree_fname(struct rb_root *root) { - struct rb_node *n = root->rb_node; - struct rb_node *parent; - struct fname *fname; - - while (n) { - /* Do the node's children first */ - if (n->rb_left) { - n = n->rb_left; - continue; - } - if (n->rb_right) { - n = n->rb_right; - continue; - } - /* - * The node has no children; free it, and then zero - * out parent's link to it. Finally go to the - * beginning of the loop and try to free the parent - * node. - */ - parent = rb_parent(n); - fname = rb_entry(n, struct fname, rb_hash); + struct fname *fname, *next; + + rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash) while (fname) { struct fname *old = fname; fname = fname->next; kfree(old); } - if (!parent) - *root = RB_ROOT; - else if (parent->rb_left == n) - parent->rb_left = NULL; - else if (parent->rb_right == n) - parent->rb_right = NULL; - n = parent; - } + + *root = RB_ROOT; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e6185031c1c..ece55565b9c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -268,6 +268,16 @@ struct ext4_io_submit { /* Translate # of blks to # of clusters */ #define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* * Structure of a blocks group descriptor diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 17ac112ab10..3fe29de832c 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -259,6 +259,15 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "journal_dirty_metadata failed: " + "handle type %u started at line %u, " + "credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); } } else { if (inode) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 35f65cf4f31..10cff4736b1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -360,8 +360,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); + ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); + ext4_lblk_t last = lblock + len - 1; - if (len == 0) + if (lblock > last) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -387,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + ext4_fsblk_t pblock = 0; + ext4_lblk_t lblock = 0; + ext4_lblk_t prev = 0; + int len = 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; + + /* Check for overlapping extents */ + lblock = le32_to_cpu(ext->ee_block); + len = ext4_ext_get_actual_len(ext); + if ((lblock <= prev) && prev) { + pblock = ext4_ext_pblock(ext); + es->s_last_error_block = cpu_to_le64(pblock); + return 0; + } ext++; entries--; + prev = lblock + len - 1; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); @@ -1834,8 +1851,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, depth = ext_depth(inode); if (!path[depth].p_ext) goto out; - b2 = le32_to_cpu(path[depth].p_ext->ee_block); - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); /* * get the next allocated block if the extent in the path @@ -1845,7 +1861,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, b2); } /* check for wrap through zero on extent logical start block*/ @@ -2504,7 +2520,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * extent, we have to mark the cluster as used (store negative * cluster number in partial_cluster). */ - unaligned = pblk & (sbi->s_cluster_ratio - 1); + unaligned = EXT4_PBLK_COFF(sbi, pblk); if (unaligned && (ee_len == num) && (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) *partial_cluster = EXT4_B2C(sbi, pblk); @@ -2598,7 +2614,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * accidentally freeing it later on */ pblk = ext4_ext_pblock(ex); - if (pblk & (sbi->s_cluster_ratio - 1)) + if (EXT4_PBLK_COFF(sbi, pblk)) *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); ex--; @@ -3461,7 +3477,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, WARN_ON(map->m_lblk < ee_block); /* * It is safe to convert extent to initialized via explicit - * zeroout only if extent is fully insde i_size or new_size. + * zeroout only if extent is fully inside i_size or new_size. */ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0; @@ -3753,7 +3769,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; - lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; return ext4_find_delalloc_range(inode, lblk_start, lblk_end); @@ -3812,9 +3828,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); /* Check towards left side */ - c_offset = lblk_start & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start); if (c_offset) { - lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); + lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); lblk_to = lblk_from + c_offset - 1; if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) @@ -3822,7 +3838,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } /* Now check towards right. */ - c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); if (allocated_clusters && c_offset) { lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; @@ -4030,7 +4046,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_ext_path *path) { struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); @@ -4048,8 +4064,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, (rr_cluster_start == ex_cluster_start)) { if (rr_cluster_start == ex_cluster_end) ee_start += ee_len - 1; - map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + - c_offset; + map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; map->m_len = min(map->m_len, (unsigned) sbi->s_cluster_ratio - c_offset); /* @@ -4203,7 +4218,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); - cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned @@ -4271,7 +4286,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * needed so that future calls to get_implied_cluster_alloc() * work correctly. */ - offset = map->m_lblk & (sbi->s_cluster_ratio - 1); + offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ar.len = EXT4_NUM_B2C(sbi, offset+allocated); ar.goal -= offset; ar.logical -= offset; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3da21945ff1..43e64f6022e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -617,6 +617,7 @@ const struct inode_operations ext4_file_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index bae987549dc..82edf5b9335 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -849,15 +849,16 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, handle_t *handle; struct page *page; struct ext4_iloc iloc; + int retries; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; +retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - handle = NULL; goto out; } @@ -867,7 +868,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, if (inline_size >= pos + len) { ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) - goto out; + goto out_journal; } if (ret == -ENOSPC) { @@ -875,6 +876,10 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, inode, flags, fsdata); + ext4_journal_stop(handle); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; goto out; } @@ -887,7 +892,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { ret = -ENOMEM; - goto out; + goto out_journal; } down_read(&EXT4_I(inode)->xattr_sem); @@ -904,16 +909,15 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, up_read(&EXT4_I(inode)->xattr_sem); *pagep = page; - handle = NULL; brelse(iloc.bh); return 1; out_release_page: up_read(&EXT4_I(inode)->xattr_sem); unlock_page(page); page_cache_release(page); +out_journal: + ext4_journal_stop(handle); out: - if (handle) - ext4_journal_stop(handle); brelse(iloc.bh); return ret; } @@ -1837,7 +1841,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle, { int error; struct ext4_xattr_entry *entry; - struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; @@ -1846,7 +1849,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle, return error; raw_inode = ext4_raw_inode(&iloc); - header = IHDR(inode, raw_inode); entry = (struct ext4_xattr_entry *)((void *)raw_inode + EXT4_I(inode)->i_inline_off); if (EXT4_XATTR_LEN(entry->e_name_len) + @@ -1924,9 +1926,11 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) } /* Clear the content within i_blocks. */ - if (i_size < EXT4_MIN_INLINE_DATA_SIZE) - memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0, - EXT4_MIN_INLINE_DATA_SIZE - i_size); + if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { + void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; + memset(p + i_size, 0, + EXT4_MIN_INLINE_DATA_SIZE - i_size); + } EXT4_I(inode)->i_inline_size = i_size < EXT4_MIN_INLINE_DATA_SIZE ? diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 07576347411..6e39895a91b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -144,8 +144,8 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ static int ext4_inode_is_fast_symlink(struct inode *inode) { - int ea_blocks = EXT4_I(inode)->i_file_acl ? - (inode->i_sb->s_blocksize >> 9) : 0; + int ea_blocks = EXT4_I(inode)->i_file_acl ? + EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } @@ -1206,7 +1206,6 @@ static int ext4_journalled_write_end(struct file *file, */ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; @@ -1218,7 +1217,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); /* * ext4_calc_metadata_amount() has side effects, which we have @@ -1238,10 +1236,6 @@ repeat: ei->i_da_metadata_calc_len = save_len; ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - cond_resched(); - goto repeat; - } return -ENOSPC; } ei->i_reserved_meta_blocks += md_needed; @@ -1255,7 +1249,6 @@ repeat: */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; @@ -1277,7 +1270,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); /* * ext4_calc_metadata_amount() has side effects, which we have @@ -1297,10 +1289,6 @@ repeat: ei->i_da_metadata_calc_len = save_len; ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - cond_resched(); - goto repeat; - } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } @@ -1784,7 +1772,7 @@ static int __ext4_journalled_writepage(struct page *page, ret = err; if (!ext4_has_inline_data(inode)) - ext4_walk_page_buffers(handle, page_bufs, 0, len, + ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: @@ -3513,11 +3501,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (EXT4_SB(sb)->s_cluster_ratio > 1) { - /* TODO: Add support for bigalloc file systems */ - return -EOPNOTSUPP; - } - trace_ext4_punch_hole(inode, offset, length); /* @@ -4598,6 +4581,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } + + if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) + inode_inc_iversion(inode); + if (S_ISREG(inode->i_mode) && (attr->ia_size < inode->i_size)) { if (ext4_should_order_data(inode)) { @@ -4675,7 +4662,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_orphan_del(NULL, inode); if (!rc && (ia_valid & ATTR_MODE)) - rc = ext4_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); err_out: ext4_std_error(inode->i_sb, error); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 60589b60e9b..6bea80614d7 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -101,9 +101,8 @@ static long swap_inode_boot_loader(struct super_block *sb, handle_t *handle; int err; struct inode *inode_bl; - struct ext4_inode_info *ei; struct ext4_inode_info *ei_bl; - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = EXT4_SB(sb); if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { err = -EINVAL; @@ -115,9 +114,6 @@ static long swap_inode_boot_loader(struct super_block *sb, goto swap_boot_out; } - sbi = EXT4_SB(sb); - ei = EXT4_I(inode); - inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); if (IS_ERR(inode_bl)) { err = PTR_ERR(inode_bl); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4d113efa024..04a5c7504be 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3442,6 +3442,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + + BUG_ON(atomic_read(&pa->pa_count)); + BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } @@ -3455,11 +3458,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, ext4_group_t grp; ext4_fsblk_t grp_blk; - if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) - return; - /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { + spin_unlock(&pa->pa_lock); + return; + } + if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; @@ -4121,7 +4126,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ - ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); + ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; @@ -4663,7 +4668,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, * blocks at the beginning or the end unless we are explicitly * requested to avoid doing so. */ - overflow = block & (sbi->s_cluster_ratio - 1); + overflow = EXT4_PBLK_COFF(sbi, block); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { overflow = sbi->s_cluster_ratio - overflow; @@ -4677,7 +4682,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, count += overflow; } } - overflow = count & (sbi->s_cluster_ratio - 1); + overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { if (count > overflow) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5a0408d7b11..d050e043e88 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1425,9 +1425,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi return ERR_PTR(-EIO); } if (unlikely(ino == dir->i_ino)) { - EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir", - dentry->d_name.len, - dentry->d_name.name); + EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", + dentry); return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); @@ -3225,6 +3224,7 @@ const struct inode_operations ext4_dir_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, }; @@ -3235,4 +3235,5 @@ const struct inode_operations ext4_special_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, .get_acl = ext4_get_acl, + .set_acl = ext4_set_acl, }; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c977f4e4e63..1f7784de05b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -792,7 +792,7 @@ static void ext4_put_super(struct super_block *sb) } ext4_es_unregister_shrinker(sbi); - del_timer(&sbi->s_err_report); + del_timer_sync(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); @@ -3316,11 +3316,19 @@ int ext4_calculate_overhead(struct super_block *sb) } -static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) +static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; /* + * There's no need to reserve anything when we aren't using extents. + * The space estimates are exact, there are no unwritten extents, + * hole punching doesn't need new metadata... This is needed especially + * to keep ext2/3 backward compatibility. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + return 0; + /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting @@ -3328,7 +3336,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ - resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >> + EXT4_SB(sb)->s_cluster_bits; do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); @@ -4071,10 +4080,10 @@ no_journal: "available"); } - err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); + err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb)); if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " - "reserved pool", ext4_calculate_resv_clusters(sbi)); + "reserved pool", ext4_calculate_resv_clusters(sb)); goto failed_mount4a; } @@ -4184,7 +4193,7 @@ failed_mount_wq: } failed_mount3: ext4_es_unregister_shrinker(sbi); - del_timer(&sbi->s_err_report); + del_timer_sync(&sbi->s_err_report); if (sbi->s_flex_groups) ext4_kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1423c4816a4..e175e94116a 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -95,8 +95,8 @@ static struct mb_cache *ext4_xattr_cache; static const struct xattr_handler *ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL - [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, - [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY @@ -108,8 +108,8 @@ const struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL - &ext4_xattr_acl_access_handler, - &ext4_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index c767dbdd7fc..819d6398833 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -96,8 +96,6 @@ struct ext4_xattr_ibody_find { extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; -extern const struct xattr_handler ext4_xattr_acl_access_handler; -extern const struct xattr_handler ext4_xattr_acl_default_handler; extern const struct xattr_handler ext4_xattr_security_handler; extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 27a0820340b..2e35da12d29 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o -f2fs-y := dir.o file.o inode.o namei.o hash.o super.o +f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index d0fc287efef..fa8da4cb8c4 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -17,9 +17,6 @@ #include "xattr.h" #include "acl.h" -#define get_inode_mode(i) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ - (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) - static inline size_t f2fs_acl_size(int count) { if (count <= 4) { @@ -167,19 +164,11 @@ fail: struct posix_acl *f2fs_get_acl(struct inode *inode, int type) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; void *value = NULL; struct posix_acl *acl; int retval; - if (!test_opt(sbi, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - if (type == ACL_TYPE_ACCESS) name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -205,21 +194,15 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) return acl; } -static int f2fs_set_acl(struct inode *inode, int type, +static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); int name_index; void *value = NULL; size_t size = 0; int error; - if (!test_opt(sbi, POSIX_ACL)) - return 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; @@ -261,154 +244,31 @@ static int f2fs_set_acl(struct inode *inode, int type, return error; } -int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage) +int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - struct posix_acl *acl = NULL; - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(sbi, POSIX_ACL)) { - acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - - if (!test_opt(sbi, POSIX_ACL) || !acl) - goto cleanup; - - if (S_ISDIR(inode->i_mode)) { - error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl, ipage); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) - error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, ipage); -cleanup: - posix_acl_release(acl); - return error; + return __f2fs_set_acl(inode, type, acl, NULL); } -int f2fs_acl_chmod(struct inode *inode) +int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct posix_acl *acl; - int error; - umode_t mode = get_inode_mode(inode); - - if (!test_opt(sbi, POSIX_ACL)) - return 0; - if (S_ISLNK(mode)) - return -EOPNOTSUPP; - - acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); + struct posix_acl *default_acl, *acl; + int error = 0; - error = posix_acl_chmod(&acl, GFP_KERNEL, mode); + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; - error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, NULL); - posix_acl_release(acl); - return error; -} - -static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - const char *xname = POSIX_ACL_XATTR_DEFAULT; - size_t size; - - if (!test_opt(sbi, POSIX_ACL)) - return 0; - - if (type == ACL_TYPE_ACCESS) - xname = POSIX_ACL_XATTR_ACCESS; - - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; -} - -static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(sbi, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = f2fs_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(sbi, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else { - acl = NULL; + if (default_acl) { + error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, + ipage); + posix_acl_release(default_acl); + } + if (acl) { + if (error) + error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, + ipage); + posix_acl_release(acl); } - error = f2fs_set_acl(inode, type, acl, NULL); - -release_and_out: - posix_acl_release(acl); return error; } - -const struct xattr_handler f2fs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = f2fs_xattr_list_acl, - .get = f2fs_xattr_get_acl, - .set = f2fs_xattr_set_acl, -}; - -const struct xattr_handler f2fs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = f2fs_xattr_list_acl, - .get = f2fs_xattr_get_acl, - .set = f2fs_xattr_set_acl, -}; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 49633131e03..e0864651cdc 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -37,18 +37,13 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int); -extern int f2fs_acl_chmod(struct inode *); +extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *); #else #define f2fs_check_acl NULL #define f2fs_get_acl NULL #define f2fs_set_acl NULL -static inline int f2fs_acl_chmod(struct inode *inode) -{ - return 0; -} - static inline int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *page) { diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 5716e5eb4e8..293d0486a40 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -30,7 +30,7 @@ static struct kmem_cache *inode_entry_slab; */ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: page = grab_cache_page(mapping, index); @@ -50,7 +50,7 @@ repeat: */ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); struct page *page; repeat: page = grab_cache_page(mapping, index); @@ -61,11 +61,12 @@ repeat: if (PageUptodate(page)) goto out; - if (f2fs_readpage(sbi, page, index, READ_SYNC)) + if (f2fs_submit_page_bio(sbi, page, index, + READ_SYNC | REQ_META | REQ_PRIO)) goto repeat; lock_page(page); - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@ -81,13 +82,12 @@ static int f2fs_write_meta_page(struct page *page, struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); /* Should not write any meta pages, if any IO error was occurred */ - if (wbc->for_reclaim || sbi->por_doing || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) { - dec_page_count(sbi, F2FS_DIRTY_META); - wbc->pages_skipped++; - set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; - } + if (unlikely(sbi->por_doing || + is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + goto redirty_out; + + if (wbc->for_reclaim) + goto redirty_out; wait_on_page_writeback(page); @@ -95,24 +95,31 @@ static int f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); return 0; + +redirty_out: + dec_page_count(sbi, F2FS_DIRTY_META); + wbc->pages_skipped++; + set_page_dirty(page); + return AOP_WRITEPAGE_ACTIVATE; } static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - struct block_device *bdev = sbi->sb->s_bdev; + int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); long written; if (wbc->for_kupdate) return 0; - if (get_pages(sbi, F2FS_DIRTY_META) == 0) + /* collect a number of dirty meta pages and write together */ + if (get_pages(sbi, F2FS_DIRTY_META) < nrpages) return 0; /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); - written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev)); + written = sync_meta_pages(sbi, META, nrpages); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write -= written; return 0; @@ -121,7 +128,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = LONG_MAX; struct pagevec pvec; long nwritten = 0; @@ -136,7 +143,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) + if (unlikely(nr_pages == 0)) break; for (i = 0; i < nr_pages; i++) { @@ -149,7 +156,8 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, unlock_page(page); break; } - if (nwritten++ >= nr_to_write) + nwritten++; + if (unlikely(nwritten >= nr_to_write)) break; } pagevec_release(&pvec); @@ -157,7 +165,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, } if (nwritten) - f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX); + f2fs_submit_merged_bio(sbi, type, WRITE); return nwritten; } @@ -186,31 +194,24 @@ const struct address_space_operations f2fs_meta_aops = { int acquire_orphan_inode(struct f2fs_sb_info *sbi) { - unsigned int max_orphans; int err = 0; - /* - * considering 512 blocks in a segment 5 blocks are needed for cp - * and log segment summaries. Remaining blocks are used to keep - * orphan entries with the limitation one reserved segment - * for cp pack we can have max 1020*507 orphan entries - */ - max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK; - mutex_lock(&sbi->orphan_inode_mutex); - if (sbi->n_orphans >= max_orphans) + spin_lock(&sbi->orphan_inode_lock); + if (unlikely(sbi->n_orphans >= sbi->max_orphans)) err = -ENOSPC; else sbi->n_orphans++; - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); + return err; } void release_orphan_inode(struct f2fs_sb_info *sbi) { - mutex_lock(&sbi->orphan_inode_mutex); + spin_lock(&sbi->orphan_inode_lock); f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); } void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -218,27 +219,30 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) struct list_head *head, *this; struct orphan_inode_entry *new = NULL, *orphan = NULL; - mutex_lock(&sbi->orphan_inode_mutex); + new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); + new->ino = ino; + + spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; list_for_each(this, head) { orphan = list_entry(this, struct orphan_inode_entry, list); - if (orphan->ino == ino) - goto out; + if (orphan->ino == ino) { + spin_unlock(&sbi->orphan_inode_lock); + kmem_cache_free(orphan_entry_slab, new); + return; + } + if (orphan->ino > ino) break; orphan = NULL; } - new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); - new->ino = ino; - /* add new_oentry into list which is sorted by inode number */ if (orphan) list_add(&new->list, this->prev); else list_add_tail(&new->list, head); -out: - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -246,7 +250,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) struct list_head *head; struct orphan_inode_entry *orphan; - mutex_lock(&sbi->orphan_inode_mutex); + spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { @@ -257,7 +261,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) break; } } - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); } static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -270,12 +274,12 @@ static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) iput(inode); } -int recover_orphan_inodes(struct f2fs_sb_info *sbi) +void recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blkaddr, i, j; if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) - return 0; + return; sbi->por_doing = true; start_blk = __start_cp_addr(sbi) + 1; @@ -295,29 +299,39 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) /* clear Orphan Flag */ clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); sbi->por_doing = false; - return 0; + return; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) { - struct list_head *head, *this, *next; + struct list_head *head; struct f2fs_orphan_block *orphan_blk = NULL; - struct page *page = NULL; unsigned int nentries = 0; - unsigned short index = 1; - unsigned short orphan_blocks; - - orphan_blocks = (unsigned short)((sbi->n_orphans + + unsigned short index; + unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans + (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); + struct page *page = NULL; + struct orphan_inode_entry *orphan = NULL; + + for (index = 0; index < orphan_blocks; index++) + grab_meta_page(sbi, start_blk + index); - mutex_lock(&sbi->orphan_inode_mutex); + index = 1; + spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; /* loop for each orphan inode entry and write them in Jornal block */ - list_for_each_safe(this, next, head) { - struct orphan_inode_entry *orphan; + list_for_each_entry(orphan, head, list) { + if (!page) { + page = find_get_page(META_MAPPING(sbi), start_blk++); + f2fs_bug_on(!page); + orphan_blk = + (struct f2fs_orphan_block *)page_address(page); + memset(orphan_blk, 0, sizeof(*orphan_blk)); + f2fs_put_page(page, 0); + } - orphan = list_entry(this, struct orphan_inode_entry, list); + orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); if (nentries == F2FS_ORPHANS_PER_BLOCK) { /* @@ -331,29 +345,20 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) set_page_dirty(page); f2fs_put_page(page, 1); index++; - start_blk++; nentries = 0; page = NULL; } - if (page) - goto page_exist; + } - page = grab_meta_page(sbi, start_blk); - orphan_blk = (struct f2fs_orphan_block *)page_address(page); - memset(orphan_blk, 0, sizeof(*orphan_blk)); -page_exist: - orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); + if (page) { + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + set_page_dirty(page); + f2fs_put_page(page, 1); } - if (!page) - goto end; - orphan_blk->blk_addr = cpu_to_le16(index); - orphan_blk->blk_count = cpu_to_le16(orphan_blocks); - orphan_blk->entry_count = cpu_to_le32(nentries); - set_page_dirty(page); - f2fs_put_page(page, 1); -end: - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, @@ -428,7 +433,8 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); /* The second checkpoint pack should start at the next segment */ - cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + cp_start_blk_no += ((unsigned long long)1) << + le32_to_cpu(fsb->log_blocks_per_seg); cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); if (cp1 && cp2) { @@ -465,7 +471,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) list_for_each(this, head) { struct dir_inode_entry *entry; entry = list_entry(this, struct dir_inode_entry, list); - if (entry->inode == inode) + if (unlikely(entry->inode == inode)) return -EEXIST; } list_add_tail(&new->list, head); @@ -513,8 +519,8 @@ void add_dirty_dir_inode(struct inode *inode) void remove_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; + + struct list_head *this, *head; if (!S_ISDIR(inode->i_mode)) return; @@ -525,6 +531,7 @@ void remove_dirty_dir_inode(struct inode *inode) return; } + head = &sbi->dir_inode_list; list_for_each(this, head) { struct dir_inode_entry *entry; entry = list_entry(this, struct dir_inode_entry, list); @@ -546,11 +553,13 @@ void remove_dirty_dir_inode(struct inode *inode) struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; + + struct list_head *this, *head; struct inode *inode = NULL; spin_lock(&sbi->dir_inode_lock); + + head = &sbi->dir_inode_list; list_for_each(this, head) { struct dir_inode_entry *entry; entry = list_entry(this, struct dir_inode_entry, list); @@ -565,11 +574,13 @@ struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino) void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) { - struct list_head *head = &sbi->dir_inode_list; + struct list_head *head; struct dir_inode_entry *entry; struct inode *inode; retry: spin_lock(&sbi->dir_inode_lock); + + head = &sbi->dir_inode_list; if (list_empty(head)) { spin_unlock(&sbi->dir_inode_lock); return; @@ -585,7 +596,7 @@ retry: * We should submit bio, since it exists several * wribacking dentry pages in the freeing inode. */ - f2fs_submit_bio(sbi, DATA, true); + f2fs_submit_merged_bio(sbi, DATA, WRITE); } goto retry; } @@ -760,8 +771,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* wait for previous submitted node/meta pages writeback */ wait_on_all_pages_writeback(sbi); - filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); - filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX); + filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); + filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; @@ -770,7 +781,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); - if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { clear_prefree_segments(sbi); F2FS_RESET_SB_DIRT(sbi); } @@ -791,9 +802,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); - f2fs_submit_bio(sbi, DATA, true); - f2fs_submit_bio(sbi, NODE, true); - f2fs_submit_bio(sbi, META, true); + f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_bio(sbi, META, WRITE); /* * update checkpoint pack index @@ -818,20 +829,28 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) void init_orphan_info(struct f2fs_sb_info *sbi) { - mutex_init(&sbi->orphan_inode_mutex); + spin_lock_init(&sbi->orphan_inode_lock); INIT_LIST_HEAD(&sbi->orphan_inode_list); sbi->n_orphans = 0; + /* + * considering 512 blocks in a segment 8 blocks are needed for cp + * and log segment summaries. Remaining blocks are used to keep + * orphan entries with the limitation one reserved segment + * for cp pack we can have max 1020*504 orphan entries + */ + sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE) + * F2FS_ORPHANS_PER_BLOCK; } int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", sizeof(struct orphan_inode_entry), NULL); - if (unlikely(!orphan_entry_slab)) + if (!orphan_entry_slab) return -ENOMEM; inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", sizeof(struct dir_inode_entry), NULL); - if (unlikely(!inode_entry_slab)) { + if (!inode_entry_slab) { kmem_cache_destroy(orphan_entry_slab); return -ENOMEM; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aa3438c571f..0ae55872350 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -24,6 +24,195 @@ #include "segment.h" #include <trace/events/f2fs.h> +static void f2fs_read_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (unlikely(!uptodate)) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +static void f2fs_write_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct f2fs_sb_info *sbi = F2FS_SB(bvec->bv_page->mapping->host->i_sb); + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (unlikely(!uptodate)) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; + } + end_page_writeback(page); + dec_page_count(sbi, F2FS_WRITEBACK); + } while (bvec >= bio->bi_io_vec); + + if (bio->bi_private) + complete(bio->bi_private); + + if (!get_pages(sbi, F2FS_WRITEBACK) && + !list_empty(&sbi->cp_wait.task_list)) + wake_up(&sbi->cp_wait); + + bio_put(bio); +} + +/* + * Low-level block read/write IO operations. + */ +static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, + int npages, bool is_read) +{ + struct bio *bio; + + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + + bio->bi_bdev = sbi->sb->s_bdev; + bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); + bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; + + return bio; +} + +static void __submit_merged_bio(struct f2fs_bio_info *io) +{ + struct f2fs_io_info *fio = &io->fio; + int rw; + + if (!io->bio) + return; + + rw = fio->rw; + + if (is_read_io(rw)) { + trace_f2fs_submit_read_bio(io->sbi->sb, rw, + fio->type, io->bio); + submit_bio(rw, io->bio); + } else { + trace_f2fs_submit_write_bio(io->sbi->sb, rw, + fio->type, io->bio); + /* + * META_FLUSH is only from the checkpoint procedure, and we + * should wait this metadata bio for FS consistency. + */ + if (fio->type == META_FLUSH) { + DECLARE_COMPLETION_ONSTACK(wait); + io->bio->bi_private = &wait; + submit_bio(rw, io->bio); + wait_for_completion(&wait); + } else { + submit_bio(rw, io->bio); + } + } + + io->bio = NULL; +} + +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, + enum page_type type, int rw) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io; + + io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + + mutex_lock(&io->io_mutex); + + /* change META to META_FLUSH in the checkpoint procedure */ + if (type >= META_FLUSH) { + io->fio.type = META_FLUSH; + io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + } + __submit_merged_bio(io); + mutex_unlock(&io->io_mutex); +} + +/* + * Fill the locked page with data located in the block address. + * Return unlocked page. + */ +int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, + block_t blk_addr, int rw) +{ + struct bio *bio; + + trace_f2fs_submit_page_bio(page, blk_addr, rw); + + /* Allocate a new bio */ + bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw)); + + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + bio_put(bio); + f2fs_put_page(page, 1); + return -EFAULT; + } + + submit_bio(rw, bio); + return 0; +} + +void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, + block_t blk_addr, struct f2fs_io_info *fio) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); + struct f2fs_bio_info *io; + bool is_read = is_read_io(fio->rw); + + io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + + verify_block_addr(sbi, blk_addr); + + mutex_lock(&io->io_mutex); + + if (!is_read) + inc_page_count(sbi, F2FS_WRITEBACK); + + if (io->bio && (io->last_block_in_bio != blk_addr - 1 || + io->fio.rw != fio->rw)) + __submit_merged_bio(io); +alloc_new: + if (io->bio == NULL) { + int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + + io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); + io->fio = *fio; + } + + if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + __submit_merged_bio(io); + goto alloc_new; + } + + io->last_block_in_bio = blk_addr; + + mutex_unlock(&io->io_mutex); + trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); +} + /* * Lock ordering for the change of data block address: * ->data_page @@ -37,7 +226,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) struct page *node_page = dn->node_page; unsigned int ofs_in_node = dn->ofs_in_node; - f2fs_wait_on_page_writeback(node_page, NODE, false); + f2fs_wait_on_page_writeback(node_page, NODE); rn = F2FS_NODE(node_page); @@ -51,19 +240,39 @@ int reserve_new_block(struct dnode_of_data *dn) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; - if (!inc_valid_block_count(sbi, dn->inode, 1)) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) return -ENOSPC; trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); __set_data_blkaddr(dn, NEW_ADDR); dn->data_blkaddr = NEW_ADDR; + mark_inode_dirty(dn->inode); sync_inode_page(dn); return 0; } +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) +{ + bool need_put = dn->inode_page ? false : true; + int err; + + /* if inode_page exists, index should be zero */ + f2fs_bug_on(!need_put && index); + + err = get_dnode_of_data(dn, index, ALLOC_NODE); + if (err) + return err; + + if (dn->data_blkaddr == NULL_ADDR) + err = reserve_new_block(dn); + if (err || need_put) + f2fs_put_dnode(dn); + return err; +} + static int check_extent_cache(struct inode *inode, pgoff_t pgofs, struct buffer_head *bh_result) { @@ -71,6 +280,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, pgoff_t start_fofs, end_fofs; block_t start_blkaddr; + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return 0; + read_lock(&fi->ext.ext_lock); if (fi->ext.len == 0) { read_unlock(&fi->ext.ext_lock); @@ -109,6 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) struct f2fs_inode_info *fi = F2FS_I(dn->inode); pgoff_t fofs, start_fofs, end_fofs; block_t start_blkaddr, end_blkaddr; + int need_update = true; f2fs_bug_on(blk_addr == NEW_ADDR); fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + @@ -117,6 +330,9 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) /* Update the page address in the parent node */ __set_data_blkaddr(dn, blk_addr); + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return; + write_lock(&fi->ext.ext_lock); start_fofs = fi->ext.fofs; @@ -163,14 +379,21 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) fofs - start_fofs + 1; fi->ext.len -= fofs - start_fofs + 1; } - goto end_update; + } else { + need_update = false; } - write_unlock(&fi->ext.ext_lock); - return; + /* Finally, if the extent is very fragmented, let's drop the cache. */ + if (fi->ext.len < F2FS_MIN_EXTENT_LEN) { + fi->ext.len = 0; + set_inode_flag(fi, FI_NO_EXTENT); + need_update = true; + } end_update: write_unlock(&fi->ext.ext_lock); - sync_inode_page(dn); + if (need_update) + sync_inode_page(dn); + return; } struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) @@ -196,7 +419,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) return ERR_PTR(-ENOENT); /* By fallocate(), there is no cached page, but with NEW_ADDR */ - if (dn.data_blkaddr == NEW_ADDR) + if (unlikely(dn.data_blkaddr == NEW_ADDR)) return ERR_PTR(-EINVAL); page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); @@ -208,11 +431,14 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) return page; } - err = f2fs_readpage(sbi, page, dn.data_blkaddr, + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, sync ? READ_SYNC : READA); + if (err) + return ERR_PTR(err); + if (sync) { wait_on_page_locked(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 0); return ERR_PTR(-EIO); } @@ -246,7 +472,7 @@ repeat: } f2fs_put_dnode(&dn); - if (dn.data_blkaddr == NULL_ADDR) { + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { f2fs_put_page(page, 1); return ERR_PTR(-ENOENT); } @@ -266,16 +492,16 @@ repeat: return page; } - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); if (err) return ERR_PTR(err); lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@ -286,12 +512,12 @@ repeat: * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. * - * Also, caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). - * Note that, npage is set only by make_empty_dir. + * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + * Note that, ipage is set only by make_empty_dir. */ struct page *get_new_data_page(struct inode *inode, - struct page *npage, pgoff_t index, bool new_i_size) + struct page *ipage, pgoff_t index, bool new_i_size) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; @@ -299,24 +525,16 @@ struct page *get_new_data_page(struct inode *inode, struct dnode_of_data dn; int err; - set_new_dnode(&dn, inode, npage, npage, 0); - err = get_dnode_of_data(&dn, index, ALLOC_NODE); + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, index); if (err) return ERR_PTR(err); - - if (dn.data_blkaddr == NULL_ADDR) { - if (reserve_new_block(&dn)) { - if (!npage) - f2fs_put_dnode(&dn); - return ERR_PTR(-ENOSPC); - } - } - if (!npage) - f2fs_put_dnode(&dn); repeat: page = grab_cache_page(mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); + if (!page) { + err = -ENOMEM; + goto put_err; + } if (PageUptodate(page)) return page; @@ -325,15 +543,18 @@ repeat: zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); } else { - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + READ_SYNC); if (err) - return ERR_PTR(err); + goto put_err; + lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + err = -EIO; + goto put_err; } - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@ -344,140 +565,187 @@ repeat: i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); /* Only the directory inode sets new_i_size */ set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - mark_inode_dirty_sync(inode); } return page; -} - -static void read_end_io(struct bio *bio, int err) -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } while (bvec >= bio->bi_io_vec); - bio_put(bio); +put_err: + f2fs_put_dnode(&dn); + return ERR_PTR(err); } -/* - * Fill the locked page with data located in the block address. - * Return unlocked page. - */ -int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, int type) +static int __allocate_data_block(struct dnode_of_data *dn) { - struct block_device *bdev = sbi->sb->s_bdev; - struct bio *bio; + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_summary sum; + block_t new_blkaddr; + struct node_info ni; + int type; - trace_f2fs_readpage(page, blk_addr, type); + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + return -EPERM; + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + return -ENOSPC; - down_read(&sbi->bio_sem); + __set_data_blkaddr(dn, NEW_ADDR); + dn->data_blkaddr = NEW_ADDR; - /* Allocate a new bio */ - bio = f2fs_bio_alloc(bdev, 1); + get_node_info(sbi, dn->nid, &ni); + set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - /* Initialize the bio */ - bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); - bio->bi_end_io = read_end_io; + type = CURSEG_WARM_DATA; - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { - bio_put(bio); - up_read(&sbi->bio_sem); - f2fs_put_page(page, 1); - return -EFAULT; - } + allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type); - submit_bio(type, bio); - up_read(&sbi->bio_sem); + /* direct IO doesn't use extent cache to maximize the performance */ + set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); + update_extent_cache(new_blkaddr, dn); + clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); + + dn->data_blkaddr = new_blkaddr; return 0; } /* - * This function should be used by the data read flow only where it - * does not check the "create" flag that indicates block allocation. - * The reason for this special functionality is to exploit VFS readahead - * mechanism. + * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh. + * If original data blocks are allocated, then give them to blockdev. + * Otherwise, + * a. preallocate requested block addresses + * b. do not use extent cache for better performance + * c. give the block addresses to blockdev */ -static int get_data_block_ro(struct inode *inode, sector_t iblock, +static int get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; unsigned maxblocks = bh_result->b_size >> blkbits; struct dnode_of_data dn; - pgoff_t pgofs; - int err; + int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; + pgoff_t pgofs, end_offset; + int err = 0, ofs = 1; + bool allocated = false; /* Get the page offset from the block offset(iblock) */ pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); - if (check_extent_cache(inode, pgofs, bh_result)) { - trace_f2fs_get_data_block(inode, iblock, bh_result, 0); - return 0; - } + if (check_extent_cache(inode, pgofs, bh_result)) + goto out; + + if (create) + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + err = get_dnode_of_data(&dn, pgofs, mode); if (err) { - trace_f2fs_get_data_block(inode, iblock, bh_result, err); - return (err == -ENOENT) ? 0 : err; + if (err == -ENOENT) + err = 0; + goto unlock_out; + } + if (dn.data_blkaddr == NEW_ADDR) + goto put_out; + + if (dn.data_blkaddr != NULL_ADDR) { + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + } else if (create) { + err = __allocate_data_block(&dn); + if (err) + goto put_out; + allocated = true; + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + } else { + goto put_out; } - /* It does not support data allocation */ - f2fs_bug_on(create); + end_offset = IS_INODE(dn.node_page) ? + ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + bh_result->b_size = (((size_t)1) << blkbits); + dn.ofs_in_node++; + pgofs++; + +get_next: + if (dn.ofs_in_node >= end_offset) { + if (allocated) + sync_inode_page(&dn); + allocated = false; + f2fs_put_dnode(&dn); - if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) { - int i; - unsigned int end_offset; + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, pgofs, mode); + if (err) { + if (err == -ENOENT) + err = 0; + goto unlock_out; + } + if (dn.data_blkaddr == NEW_ADDR) + goto put_out; end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE(F2FS_I(inode)) : - ADDRS_PER_BLOCK; - - clear_buffer_new(bh_result); + ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + } + if (maxblocks > (bh_result->b_size >> blkbits)) { + block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (blkaddr == NULL_ADDR && create) { + err = __allocate_data_block(&dn); + if (err) + goto sync_out; + allocated = true; + blkaddr = dn.data_blkaddr; + } /* Give more consecutive addresses for the read ahead */ - for (i = 0; i < end_offset - dn.ofs_in_node; i++) - if (((datablock_addr(dn.node_page, - dn.ofs_in_node + i)) - != (dn.data_blkaddr + i)) || maxblocks == i) - break; - map_bh(bh_result, inode->i_sb, dn.data_blkaddr); - bh_result->b_size = (i << blkbits); + if (blkaddr == (bh_result->b_blocknr + ofs)) { + ofs++; + dn.ofs_in_node++; + pgofs++; + bh_result->b_size += (((size_t)1) << blkbits); + goto get_next; + } } +sync_out: + if (allocated) + sync_inode_page(&dn); +put_out: f2fs_put_dnode(&dn); - trace_f2fs_get_data_block(inode, iblock, bh_result, 0); - return 0; +unlock_out: + if (create) + f2fs_unlock_op(sbi); +out: + trace_f2fs_get_data_block(inode, iblock, bh_result, err); + return err; } static int f2fs_read_data_page(struct file *file, struct page *page) { - return mpage_readpage(page, get_data_block_ro); + struct inode *inode = page->mapping->host; + int ret; + + /* If the file has inline data, try to read it directlly */ + if (f2fs_has_inline_data(inode)) + ret = f2fs_read_inline_data(inode, page); + else + ret = mpage_readpage(page, get_data_block); + + return ret; } static int f2fs_read_data_pages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro); + struct inode *inode = file->f_mapping->host; + + /* If the file has inline data, skip readpages */ + if (f2fs_has_inline_data(inode)) + return 0; + + return mpage_readpages(mapping, pages, nr_pages, get_data_block); } -int do_write_data_page(struct page *page) +int do_write_data_page(struct page *page, struct f2fs_io_info *fio) { struct inode *inode = page->mapping->host; - block_t old_blk_addr, new_blk_addr; + block_t old_blkaddr, new_blkaddr; struct dnode_of_data dn; int err = 0; @@ -486,10 +754,10 @@ int do_write_data_page(struct page *page) if (err) return err; - old_blk_addr = dn.data_blkaddr; + old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ - if (old_blk_addr == NULL_ADDR) + if (old_blkaddr == NULL_ADDR) goto out_writepage; set_page_writeback(page); @@ -498,15 +766,13 @@ int do_write_data_page(struct page *page) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(old_blk_addr != NEW_ADDR && + if (unlikely(old_blkaddr != NEW_ADDR && !is_cold_data(page) && need_inplace_update(inode))) { - rewrite_data_page(F2FS_SB(inode->i_sb), page, - old_blk_addr); + rewrite_data_page(page, old_blkaddr, fio); } else { - write_data_page(inode, page, &dn, - old_blk_addr, &new_blk_addr); - update_extent_cache(new_blk_addr, &dn); + write_data_page(page, &dn, &new_blkaddr, fio); + update_extent_cache(new_blkaddr, &dn); } out_writepage: f2fs_put_dnode(&dn); @@ -521,9 +787,13 @@ static int f2fs_write_data_page(struct page *page, loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_CACHE_SHIFT; - unsigned offset; + unsigned offset = 0; bool need_balance_fs = false; int err = 0; + struct f2fs_io_info fio = { + .type = DATA, + .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + }; if (page->index < end_index) goto write; @@ -543,7 +813,7 @@ static int f2fs_write_data_page(struct page *page, zero_user_segment(page, offset, PAGE_CACHE_SIZE); write: - if (sbi->por_doing) { + if (unlikely(sbi->por_doing)) { err = AOP_WRITEPAGE_ACTIVATE; goto redirty_out; } @@ -552,10 +822,18 @@ write: if (S_ISDIR(inode->i_mode)) { dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); - err = do_write_data_page(page); + err = do_write_data_page(page, &fio); } else { f2fs_lock_op(sbi); - err = do_write_data_page(page); + + if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) { + err = f2fs_write_inline_data(inode, page, offset); + f2fs_unlock_op(sbi); + goto out; + } else { + err = do_write_data_page(page, &fio); + } + f2fs_unlock_op(sbi); need_balance_fs = true; } @@ -564,8 +842,10 @@ write: else if (err) goto redirty_out; - if (wbc->for_reclaim) - f2fs_submit_bio(sbi, DATA, true); + if (wbc->for_reclaim) { + f2fs_submit_merged_bio(sbi, DATA, WRITE); + need_balance_fs = false; + } clear_cold_data(page); out: @@ -617,7 +897,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); if (locked) mutex_unlock(&sbi->writepages); - f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); + + f2fs_submit_merged_bio(sbi, DATA, WRITE); remove_dirty_dir_inode(inode); @@ -638,27 +919,28 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, f2fs_balance_fs(sbi); repeat: + err = f2fs_convert_inline_data(inode, pos + len); + if (err) + return err; + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; - f2fs_lock_op(sbi); + if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA) + goto inline_data; + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, ALLOC_NODE); - if (err) - goto err; - - if (dn.data_blkaddr == NULL_ADDR) - err = reserve_new_block(&dn); - - f2fs_put_dnode(&dn); - if (err) - goto err; - + err = f2fs_reserve_block(&dn, index); f2fs_unlock_op(sbi); + if (err) { + f2fs_put_page(page, 1); + return err; + } +inline_data: if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) return 0; @@ -674,15 +956,19 @@ repeat: if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); + if (f2fs_has_inline_data(inode)) + err = f2fs_read_inline_data(inode, page); + else + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + READ_SYNC); if (err) return err; lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return -EIO; } - if (page->mapping != mapping) { + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } @@ -691,11 +977,6 @@ out: SetPageUptodate(page); clear_cold_data(page); return 0; - -err: - f2fs_unlock_op(sbi); - f2fs_put_page(page, 1); - return err; } static int f2fs_write_end(struct file *file, @@ -714,23 +995,43 @@ static int f2fs_write_end(struct file *file, update_inode_page(inode); } - unlock_page(page); - page_cache_release(page); + f2fs_put_page(page, 1); return copied; } +static int check_direct_IO(struct inode *inode, int rw, + const struct iovec *iov, loff_t offset, unsigned long nr_segs) +{ + unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; + int i; + + if (rw == READ) + return 0; + + if (offset & blocksize_mask) + return -EINVAL; + + for (i = 0; i < nr_segs; i++) + if (iov[i].iov_len & blocksize_mask) + return -EINVAL; + return 0; +} + static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - if (rw == WRITE) + /* Let buffer I/O handle the inline data case. */ + if (f2fs_has_inline_data(inode)) + return 0; + + if (check_direct_IO(inode, rw, iov, offset, nr_segs)) return 0; - /* Needs synchronization with the cleaner */ return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - get_data_block_ro); + get_data_block); } static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, @@ -759,6 +1060,8 @@ static int f2fs_set_data_page_dirty(struct page *page) trace_f2fs_set_page_dirty(page, DATA); SetPageUptodate(page); + mark_inode_dirty(inode); + if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); set_dirty_dir_page(inode, page); @@ -769,7 +1072,7 @@ static int f2fs_set_data_page_dirty(struct page *page) static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { - return generic_block_bmap(mapping, block, get_data_block_ro); + return generic_block_bmap(mapping, block, get_data_block); } const struct address_space_operations f2fs_dblock_aops = { diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a84b0a8e685..3de9d20d0c1 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -24,7 +24,7 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static struct dentry *debugfs_root; +static struct dentry *f2fs_debugfs_root; static DEFINE_MUTEX(f2fs_stat_mutex); static void update_general_status(struct f2fs_sb_info *sbi) @@ -45,14 +45,15 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->valid_count = valid_user_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); + si->inline_inode = sbi->inline_inode; si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); si->free_secs = free_sections(sbi); si->prefree_count = prefree_segments(sbi); si->dirty_count = dirty_segments(sbi); - si->node_pages = sbi->node_inode->i_mapping->nrpages; - si->meta_pages = sbi->meta_inode->i_mapping->nrpages; + si->node_pages = NODE_MAPPING(sbi)->nrpages; + si->meta_pages = META_MAPPING(sbi)->nrpages; si->nats = NM_I(sbi)->nat_cnt; si->sits = SIT_I(sbi)->dirty_sentries; si->fnids = NM_I(sbi)->fcnt; @@ -165,9 +166,9 @@ get_cache: /* free nids */ si->cache_mem = NM_I(sbi)->fcnt; si->cache_mem += NM_I(sbi)->nat_cnt; - npages = sbi->node_inode->i_mapping->nrpages; + npages = NODE_MAPPING(sbi)->nrpages; si->cache_mem += npages << PAGE_CACHE_SHIFT; - npages = sbi->meta_inode->i_mapping->nrpages; + npages = META_MAPPING(sbi)->nrpages; si->cache_mem += npages << PAGE_CACHE_SHIFT; si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); @@ -200,6 +201,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "Other: %u)\n - Data: %u\n", si->valid_node_count - si->valid_inode_count, si->valid_count - si->valid_node_count); + seq_printf(s, " - Inline_data Inode: %u\n", + si->inline_inode); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -242,14 +245,14 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - node blocks : %d\n", si->node_blks); seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", si->hit_ext, si->total_ext); - seq_printf(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - nodes %4d in %4d\n", + seq_puts(s, "\nBalancing F2FS Async:\n"); + seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents %4d in dirs:%4d\n", + seq_printf(s, " - dents: %4d in dirs:%4d\n", si->ndirty_dent, si->ndirty_dirs); - seq_printf(s, " - meta %4d in %4d\n", + seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - NATs %5d > %lu\n", + seq_printf(s, " - NATs: %5d > %lu\n", si->nats, NM_WOUT_THRESHOLD); seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", si->sits, si->fnids); @@ -340,14 +343,32 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) void __init f2fs_create_root_stats(void) { - debugfs_root = debugfs_create_dir("f2fs", NULL); - if (debugfs_root) - debugfs_create_file("status", S_IRUGO, debugfs_root, - NULL, &stat_fops); + struct dentry *file; + + f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); + if (!f2fs_debugfs_root) + goto bail; + + file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, + NULL, &stat_fops); + if (!file) + goto free_debugfs_dir; + + return; + +free_debugfs_dir: + debugfs_remove(f2fs_debugfs_root); + +bail: + f2fs_debugfs_root = NULL; + return; } void f2fs_destroy_root_stats(void) { - debugfs_remove_recursive(debugfs_root); - debugfs_root = NULL; + if (!f2fs_debugfs_root) + return; + + debugfs_remove_recursive(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 594fc1bb64e..2b7c255bcbd 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -190,9 +190,6 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, unsigned int max_depth; unsigned int level; - if (namelen > F2FS_NAME_LEN) - return NULL; - if (npages == 0) return NULL; @@ -259,20 +256,17 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, dir->i_mtime = dir->i_ctime = CURRENT_TIME; mark_inode_dirty(dir); - /* update parent inode number before releasing dentry page */ - F2FS_I(inode)->i_pino = dir->i_ino; - f2fs_put_page(page, 1); } static void init_dent_inode(const struct qstr *name, struct page *ipage) { - struct f2fs_node *rn; + struct f2fs_inode *ri; /* copy name info. to this inode page */ - rn = F2FS_NODE(ipage); - rn->i.i_namelen = cpu_to_le32(name->len); - memcpy(rn->i.i_name, name->name, name->len); + ri = F2FS_INODE(ipage); + ri->i_namelen = cpu_to_le32(name->len); + memcpy(ri->i_name, name->name, name->len); set_page_dirty(ipage); } @@ -348,11 +342,11 @@ static struct page *init_inode_metadata(struct inode *inode, err = f2fs_init_acl(inode, dir, page); if (err) - goto error; + goto put_error; err = f2fs_init_security(inode, dir, name, page); if (err) - goto error; + goto put_error; wait_on_page_writeback(page); } else { @@ -376,8 +370,9 @@ static struct page *init_inode_metadata(struct inode *inode, } return page; -error: +put_error: f2fs_put_page(page, 1); +error: remove_inode_page(inode); return ERR_PTR(err); } @@ -393,6 +388,8 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + if (F2FS_I(dir)->i_current_depth != current_depth) { F2FS_I(dir)->i_current_depth = current_depth; set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); @@ -400,8 +397,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode, if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) update_inode_page(dir); - else - mark_inode_dirty(dir); if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) clear_inode_flag(F2FS_I(inode), FI_INC_LINK); @@ -432,10 +427,11 @@ next: } /* - * Caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode) { unsigned int bit_pos; unsigned int level; @@ -461,7 +457,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in } start: - if (current_depth == MAX_DIR_HASH_DEPTH) + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; /* Increase the depth, if required */ @@ -554,14 +550,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dir->i_ctime = dir->i_mtime = CURRENT_TIME; - if (inode && S_ISDIR(inode->i_mode)) { - drop_nlink(dir); - update_inode_page(dir); - } else { - mark_inode_dirty(dir); - } - if (inode) { + if (S_ISDIR(inode->i_mode)) { + drop_nlink(dir); + update_inode_page(dir); + } inode->i_ctime = CURRENT_TIME; drop_nlink(inode); if (S_ISDIR(inode->i_mode)) { @@ -636,7 +629,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); - for ( ; n < npages; n++) { + for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n); if (IS_ERR(dentry_page)) continue; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 89dc7508faf..fc3c558cb4f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,8 +22,10 @@ #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(condition) BUG_ON(condition) +#define f2fs_down_write(x, y) down_write_nest_lock(x, y) #else #define f2fs_bug_on(condition) +#define f2fs_down_write(x, y) down_write(x) #endif /* @@ -37,6 +39,7 @@ #define F2FS_MOUNT_POSIX_ACL 0x00000020 #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 #define F2FS_MOUNT_INLINE_XATTR 0x00000080 +#define F2FS_MOUNT_INLINE_DATA 0x00000100 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -97,6 +100,13 @@ struct dir_inode_entry { struct inode *inode; /* vfs inode pointer */ }; +/* for the list of blockaddresses to be discarded */ +struct discard_entry { + struct list_head list; /* list head */ + block_t blkaddr; /* block address to be discarded */ + int len; /* # of consecutive blocks of the discard */ +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ @@ -155,13 +165,15 @@ enum { LOOKUP_NODE, /* look up a node without readahead */ LOOKUP_NODE_RA, /* * look up a node with readahead called - * by get_datablock_ro. + * by get_data_block. */ }; #define F2FS_LINK_MAX 32000 /* maximum link count per file */ /* for in-memory extent cache entry */ +#define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */ + struct extent_info { rwlock_t ext_lock; /* rwlock for consistency */ unsigned int fofs; /* start offset in a file */ @@ -308,6 +320,14 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; + + /* for small discard management */ + struct list_head discard_list; /* 4KB discard list */ + int nr_discards; /* # of discards in the list */ + int max_discards; /* max. discards to be issued */ + + unsigned int ipu_policy; /* in-place-update policy */ + unsigned int min_ipu_util; /* in-place-update threshold */ }; /* @@ -338,6 +358,7 @@ enum count_type { * with waiting the bio's completion * ... Only can be used with META. */ +#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) enum page_type { DATA, NODE, @@ -346,6 +367,20 @@ enum page_type { META_FLUSH, }; +struct f2fs_io_info { + enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ +}; + +#define is_read_io(rw) (((rw) & 1) == READ) +struct f2fs_bio_info { + struct f2fs_sb_info *sbi; /* f2fs superblock */ + struct bio *bio; /* bios to merge */ + sector_t last_block_in_bio; /* last block number */ + struct f2fs_io_info fio; /* store buffered io info. */ + struct mutex io_mutex; /* mutex for bio */ +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -359,9 +394,10 @@ struct f2fs_sb_info { /* for segment-related operations */ struct f2fs_sm_info *sm_info; /* segment manager */ - struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */ - sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */ - struct rw_semaphore bio_sem; /* IO semaphore */ + + /* for bio operations */ + struct f2fs_bio_info read_io; /* for read bios */ + struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -376,8 +412,9 @@ struct f2fs_sb_info { /* for orphan inode management */ struct list_head orphan_inode_list; /* orphan inode list */ - struct mutex orphan_inode_mutex; /* for orphan inode list */ + spinlock_t orphan_inode_lock; /* for orphan inode list */ unsigned int n_orphans; /* # of orphan inodes */ + unsigned int max_orphans; /* max orphan inodes */ /* for directory inode management */ struct list_head dir_inode_list; /* dir inode list */ @@ -414,6 +451,9 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + /* maximum # of trials to find a victim segment for SSR and GC */ + unsigned int max_victim_search; + /* * for stat information. * one is for the LFS mode, and the other is for the SSR mode. @@ -423,6 +463,7 @@ struct f2fs_sb_info { unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ + int inline_inode; /* # of inline_data inodes */ int bg_gc; /* background gc calls */ unsigned int n_dirty_dirs; /* # of dir inodes */ #endif @@ -462,6 +503,11 @@ static inline struct f2fs_node *F2FS_NODE(struct page *page) return (struct f2fs_node *)page_address(page); } +static inline struct f2fs_inode *F2FS_INODE(struct page *page) +{ + return &((struct f2fs_node *)page_address(page))->i; +} + static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) { return (struct f2fs_nm_info *)(sbi->nm_info); @@ -487,6 +533,16 @@ static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi) return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); } +static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->meta_inode->i_mapping; +} + +static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->node_inode->i_mapping; +} + static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi) { sbi->s_dirty = 1; @@ -534,7 +590,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - down_write_nest_lock(&sbi->cp_rwsem, &sbi->cp_mutex); + f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) @@ -548,7 +604,7 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { WARN_ON((nid >= NM_I(sbi)->max_nid)); - if (nid >= NM_I(sbi)->max_nid) + if (unlikely(nid >= NM_I(sbi)->max_nid)) return -EINVAL; return 0; } @@ -561,9 +617,9 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) static inline int F2FS_HAS_BLOCKS(struct inode *inode) { if (F2FS_I(inode)->i_xattr_nid) - return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1); + return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1; else - return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS); + return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; } static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, @@ -574,7 +630,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + (block_t)count; - if (valid_block_count > sbi->user_block_count) { + if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); return false; } @@ -585,7 +641,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, return true; } -static inline int dec_valid_block_count(struct f2fs_sb_info *sbi, +static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t count) { @@ -595,7 +651,6 @@ static inline int dec_valid_block_count(struct f2fs_sb_info *sbi, inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); - return 0; } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -686,50 +741,48 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) } static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode, - unsigned int count) + struct inode *inode) { block_t valid_block_count; unsigned int valid_node_count; spin_lock(&sbi->stat_lock); - valid_block_count = sbi->total_valid_block_count + (block_t)count; - sbi->alloc_valid_block_count += (block_t)count; - valid_node_count = sbi->total_valid_node_count + count; - - if (valid_block_count > sbi->user_block_count) { + valid_block_count = sbi->total_valid_block_count + 1; + if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); return false; } - if (valid_node_count > sbi->total_node_count) { + valid_node_count = sbi->total_valid_node_count + 1; + if (unlikely(valid_node_count > sbi->total_node_count)) { spin_unlock(&sbi->stat_lock); return false; } if (inode) - inode->i_blocks += count; - sbi->total_valid_node_count = valid_node_count; - sbi->total_valid_block_count = valid_block_count; + inode->i_blocks++; + + sbi->alloc_valid_block_count++; + sbi->total_valid_node_count++; + sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); return true; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode, - unsigned int count) + struct inode *inode) { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi->total_valid_block_count < count); - f2fs_bug_on(sbi->total_valid_node_count < count); - f2fs_bug_on(inode->i_blocks < count); + f2fs_bug_on(!sbi->total_valid_block_count); + f2fs_bug_on(!sbi->total_valid_node_count); + f2fs_bug_on(!inode->i_blocks); - inode->i_blocks -= count; - sbi->total_valid_node_count -= count; - sbi->total_valid_block_count -= (block_t)count; + inode->i_blocks--; + sbi->total_valid_node_count--; + sbi->total_valid_block_count--; spin_unlock(&sbi->stat_lock); } @@ -751,13 +804,12 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) spin_unlock(&sbi->stat_lock); } -static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi) +static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { spin_lock(&sbi->stat_lock); f2fs_bug_on(!sbi->total_valid_inode_count); sbi->total_valid_inode_count--; spin_unlock(&sbi->stat_lock); - return 0; } static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) @@ -771,7 +823,7 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) static inline void f2fs_put_page(struct page *page, int unlock) { - if (!page || IS_ERR(page)) + if (!page) return; if (unlock) { @@ -876,7 +928,9 @@ enum { FI_NO_ALLOC, /* should not allocate any blocks */ FI_UPDATE_DIR, /* should update inode block for consistency */ FI_DELAY_IPUT, /* used for the recovery */ + FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ + FI_INLINE_DATA, /* used for inline data*/ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -914,6 +968,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi, { if (ri->i_inline & F2FS_INLINE_XATTR) set_inode_flag(fi, FI_INLINE_XATTR); + if (ri->i_inline & F2FS_INLINE_DATA) + set_inode_flag(fi, FI_INLINE_DATA); } static inline void set_raw_inline(struct f2fs_inode_info *fi, @@ -923,6 +979,8 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi, if (is_inode_flag_set(fi, FI_INLINE_XATTR)) ri->i_inline |= F2FS_INLINE_XATTR; + if (is_inode_flag_set(fi, FI_INLINE_DATA)) + ri->i_inline |= F2FS_INLINE_DATA; } static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) @@ -948,16 +1006,33 @@ static inline int inline_xattr_size(struct inode *inode) return 0; } +static inline int f2fs_has_inline_data(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); +} + +static inline void *inline_data_addr(struct page *page) +{ + struct f2fs_inode *ri; + ri = (struct f2fs_inode *)page_address(page); + return (void *)&(ri->i_addr[1]); +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; } +#define get_inode_mode(i) \ + ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ + (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) + /* * file.c */ int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); +int truncate_blocks(struct inode *, u64); void f2fs_truncate(struct inode *); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); @@ -1027,7 +1102,7 @@ int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); -int remove_inode_page(struct inode *); +void remove_inode_page(struct inode *); struct page *new_inode_page(struct inode *, const struct qstr *); struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); @@ -1059,19 +1134,19 @@ void clear_prefree_segments(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -struct bio *f2fs_bio_alloc(struct block_device *, int); -void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool); -void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, - block_t, block_t *); -void write_data_page(struct inode *, struct page *, struct dnode_of_data*, - block_t, block_t *); -void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t); +void write_node_page(struct f2fs_sb_info *, struct page *, + struct f2fs_io_info *, unsigned int, block_t, block_t *); +void write_data_page(struct page *, struct dnode_of_data *, block_t *, + struct f2fs_io_info *); +void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *); void recover_data_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, block_t, block_t); void rewrite_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, block_t, block_t); +void allocate_data_block(struct f2fs_sb_info *, struct page *, + block_t, block_t *, struct f2fs_summary *, int); +void f2fs_wait_on_page_writeback(struct page *, enum page_type); void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); int lookup_journal_in_cursum(struct f2fs_summary_block *, @@ -1079,6 +1154,8 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *, void flush_sit_entries(struct f2fs_sb_info *); int build_segment_manager(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); +int __init create_segment_manager_caches(void); +void destroy_segment_manager_caches(void); /* * checkpoint.c @@ -1090,7 +1167,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -int recover_orphan_inodes(struct f2fs_sb_info *); +void recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void set_dirty_dir_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); @@ -1105,13 +1182,17 @@ void destroy_checkpoint_caches(void); /* * data.c */ +void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); +int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int); +void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t, + struct f2fs_io_info *); int reserve_new_block(struct dnode_of_data *); +int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); void update_extent_cache(block_t, struct dnode_of_data *); struct page *find_data_page(struct inode *, pgoff_t, bool); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); -int do_write_data_page(struct page *); +int do_write_data_page(struct page *, struct f2fs_io_info *); /* * gc.c @@ -1144,7 +1225,7 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, sits, fnids; int total_count, utilization; - int bg_gc; + int bg_gc, inline_inode; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -1164,7 +1245,7 @@ struct f2fs_stat_info { static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) { - return (struct f2fs_stat_info*)sbi->stat_info; + return (struct f2fs_stat_info *)sbi->stat_info; } #define stat_inc_call_count(si) ((si)->call_count++) @@ -1173,6 +1254,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) #define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++) #define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++) +#define stat_inc_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + ((F2FS_SB(inode->i_sb))->inline_inode++); \ + } while (0) +#define stat_dec_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + ((F2FS_SB(inode->i_sb))->inline_inode--); \ + } while (0) + #define stat_inc_seg_type(sbi, curseg) \ ((sbi)->segment_count[(curseg)->alloc_type]++) #define stat_inc_block_count(sbi, curseg) \ @@ -1216,6 +1308,8 @@ void f2fs_destroy_root_stats(void); #define stat_dec_dirty_dir(sbi) #define stat_inc_total_hit(sb) #define stat_inc_read_hit(sb) +#define stat_inc_inline_inode(inode) +#define stat_dec_inline_inode(inode) #define stat_inc_seg_type(sbi, curseg) #define stat_inc_block_count(sbi, curseg) #define stat_inc_seg_count(si, type) @@ -1238,4 +1332,13 @@ extern const struct address_space_operations f2fs_meta_aops; extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; + +/* + * inline.c + */ +bool f2fs_may_inline(struct inode *); +int f2fs_read_inline_data(struct inode *, struct page *); +int f2fs_convert_inline_data(struct inode *, pgoff_t); +int f2fs_write_inline_data(struct inode *, struct page *, unsigned int); +int recover_inline_data(struct inode *, struct page *); #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7d714f4972d..0dfcef53a6e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -33,7 +33,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - block_t old_blk_addr; struct dnode_of_data dn; int err; @@ -44,30 +43,16 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, /* block allocation */ f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, ALLOC_NODE); - if (err) { - f2fs_unlock_op(sbi); - goto out; - } - - old_blk_addr = dn.data_blkaddr; - - if (old_blk_addr == NULL_ADDR) { - err = reserve_new_block(&dn); - if (err) { - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - goto out; - } - } - f2fs_put_dnode(&dn); + err = f2fs_reserve_block(&dn, page->index); f2fs_unlock_op(sbi); + if (err) + goto out; file_update_time(vma->vm_file); lock_page(page); - if (page->mapping != inode->i_mapping || + if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || - !PageUptodate(page)) { + !PageUptodate(page))) { unlock_page(page); err = -EFAULT; goto out; @@ -130,12 +115,12 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0; bool need_cp = false; struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, + .sync_mode = WB_SYNC_NONE, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; - if (f2fs_readonly(inode->i_sb)) + if (unlikely(f2fs_readonly(inode->i_sb))) return 0; trace_f2fs_sync_file_enter(inode); @@ -217,7 +202,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + ofs; - for ( ; count > 0; count--, addr++, dn->ofs_in_node++) { + for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); if (blkaddr == NULL_ADDR) continue; @@ -256,7 +241,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) return; lock_page(page); - if (page->mapping != inode->i_mapping) { + if (unlikely(page->mapping != inode->i_mapping)) { f2fs_put_page(page, 1); return; } @@ -266,21 +251,24 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) f2fs_put_page(page, 1); } -static int truncate_blocks(struct inode *inode, u64 from) +int truncate_blocks(struct inode *inode, u64 from) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; - int count = 0; - int err; + int count = 0, err = 0; trace_f2fs_truncate_blocks_enter(inode, from); + if (f2fs_has_inline_data(inode)) + goto done; + free_from = (pgoff_t) ((from + blocksize - 1) >> (sbi->log_blocksize)); f2fs_lock_op(sbi); + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); if (err) { @@ -308,7 +296,7 @@ static int truncate_blocks(struct inode *inode, u64 from) free_next: err = truncate_inode_blocks(inode, free_from); f2fs_unlock_op(sbi); - +done: /* lastly zero out the first data page */ truncate_partial_data_page(inode, from); @@ -382,6 +370,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { + err = f2fs_convert_inline_data(inode, attr->ia_size); + if (err) + return err; + truncate_setsize(inode, attr->ia_size); f2fs_truncate(inode); f2fs_balance_fs(F2FS_SB(inode->i_sb)); @@ -390,7 +382,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) __setattr_copy(inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = f2fs_acl_chmod(inode); + err = posix_acl_chmod(inode, get_inode_mode(inode)); if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { inode->i_mode = fi->i_acl_mode; clear_inode_flag(fi, FI_ACL_MODE); @@ -405,6 +397,7 @@ const struct inode_operations f2fs_file_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -459,12 +452,16 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return 0; } -static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) +static int punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; int ret = 0; + ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1); + if (ret) + return ret; + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -499,12 +496,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) } } - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) <= (offset + len)) { - i_size_write(inode, offset); - mark_inode_dirty(inode); - } - return ret; } @@ -521,6 +512,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (ret) return ret; + ret = f2fs_convert_inline_data(inode, offset + len); + if (ret) + return ret; + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -532,22 +527,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset, f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, index, ALLOC_NODE); - if (ret) { - f2fs_unlock_op(sbi); - break; - } - - if (dn.data_blkaddr == NULL_ADDR) { - ret = reserve_new_block(&dn); - if (ret) { - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - break; - } - } - f2fs_put_dnode(&dn); + ret = f2fs_reserve_block(&dn, index); f2fs_unlock_op(sbi); + if (ret) + break; if (pg_start == pg_end) new_size = offset + len; @@ -578,7 +561,7 @@ static long f2fs_fallocate(struct file *file, int mode, return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) - ret = punch_hole(inode, offset, len, mode); + ret = punch_hole(inode, offset, len); else ret = expand_inode_data(inode, offset, len, mode); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b7ad1ec7e4c..ea0371e854b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -119,7 +119,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi) kfree(gc_th); sbi->gc_thread = NULL; } - out: return err; } @@ -164,8 +163,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - if (p->max_search > MAX_VICTIM_SEARCH) - p->max_search = MAX_VICTIM_SEARCH; + if (p->max_search > sbi->max_victim_search) + p->max_search = sbi->max_victim_search; p->offset = sbi->last_victim[p->gc_mode]; } @@ -429,7 +428,7 @@ next_step: /* set page dirty and write it */ if (gc_type == FG_GC) { - f2fs_wait_on_page_writeback(node_page, NODE, true); + f2fs_wait_on_page_writeback(node_page, NODE); set_page_dirty(node_page); } else { if (!PageWriteback(node_page)) @@ -521,6 +520,11 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, static void move_data_page(struct inode *inode, struct page *page, int gc_type) { + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC, + }; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; @@ -529,7 +533,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) } else { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA); if (clear_page_dirty_for_io(page) && S_ISDIR(inode->i_mode)) { @@ -537,7 +541,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) inode_dec_dirty_dents(inode); } set_cold_data(page); - do_write_data_page(page); + do_write_data_page(page, &fio); clear_cold_data(page); } out: @@ -631,7 +635,7 @@ next_iput: goto next_step; if (gc_type == FG_GC) { - f2fs_submit_bio(sbi, DATA, true); + f2fs_submit_merged_bio(sbi, DATA, WRITE); /* * In the case of FG_GC, it'd be better to reclaim this victim @@ -664,8 +668,6 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, /* read segment summary of victim */ sum_page = get_sum_page(sbi, segno); - if (IS_ERR(sum_page)) - return; blk_start_plug(&plug); @@ -697,7 +699,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&ilist); gc_more: - if (!(sbi->sb->s_flags & MS_ACTIVE)) + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 507056d2220..5d5eb6047bf 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -20,7 +20,7 @@ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ /* Search max. number of dirty segments to select a victim segment */ -#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */ +#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ struct f2fs_gc_kthread { struct task_struct *f2fs_gc_task; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c new file mode 100644 index 00000000000..31ee5b164ff --- /dev/null +++ b/fs/f2fs/inline.c @@ -0,0 +1,222 @@ +/* + * fs/f2fs/inline.c + * Copyright (c) 2013, Intel Corporation + * Authors: Huajun Li <huajun.li@intel.com> + * Haicheng Li <haicheng.li@intel.com> + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" + +bool f2fs_may_inline(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + block_t nr_blocks; + loff_t i_size; + + if (!test_opt(sbi, INLINE_DATA)) + return false; + + nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2; + if (inode->i_blocks > nr_blocks) + return false; + + i_size = i_size_read(inode); + if (i_size > MAX_INLINE_DATA) + return false; + + return true; +} + +int f2fs_read_inline_data(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *ipage; + void *src_addr, *dst_addr; + + if (page->index) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + goto out; + } + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + + /* Copy the whole inline data block */ + src_addr = inline_data_addr(ipage); + dst_addr = kmap(page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + kunmap(page); + f2fs_put_page(ipage, 1); + +out: + SetPageUptodate(page); + unlock_page(page); + + return 0; +} + +static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) +{ + int err; + struct page *ipage; + struct dnode_of_data dn; + void *src_addr, *dst_addr; + block_t new_blk_addr; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC | REQ_PRIO, + }; + + f2fs_lock_op(sbi); + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + /* + * i_addr[0] is not used for inline data, + * so reserving new block will not destroy inline data + */ + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, 0); + if (err) { + f2fs_unlock_op(sbi); + return err; + } + + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + + /* Copy the whole inline data block */ + src_addr = inline_data_addr(ipage); + dst_addr = kmap(page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + kunmap(page); + SetPageUptodate(page); + + /* write data page to try to make data consistent */ + set_page_writeback(page); + write_data_page(page, &dn, &new_blk_addr, &fio); + update_extent_cache(new_blk_addr, &dn); + f2fs_wait_on_page_writeback(page, DATA); + + /* clear inline data and flag after data writeback */ + zero_user_segment(ipage, INLINE_DATA_OFFSET, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + stat_dec_inline_inode(inode); + + sync_inode_page(&dn); + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + return err; +} + +int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size) +{ + struct page *page; + int err; + + if (!f2fs_has_inline_data(inode)) + return 0; + else if (to_size <= MAX_INLINE_DATA) + return 0; + + page = grab_cache_page_write_begin(inode->i_mapping, 0, AOP_FLAG_NOFS); + if (!page) + return -ENOMEM; + + err = __f2fs_convert_inline_data(inode, page); + f2fs_put_page(page, 1); + return err; +} + +int f2fs_write_inline_data(struct inode *inode, + struct page *page, unsigned size) +{ + void *src_addr, *dst_addr; + struct page *ipage; + struct dnode_of_data dn; + int err; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; + ipage = dn.inode_page; + + zero_user_segment(ipage, INLINE_DATA_OFFSET, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + src_addr = kmap(page); + dst_addr = inline_data_addr(ipage); + memcpy(dst_addr, src_addr, size); + kunmap(page); + + /* Release the first data block if it is allocated */ + if (!f2fs_has_inline_data(inode)) { + truncate_data_blocks_range(&dn, 1); + set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + stat_inc_inline_inode(inode); + } + + sync_inode_page(&dn); + f2fs_put_dnode(&dn); + + return 0; +} + +int recover_inline_data(struct inode *inode, struct page *npage) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_inode *ri = NULL; + void *src_addr, *dst_addr; + struct page *ipage; + + /* + * The inline_data recovery policy is as follows. + * [prev.] [next] of inline_data flag + * o o -> recover inline_data + * o x -> remove inline_data, and then recover data blocks + * x o -> remove inline_data, and then recover inline_data + * x x -> recover data blocks + */ + if (IS_INODE(npage)) + ri = F2FS_INODE(npage); + + if (f2fs_has_inline_data(inode) && + ri && ri->i_inline & F2FS_INLINE_DATA) { +process_inline: + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + + src_addr = inline_data_addr(npage); + dst_addr = inline_data_addr(ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); + return -1; + } + + if (f2fs_has_inline_data(inode)) { + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + zero_user_segment(ipage, INLINE_DATA_OFFSET, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); + } else if (ri && ri->i_inline & F2FS_INLINE_DATA) { + truncate_blocks(inode, 0); + set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + goto process_inline; + } + return 0; +} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d0eaa9faeca..4d67ed736dc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -42,9 +42,11 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { if (ri->i_addr[0]) - inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0])); + inode->i_rdev = + old_decode_dev(le32_to_cpu(ri->i_addr[0])); else - inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1])); + inode->i_rdev = + new_decode_dev(le32_to_cpu(ri->i_addr[1])); } } @@ -52,11 +54,13 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); + ri->i_addr[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); ri->i_addr[1] = 0; } else { ri->i_addr[0] = 0; - ri->i_addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); + ri->i_addr[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); ri->i_addr[2] = 0; } } @@ -67,7 +71,6 @@ static int do_read_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; - struct f2fs_node *rn; struct f2fs_inode *ri; /* Check if ino is within scope */ @@ -81,8 +84,7 @@ static int do_read_inode(struct inode *inode) if (IS_ERR(node_page)) return PTR_ERR(node_page); - rn = F2FS_NODE(node_page); - ri = &(rn->i); + ri = F2FS_INODE(node_page); inode->i_mode = le16_to_cpu(ri->i_mode); i_uid_write(inode, le32_to_cpu(ri->i_uid)); @@ -175,13 +177,11 @@ bad_inode: void update_inode(struct inode *inode, struct page *node_page) { - struct f2fs_node *rn; struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(node_page, NODE, false); + f2fs_wait_on_page_writeback(node_page, NODE); - rn = F2FS_NODE(node_page); - ri = &(rn->i); + ri = F2FS_INODE(node_page); ri->i_mode = cpu_to_le16(inode->i_mode); ri->i_advise = F2FS_I(inode)->i_advise; @@ -281,6 +281,7 @@ void f2fs_evict_inode(struct inode *inode) f2fs_lock_op(sbi); remove_inode_page(inode); + stat_dec_inline_inode(inode); f2fs_unlock_op(sbi); sb_end_intwrite(inode->i_sb); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 575adac17f8..397d459e97b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -424,11 +424,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_set_link(new_dir, new_entry, new_page, old_inode); + F2FS_I(old_inode)->i_pino = new_dir->i_ino; new_inode->i_ctime = CURRENT_TIME; if (old_dir_entry) drop_nlink(new_inode); drop_nlink(new_inode); + mark_inode_dirty(new_inode); if (!new_inode->i_nlink) add_orphan_inode(sbi, new_inode->i_ino); @@ -457,11 +459,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + F2FS_I(old_inode)->i_pino = new_dir->i_ino; + update_inode_page(old_inode); } else { kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } drop_nlink(old_dir); + mark_inode_dirty(old_dir); update_inode_page(old_dir); } @@ -496,6 +501,7 @@ const struct inode_operations f2fs_dir_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -522,6 +528,7 @@ const struct inode_operations f2fs_special_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4ac4150d421..b0649b76eb4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -87,17 +87,19 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) */ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); - struct blk_plug plug; struct page *page; pgoff_t index; int i; + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; - blk_start_plug(&plug); for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { - if (nid >= nm_i->max_nid) + if (unlikely(nid >= nm_i->max_nid)) nid = 0; index = current_nat_addr(sbi, nid); @@ -105,15 +107,15 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) if (!page) continue; if (PageUptodate(page)) { + mark_page_accessed(page); f2fs_put_page(page, 1); continue; } - if (f2fs_readpage(sbi, page, index, READ)) - continue; - + f2fs_submit_page_mbio(sbi, page, index, &fio); + mark_page_accessed(page); f2fs_put_page(page, 0); } - blk_finish_plug(&plug); + f2fs_submit_merged_bio(sbi, META, READ); } static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) @@ -391,8 +393,8 @@ got: /* * Caller should call f2fs_put_dnode(dn). - * Also, it should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op() only if ro is not set RDONLY_NODE. + * Also, it should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op() only if ro is not set RDONLY_NODE. * In the case of RDONLY_NODE, we don't need to care about mutex. */ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) @@ -502,7 +504,7 @@ static void truncate_node(struct dnode_of_data *dn) /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, dn->inode, 1); + dec_valid_node_count(sbi, dn->inode); set_node_addr(sbi, &ni, NULL_ADDR); if (dn->nid == dn->inode->i_ino) { @@ -516,6 +518,10 @@ invalidate: F2FS_SET_SB_DIRT(sbi); f2fs_put_page(dn->node_page, 1); + + invalidate_mapping_pages(NODE_MAPPING(sbi), + dn->node_page->index, dn->node_page->index); + dn->node_page = NULL; trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); } @@ -631,19 +637,19 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, return 0; /* get indirect nodes in the path */ - for (i = 0; i < depth - 1; i++) { + for (i = 0; i < idx + 1; i++) { /* refernece count'll be increased */ pages[i] = get_node_page(sbi, nid[i]); if (IS_ERR(pages[i])) { - depth = i + 1; err = PTR_ERR(pages[i]); + idx = i - 1; goto fail; } nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } /* free direct nodes linked to a partial indirect node */ - for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) { + for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { child_nid = get_nid(pages[idx], i, false); if (!child_nid) continue; @@ -654,7 +660,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, set_nid(pages[idx], i, 0, false); } - if (offset[depth - 1] == 0) { + if (offset[idx + 1] == 0) { dn->node_page = pages[idx]; dn->nid = nid[idx]; truncate_node(dn); @@ -662,9 +668,10 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, f2fs_put_page(pages[idx], 1); } offset[idx]++; - offset[depth - 1] = 0; + offset[idx + 1] = 0; + idx--; fail: - for (i = depth - 3; i >= 0; i--) + for (i = idx; i >= 0; i--) f2fs_put_page(pages[i], 1); trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); @@ -678,11 +685,10 @@ fail: int truncate_inode_blocks(struct inode *inode, pgoff_t from) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct address_space *node_mapping = sbi->node_inode->i_mapping; int err = 0, cont = 1; int level, offset[4], noffset[4]; unsigned int nofs = 0; - struct f2fs_node *rn; + struct f2fs_inode *ri; struct dnode_of_data dn; struct page *page; @@ -699,7 +705,7 @@ restart: set_new_dnode(&dn, inode, page, NULL, 0); unlock_page(page); - rn = F2FS_NODE(page); + ri = F2FS_INODE(page); switch (level) { case 0: case 1: @@ -709,7 +715,7 @@ restart: nofs = noffset[1]; if (!offset[level - 1]) goto skip_partial; - err = truncate_partial_nodes(&dn, &rn->i, offset, level); + err = truncate_partial_nodes(&dn, ri, offset, level); if (err < 0 && err != -ENOENT) goto fail; nofs += 1 + NIDS_PER_BLOCK; @@ -718,7 +724,7 @@ restart: nofs = 5 + 2 * NIDS_PER_BLOCK; if (!offset[level - 1]) goto skip_partial; - err = truncate_partial_nodes(&dn, &rn->i, offset, level); + err = truncate_partial_nodes(&dn, ri, offset, level); if (err < 0 && err != -ENOENT) goto fail; break; @@ -728,7 +734,7 @@ restart: skip_partial: while (cont) { - dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]); + dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -751,14 +757,14 @@ skip_partial: if (err < 0 && err != -ENOENT) goto fail; if (offset[1] == 0 && - rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) { + ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); - if (page->mapping != node_mapping) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto restart; } wait_on_page_writeback(page); - rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; + ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); } @@ -794,38 +800,34 @@ int truncate_xattr_node(struct inode *inode, struct page *page) set_new_dnode(&dn, inode, page, npage, nid); if (page) - dn.inode_page_locked = 1; + dn.inode_page_locked = true; truncate_node(&dn); return 0; } /* - * Caller should grab and release a mutex by calling mutex_lock_op() and - * mutex_unlock_op(). + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). */ -int remove_inode_page(struct inode *inode) +void remove_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; nid_t ino = inode->i_ino; struct dnode_of_data dn; - int err; page = get_node_page(sbi, ino); if (IS_ERR(page)) - return PTR_ERR(page); + return; - err = truncate_xattr_node(inode, page); - if (err) { + if (truncate_xattr_node(inode, page)) { f2fs_put_page(page, 1); - return err; + return; } - /* 0 is possible, after f2fs_new_inode() is failed */ f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); set_new_dnode(&dn, inode, page, page, ino); truncate_node(&dn); - return 0; } struct page *new_inode_page(struct inode *inode, const struct qstr *name) @@ -843,19 +845,18 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - struct address_space *mapping = sbi->node_inode->i_mapping; struct node_info old_ni, new_ni; struct page *page; int err; - if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(mapping, dn->nid); + page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); if (!page) return ERR_PTR(-ENOMEM); - if (!inc_valid_node_count(sbi, dn->inode, 1)) { + if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { err = -ENOSPC; goto fail; } @@ -898,14 +899,14 @@ fail: * LOCKED_PAGE: f2fs_put_page(page, 1) * error: nothing */ -static int read_node_page(struct page *page, int type) +static int read_node_page(struct page *page, int rw) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); struct node_info ni; get_node_info(sbi, page->index, &ni); - if (ni.blk_addr == NULL_ADDR) { + if (unlikely(ni.blk_addr == NULL_ADDR)) { f2fs_put_page(page, 1); return -ENOENT; } @@ -913,7 +914,7 @@ static int read_node_page(struct page *page, int type) if (PageUptodate(page)) return LOCKED_PAGE; - return f2fs_readpage(sbi, page, ni.blk_addr, type); + return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw); } /* @@ -921,18 +922,17 @@ static int read_node_page(struct page *page, int type) */ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) { - struct address_space *mapping = sbi->node_inode->i_mapping; struct page *apage; int err; - apage = find_get_page(mapping, nid); + apage = find_get_page(NODE_MAPPING(sbi), nid); if (apage && PageUptodate(apage)) { f2fs_put_page(apage, 0); return; } f2fs_put_page(apage, 0); - apage = grab_cache_page(mapping, nid); + apage = grab_cache_page(NODE_MAPPING(sbi), nid); if (!apage) return; @@ -945,11 +945,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - struct address_space *mapping = sbi->node_inode->i_mapping; struct page *page; int err; repeat: - page = grab_cache_page(mapping, nid); + page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) return ERR_PTR(-ENOMEM); @@ -960,11 +959,11 @@ repeat: goto got_it; lock_page(page); - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } - if (page->mapping != mapping) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } @@ -981,7 +980,6 @@ got_it: struct page *get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); - struct address_space *mapping = sbi->node_inode->i_mapping; struct blk_plug plug; struct page *page; int err, i, end; @@ -992,7 +990,7 @@ struct page *get_node_page_ra(struct page *parent, int start) if (!nid) return ERR_PTR(-ENOENT); repeat: - page = grab_cache_page(mapping, nid); + page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) return ERR_PTR(-ENOMEM); @@ -1017,12 +1015,12 @@ repeat: blk_finish_plug(&plug); lock_page(page); - if (page->mapping != mapping) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } page_hit: - if (!PageUptodate(page)) { + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -1048,7 +1046,6 @@ void sync_inode_page(struct dnode_of_data *dn) int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, struct writeback_control *wbc) { - struct address_space *mapping = sbi->node_inode->i_mapping; pgoff_t index, end; struct pagevec pvec; int step = ino ? 2 : 0; @@ -1062,7 +1059,7 @@ next_step: while (index <= end) { int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) @@ -1095,7 +1092,7 @@ next_step: else if (!trylock_page(page)) continue; - if (unlikely(page->mapping != mapping)) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { continue_unlock: unlock_page(page); continue; @@ -1122,7 +1119,7 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); } - mapping->a_ops->writepage(page, wbc); + NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); wrote++; if (--wbc->nr_to_write == 0) @@ -1143,31 +1140,31 @@ continue_unlock: } if (wrote) - f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL); - + f2fs_submit_merged_bio(sbi, NODE, WRITE); return nwritten; } int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { - struct address_space *mapping = sbi->node_inode->i_mapping; pgoff_t index = 0, end = LONG_MAX; struct pagevec pvec; - int nr_pages; int ret2 = 0, ret = 0; pagevec_init(&pvec, 0); - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { - unsigned i; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; /* until radix tree lookup accepts end_index */ - if (page->index > end) + if (unlikely(page->index > end)) continue; if (ino && ino_of_node(page) == ino) { @@ -1180,9 +1177,9 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) cond_resched(); } - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags))) ret2 = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) + if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags))) ret2 = -EIO; if (!ret) ret = ret2; @@ -1196,8 +1193,12 @@ static int f2fs_write_node_page(struct page *page, nid_t nid; block_t new_addr; struct node_info ni; + struct f2fs_io_info fio = { + .type = NODE, + .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + }; - if (sbi->por_doing) + if (unlikely(sbi->por_doing)) goto redirty_out; wait_on_page_writeback(page); @@ -1209,7 +1210,7 @@ static int f2fs_write_node_page(struct page *page, get_node_info(sbi, nid, &ni); /* This page is already truncated */ - if (ni.blk_addr == NULL_ADDR) { + if (unlikely(ni.blk_addr == NULL_ADDR)) { dec_page_count(sbi, F2FS_DIRTY_NODES); unlock_page(page); return 0; @@ -1220,7 +1221,7 @@ static int f2fs_write_node_page(struct page *page, mutex_lock(&sbi->node_write); set_page_writeback(page); - write_node_page(sbi, page, nid, ni.blk_addr, &new_addr); + write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); set_node_addr(sbi, &ni, new_addr); dec_page_count(sbi, F2FS_DIRTY_NODES); mutex_unlock(&sbi->node_write); @@ -1255,6 +1256,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, /* if mounting is failed, skip writing node pages */ wbc->nr_to_write = 3 * max_hw_blocks(sbi); + wbc->sync_mode = WB_SYNC_NONE; sync_node_pages(sbi, 0, wbc); wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) - wbc->nr_to_write); @@ -1333,7 +1335,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) return -1; /* 0 nid should not be used */ - if (nid == 0) + if (unlikely(nid == 0)) return 0; if (build) { @@ -1386,7 +1388,7 @@ static void scan_nat_page(struct f2fs_nm_info *nm_i, for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { - if (start_nid >= nm_i->max_nid) + if (unlikely(start_nid >= nm_i->max_nid)) break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); @@ -1420,7 +1422,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); - if (nid >= nm_i->max_nid) + if (unlikely(nid >= nm_i->max_nid)) nid = 0; if (i++ == FREE_NID_PAGES) @@ -1454,7 +1456,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; struct list_head *this; retry: - if (sbi->total_valid_node_count + 1 >= nm_i->max_nid) + if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid)) return false; spin_lock(&nm_i->free_nid_list_lock); @@ -1535,13 +1537,12 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) { - struct address_space *mapping = sbi->node_inode->i_mapping; - struct f2fs_node *src, *dst; + struct f2fs_inode *src, *dst; nid_t ino = ino_of_node(page); struct node_info old_ni, new_ni; struct page *ipage; - ipage = grab_cache_page(mapping, ino); + ipage = grab_cache_page(NODE_MAPPING(sbi), ino); if (!ipage) return -ENOMEM; @@ -1552,19 +1553,19 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); - src = F2FS_NODE(page); - dst = F2FS_NODE(ipage); + src = F2FS_INODE(page); + dst = F2FS_INODE(ipage); - memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); - dst->i.i_size = 0; - dst->i.i_blocks = cpu_to_le64(1); - dst->i.i_links = cpu_to_le32(1); - dst->i.i_xattr_nid = 0; + memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + dst->i_size = 0; + dst->i_blocks = cpu_to_le64(1); + dst->i_links = cpu_to_le32(1); + dst->i_xattr_nid = 0; new_ni = old_ni; new_ni.ino = ino; - if (!inc_valid_node_count(sbi, NULL, 1)) + if (unlikely(!inc_valid_node_count(sbi, NULL))) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR); inc_valid_inode_count(sbi); @@ -1572,47 +1573,88 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) return 0; } +/* + * ra_sum_pages() merge contiguous pages into one bio and submit. + * these pre-readed pages are linked in pages list. + */ +static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages, + int start, int nrpages) +{ + struct page *page; + int page_idx = start; + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (; page_idx < start + nrpages; page_idx++) { + /* alloc temporal page for read node summary info*/ + page = alloc_page(GFP_F2FS_ZERO); + if (!page) { + struct page *tmp; + list_for_each_entry_safe(page, tmp, pages, lru) { + list_del(&page->lru); + unlock_page(page); + __free_pages(page, 0); + } + return -ENOMEM; + } + + lock_page(page); + page->index = page_idx; + list_add_tail(&page->lru, pages); + } + + list_for_each_entry(page, pages, lru) + f2fs_submit_page_mbio(sbi, page, page->index, &fio); + + f2fs_submit_merged_bio(sbi, META, READ); + return 0; +} + int restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; struct f2fs_summary *sum_entry; - struct page *page; + struct page *page, *tmp; block_t addr; - int i, last_offset; - - /* alloc temporal page for read node */ - page = alloc_page(GFP_NOFS | __GFP_ZERO); - if (!page) - return -ENOMEM; - lock_page(page); + int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + int i, last_offset, nrpages, err = 0; + LIST_HEAD(page_list); /* scan the node segment */ last_offset = sbi->blocks_per_seg; addr = START_BLOCK(sbi, segno); sum_entry = &sum->entries[0]; - for (i = 0; i < last_offset; i++, sum_entry++) { - /* - * In order to read next node page, - * we must clear PageUptodate flag. - */ - ClearPageUptodate(page); + for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { + nrpages = min(last_offset - i, bio_blocks); - if (f2fs_readpage(sbi, page, addr, READ_SYNC)) - goto out; + /* read ahead node pages */ + err = ra_sum_pages(sbi, &page_list, addr, nrpages); + if (err) + return err; - lock_page(page); - rn = F2FS_NODE(page); - sum_entry->nid = rn->footer.nid; - sum_entry->version = 0; - sum_entry->ofs_in_node = 0; - addr++; + list_for_each_entry_safe(page, tmp, &page_list, lru) { + + lock_page(page); + if (unlikely(!PageUptodate(page))) { + err = -EIO; + } else { + rn = F2FS_NODE(page); + sum_entry->nid = rn->footer.nid; + sum_entry->version = 0; + sum_entry->ofs_in_node = 0; + sum_entry++; + } + + list_del(&page->lru); + unlock_page(page); + __free_pages(page, 0); + } } - unlock_page(page); -out: - __free_pages(page, 0); - return 0; + return err; } static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 3496bb3e15d..c4c79885c99 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -224,7 +224,13 @@ static inline block_t next_blkaddr_of_node(struct page *node_page) * | `- direct node (5 + N => 5 + 2N - 1) * `- double indirect node (5 + 2N) * `- indirect node (6 + 2N) - * `- direct node (x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) + * `- direct node */ static inline bool IS_DNODE(struct page *node_page) { diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index fdc81161f25..976a7a934db 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -40,8 +40,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, static int recover_dentry(struct page *ipage, struct inode *inode) { - struct f2fs_node *raw_node = F2FS_NODE(ipage); - struct f2fs_inode *raw_inode = &(raw_node->i); + struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; struct qstr name; @@ -62,6 +61,12 @@ static int recover_dentry(struct page *ipage, struct inode *inode) name.len = le32_to_cpu(raw_inode->i_namelen); name.name = raw_inode->i_name; + + if (unlikely(name.len > F2FS_NAME_LEN)) { + WARN_ON(1); + err = -ENAMETOOLONG; + goto out; + } retry: de = f2fs_find_entry(dir, &name, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) @@ -90,17 +95,16 @@ out_unmap_put: kunmap(page); f2fs_put_page(page, 0); out: - f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: " - "ino = %x, name = %s, dir = %lx, err = %d", - ino_of_node(ipage), raw_inode->i_name, + f2fs_msg(inode->i_sb, KERN_NOTICE, + "%s: ino = %x, name = %s, dir = %lx, err = %d", + __func__, ino_of_node(ipage), raw_inode->i_name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } static int recover_inode(struct inode *inode, struct page *node_page) { - struct f2fs_node *raw_node = F2FS_NODE(node_page); - struct f2fs_inode *raw_inode = &(raw_node->i); + struct f2fs_inode *raw_inode = F2FS_INODE(node_page); if (!IS_INODE(node_page)) return 0; @@ -143,9 +147,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) while (1) { struct fsync_inode_entry *entry; - err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); if (err) - goto out; + return err; lock_page(page); @@ -191,9 +195,10 @@ next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); } + unlock_page(page); -out: __free_pages(page, 0); + return err; } @@ -293,6 +298,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct node_info ni; int err = 0, recovered = 0; + if (recover_inline_data(inode, page)) + goto out; + start = start_bidx_of_node(ofs_of_node(page), fi); if (IS_INODE(page)) end = start + ADDRS_PER_INODE(fi); @@ -300,12 +308,13 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, end = start + ADDRS_PER_BLOCK; f2fs_lock_op(sbi); + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { f2fs_unlock_op(sbi); - return err; + goto out; } wait_on_page_writeback(dn.node_page); @@ -356,10 +365,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, err: f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - - f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, " - "recovered_data = %d blocks, err = %d", - inode->i_ino, recovered, err); +out: + f2fs_msg(sbi->sb, KERN_NOTICE, + "recover_data: ino = %lx, recovered = %d blocks, err = %d", + inode->i_ino, recovered, err); return err; } @@ -377,7 +386,7 @@ static int recover_data(struct f2fs_sb_info *sbi, blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* read node page */ - page = alloc_page(GFP_NOFS | __GFP_ZERO); + page = alloc_page(GFP_F2FS_ZERO); if (!page) return -ENOMEM; @@ -386,9 +395,9 @@ static int recover_data(struct f2fs_sb_info *sbi, while (1) { struct fsync_inode_entry *entry; - err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); if (err) - goto out; + return err; lock_page(page); @@ -412,8 +421,8 @@ next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); } + unlock_page(page); -out: __free_pages(page, 0); if (!err) @@ -429,7 +438,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry), NULL); - if (unlikely(!fsync_entry_slab)) + if (!fsync_entry_slab) return -ENOMEM; INIT_LIST_HEAD(&inode_list); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fa284d39719..7caac5f2ca9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -14,12 +14,163 @@ #include <linux/blkdev.h> #include <linux/prefetch.h> #include <linux/vmalloc.h> +#include <linux/swap.h> #include "f2fs.h" #include "segment.h" #include "node.h" #include <trace/events/f2fs.h> +#define __reverse_ffz(x) __reverse_ffs(~(x)) + +static struct kmem_cache *discard_entry_slab; + +/* + * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since + * MSB and LSB are reversed in a byte by f2fs_set_bit. + */ +static inline unsigned long __reverse_ffs(unsigned long word) +{ + int num = 0; + +#if BITS_PER_LONG == 64 + if ((word & 0xffffffff) == 0) { + num += 32; + word >>= 32; + } +#endif + if ((word & 0xffff) == 0) { + num += 16; + word >>= 16; + } + if ((word & 0xff) == 0) { + num += 8; + word >>= 8; + } + if ((word & 0xf0) == 0) + num += 4; + else + word >>= 4; + if ((word & 0xc) == 0) + num += 2; + else + word >>= 2; + if ((word & 0x2) == 0) + num += 1; + return num; +} + +/* + * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue + * f2fs_set_bit makes MSB and LSB reversed in a byte. + * Example: + * LSB <--> MSB + * f2fs_set_bit(0, bitmap) => 0000 0001 + * f2fs_set_bit(7, bitmap) => 1000 0000 + */ +static unsigned long __find_rev_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + unsigned long mask, submask; + unsigned long quot, rest; + + if (offset >= size) + return size; + + size -= result; + offset %= BITS_PER_LONG; + if (!offset) + goto aligned; + + tmp = *(p++); + quot = (offset >> 3) << 3; + rest = offset & 0x7; + mask = ~0UL << quot; + submask = (unsigned char)(0xff << rest) >> rest; + submask <<= quot; + mask &= submask; + tmp &= mask; + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + + size -= BITS_PER_LONG; + result += BITS_PER_LONG; +aligned: + while (size & ~(BITS_PER_LONG-1)) { + tmp = *(p++); + if (tmp) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __reverse_ffs(tmp); +} + +static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + unsigned long mask, submask; + unsigned long quot, rest; + + if (offset >= size) + return size; + + size -= result; + offset %= BITS_PER_LONG; + if (!offset) + goto aligned; + + tmp = *(p++); + quot = (offset >> 3) << 3; + rest = offset & 0x7; + mask = ~(~0UL << quot); + submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest); + submask <<= quot; + mask += submask; + tmp |= mask; + if (size < BITS_PER_LONG) + goto found_first; + if (~tmp) + goto found_middle; + + size -= BITS_PER_LONG; + result += BITS_PER_LONG; +aligned: + while (size & ~(BITS_PER_LONG - 1)) { + tmp = *(p++); + if (~tmp) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp |= ~0UL << size; + if (tmp == ~0UL) /* Are any bits zero? */ + return result + size; /* Nope. */ +found_middle: + return result + __reverse_ffz(tmp); +} + /* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. @@ -116,6 +267,56 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +static void f2fs_issue_discard(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart); + sector_t len = SECTOR_FROM_BLOCK(sbi, blklen); + blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); +} + +static void add_discard_addrs(struct f2fs_sb_info *sbi, + unsigned int segno, struct seg_entry *se) +{ + struct list_head *head = &SM_I(sbi)->discard_list; + struct discard_entry *new; + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + int max_blocks = sbi->blocks_per_seg; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long dmap[entries]; + unsigned int start = 0, end = -1; + int i; + + if (!test_opt(sbi, DISCARD)) + return; + + /* zero block will be discarded through the prefree list */ + if (!se->valid_blocks || se->valid_blocks == max_blocks) + return; + + /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ + for (i = 0; i < entries; i++) + dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; + + while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + start = __find_rev_next_bit(dmap, max_blocks, end + 1); + if (start >= max_blocks) + break; + + end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + + new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); + INIT_LIST_HEAD(&new->list); + new->blkaddr = START_BLOCK(sbi, segno) + start; + new->len = end - start; + + list_add_tail(&new->list, head); + SM_I(sbi)->nr_discards += end - start; + } +} + /* * Should call clear_prefree_segments after checkpoint is done. */ @@ -138,6 +339,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi) { + struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *this, *next; + struct discard_entry *entry; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int total_segs = TOTAL_SEGS(sbi); @@ -160,14 +364,19 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi) if (!test_opt(sbi, DISCARD)) continue; - blkdev_issue_discard(sbi->sb->s_bdev, - START_BLOCK(sbi, start) << - sbi->log_sectors_per_block, - (1 << (sbi->log_sectors_per_block + - sbi->log_blocks_per_seg)) * (end - start), - GFP_NOFS, 0); + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + (end - start) << sbi->log_blocks_per_seg); } mutex_unlock(&dirty_i->seglist_lock); + + /* send small discards */ + list_for_each_safe(this, next, head) { + entry = list_entry(this, struct discard_entry, list); + f2fs_issue_discard(sbi, entry->blkaddr, entry->len); + list_del(&entry->list); + SM_I(sbi)->nr_discards -= entry->len; + kmem_cache_free(discard_entry_slab, entry); + } } static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -459,13 +668,18 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg, block_t start) { struct seg_entry *se = get_seg_entry(sbi, seg->segno); - block_t ofs; - for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) { - if (!f2fs_test_bit(ofs, se->ckpt_valid_map) - && !f2fs_test_bit(ofs, se->cur_valid_map)) - break; - } - seg->next_blkoff = ofs; + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + unsigned long target_map[entries]; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + int i, pos; + + for (i = 0; i < entries; i++) + target_map[i] = ckpt_map[i] | cur_map[i]; + + pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); + + seg->next_blkoff = pos; } /* @@ -573,148 +787,6 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; -static void f2fs_end_io_write(struct bio *bio, int err) -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_private *p = bio->bi_private; - - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (!uptodate) { - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); - set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG); - p->sbi->sb->s_flags |= MS_RDONLY; - } - end_page_writeback(page); - dec_page_count(p->sbi, F2FS_WRITEBACK); - } while (bvec >= bio->bi_io_vec); - - if (p->is_sync) - complete(p->wait); - - if (!get_pages(p->sbi, F2FS_WRITEBACK) && - !list_empty(&p->sbi->cp_wait.task_list)) - wake_up(&p->sbi->cp_wait); - - kfree(p); - bio_put(bio); -} - -struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages) -{ - struct bio *bio; - - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); - bio->bi_bdev = bdev; - bio->bi_private = NULL; - - return bio; -} - -static void do_submit_bio(struct f2fs_sb_info *sbi, - enum page_type type, bool sync) -{ - int rw = sync ? WRITE_SYNC : WRITE; - enum page_type btype = type > META ? META : type; - - if (type >= META_FLUSH) - rw = WRITE_FLUSH_FUA; - - if (btype == META) - rw |= REQ_META; - - if (sbi->bio[btype]) { - struct bio_private *p = sbi->bio[btype]->bi_private; - p->sbi = sbi; - sbi->bio[btype]->bi_end_io = f2fs_end_io_write; - - trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]); - - if (type == META_FLUSH) { - DECLARE_COMPLETION_ONSTACK(wait); - p->is_sync = true; - p->wait = &wait; - submit_bio(rw, sbi->bio[btype]); - wait_for_completion(&wait); - } else { - p->is_sync = false; - submit_bio(rw, sbi->bio[btype]); - } - sbi->bio[btype] = NULL; - } -} - -void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync) -{ - down_write(&sbi->bio_sem); - do_submit_bio(sbi, type, sync); - up_write(&sbi->bio_sem); -} - -static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, enum page_type type) -{ - struct block_device *bdev = sbi->sb->s_bdev; - int bio_blocks; - - verify_block_addr(sbi, blk_addr); - - down_write(&sbi->bio_sem); - - inc_page_count(sbi, F2FS_WRITEBACK); - - if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1) - do_submit_bio(sbi, type, false); -alloc_new: - if (sbi->bio[type] == NULL) { - struct bio_private *priv; -retry: - priv = kmalloc(sizeof(struct bio_private), GFP_NOFS); - if (!priv) { - cond_resched(); - goto retry; - } - - bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); - sbi->bio[type] = f2fs_bio_alloc(bdev, bio_blocks); - sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); - sbi->bio[type]->bi_private = priv; - /* - * The end_io will be assigned at the sumbission phase. - * Until then, let bio_add_page() merge consecutive IOs as much - * as possible. - */ - } - - if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { - do_submit_bio(sbi, type, false); - goto alloc_new; - } - - sbi->last_block_in_bio[type] = blk_addr; - - up_write(&sbi->bio_sem); - trace_f2fs_submit_write_page(page, blk_addr, type); -} - -void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool sync) -{ - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); - if (PageWriteback(page)) { - f2fs_submit_bio(sbi, type, sync); - wait_on_page_writeback(page); - } -} - static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -782,16 +854,14 @@ static int __get_segment_type(struct page *page, enum page_type p_type) return __get_segment_type_6(page, p_type); } -static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, - block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, enum page_type p_type) +void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; unsigned int old_cursegno; - int type; - type = __get_segment_type(page, p_type); curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); @@ -824,49 +894,64 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); - if (p_type == NODE) + if (page && IS_NODESEG(type)) fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); - /* writeout dirty page into bdev */ - submit_write_page(sbi, page, *new_blkaddr, p_type); - mutex_unlock(&curseg->curseg_mutex); } +static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, struct f2fs_io_info *fio) +{ + int type = __get_segment_type(page, fio->type); + + allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type); + + /* writeout dirty page into bdev */ + f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio); +} + void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) { + struct f2fs_io_info fio = { + .type = META, + .rw = WRITE_SYNC | REQ_META | REQ_PRIO + }; + set_page_writeback(page); - submit_write_page(sbi, page, page->index, META); + f2fs_submit_page_mbio(sbi, page, page->index, &fio); } void write_node_page(struct f2fs_sb_info *sbi, struct page *page, + struct f2fs_io_info *fio, unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr) { struct f2fs_summary sum; set_summary(&sum, nid, 0, 0); - do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE); + do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio); } -void write_data_page(struct inode *inode, struct page *page, - struct dnode_of_data *dn, block_t old_blkaddr, - block_t *new_blkaddr) +void write_data_page(struct page *page, struct dnode_of_data *dn, + block_t *new_blkaddr, struct f2fs_io_info *fio) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct f2fs_summary sum; struct node_info ni; - f2fs_bug_on(old_blkaddr == NULL_ADDR); + f2fs_bug_on(dn->data_blkaddr == NULL_ADDR); get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - do_write_page(sbi, page, old_blkaddr, - new_blkaddr, &sum, DATA); + do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio); } -void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page, - block_t old_blk_addr) +void rewrite_data_page(struct page *page, block_t old_blkaddr, + struct f2fs_io_info *fio) { - submit_write_page(sbi, page, old_blk_addr, DATA); + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio); } void recover_data_page(struct f2fs_sb_info *sbi, @@ -925,6 +1010,10 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, unsigned int segno, old_cursegno; block_t next_blkaddr = next_blkaddr_of_node(page); unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr); + struct f2fs_io_info fio = { + .type = NODE, + .rw = WRITE_SYNC, + }; curseg = CURSEG_I(sbi, type); @@ -953,8 +1042,8 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, /* rewrite node page */ set_page_writeback(page); - submit_write_page(sbi, page, new_blkaddr, NODE); - f2fs_submit_bio(sbi, NODE, true); + f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); + f2fs_submit_merged_bio(sbi, NODE, WRITE); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); locate_dirty_segment(sbi, old_cursegno); @@ -964,6 +1053,16 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, mutex_unlock(&curseg->curseg_mutex); } +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + if (PageWriteback(page)) { + f2fs_submit_merged_bio(sbi, type, WRITE); + wait_on_page_writeback(page); + } +} + static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1314,6 +1413,10 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + /* add discard candidates */ + if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) + add_discard_addrs(sbi, segno, se); + if (flushed) goto to_sit_page; @@ -1480,41 +1583,94 @@ static int build_curseg(struct f2fs_sb_info *sbi) return restore_curseg_summaries(sbi); } +static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages) +{ + struct address_space *mapping = META_MAPPING(sbi); + struct page *page; + block_t blk_addr, prev_blk_addr = 0; + int sit_blk_cnt = SIT_BLK_CNT(sbi); + int blkno = start; + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) { + + blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); + + if (blkno != start && prev_blk_addr + 1 != blk_addr) + break; + prev_blk_addr = blk_addr; +repeat: + page = grab_cache_page(mapping, blk_addr); + if (!page) { + cond_resched(); + goto repeat; + } + if (PageUptodate(page)) { + mark_page_accessed(page); + f2fs_put_page(page, 1); + continue; + } + + f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); + + mark_page_accessed(page); + f2fs_put_page(page, 0); + } + + f2fs_submit_merged_bio(sbi, META, READ); + return blkno - start; +} + static void build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - unsigned int start; + int sit_blk_cnt = SIT_BLK_CNT(sbi); + unsigned int i, start, end; + unsigned int readed, start_blk = 0; + int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); - for (start = 0; start < TOTAL_SEGS(sbi); start++) { - struct seg_entry *se = &sit_i->sentries[start]; - struct f2fs_sit_block *sit_blk; - struct f2fs_sit_entry sit; - struct page *page; - int i; - - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < sits_in_cursum(sum); i++) { - if (le32_to_cpu(segno_in_journal(sum, i)) == start) { - sit = sit_in_journal(sum, i); - mutex_unlock(&curseg->curseg_mutex); - goto got_it; + do { + readed = ra_sit_pages(sbi, start_blk, nrpages); + + start = start_blk * sit_i->sents_per_block; + end = (start_blk + readed) * sit_i->sents_per_block; + + for (; start < end && start < TOTAL_SEGS(sbi); start++) { + struct seg_entry *se = &sit_i->sentries[start]; + struct f2fs_sit_block *sit_blk; + struct f2fs_sit_entry sit; + struct page *page; + + mutex_lock(&curseg->curseg_mutex); + for (i = 0; i < sits_in_cursum(sum); i++) { + if (le32_to_cpu(segno_in_journal(sum, i)) + == start) { + sit = sit_in_journal(sum, i); + mutex_unlock(&curseg->curseg_mutex); + goto got_it; + } } - } - mutex_unlock(&curseg->curseg_mutex); - page = get_current_sit_page(sbi, start); - sit_blk = (struct f2fs_sit_block *)page_address(page); - sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; - f2fs_put_page(page, 1); + mutex_unlock(&curseg->curseg_mutex); + + page = get_current_sit_page(sbi, start); + sit_blk = (struct f2fs_sit_block *)page_address(page); + sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; + f2fs_put_page(page, 1); got_it: - check_block_count(sbi, start, &sit); - seg_info_from_raw_sit(se, &sit); - if (sbi->segs_per_sec > 1) { - struct sec_entry *e = get_sec_entry(sbi, start); - e->valid_blocks += se->valid_blocks; + check_block_count(sbi, start, &sit); + seg_info_from_raw_sit(se, &sit); + if (sbi->segs_per_sec > 1) { + struct sec_entry *e = get_sec_entry(sbi, start); + e->valid_blocks += se->valid_blocks; + } } - } + start_blk += readed; + } while (start_blk < sit_blk_cnt); } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -1644,6 +1800,12 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS; + sm_info->ipu_policy = F2FS_IPU_DISABLE; + sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; + + INIT_LIST_HEAD(&sm_info->discard_list); + sm_info->nr_discards = 0; + sm_info->max_discards = 0; err = build_sit_info(sbi); if (err) @@ -1760,3 +1922,17 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) sbi->sm_info = NULL; kfree(sm_info); } + +int __init create_segment_manager_caches(void) +{ + discard_entry_slab = f2fs_kmem_cache_create("discard_entry", + sizeof(struct discard_entry), NULL); + if (!discard_entry_slab) + return -ENOMEM; + return 0; +} + +void destroy_segment_manager_caches(void) +{ + kmem_cache_destroy(discard_entry_slab); +} diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 269f690b4e2..5731682d751 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -20,13 +20,8 @@ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) -#define IS_DATASEG(t) \ - ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \ - (t == CURSEG_WARM_DATA)) - -#define IS_NODESEG(t) \ - ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \ - (t == CURSEG_WARM_NODE)) +#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) #define IS_CURSEG(sbi, seg) \ ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ @@ -83,25 +78,20 @@ (segno / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(sit_i, segno) \ (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) +#define SIT_BLK_CNT(sbi) \ + ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) #define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) #define TOTAL_SECS(sbi) (sbi->total_sections) #define SECTOR_FROM_BLOCK(sbi, blk_addr) \ - (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) + (((sector_t)blk_addr) << (sbi)->log_sectors_per_block) #define SECTOR_TO_BLOCK(sbi, sectors) \ - (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) + (sectors >> (sbi)->log_sectors_per_block) #define MAX_BIO_BLOCKS(max_hw_blocks) \ (min((int)max_hw_blocks, BIO_MAX_PAGES)) -/* during checkpoint, bio_private is used to synchronize the last bio */ -struct bio_private { - struct f2fs_sb_info *sbi; - bool is_sync; - void *wait; -}; - /* * indicate a block allocation direction: RIGHT and LEFT. * RIGHT means allocating new sections towards the end of volume. @@ -458,8 +448,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) static inline bool need_SSR(struct f2fs_sb_info *sbi) { - return ((prefree_segments(sbi) / sbi->segs_per_sec) - + free_sections(sbi) < overprovision_sections(sbi)); + return (prefree_segments(sbi) / sbi->segs_per_sec) + + free_sections(sbi) < overprovision_sections(sbi); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) @@ -467,38 +457,71 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - if (sbi->por_doing) + if (unlikely(sbi->por_doing)) return false; - return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi))); + return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi)); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) { - return (prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments); + return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; } static inline int utilization(struct f2fs_sb_info *sbi) { - return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); + return div_u64((u64)valid_user_blocks(sbi) * 100, + sbi->user_block_count); } /* * Sometimes f2fs may be better to drop out-of-place update policy. - * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write - * data in the original place likewise other traditional file systems. - * But, currently set 100 in percentage, which means it is disabled. - * See below need_inplace_update(). + * And, users can control the policy through sysfs entries. + * There are five policies with triggering conditions as follows. + * F2FS_IPU_FORCE - all the time, + * F2FS_IPU_SSR - if SSR mode is activated, + * F2FS_IPU_UTIL - if FS utilization is over threashold, + * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over + * threashold, + * F2FS_IPUT_DISABLE - disable IPU. (=default option) */ -#define MIN_IPU_UTIL 100 +#define DEF_MIN_IPU_UTIL 70 + +enum { + F2FS_IPU_FORCE, + F2FS_IPU_SSR, + F2FS_IPU_UTIL, + F2FS_IPU_SSR_UTIL, + F2FS_IPU_DISABLE, +}; + static inline bool need_inplace_update(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + + /* IPU can be done only for the user data */ if (S_ISDIR(inode->i_mode)) return false; - if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL) + + switch (SM_I(sbi)->ipu_policy) { + case F2FS_IPU_FORCE: return true; + case F2FS_IPU_SSR: + if (need_SSR(sbi)) + return true; + break; + case F2FS_IPU_UTIL: + if (utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + break; + case F2FS_IPU_SSR_UTIL: + if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + break; + case F2FS_IPU_DISABLE: + break; + } return false; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bafff72de8e..1a85f83abd5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -50,6 +50,7 @@ enum { Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, + Opt_inline_data, Opt_err, }; @@ -65,6 +66,7 @@ static match_table_t f2fs_tokens = { {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, + {Opt_inline_data, "inline_data"}, {Opt_err, NULL}, }; @@ -72,6 +74,7 @@ static match_table_t f2fs_tokens = { enum { GC_THREAD, /* struct f2fs_gc_thread */ SM_INFO, /* struct f2fs_sm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ }; struct f2fs_attr { @@ -89,6 +92,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)sbi->gc_thread; else if (struct_type == SM_INFO) return (unsigned char *)SM_I(sbi); + else if (struct_type == F2FS_SBI) + return (unsigned char *)sbi; return NULL; } @@ -175,6 +180,10 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -183,6 +192,10 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_no_gc_sleep_time), ATTR_LIST(gc_idle), ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(max_victim_search), NULL, }; @@ -311,6 +324,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_disable_ext_identify: set_opt(sbi, DISABLE_EXT_IDENTIFY); break; + case Opt_inline_data: + set_opt(sbi, INLINE_DATA); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -325,7 +341,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO); + fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO); if (!fi) return NULL; @@ -508,7 +524,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) #endif if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) seq_puts(seq, ",disable_ext_identify"); - + if (test_opt(sbi, INLINE_DATA)) + seq_puts(seq, ",inline_data"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -518,7 +535,8 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); int i; for (i = 0; i < total_segs; i++) { @@ -618,7 +636,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (ino < F2FS_ROOT_INO(sbi)) + if (unlikely(ino < F2FS_ROOT_INO(sbi))) return ERR_PTR(-ESTALE); /* @@ -629,7 +647,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { + if (unlikely(generation && inode->i_generation != generation)) { /* we didn't find the right inode.. */ iput(inode); return ERR_PTR(-ESTALE); @@ -732,10 +750,10 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); fsmeta += le32_to_cpu(raw_super->segment_count_ssa); - if (fsmeta >= total) + if (unlikely(fsmeta >= total)) return 1; - if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; } @@ -763,6 +781,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; + sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); @@ -798,9 +817,10 @@ retry: /* sanity checking of raw super */ if (sanity_check_raw_super(sb, *raw_super)) { brelse(*raw_super_buf); - f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " - "in %dth superblock", block + 1); - if(block == 0) { + f2fs_msg(sb, KERN_ERR, + "Can't find valid F2FS filesystem in %dth superblock", + block + 1); + if (block == 0) { block++; goto retry; } else { @@ -818,6 +838,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct buffer_head *raw_super_buf; struct inode *root; long err = -EINVAL; + int i; /* allocate memory for f2fs-specific super block info */ sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); @@ -825,7 +846,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; /* set a block size */ - if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { + if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; } @@ -874,7 +895,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->node_write); sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); - init_rwsem(&sbi->bio_sem); + + mutex_init(&sbi->read_io.io_mutex); + sbi->read_io.sbi = sbi; + sbi->read_io.bio = NULL; + for (i = 0; i < NR_PAGE_TYPE; i++) { + mutex_init(&sbi->write_io[i].io_mutex); + sbi->write_io[i].sbi = sbi; + sbi->write_io[i].bio = NULL; + } + init_rwsem(&sbi->cp_rwsem); init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); @@ -939,9 +969,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } /* if there are nt orphan nodes free them */ - err = -EINVAL; - if (recover_orphan_inodes(sbi)) - goto free_node_inode; + recover_orphan_inodes(sbi); /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); @@ -950,8 +978,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = PTR_ERR(root); goto free_node_inode; } - if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + err = -EINVAL; goto free_root_inode; + } sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { @@ -1053,7 +1083,7 @@ static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), NULL); - if (f2fs_inode_cachep == NULL) + if (!f2fs_inode_cachep) return -ENOMEM; return 0; } @@ -1078,9 +1108,12 @@ static int __init init_f2fs_fs(void) err = create_node_manager_caches(); if (err) goto free_inodecache; - err = create_gc_caches(); + err = create_segment_manager_caches(); if (err) goto free_node_manager_caches; + err = create_gc_caches(); + if (err) + goto free_segment_manager_caches; err = create_checkpoint_caches(); if (err) goto free_gc_caches; @@ -1102,6 +1135,8 @@ free_checkpoint_caches: destroy_checkpoint_caches(); free_gc_caches: destroy_gc_caches(); +free_segment_manager_caches: + destroy_segment_manager_caches(); free_node_manager_caches: destroy_node_manager_caches(); free_inodecache: @@ -1117,6 +1152,7 @@ static void __exit exit_f2fs_fs(void) unregister_filesystem(&f2fs_fs_type); destroy_checkpoint_caches(); destroy_gc_caches(); + destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); kset_unregister(f2fs_kset); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index aa7a3f139fe..89d0422a91a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -21,6 +21,7 @@ #include <linux/rwsem.h> #include <linux/f2fs_fs.h> #include <linux/security.h> +#include <linux/posix_acl_xattr.h> #include "f2fs.h" #include "xattr.h" @@ -216,8 +217,8 @@ const struct xattr_handler f2fs_xattr_security_handler = { static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL - [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler, - [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY @@ -229,8 +230,8 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = { const struct xattr_handler *f2fs_xattr_handlers[] = { &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL - &f2fs_xattr_acl_access_handler, - &f2fs_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY @@ -522,7 +523,7 @@ static int __f2fs_setxattr(struct inode *inode, int name_index, if (found) free = free + ENTRY_SIZE(here); - if (free < newsize) { + if (unlikely(free < newsize)) { error = -ENOSPC; goto exit; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 02a08fb88a1..b21d9ebdeff 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -108,8 +108,6 @@ struct f2fs_xattr_entry { #ifdef CONFIG_F2FS_FS_XATTR extern const struct xattr_handler f2fs_xattr_user_handler; extern const struct xattr_handler f2fs_xattr_trusted_handler; -extern const struct xattr_handler f2fs_xattr_acl_access_handler; -extern const struct xattr_handler f2fs_xattr_acl_default_handler; extern const struct xattr_handler f2fs_xattr_advise_handler; extern const struct xattr_handler f2fs_xattr_security_handler; diff --git a/fs/file.c b/fs/file.c index 4a78f981557..771578b33fb 100644 --- a/fs/file.c +++ b/fs/file.c @@ -348,21 +348,16 @@ out: return NULL; } -static void close_files(struct files_struct * files) +static struct fdtable *close_files(struct files_struct * files) { - int i, j; - struct fdtable *fdt; - - j = 0; - /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the - * files structure. But use RCU to shut RCU-lockdep up. + * files structure. */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); + struct fdtable *fdt = rcu_dereference_raw(files->fdt); + int i, j = 0; + for (;;) { unsigned long set; i = j * BITS_PER_LONG; @@ -381,6 +376,8 @@ static void close_files(struct files_struct * files) set >>= 1; } } + + return fdt; } struct files_struct *get_files_struct(struct task_struct *task) @@ -398,14 +395,9 @@ struct files_struct *get_files_struct(struct task_struct *task) void put_files_struct(struct files_struct *files) { - struct fdtable *fdt; - if (atomic_dec_and_test(&files->count)) { - close_files(files); - /* not really needed, since nobody can see us */ - rcu_read_lock(); - fdt = files_fdtable(files); - rcu_read_unlock(); + struct fdtable *fdt = close_files(files); + /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); @@ -645,16 +637,16 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } -struct file *fget(unsigned int fd) +static struct file *__fget(unsigned int fd, fmode_t mask) { - struct file *file; struct files_struct *files = current->files; + struct file *file; rcu_read_lock(); file = fcheck_files(files, fd); if (file) { /* File object ref couldn't be taken */ - if (file->f_mode & FMODE_PATH || + if ((file->f_mode & mask) || !atomic_long_inc_not_zero(&file->f_count)) file = NULL; } @@ -663,25 +655,16 @@ struct file *fget(unsigned int fd) return file; } +struct file *fget(unsigned int fd) +{ + return __fget(fd, FMODE_PATH); +} EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { - struct file *file; - struct files_struct *files = current->files; - - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - /* File object ref couldn't be taken */ - if (!atomic_long_inc_not_zero(&file->f_count)) - file = NULL; - } - rcu_read_unlock(); - - return file; + return __fget(fd, 0); } - EXPORT_SYMBOL(fget_raw); /* @@ -700,56 +683,33 @@ EXPORT_SYMBOL(fget_raw); * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. */ -struct file *fget_light(unsigned int fd, int *fput_needed) +struct file *__fget_light(unsigned int fd, fmode_t mask, int *fput_needed) { - struct file *file; struct files_struct *files = current->files; + struct file *file; *fput_needed = 0; if (atomic_read(&files->count) == 1) { - file = fcheck_files(files, fd); - if (file && (file->f_mode & FMODE_PATH)) + file = __fcheck_files(files, fd); + if (file && (file->f_mode & mask)) file = NULL; } else { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (!(file->f_mode & FMODE_PATH) && - atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); + file = __fget(fd, mask); + if (file) + *fput_needed = 1; } return file; } +struct file *fget_light(unsigned int fd, int *fput_needed) +{ + return __fget_light(fd, FMODE_PATH, fput_needed); +} EXPORT_SYMBOL(fget_light); struct file *fget_raw_light(unsigned int fd, int *fput_needed) { - struct file *file; - struct files_struct *files = current->files; - - *fput_needed = 0; - if (atomic_read(&files->count) == 1) { - file = fcheck_files(files, fd); - } else { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); - } - - return file; + return __fget_light(fd, 0, fput_needed); } void set_close_on_exec(unsigned int fd, int flag) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1f4a10ece2f..e0259a163f9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -516,13 +516,16 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, } WARN_ON(inode->i_state & I_SYNC); /* - * Skip inode if it is clean. We don't want to mess with writeback - * lists in this function since flusher thread may be doing for example - * sync in parallel and if we move the inode, it could get skipped. So - * here we make sure inode is on some writeback list and leave it there - * unless we have completely cleaned the inode. + * Skip inode if it is clean and we have no outstanding writeback in + * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this + * function since flusher thread may be doing for example sync in + * parallel and if we move the inode, it could get skipped. So here we + * make sure inode is on some writeback list and leave it there unless + * we have completely cleaned the inode. */ - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY) && + (wbc->sync_mode != WB_SYNC_ALL || + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; spin_unlock(&inode->i_lock); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ef74ad5fd36..0a648bb455a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1296,22 +1296,6 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); } -static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - -static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = generic_pipe_buf_release, - .steal = fuse_dev_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -1358,7 +1342,11 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, buf->page = bufs[page_nr].page; buf->offset = bufs[page_nr].offset; buf->len = bufs[page_nr].len; - buf->ops = &fuse_dev_pipe_buf_ops; + /* + * Need to be careful about this. Having buf->ops in module + * code can Oops if the buffer persists after module unload. + */ + buf->ops = &nosteal_pipe_buf_ops; pipe->nrbufs++; page_nr++; @@ -1599,7 +1587,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); - if (!err && offset == 0 && (num != 0 || file_size == end)) + if (!err && offset == 0 && + (this_num == PAGE_CACHE_SIZE || file_size == end)) SetPageUptodate(page); unlock_page(page); page_cache_release(page); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c3eb2c46c8f..1d1292c581c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -112,6 +112,16 @@ void fuse_invalidate_attr(struct inode *inode) get_fuse_inode(inode)->i_time = 0; } +/** + * Mark the attributes as stale due to an atime change. Avoid the invalidate if + * atime is not used. + */ +void fuse_invalidate_atime(struct inode *inode) +{ + if (!IS_RDONLY(inode)) + fuse_invalidate_attr(inode); +} + /* * Just mark the entry as stale, so that a next attempt to look it up * will result in a new lookup call to userspace @@ -1371,7 +1381,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) } __free_page(page); - fuse_invalidate_attr(inode); /* atime changed */ + fuse_invalidate_atime(inode); return err; } @@ -1404,7 +1414,7 @@ static char *read_link(struct dentry *dentry) link[req->out.args[0].size] = '\0'; out: fuse_put_request(fc, req); - fuse_invalidate_attr(inode); /* atime changed */ + fuse_invalidate_atime(inode); return link; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7e70506297b..77bcc303c3a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -127,7 +127,15 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) if (atomic_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - if (sync) { + if (ff->fc->no_open) { + /* + * Drop the release request when client does not + * implement 'open' + */ + req->background = 0; + path_put(&req->misc.release.path); + fuse_put_request(ff->fc, req); + } else if (sync) { req->background = 0; fuse_request_send(ff->fc, req); path_put(&req->misc.release.path); @@ -144,27 +152,36 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir) { - struct fuse_open_out outarg; struct fuse_file *ff; - int err; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; ff = fuse_file_alloc(fc); if (!ff) return -ENOMEM; - err = fuse_send_open(fc, nodeid, file, opcode, &outarg); - if (err) { - fuse_file_free(ff); - return err; + ff->fh = 0; + ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */ + if (!fc->no_open || isdir) { + struct fuse_open_out outarg; + int err; + + err = fuse_send_open(fc, nodeid, file, opcode, &outarg); + if (!err) { + ff->fh = outarg.fh; + ff->open_flags = outarg.open_flags; + + } else if (err != -ENOSYS || isdir) { + fuse_file_free(ff); + return err; + } else { + fc->no_open = 1; + } } if (isdir) - outarg.open_flags &= ~FOPEN_DIRECT_IO; + ff->open_flags &= ~FOPEN_DIRECT_IO; - ff->fh = outarg.fh; ff->nodeid = nodeid; - ff->open_flags = outarg.open_flags; file->private_data = fuse_file_get(ff); return 0; @@ -687,7 +704,7 @@ static int fuse_readpage(struct file *file, struct page *page) SetPageUptodate(page); } - fuse_invalidate_attr(inode); /* atime changed */ + fuse_invalidate_atime(inode); out: unlock_page(page); return err; @@ -716,7 +733,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) fuse_read_update_size(inode, pos, req->misc.read.attr_ver); } - fuse_invalidate_attr(inode); /* atime changed */ + fuse_invalidate_atime(inode); } for (i = 0; i < req->num_pages; i++) { @@ -2710,6 +2727,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, inode = file->f_mapping->host; i_size = i_size_read(inode); + if ((rw == READ) && (offset > i_size)) + return 0; + /* optimization for short read */ if (async_dio && rw != WRITE && offset + count > i_size) { if (offset >= i_size) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7d273091266..2da5db2c8bd 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -485,6 +485,9 @@ struct fuse_conn { * and hence races in setting them will not cause malfunction */ + /** Is open/release not implemented by fs? */ + unsigned no_open:1; + /** Is fsync not implemented by fs? */ unsigned no_fsync:1; @@ -788,6 +791,8 @@ void fuse_invalidate_attr(struct inode *inode); void fuse_invalidate_entry_cache(struct dentry *entry); +void fuse_invalidate_atime(struct inode *inode); + /** * Acquire reference to fuse_conn */ diff --git a/fs/generic_acl.c b/fs/generic_acl.c deleted file mode 100644 index b3f3676796d..00000000000 --- a/fs/generic_acl.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * (C) 2005 Andreas Gruenbacher <agruen@suse.de> - * - * This file is released under the GPL. - * - * Generic ACL support for in-memory filesystems. - */ - -#include <linux/sched.h> -#include <linux/gfp.h> -#include <linux/fs.h> -#include <linux/generic_acl.h> -#include <linux/posix_acl.h> -#include <linux/posix_acl_xattr.h> - - -static size_t -generic_acl_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - struct posix_acl *acl; - const char *xname; - size_t size; - - acl = get_cached_acl(dentry->d_inode, type); - if (!acl) - return 0; - posix_acl_release(acl); - - switch (type) { - case ACL_TYPE_ACCESS: - xname = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - xname = POSIX_ACL_XATTR_DEFAULT; - break; - default: - return 0; - } - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; -} - -static int -generic_acl_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - - acl = get_cached_acl(dentry->d_inode, type); - if (!acl) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int -generic_acl_set(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (acl) { - error = posix_acl_valid(acl); - if (error) - goto failed; - switch (type) { - case ACL_TYPE_ACCESS: - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) - goto failed; - inode->i_ctime = CURRENT_TIME; - if (error == 0) { - posix_acl_release(acl); - acl = NULL; - } - break; - case ACL_TYPE_DEFAULT: - if (!S_ISDIR(inode->i_mode)) { - error = -EINVAL; - goto failed; - } - break; - } - } - set_cached_acl(inode, type, acl); - error = 0; -failed: - posix_acl_release(acl); - return error; -} - -/** - * generic_acl_init - Take care of acl inheritance at @inode create time - * - * Files created inside a directory with a default ACL inherit the - * directory's default ACL. - */ -int -generic_acl_init(struct inode *inode, struct inode *dir) -{ - struct posix_acl *acl = NULL; - int error; - - if (!S_ISLNK(inode->i_mode)) - acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); - if (acl) { - if (S_ISDIR(inode->i_mode)) - set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - } else { - inode->i_mode &= ~current_umask(); - } - error = 0; - - posix_acl_release(acl); - return error; -} - -/** - * generic_acl_chmod - change the access acl of @inode upon chmod() - * - * A chmod also changes the permissions of the owner, group/mask, and - * other ACL entries. - */ -int -generic_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int error = 0; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - if (acl) { - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - } - return error; -} - -const struct xattr_handler generic_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = generic_acl_list, - .get = generic_acl_get, - .set = generic_acl_set, -}; - -const struct xattr_handler generic_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = generic_acl_list, - .get = generic_acl_get, - .set = generic_acl_set, -}; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index f69ac0af549..ba9456685f4 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -49,10 +49,6 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type) if (!ip->i_eattr) return NULL; - acl = get_cached_acl(&ip->i_inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - name = gfs2_acl_name(type); if (name == NULL) return ERR_PTR(-EINVAL); @@ -80,7 +76,7 @@ static int gfs2_set_mode(struct inode *inode, umode_t mode) return error; } -static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) +int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int error; int len; @@ -88,219 +84,49 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl) const char *name = gfs2_acl_name(type); BUG_ON(name == NULL); - len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); - if (len == 0) - return 0; - data = kmalloc(len, GFP_NOFS); - if (data == NULL) - return -ENOMEM; - error = posix_acl_to_xattr(&init_user_ns, acl, data, len); - if (error < 0) - goto out; - error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); - if (!error) - set_cached_acl(inode, type, acl); -out: - kfree(data); - return error; -} - -int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) -{ - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct posix_acl *acl; - umode_t mode = inode->i_mode; - int error = 0; - - if (!sdp->sd_args.ar_posix_acl) - return 0; - if (S_ISLNK(inode->i_mode)) - return 0; - - acl = gfs2_get_acl(&dip->i_inode, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) { - mode &= ~current_umask(); - return gfs2_set_mode(inode, mode); - } - - if (S_ISDIR(inode->i_mode)) { - error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto out; - } - - error = posix_acl_create(&acl, GFP_NOFS, &mode); - if (error < 0) - return error; - if (error == 0) - goto munge; - - error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl); - if (error) - goto out; -munge: - error = gfs2_set_mode(inode, mode); -out: - posix_acl_release(acl); - return error; -} - -int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) -{ - struct inode *inode = &ip->i_inode; - struct posix_acl *acl; - char *data; - unsigned int len; - int error; - - acl = gfs2_get_acl(&ip->i_inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) - return gfs2_setattr_simple(inode, attr); - - error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode); - if (error) - return error; - - len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); - data = kmalloc(len, GFP_NOFS); - error = -ENOMEM; - if (data == NULL) - goto out; - posix_acl_to_xattr(&init_user_ns, acl, data, len); - error = gfs2_xattr_acl_chmod(ip, attr, data); - kfree(data); - set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl); - -out: - posix_acl_release(acl); - return error; -} - -static int gfs2_acl_type(const char *name) -{ - if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0) - return ACL_TYPE_ACCESS; - if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0) - return ACL_TYPE_DEFAULT; - return -EINVAL; -} - -static int gfs2_xattr_system_get(struct dentry *dentry, const char *name, - void *buffer, size_t size, int xtype) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct posix_acl *acl; - int type; - int error; - - if (!sdp->sd_args.ar_posix_acl) - return -EOPNOTSUPP; - - type = gfs2_acl_type(name); - if (type < 0) - return type; - - acl = gfs2_get_acl(inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, - int xtype) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct posix_acl *acl = NULL; - int error = 0, type; - - if (!sdp->sd_args.ar_posix_acl) - return -EOPNOTSUPP; - - type = gfs2_acl_type(name); - if (type < 0) - return type; - if (flags & XATTR_CREATE) - return -EINVAL; - if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) - return value ? -EACCES : 0; - if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER)) - return -EPERM; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - if (!value) - goto set_acl; - - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (!acl) { - /* - * acl_set_file(3) may request that we set default ACLs with - * zero length -- defend (gracefully) against that here. - */ - goto out; - } - if (IS_ERR(acl)) { - error = PTR_ERR(acl); - goto out; - } - - error = posix_acl_valid(acl); - if (error) - goto out_release; - - error = -EINVAL; if (acl->a_count > GFS2_ACL_MAX_ENTRIES) - goto out_release; + return -EINVAL; if (type == ACL_TYPE_ACCESS) { umode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; - if (error <= 0) { - posix_acl_release(acl); + if (error == 0) acl = NULL; - if (error < 0) - return error; - } - error = gfs2_set_mode(inode, mode); if (error) - goto out_release; + return error; } -set_acl: - error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS); - if (!error) { - if (acl) - set_cached_acl(inode, type, acl); - else - forget_cached_acl(inode, type); + if (acl) { + len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); + if (len == 0) + return 0; + data = kmalloc(len, GFP_NOFS); + if (data == NULL) + return -ENOMEM; + error = posix_acl_to_xattr(&init_user_ns, acl, data, len); + if (error < 0) + goto out; + } else { + data = NULL; + len = 0; } -out_release: - posix_acl_release(acl); + + error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); + if (error) + goto out; + + if (acl) + set_cached_acl(inode, type, acl); + else + forget_cached_acl(inode, type); out: + kfree(data); return error; } - -const struct xattr_handler gfs2_xattr_system_handler = { - .prefix = XATTR_SYSTEM_PREFIX, - .flags = GFS2_EATYPE_SYS, - .get = gfs2_xattr_system_get, - .set = gfs2_xattr_system_set, -}; - diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 0da38dc7efe..301260c999b 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -17,8 +17,6 @@ #define GFS2_ACL_MAX_ENTRIES 25 extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); -extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); -extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); -extern const struct xattr_handler gfs2_xattr_system_handler; +extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index b7fc035a694..49436fa7cd4 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -986,6 +986,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct address_space *mapping = inode->i_mapping; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int rv; @@ -1006,6 +1007,36 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ + /* + * Now since we are holding a deferred (CW) lock at this point, you + * might be wondering why this is ever needed. There is a case however + * where we've granted a deferred local lock against a cached exclusive + * glock. That is ok provided all granted local locks are deferred, but + * it also means that it is possible to encounter pages which are + * cached and possibly also mapped. So here we check for that and sort + * them out ahead of the dio. The glock state machine will take care of + * everything else. + * + * If in fact the cached glock state (gl->gl_state) is deferred (CW) in + * the first place, mapping->nr_pages will always be zero. + */ + if (mapping->nrpages) { + loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); + loff_t len = iov_length(iov, nr_segs); + loff_t end = PAGE_ALIGN(offset + len) - 1; + + rv = 0; + if (len == 0) + goto out; + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); + rv = filemap_write_and_wait_range(mapping, lstart, end); + if (rv) + goto out; + if (rw == WRITE) + truncate_inode_pages_range(mapping, lstart, end); + } + rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, gfs2_get_block_direct, NULL, NULL, 0); @@ -1050,30 +1081,22 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) bh = bh->b_this_page; } while(bh != head); spin_unlock(&sdp->sd_ail_lock); - gfs2_log_unlock(sdp); head = bh = page_buffers(page); do { - gfs2_log_lock(sdp); bd = bh->b_private; if (bd) { gfs2_assert_warn(sdp, bd->bd_bh == bh); - if (!list_empty(&bd->bd_list)) { - if (!buffer_pinned(bh)) - list_del_init(&bd->bd_list); - else - bd = NULL; - } - if (bd) - bd->bd_bh = NULL; + if (!list_empty(&bd->bd_list)) + list_del_init(&bd->bd_list); + bd->bd_bh = NULL; bh->b_private = NULL; - } - gfs2_log_unlock(sdp); - if (bd) kmem_cache_free(gfs2_bufdata_cachep, bd); + } bh = bh->b_this_page; } while (bh != head); + gfs2_log_unlock(sdp); return try_to_free_buffers(page); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 2e5fc268d32..fa32655449c 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -834,6 +834,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, struct gfs2_leaf *leaf; struct gfs2_dirent *dent; struct qstr name = { .name = "" }; + struct timespec tv = CURRENT_TIME; error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); if (error) @@ -850,7 +851,11 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, leaf->lf_entries = 0; leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); leaf->lf_next = 0; - memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved)); + leaf->lf_inode = cpu_to_be64(ip->i_no_addr); + leaf->lf_dist = cpu_to_be32(1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); + memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2)); dent = (struct gfs2_dirent *)(leaf+1); gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent); *pbh = bh; @@ -1612,11 +1617,31 @@ out: return ret; } +/** + * dir_new_leaf - Add a new leaf onto hash chain + * @inode: The directory + * @name: The name we are adding + * + * This adds a new dir leaf onto an existing leaf when there is not + * enough space to add a new dir entry. This is a last resort after + * we've expanded the hash table to max size and also split existing + * leaf blocks, so it will only occur for very large directories. + * + * The dist parameter is set to 1 for leaf blocks directly attached + * to the hash table, 2 for one layer of indirection, 3 for two layers + * etc. We are thus able to tell the difference between an old leaf + * with dist set to zero (i.e. "don't know") and a new one where we + * set this information for debug/fsck purposes. + * + * Returns: 0 on success, or -ve on error + */ + static int dir_new_leaf(struct inode *inode, const struct qstr *name) { struct buffer_head *bh, *obh; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_leaf *leaf, *oleaf; + u32 dist = 1; int error; u32 index; u64 bn; @@ -1626,6 +1651,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) if (error) return error; do { + dist++; oleaf = (struct gfs2_leaf *)obh->b_data; bn = be64_to_cpu(oleaf->lf_next); if (!bn) @@ -1643,6 +1669,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) brelse(obh); return -ENOSPC; } + leaf->lf_dist = cpu_to_be32(dist); oleaf->lf_next = cpu_to_be64(bh->b_blocknr); brelse(bh); brelse(obh); @@ -1659,39 +1686,53 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) /** * gfs2_dir_add - Add new filename into directory - * @dip: The GFS2 inode - * @filename: The new name - * @inode: The inode number of the entry - * @type: The type of the entry + * @inode: The directory inode + * @name: The new name + * @nip: The GFS2 inode to be linked in to the directory + * @da: The directory addition info + * + * If the call to gfs2_diradd_alloc_required resulted in there being + * no need to allocate any new directory blocks, then it will contain + * a pointer to the directory entry and the bh in which it resides. We + * can use that without having to repeat the search. If there was no + * free space, then we must now create more space. * * Returns: 0 on success, error code on failure */ int gfs2_dir_add(struct inode *inode, const struct qstr *name, - const struct gfs2_inode *nip) + const struct gfs2_inode *nip, struct gfs2_diradd *da) { struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *bh; - struct gfs2_dirent *dent; + struct buffer_head *bh = da->bh; + struct gfs2_dirent *dent = da->dent; + struct timespec tv; struct gfs2_leaf *leaf; int error; while(1) { - dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, - &bh); + if (da->bh == NULL) { + dent = gfs2_dirent_search(inode, name, + gfs2_dirent_find_space, &bh); + } if (dent) { if (IS_ERR(dent)) return PTR_ERR(dent); dent = gfs2_init_dirent(inode, dent, name, bh); gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); + tv = CURRENT_TIME; if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; be16_add_cpu(&leaf->lf_entries, 1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); } + da->dent = NULL; + da->bh = NULL; brelse(bh); ip->i_entries++; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv; if (S_ISDIR(nip->i_inode.i_mode)) inc_nlink(&ip->i_inode); mark_inode_dirty(inode); @@ -1742,6 +1783,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; + struct timespec tv = CURRENT_TIME; /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ @@ -1767,13 +1809,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) if (!entries) gfs2_consist_inode(dip); leaf->lf_entries = cpu_to_be16(--entries); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); } brelse(bh); if (!dip->i_entries) gfs2_consist_inode(dip); dip->i_entries--; - dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; + dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv; if (S_ISDIR(dentry->d_inode->i_mode)) drop_nlink(&dip->i_inode); mark_inode_dirty(&dip->i_inode); @@ -2017,22 +2061,36 @@ out: * gfs2_diradd_alloc_required - find if adding entry will require an allocation * @ip: the file being written to * @filname: the filename that's going to be added + * @da: The structure to return dir alloc info * - * Returns: 1 if alloc required, 0 if not, -ve on error + * Returns: 0 if ok, -ve on error */ -int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name) +int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name, + struct gfs2_diradd *da) { + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf); struct gfs2_dirent *dent; struct buffer_head *bh; + da->nr_blocks = 0; + da->bh = NULL; + da->dent = NULL; + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); if (!dent) { - return 1; + da->nr_blocks = sdp->sd_max_dirres; + if (!(ip->i_diskflags & GFS2_DIF_EXHASH) && + (GFS2_DIRENT_SIZE(name->len) < extra)) + da->nr_blocks = 1; + return 0; } if (IS_ERR(dent)) return PTR_ERR(dent); - brelse(bh); + da->bh = bh; + da->dent = dent; return 0; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 4f03bbd1873..126c65dda02 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -16,6 +16,14 @@ struct inode; struct gfs2_inode; struct gfs2_inum; +struct buffer_head; +struct gfs2_dirent; + +struct gfs2_diradd { + unsigned nr_blocks; + struct gfs2_dirent *dent; + struct buffer_head *bh; +}; extern struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename, @@ -23,7 +31,13 @@ extern struct inode *gfs2_dir_search(struct inode *dir, extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, const struct gfs2_inode *ip); extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, - const struct gfs2_inode *ip); + const struct gfs2_inode *ip, struct gfs2_diradd *da); +static inline void gfs2_dir_no_add(struct gfs2_diradd *da) +{ + if (da->bh) + brelse(da->bh); + da->bh = NULL; +} extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, struct file_ra_state *f_ra); @@ -33,7 +47,8 @@ extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); extern int gfs2_diradd_alloc_required(struct inode *dir, - const struct qstr *filename); + const struct qstr *filename, + struct gfs2_diradd *da); extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head **bhp); extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c8420f7e4db..ca0be6c69a2 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1552,13 +1552,11 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp) glock_hash_walk(thaw_glock, sdp); } -static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) +static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl) { - int ret; spin_lock(&gl->gl_spin); - ret = gfs2_dump_glock(seq, gl); + gfs2_dump_glock(seq, gl); spin_unlock(&gl->gl_spin); - return ret; } static void dump_glock_func(struct gfs2_glock *gl) @@ -1647,14 +1645,14 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) * @seq: the seq_file struct * @gh: the glock holder * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) +static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) { struct task_struct *gh_owner = NULL; char flags_buf[32]; + rcu_read_lock(); if (gh->gh_owner_pid) gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n", @@ -1664,7 +1662,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, gh_owner ? gh_owner->comm : "(ended)", (void *)gh->gh_ip); - return 0; + rcu_read_unlock(); } static const char *gflags2str(char *buf, const struct gfs2_glock *gl) @@ -1719,16 +1717,14 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) * example. The field's are n = number (id of the object), f = flags, * t = type, s = state, r = refcount, e = error, p = pid. * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) +void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned long long dtime; const struct gfs2_holder *gh; char gflags_buf[32]; - int error = 0; dtime = jiffies - gl->gl_demote_time; dtime *= 1000000/HZ; /* demote time in uSec */ @@ -1745,15 +1741,11 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) atomic_read(&gl->gl_revokes), (int)gl->gl_lockref.count, gl->gl_hold_time); - list_for_each_entry(gh, &gl->gl_holders, gh_list) { - error = dump_holder(seq, gh); - if (error) - goto out; - } + list_for_each_entry(gh, &gl->gl_holders, gh_list) + dump_holder(seq, gh); + if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) - error = glops->go_dump(seq, gl); -out: - return error; + glops->go_dump(seq, gl); } static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) @@ -1951,7 +1943,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) { - return dump_glock(seq, iter_ptr); + dump_glock(seq, iter_ptr); + return 0; } static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 6647d77366b..32572f71f02 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -199,7 +199,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, struct gfs2_holder *gh); extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) extern __printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index db908f69713..3bf0631b5d5 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -133,7 +133,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) static void rgrp_go_sync(struct gfs2_glock *gl) { - struct address_space *metamapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd; int error; @@ -141,10 +142,10 @@ static void rgrp_go_sync(struct gfs2_glock *gl) return; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); - gfs2_log_flush(gl->gl_sbd, gl); - filemap_fdatawrite(metamapping); - error = filemap_fdatawait(metamapping); - mapping_set_error(metamapping, error); + gfs2_log_flush(sdp, gl); + filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + mapping_set_error(mapping, error); gfs2_ail_empty_gl(gl); spin_lock(&gl->gl_spin); @@ -166,11 +167,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl) static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { - struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = &sdp->sd_aspace; WARN_ON_ONCE(!(flags & DIO_METADATA)); - gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); - truncate_inode_pages(mapping, 0); + gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); if (gl->gl_object) { struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; @@ -192,8 +194,11 @@ static void inode_go_sync(struct gfs2_glock *gl) if (ip && !S_ISREG(ip->i_inode.i_mode)) ip = NULL; - if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) - unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + if (ip) { + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + inode_dio_wait(&ip->i_inode); + } if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; @@ -410,6 +415,9 @@ static int inode_go_lock(struct gfs2_holder *gh) return error; } + if (gh->gh_state != LM_ST_DEFERRED) + inode_dio_wait(&ip->i_inode); + if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && (gl->gl_state == LM_ST_EXCLUSIVE) && (gh->gh_state == LM_ST_EXCLUSIVE)) { @@ -429,21 +437,19 @@ static int inode_go_lock(struct gfs2_holder *gh) * @seq: The iterator * @ip: the inode * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) +static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) { const struct gfs2_inode *ip = gl->gl_object; if (ip == NULL) - return 0; + return; gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", (unsigned long long)ip->i_no_formal_ino, (unsigned long long)ip->i_no_addr, IF2DT(ip->i_inode.i_mode), ip->i_flags, (unsigned int)ip->i_diskflags, (unsigned long long)i_size_read(&ip->i_inode)); - return 0; } /** @@ -552,7 +558,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_unlock = gfs2_rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, - .go_flags = GLOF_ASPACE | GLOF_LVB, + .go_flags = GLOF_LVB, }; const struct gfs2_glock_operations gfs2_trans_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index ba1ea67f4ee..cf0e34400f7 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -93,6 +93,7 @@ struct gfs2_rgrpd { struct gfs2_rgrp_lvb *rd_rgl; u32 rd_last_alloc; u32 rd_flags; + u32 rd_extfail_pt; /* extent failure point */ #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ #define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ @@ -217,7 +218,7 @@ struct gfs2_glock_operations { int (*go_demote_ok) (const struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); - int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); + void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); void (*go_callback)(struct gfs2_glock *gl, bool remote); const int go_type; const unsigned long go_flags; @@ -350,7 +351,15 @@ struct gfs2_glock { atomic_t gl_ail_count; atomic_t gl_revokes; struct delayed_work gl_work; - struct work_struct gl_delete; + union { + /* For inode and iopen glocks only */ + struct work_struct gl_delete; + /* For rgrp glocks only */ + struct { + loff_t start; + loff_t end; + } gl_vm; + }; struct rcu_head gl_rcu; }; @@ -419,10 +428,13 @@ enum { }; struct gfs2_quota_data { + struct hlist_bl_node qd_hlist; struct list_head qd_list; struct kqid qd_id; + struct gfs2_sbd *qd_sbd; struct lockref qd_lockref; struct list_head qd_lru; + unsigned qd_hash; unsigned long qd_flags; /* QDF_... */ @@ -441,6 +453,7 @@ struct gfs2_quota_data { u64 qd_sync_gen; unsigned long qd_last_warn; + struct rcu_head qd_rcu; }; struct gfs2_trans { @@ -720,13 +733,15 @@ struct gfs2_sbd { spinlock_t sd_trunc_lock; unsigned int sd_quota_slots; - unsigned int sd_quota_chunks; - unsigned char **sd_quota_bitmap; + unsigned long *sd_quota_bitmap; + spinlock_t sd_bitmap_lock; u64 sd_quota_sync_gen; /* Log stuff */ + struct address_space sd_aspace; + spinlock_t sd_log_lock; struct gfs2_trans *sd_log_tr; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 7119504159f..5c524180c98 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -149,7 +149,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, ip = GFS2_I(inode); if (!inode) - return ERR_PTR(-ENOBUFS); + return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -469,14 +469,36 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, brelse(dibh); } +/** + * gfs2_trans_da_blocks - Calculate number of blocks to link inode + * @dip: The directory we are linking into + * @da: The dir add information + * @nr_inodes: The number of inodes involved + * + * This calculate the number of blocks we need to reserve in a + * transaction to link @nr_inodes into a directory. In most cases + * @nr_inodes will be 2 (the directory plus the inode being linked in) + * but in case of rename, 4 may be required. + * + * Returns: Number of blocks + */ + +static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip, + const struct gfs2_diradd *da, + unsigned nr_inodes) +{ + return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) + + (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS; +} + static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip, int arq) + struct gfs2_inode *ip, struct gfs2_diradd *da) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; + struct gfs2_alloc_parms ap = { .target = da->nr_blocks, }; int error; - if (arq) { + if (da->nr_blocks) { error = gfs2_quota_lock_check(dip); if (error) goto fail_quota_locks; @@ -485,10 +507,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_quota_locks; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - dip->i_rgd->rd_length + - 2 * RES_DINODE + - RES_STATFS + RES_QUOTA, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0); if (error) goto fail_ipreserv; } else { @@ -497,7 +516,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, goto fail_quota_locks; } - error = gfs2_dir_add(&dip->i_inode, name, ip); + error = gfs2_dir_add(&dip->i_inode, name, ip, da); if (error) goto fail_end_trans; @@ -552,6 +571,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, unsigned int size, int excl, int *opened) { const struct qstr *name = &dentry->d_name; + struct posix_acl *default_acl, *acl; struct gfs2_holder ghs[2]; struct inode *inode = NULL; struct gfs2_inode *dip = GFS2_I(dir), *ip; @@ -560,7 +580,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct dentry *d; int error; u32 aflags = 0; - int arq; + struct gfs2_diradd da = { .bh = NULL, }; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; @@ -585,6 +605,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = PTR_ERR(inode); if (!IS_ERR(inode)) { d = d_splice_alias(inode, dentry); + error = PTR_ERR(d); + if (IS_ERR(d)) + goto fail_gunlock; error = 0; if (file) { if (S_ISREG(inode->i_mode)) { @@ -602,7 +625,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail_gunlock; } - arq = error = gfs2_diradd_alloc_required(dir, name); + error = gfs2_diradd_alloc_required(dir, name, &da); if (error < 0) goto fail_gunlock; @@ -611,10 +634,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!inode) goto fail_gunlock; + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + goto fail_free_vfs_inode; + ip = GFS2_I(inode); error = gfs2_rs_alloc(ip); if (error) - goto fail_free_inode; + goto fail_free_acls; inode->i_mode = mode; set_nlink(inode, S_ISDIR(mode) ? 2 : 1); @@ -682,7 +709,16 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_set_iop(inode); insert_inode_hash(inode); - error = gfs2_acl_create(dip, inode); + if (default_acl) { + error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + if (error) goto fail_gunlock3; @@ -690,7 +726,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock3; - error = link_dinode(dip, name, ip, arq); + error = link_dinode(dip, name, ip, &da); if (error) goto fail_gunlock3; @@ -716,9 +752,16 @@ fail_free_inode: if (ip->i_gl) gfs2_glock_put(ip->i_gl); gfs2_rs_delete(ip, NULL); +fail_free_acls: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); +fail_free_vfs_inode: free_inode_nonrcu(inode); inode = NULL; fail_gunlock: + gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(ghs); if (inode && !IS_ERR(inode)) { clear_nlink(inode); @@ -779,6 +822,11 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, } d = d_splice_alias(inode, dentry); + if (IS_ERR(d)) { + iput(inode); + gfs2_glock_dq_uninit(&gh); + return d; + } if (file && S_ISREG(inode->i_mode)) error = finish_open(file, dentry, gfs2_open_common, opened); @@ -817,7 +865,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder ghs[2]; struct buffer_head *dibh; - int alloc_required; + struct gfs2_diradd da = { .bh = NULL, }; int error; if (S_ISDIR(inode->i_mode)) @@ -872,13 +920,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (ip->i_inode.i_nlink == (u32)-1) goto out_gunlock; - alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); + error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da); if (error < 0) goto out_gunlock; - error = 0; - if (alloc_required) { - struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; error = gfs2_quota_lock_check(dip); if (error) goto out_gunlock; @@ -887,10 +934,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_gunlock_q; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(dip, sdp->sd_max_dirres) + - 2 * RES_DINODE + RES_STATFS + - RES_QUOTA, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0); if (error) goto out_ipres; } else { @@ -903,7 +947,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_end_trans; - error = gfs2_dir_add(dir, &dentry->d_name, ip); + error = gfs2_dir_add(dir, &dentry->d_name, ip, &da); if (error) goto out_brelse; @@ -919,12 +963,13 @@ out_brelse: out_end_trans: gfs2_trans_end(sdp); out_ipres: - if (alloc_required) + if (da.nr_blocks) gfs2_inplace_release(dip); out_gunlock_q: - if (alloc_required) + if (da.nr_blocks) gfs2_quota_unlock(dip); out_gunlock: + gfs2_dir_no_add(&da); gfs2_glock_dq(ghs + 1); out_child: gfs2_glock_dq(ghs); @@ -1254,7 +1299,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; - int alloc_required = 0; + struct gfs2_diradd da = { .nr_blocks = 0, }; unsigned int x; int error; @@ -1388,14 +1433,14 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock; } - if (nip == NULL) - alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); - error = alloc_required; - if (error < 0) - goto out_gunlock; + if (nip == NULL) { + error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da); + if (error) + goto out_gunlock; + } - if (alloc_required) { - struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; error = gfs2_quota_lock_check(ndip); if (error) goto out_gunlock; @@ -1404,10 +1449,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_gunlock_q; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(ndip, sdp->sd_max_dirres) + - 4 * RES_DINODE + 4 * RES_LEAF + - RES_STATFS + RES_QUOTA + 4, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) + + 4 * RES_LEAF + 4, 0); if (error) goto out_ipreserv; } else { @@ -1441,19 +1484,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_end_trans; - error = gfs2_dir_add(ndir, &ndentry->d_name, ip); + error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da); if (error) goto out_end_trans; out_end_trans: gfs2_trans_end(sdp); out_ipreserv: - if (alloc_required) + if (da.nr_blocks) gfs2_inplace_release(ndip); out_gunlock_q: - if (alloc_required) + if (da.nr_blocks) gfs2_quota_unlock(ndip); out_gunlock: + gfs2_dir_no_add(&da); while (x--) { gfs2_glock_dq(ghs + x); gfs2_holder_uninit(ghs + x); @@ -1607,10 +1651,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) ogid = ngid = NO_GID_QUOTA_CHANGE; - error = gfs2_quota_lock(ip, nuid, ngid); + error = get_write_access(inode); if (error) return error; + error = gfs2_rs_alloc(ip); + if (error) + goto out; + + error = gfs2_rindex_update(sdp); + if (error) + goto out; + + error = gfs2_quota_lock(ip, nuid, ngid); + if (error) + goto out; + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { error = gfs2_quota_check(ip, nuid, ngid); @@ -1637,6 +1693,8 @@ out_end_trans: gfs2_trans_end(sdp); out_gunlock_q: gfs2_quota_unlock(ip); +out: + put_write_access(inode); return error; } @@ -1678,10 +1736,11 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) error = gfs2_setattr_size(inode, attr->ia_size); else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) error = setattr_chown(inode, attr); - else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) - error = gfs2_acl_chmod(ip, attr); - else + else { error = gfs2_setattr_simple(inode, attr); + if (!error && attr->ia_valid & ATTR_MODE) + error = posix_acl_chmod(inode, inode->i_mode); + } out: if (!error) @@ -1841,6 +1900,7 @@ const struct inode_operations gfs2_file_iops = { .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, + .set_acl = gfs2_set_acl, }; const struct inode_operations gfs2_dir_iops = { @@ -1862,6 +1922,7 @@ const struct inode_operations gfs2_dir_iops = { .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, .get_acl = gfs2_get_acl, + .set_acl = gfs2_set_acl, .atomic_open = gfs2_atomic_open, }; @@ -1877,6 +1938,5 @@ const struct inode_operations gfs2_symlink_iops = { .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, .fiemap = gfs2_fiemap, - .get_acl = gfs2_get_acl, }; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 610613fb65b..9dcb9777a5f 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -551,10 +551,10 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct buffer_head *bh = bd->bd_bh; struct gfs2_glock *gl = bd->bd_gl; - gfs2_remove_from_ail(bd); - bd->bd_bh = NULL; bh->b_private = NULL; bd->bd_blkno = bh->b_blocknr; + gfs2_remove_from_ail(bd); /* drops ref on bh */ + bd->bd_bh = NULL; bd->bd_ops = &gfs2_revoke_lops; sdp->sd_log_num_revoke++; atomic_inc(&gl->gl_revokes); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 010b9fb9fec..58f06400b7b 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); clear_bit(GBF_FULL, &bi->bi_flags); rgd->rd_free_clone = rgd->rd_free; + rgd->rd_extfail_pt = rgd->rd_free; } /** @@ -588,8 +589,12 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, static void gfs2_meta_sync(struct gfs2_glock *gl) { struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; int error; + if (mapping == NULL) + mapping = &sdp->sd_aspace; + filemap_fdatawrite(mapping); error = filemap_fdatawait(mapping); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 0650db2541e..c272e73063d 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void) gfs2_str2qstr(&gfs2_qdot, "."); gfs2_str2qstr(&gfs2_qdotdot, ".."); + gfs2_quota_hash_init(); error = gfs2_sys_init(); if (error) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 93241505054..c7f24690ed0 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -116,6 +116,9 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) unsigned long index; unsigned int bufnum; + if (mapping == NULL) + mapping = &sdp->sd_aspace; + shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; index = blkno >> shift; /* convert block to page */ bufnum = blkno - (index << shift); /* block buf index within page */ @@ -258,6 +261,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int struct address_space *mapping = bh->b_page->mapping; struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; + int was_pinned = 0; if (test_clear_buffer_pinned(bh)) { trace_gfs2_pin(bd, 0); @@ -273,12 +277,16 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int tr->tr_num_databuf_rm++; } tr->tr_touched = 1; + was_pinned = 1; brelse(bh); } if (bd) { spin_lock(&sdp->sd_ail_lock); if (bd->bd_tr) { gfs2_trans_add_revoke(sdp, bd); + } else if (was_pinned) { + bh->b_private = NULL; + kmem_cache_free(gfs2_bufdata_cachep, bd); } spin_unlock(&sdp->sd_ail_lock); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 82303b47495..1e712b566d7 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -36,6 +36,7 @@ #include "log.h" #include "quota.h" #include "dir.h" +#include "meta_io.h" #include "trace_gfs2.h" #define DO 0 @@ -62,6 +63,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt) static struct gfs2_sbd *init_sbd(struct super_block *sb) { struct gfs2_sbd *sdp; + struct address_space *mapping; sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); if (!sdp) @@ -97,6 +99,18 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_waitqueue_head(&sdp->sd_quota_wait); INIT_LIST_HEAD(&sdp->sd_trunc_list); spin_lock_init(&sdp->sd_trunc_lock); + spin_lock_init(&sdp->sd_bitmap_lock); + + mapping = &sdp->sd_aspace; + + address_space_init_once(mapping); + mapping->a_ops = &gfs2_meta_aops; + mapping->host = sb->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->private_data = NULL; + mapping->backing_dev_info = sb->s_bdi; + mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); atomic_set(&sdp->sd_log_pinned, 0); @@ -217,7 +231,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) page = alloc_page(GFP_NOFS); if (unlikely(!page)) - return -ENOBUFS; + return -ENOMEM; ClearPageUptodate(page); ClearPageDirty(page); @@ -956,40 +970,6 @@ fail: return error; } -static int init_threads(struct gfs2_sbd *sdp, int undo) -{ - struct task_struct *p; - int error = 0; - - if (undo) - goto fail_quotad; - - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start logd thread: %d\n", error); - return error; - } - sdp->sd_logd_process = p; - - p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start quotad thread: %d\n", error); - goto fail; - } - sdp->sd_quotad_process = p; - - return 0; - - -fail_quotad: - kthread_stop(sdp->sd_quotad_process); -fail: - kthread_stop(sdp->sd_logd_process); - return error; -} - static const match_table_t nolock_tokens = { { Opt_jid, "jid=%d\n", }, { Opt_err, NULL }, @@ -1254,15 +1234,11 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent goto fail_per_node; } - error = init_threads(sdp, DO); - if (error) - goto fail_per_node; - if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_rw(sdp); if (error) { fs_err(sdp, "can't make FS RW: %d\n", error); - goto fail_threads; + goto fail_per_node; } } @@ -1270,8 +1246,6 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent gfs2_online_uevent(sdp); return 0; -fail_threads: - init_threads(sdp, UNDO); fail_per_node: init_per_node(sdp, UNDO); fail_inodes: @@ -1366,8 +1340,18 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if (IS_ERR(s)) goto error_bdev; - if (s->s_root) + if (s->s_root) { + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); blkdev_put(bdev, mode); + down_write(&s->s_umount); + } memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 98236d0df3c..8bec0e3192d 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -52,6 +52,11 @@ #include <linux/dqblk_xfs.h> #include <linux/lockref.h> #include <linux/list_lru.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/bit_spinlock.h> +#include <linux/jhash.h> +#include <linux/vmalloc.h> #include "gfs2.h" #include "incore.h" @@ -67,16 +72,44 @@ #include "inode.h" #include "util.h" -struct gfs2_quota_change_host { - u64 qc_change; - u32 qc_flags; /* GFS2_QCF_... */ - struct kqid qc_id; -}; +#define GFS2_QD_HASH_SHIFT 12 +#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT) +#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) -/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */ +/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ +/* -> sd_bitmap_lock */ static DEFINE_SPINLOCK(qd_lock); struct list_lru gfs2_qd_lru; +static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE]; + +static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp, + const struct kqid qid) +{ + unsigned int h; + + h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0); + h = jhash(&qid, sizeof(struct kqid), h); + + return h & GFS2_QD_HASH_MASK; +} + +static inline void spin_lock_bucket(unsigned int hash) +{ + hlist_bl_lock(&qd_hash_table[hash]); +} + +static inline void spin_unlock_bucket(unsigned int hash) +{ + hlist_bl_unlock(&qd_hash_table[hash]); +} + +static void gfs2_qd_dealloc(struct rcu_head *rcu) +{ + struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu); + kmem_cache_free(gfs2_quotad_cachep, qd); +} + static void gfs2_qd_dispose(struct list_head *list) { struct gfs2_quota_data *qd; @@ -93,6 +126,10 @@ static void gfs2_qd_dispose(struct list_head *list) list_del(&qd->qd_list); spin_unlock(&qd_lock); + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); + gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_count); gfs2_assert_warn(sdp, !qd->qd_bh_count); @@ -101,7 +138,7 @@ static void gfs2_qd_dispose(struct list_head *list) atomic_dec(&sdp->sd_quota_count); /* Delete it from the common reclaim list */ - kmem_cache_free(gfs2_quotad_cachep, qd); + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); } } @@ -171,83 +208,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd) return offset; } -static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid, - struct gfs2_quota_data **qdp) +static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; int error; qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); if (!qd) - return -ENOMEM; + return NULL; + qd->qd_sbd = sdp; qd->qd_lockref.count = 1; spin_lock_init(&qd->qd_lockref.lock); qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_lru); + qd->qd_hash = hash; error = gfs2_glock_get(sdp, qd2index(qd), &gfs2_quota_glops, CREATE, &qd->qd_gl); if (error) goto fail; - *qdp = qd; - - return 0; + return qd; fail: kmem_cache_free(gfs2_quotad_cachep, qd); - return error; + return NULL; } -static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, - struct gfs2_quota_data **qdp) +static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, + const struct gfs2_sbd *sdp, + struct kqid qid) { - struct gfs2_quota_data *qd = NULL, *new_qd = NULL; - int error, found; - - *qdp = NULL; + struct gfs2_quota_data *qd; + struct hlist_bl_node *h; - for (;;) { - found = 0; - spin_lock(&qd_lock); - list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { - if (qid_eq(qd->qd_id, qid) && - lockref_get_not_dead(&qd->qd_lockref)) { - list_lru_del(&gfs2_qd_lru, &qd->qd_lru); - found = 1; - break; - } + hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) { + if (!qid_eq(qd->qd_id, qid)) + continue; + if (qd->qd_sbd != sdp) + continue; + if (lockref_get_not_dead(&qd->qd_lockref)) { + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + return qd; } + } - if (!found) - qd = NULL; + return NULL; +} - if (!qd && new_qd) { - qd = new_qd; - list_add(&qd->qd_list, &sdp->sd_quota_list); - atomic_inc(&sdp->sd_quota_count); - new_qd = NULL; - } - spin_unlock(&qd_lock); +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd, *new_qd; + unsigned int hash = gfs2_qd_hash(sdp, qid); - if (qd) { - if (new_qd) { - gfs2_glock_put(new_qd->qd_gl); - kmem_cache_free(gfs2_quotad_cachep, new_qd); - } - *qdp = qd; - return 0; - } + rcu_read_lock(); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + rcu_read_unlock(); - error = qd_alloc(sdp, qid, &new_qd); - if (error) - return error; + if (qd) + return 0; + + new_qd = qd_alloc(hash, sdp, qid); + if (!new_qd) + return -ENOMEM; + + spin_lock(&qd_lock); + spin_lock_bucket(hash); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + if (qd == NULL) { + *qdp = new_qd; + list_add(&new_qd->qd_list, &sdp->sd_quota_list); + hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); + atomic_inc(&sdp->sd_quota_count); } + spin_unlock_bucket(hash); + spin_unlock(&qd_lock); + + if (qd) { + gfs2_glock_put(new_qd->qd_gl); + kmem_cache_free(gfs2_quotad_cachep, new_qd); + } + + return 0; } + static void qd_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; @@ -268,88 +317,48 @@ static void qd_put(struct gfs2_quota_data *qd) static int slot_get(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - unsigned int c, o = 0, b; - unsigned char byte = 0; + struct gfs2_sbd *sdp = qd->qd_sbd; + unsigned int bit; + int error = 0; - spin_lock(&qd_lock); + spin_lock(&sdp->sd_bitmap_lock); + if (qd->qd_slot_count != 0) + goto out; - if (qd->qd_slot_count++) { - spin_unlock(&qd_lock); - return 0; + error = -ENOSPC; + bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots); + if (bit < sdp->sd_quota_slots) { + set_bit(bit, sdp->sd_quota_bitmap); + qd->qd_slot = bit; +out: + qd->qd_slot_count++; } + spin_unlock(&sdp->sd_bitmap_lock); - for (c = 0; c < sdp->sd_quota_chunks; c++) - for (o = 0; o < PAGE_SIZE; o++) { - byte = sdp->sd_quota_bitmap[c][o]; - if (byte != 0xFF) - goto found; - } - - goto fail; - -found: - for (b = 0; b < 8; b++) - if (!(byte & (1 << b))) - break; - qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b; - - if (qd->qd_slot >= sdp->sd_quota_slots) - goto fail; - - sdp->sd_quota_bitmap[c][o] |= 1 << b; - - spin_unlock(&qd_lock); - - return 0; - -fail: - qd->qd_slot_count--; - spin_unlock(&qd_lock); - return -ENOSPC; + return error; } static void slot_hold(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; - spin_lock(&qd_lock); + spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_count); qd->qd_slot_count++; - spin_unlock(&qd_lock); -} - -static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, - unsigned int bit, int new_value) -{ - unsigned int c, o, b = bit; - int old_value; - - c = b / (8 * PAGE_SIZE); - b %= 8 * PAGE_SIZE; - o = b / 8; - b %= 8; - - old_value = (bitmap[c][o] & (1 << b)); - gfs2_assert_withdraw(sdp, !old_value != !new_value); - - if (new_value) - bitmap[c][o] |= 1 << b; - else - bitmap[c][o] &= ~(1 << b); + spin_unlock(&sdp->sd_bitmap_lock); } static void slot_put(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; - spin_lock(&qd_lock); + spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_count); if (!--qd->qd_slot_count) { - gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); + BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap)); qd->qd_slot = -1; } - spin_unlock(&qd_lock); + spin_unlock(&sdp->sd_bitmap_lock); } static int bh_get(struct gfs2_quota_data *qd) @@ -427,8 +436,7 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, list_move_tail(&qd->qd_list, &sdp->sd_quota_list); set_bit(QDF_LOCKED, &qd->qd_flags); qd->qd_change_sync = qd->qd_change; - gfs2_assert_warn(sdp, qd->qd_slot_count); - qd->qd_slot_count++; + slot_hold(qd); return 1; } @@ -1214,17 +1222,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) return error; } -static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf) -{ - const struct gfs2_quota_change *str = buf; - - qc->qc_change = be64_to_cpu(str->qc_change); - qc->qc_flags = be32_to_cpu(str->qc_flags); - qc->qc_id = make_kqid(&init_user_ns, - (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA, - be32_to_cpu(str->qc_id)); -} - int gfs2_quota_init(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); @@ -1232,6 +1229,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; unsigned int x, slot = 0; unsigned int found = 0; + unsigned int hash; + unsigned int bm_size; u64 dblock; u32 extlen = 0; int error; @@ -1240,23 +1239,20 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) return -EIO; sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; - sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); - + bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); + bm_size *= sizeof(unsigned long); error = -ENOMEM; - - sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, - sizeof(unsigned char *), GFP_NOFS); + sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN); + if (sdp->sd_quota_bitmap == NULL) + sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL); if (!sdp->sd_quota_bitmap) return error; - for (x = 0; x < sdp->sd_quota_chunks; x++) { - sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS); - if (!sdp->sd_quota_bitmap[x]) - goto fail; - } + memset(sdp->sd_quota_bitmap, 0, bm_size); for (x = 0; x < blocks; x++) { struct buffer_head *bh; + const struct gfs2_quota_change *qc; unsigned int y; if (!extlen) { @@ -1274,34 +1270,42 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) goto fail; } + qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header)); for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; y++, slot++) { - struct gfs2_quota_change_host qc; struct gfs2_quota_data *qd; - - gfs2_quota_change_in(&qc, bh->b_data + - sizeof(struct gfs2_meta_header) + - y * sizeof(struct gfs2_quota_change)); - if (!qc.qc_change) + s64 qc_change = be64_to_cpu(qc->qc_change); + u32 qc_flags = be32_to_cpu(qc->qc_flags); + enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ? + USRQUOTA : GRPQUOTA; + struct kqid qc_id = make_kqid(&init_user_ns, qtype, + be32_to_cpu(qc->qc_id)); + qc++; + if (!qc_change) continue; - error = qd_alloc(sdp, qc.qc_id, &qd); - if (error) { + hash = gfs2_qd_hash(sdp, qc_id); + qd = qd_alloc(hash, sdp, qc_id); + if (qd == NULL) { brelse(bh); goto fail; } set_bit(QDF_CHANGE, &qd->qd_flags); - qd->qd_change = qc.qc_change; + qd->qd_change = qc_change; qd->qd_slot = slot; qd->qd_slot_count = 1; spin_lock(&qd_lock); - gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); + BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap)); list_add(&qd->qd_list, &sdp->sd_quota_list); atomic_inc(&sdp->sd_quota_count); spin_unlock(&qd_lock); + spin_lock_bucket(hash); + hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]); + spin_unlock_bucket(hash); + found++; } @@ -1324,44 +1328,28 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) { struct list_head *head = &sdp->sd_quota_list; struct gfs2_quota_data *qd; - unsigned int x; spin_lock(&qd_lock); while (!list_empty(head)) { qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); - /* - * To be removed in due course... we should be able to - * ensure that all refs to the qd have done by this point - * so that this rather odd test is not required - */ - spin_lock(&qd->qd_lockref.lock); - if (qd->qd_lockref.count > 1 || - (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) { - spin_unlock(&qd->qd_lockref.lock); - list_move(&qd->qd_list, head); - spin_unlock(&qd_lock); - schedule(); - spin_lock(&qd_lock); - continue; - } - spin_unlock(&qd->qd_lockref.lock); - list_del(&qd->qd_list); + /* Also remove if this qd exists in the reclaim list */ list_lru_del(&gfs2_qd_lru, &qd->qd_lru); atomic_dec(&sdp->sd_quota_count); spin_unlock(&qd_lock); - if (!qd->qd_lockref.count) { - gfs2_assert_warn(sdp, !qd->qd_change); - gfs2_assert_warn(sdp, !qd->qd_slot_count); - } else - gfs2_assert_warn(sdp, qd->qd_slot_count == 1); + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); + + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); gfs2_assert_warn(sdp, !qd->qd_bh_count); gfs2_glock_put(qd->qd_gl); - kmem_cache_free(gfs2_quotad_cachep, qd); + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); spin_lock(&qd_lock); } @@ -1370,9 +1358,11 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); if (sdp->sd_quota_bitmap) { - for (x = 0; x < sdp->sd_quota_chunks; x++) - kfree(sdp->sd_quota_bitmap[x]); - kfree(sdp->sd_quota_bitmap); + if (is_vmalloc_addr(sdp->sd_quota_bitmap)) + vfree(sdp->sd_quota_bitmap); + else + kfree(sdp->sd_quota_bitmap); + sdp->sd_quota_bitmap = NULL; } } @@ -1656,3 +1646,11 @@ const struct quotactl_ops gfs2_quotactl_ops = { .get_dqblk = gfs2_get_dqblk, .set_dqblk = gfs2_set_dqblk, }; + +void __init gfs2_quota_hash_init(void) +{ + unsigned i; + + for(i = 0; i < GFS2_QD_HASH_SIZE; i++) + INIT_HLIST_BL_HEAD(&qd_hash_table[i]); +} diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 96e4f34a03b..55d506eb3c4 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) extern const struct quotactl_ops gfs2_quotactl_ops; extern struct shrinker gfs2_qd_shrinker; extern struct list_lru gfs2_qd_lru; +extern void __init gfs2_quota_hash_init(void); #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c8d6161bd68..a1da2134923 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -57,6 +57,11 @@ * 3 = Used (metadata) */ +struct gfs2_extent { + struct gfs2_rbm rbm; + u32 len; +}; + static const char valid_change[16] = { /* current */ /* n */ 0, 1, 1, 1, @@ -65,8 +70,9 @@ static const char valid_change[16] = { 1, 0, 0, 0 }; -static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, - const struct gfs2_inode *ip, bool nowrap); +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + const struct gfs2_inode *ip, bool nowrap, + const struct gfs2_alloc_parms *ap); /** @@ -635,9 +641,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs) /* return reserved blocks to the rgrp */ BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; + /* The rgrp extent failure point is likely not to increase; + it will only do so if the freed blocks are somehow + contiguous with a span of free blocks that follows. Still, + it will force the number to be recalculated later. */ + rgd->rd_extfail_pt += rs->rs_free; rs->rs_free = 0; clear_bit(GBF_FULL, &bi->bi_flags); - smp_mb__after_clear_bit(); } } @@ -876,6 +886,7 @@ static int rgd_insert(struct gfs2_rgrpd *rgd) static int read_rindex_entry(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + const unsigned bsize = sdp->sd_sb.sb_bsize; loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); struct gfs2_rindex buf; int error; @@ -913,6 +924,8 @@ static int read_rindex_entry(struct gfs2_inode *ip) goto fail; rgd->rd_gl->gl_object = rgd; + rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize; + rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1; rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; if (rgd->rd_data > sdp->sd_max_rg_data) @@ -1126,6 +1139,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); rgd->rd_free_clone = rgd->rd_free; + /* max out the rgrp allocation failure point */ + rgd->rd_extfail_pt = rgd->rd_free; } if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); @@ -1184,7 +1199,7 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh) if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb) return 0; - return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object); + return gfs2_rgrp_bh_get(rgd); } /** @@ -1455,7 +1470,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) return; - ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true); + ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap); if (ret == 0) { rs->rs_rbm = rbm; rs->rs_free = extlen; @@ -1520,6 +1535,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, * @rbm: The current position in the resource group * @ip: The inode for which we are searching for blocks * @minext: The minimum extent length + * @maxext: A pointer to the maximum extent structure * * This checks the current position in the rgrp to see whether there is * a reservation covering this block. If not then this function is a @@ -1532,7 +1548,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, const struct gfs2_inode *ip, - u32 minext) + u32 minext, + struct gfs2_extent *maxext) { u64 block = gfs2_rbm_to_block(rbm); u32 extlen = 1; @@ -1545,8 +1562,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, */ if (minext) { extlen = gfs2_free_extlen(rbm, minext); - nblock = block + extlen; - if (extlen < minext) + if (extlen <= maxext->len) goto fail; } @@ -1555,9 +1571,17 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, * and skip if parts of it are already reserved */ nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip); - if (nblock == block) - return 0; + if (nblock == block) { + if (!minext || extlen >= minext) + return 0; + + if (extlen > maxext->len) { + maxext->len = extlen; + maxext->rbm = *rbm; + } fail: + nblock = block + extlen; + } ret = gfs2_rbm_from_block(rbm, nblock); if (ret < 0) return ret; @@ -1568,30 +1592,38 @@ fail: * gfs2_rbm_find - Look for blocks of a particular state * @rbm: Value/result starting position and final position * @state: The state which we want to find - * @minext: The requested extent length (0 for a single block) + * @minext: Pointer to the requested extent length (NULL for a single block) + * This is updated to be the actual reservation size. * @ip: If set, check for reservations * @nowrap: Stop looking at the end of the rgrp, rather than wrapping * around until we've reached the starting point. + * @ap: the allocation parameters * * Side effects: * - If looking for free blocks, we set GBF_FULL on each bitmap which * has no free blocks in it. + * - If looking for free blocks, we set rd_extfail_pt on each rgrp which + * has come up short on a free block search. * * Returns: 0 on success, -ENOSPC if there is no block of the requested state */ -static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, - const struct gfs2_inode *ip, bool nowrap) +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + const struct gfs2_inode *ip, bool nowrap, + const struct gfs2_alloc_parms *ap) { struct buffer_head *bh; int initial_bii; u32 initial_offset; + int first_bii = rbm->bii; + u32 first_offset = rbm->offset; u32 offset; u8 *buffer; int n = 0; int iters = rbm->rgd->rd_length; int ret; struct gfs2_bitmap *bi; + struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, }; /* If we are not starting at the beginning of a bitmap, then we * need to add one to the bitmap count to ensure that we search @@ -1620,7 +1652,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, return 0; initial_bii = rbm->bii; - ret = gfs2_reservation_check_and_update(rbm, ip, minext); + ret = gfs2_reservation_check_and_update(rbm, ip, + minext ? *minext : 0, + &maxext); if (ret == 0) return 0; if (ret > 0) { @@ -1655,6 +1689,24 @@ next_iter: break; } + if (minext == NULL || state != GFS2_BLKST_FREE) + return -ENOSPC; + + /* If the extent was too small, and it's smaller than the smallest + to have failed before, remember for future reference that it's + useless to search this rgrp again for this amount or more. */ + if ((first_offset == 0) && (first_bii == 0) && + (*minext < rbm->rgd->rd_extfail_pt)) + rbm->rgd->rd_extfail_pt = *minext; + + /* If the maximum extent we found is big enough to fulfill the + minimum requirements, use it anyway. */ + if (maxext.len) { + *rbm = maxext.rbm; + *minext = maxext.len; + return 0; + } + return -ENOSPC; } @@ -1680,7 +1732,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip while (1) { down_write(&sdp->sd_log_flush_lock); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, + true, NULL); up_write(&sdp->sd_log_flush_lock); if (error == -ENOSPC) break; @@ -1891,7 +1944,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a } /* Skip unuseable resource groups */ - if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) + if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | + GFS2_RDF_ERROR)) || + (ap->target > rs->rs_rbm.rgd->rd_extfail_pt)) goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) @@ -1911,15 +1966,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a return 0; } - /* Drop reservation, if we couldn't use reserved rgrp */ - if (gfs2_rs_active(rs)) - gfs2_rs_deltree(rs); check_rgrp: /* Check for unlinked inodes which can be reclaimed */ if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, ip->i_no_addr); skip_rgrp: + /* Drop reservation, if we couldn't use reserved rgrp */ + if (gfs2_rs_active(rs)) + gfs2_rs_deltree(rs); + /* Unlock rgrp if required */ if (!rg_locked) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); @@ -2064,25 +2120,24 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, * */ -int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) +void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) { struct gfs2_rgrpd *rgd = gl->gl_object; struct gfs2_blkreserv *trs; const struct rb_node *n; if (rgd == NULL) - return 0; - gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n", + return; + gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n", (unsigned long long)rgd->rd_addr, rgd->rd_flags, rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, - rgd->rd_reserved); + rgd->rd_reserved, rgd->rd_extfail_pt); spin_lock(&rgd->rd_rsspin); for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { trs = rb_entry(n, struct gfs2_blkreserv, rs_node); dump_rs(seq, trs); } spin_unlock(&rgd->rd_rsspin); - return 0; } static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) @@ -2184,18 +2239,20 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, int error; gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL); if (error == -ENOSPC) { gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false, + NULL); } /* Since all blocks are reserved in advance, this shouldn't happen */ if (error) { - fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n", + fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n", (unsigned long long)ip->i_no_addr, error, *nblocks, - test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags)); + test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags), + rbm.rgd->rd_extfail_pt); goto rgrp_error; } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 3a10d2ffbbe..463ab2e95d1 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -68,7 +68,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); -extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct buffer_head *bh, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 35da5b19c0d..60f60f6181f 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -369,6 +369,33 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) return 0; } +static int init_threads(struct gfs2_sbd *sdp) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start logd thread: %d\n", error); + return error; + } + sdp->sd_logd_process = p; + + p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start quotad thread: %d\n", error); + goto fail; + } + sdp->sd_quotad_process = p; + return 0; + +fail: + kthread_stop(sdp->sd_logd_process); + return error; +} + /** * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one * @sdp: the filesystem @@ -384,10 +411,14 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) struct gfs2_log_header_host head; int error; - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); + error = init_threads(sdp); if (error) return error; + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); + if (error) + goto fail_threads; + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); error = gfs2_find_jhead(sdp->sd_jdesc, &head); @@ -417,7 +448,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) fail: t_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&t_gh); - +fail_threads: + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); return error; } @@ -800,6 +833,9 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) struct gfs2_holder t_gh; int error; + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); + flush_workqueue(gfs2_delete_workqueue); gfs2_quota_sync(sdp->sd_vfs, 0); gfs2_statfs_sync(sdp->sd_vfs, 0); @@ -857,9 +893,6 @@ restart: } spin_unlock(&sdp->sd_jindex_spin); - kthread_stop(sdp->sd_quotad_process); - kthread_stop(sdp->sd_logd_process); - if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_ro(sdp); if (error) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 8c6a6f6bdba..0b81f783f78 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -13,6 +13,7 @@ #include <linux/buffer_head.h> #include <linux/xattr.h> #include <linux/gfs2_ondisk.h> +#include <linux/posix_acl_xattr.h> #include <asm/uaccess.h> #include "gfs2.h" @@ -1500,7 +1501,8 @@ static const struct xattr_handler gfs2_xattr_security_handler = { const struct xattr_handler *gfs2_xattr_handlers[] = { &gfs2_xattr_user_handler, &gfs2_xattr_security_handler, - &gfs2_xattr_system_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, NULL, }; diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h index 07c0d494752..95c8ed9ec17 100644 --- a/fs/hfsplus/acl.h +++ b/fs/hfsplus/acl.h @@ -12,16 +12,13 @@ /* posix_acl.c */ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type); -extern int hfsplus_posix_acl_chmod(struct inode *); +int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, + int type); extern int hfsplus_init_posix_acl(struct inode *, struct inode *); #else /* CONFIG_HFSPLUS_FS_POSIX_ACL */ #define hfsplus_get_posix_acl NULL - -static inline int hfsplus_posix_acl_chmod(struct inode *inode) -{ - return 0; -} +#define hfsplus_set_posix_acl NULL static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) { diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 4a4fea00267..9ee62985e73 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -532,6 +532,7 @@ const struct inode_operations hfsplus_dir_inode_operations = { .removexattr = hfsplus_removexattr, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL .get_acl = hfsplus_get_posix_acl, + .set_acl = hfsplus_set_posix_acl, #endif }; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 37213d075f3..4551cbd6bd4 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -178,64 +178,6 @@ const struct dentry_operations hfsplus_dentry_operations = { .d_compare = hfsplus_compare_dentry, }; -static struct dentry *hfsplus_file_lookup(struct inode *dir, - struct dentry *dentry, unsigned int flags) -{ - struct hfs_find_data fd; - struct super_block *sb = dir->i_sb; - struct inode *inode = NULL; - struct hfsplus_inode_info *hip; - int err; - - if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) - goto out; - - inode = HFSPLUS_I(dir)->rsrc_inode; - if (inode) - goto out; - - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - hip = HFSPLUS_I(inode); - inode->i_ino = dir->i_ino; - INIT_LIST_HEAD(&hip->open_dir_list); - mutex_init(&hip->extents_lock); - hip->extent_state = 0; - hip->flags = 0; - hip->userflags = 0; - set_bit(HFSPLUS_I_RSRC, &hip->flags); - - err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); - if (!err) { - err = hfsplus_find_cat(sb, dir->i_ino, &fd); - if (!err) - err = hfsplus_cat_read_inode(inode, &fd); - hfs_find_exit(&fd); - } - if (err) { - iput(inode); - return ERR_PTR(err); - } - hip->rsrc_inode = dir; - HFSPLUS_I(dir)->rsrc_inode = inode; - igrab(dir); - - /* - * __mark_inode_dirty expects inodes to be hashed. Since we don't - * want resource fork inodes in the regular inode space, we make them - * appear hashed, but do not put on any lists. hlist_del() - * will work fine and require no locking. - */ - hlist_add_fake(&inode->i_hash); - - mark_inode_dirty(inode); -out: - d_add(dentry, inode); - return NULL; -} - static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) { @@ -319,7 +261,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) mark_inode_dirty(inode); if (attr->ia_valid & ATTR_MODE) { - error = hfsplus_posix_acl_chmod(inode); + error = posix_acl_chmod(inode, inode->i_mode); if (unlikely(error)) return error; } @@ -385,7 +327,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, } static const struct inode_operations hfsplus_file_inode_operations = { - .lookup = hfsplus_file_lookup, .setattr = hfsplus_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -393,6 +334,7 @@ static const struct inode_operations hfsplus_file_inode_operations = { .removexattr = hfsplus_removexattr, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL .get_acl = hfsplus_get_posix_acl, + .set_acl = hfsplus_set_posix_acl, #endif }; diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index b609cc14c72..df0c9af68d0 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -17,9 +17,7 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) char *value = NULL; ssize_t size; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; + hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); switch (type) { case ACL_TYPE_ACCESS: @@ -56,17 +54,15 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) return acl; } -static int hfsplus_set_posix_acl(struct inode *inode, - int type, - struct posix_acl *acl) +int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, + int type) { int err; char *xattr_name; size_t size = 0; char *value = NULL; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; + hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); switch (type) { case ACL_TYPE_ACCESS: @@ -115,7 +111,7 @@ end_set_acl: int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) { int err = 0; - struct posix_acl *acl = NULL; + struct posix_acl *default_acl, *acl; hfs_dbg(ACL_MOD, "[%s]: ino %lu, dir->ino %lu\n", @@ -124,151 +120,21 @@ int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) if (S_ISLNK(inode->i_mode)) return 0; - acl = hfsplus_get_posix_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - if (S_ISDIR(inode->i_mode)) { - err = hfsplus_set_posix_acl(inode, - ACL_TYPE_DEFAULT, - acl); - if (unlikely(err)) - goto init_acl_cleanup; - } - - err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (unlikely(err < 0)) - return err; - - if (err > 0) - err = hfsplus_set_posix_acl(inode, - ACL_TYPE_ACCESS, - acl); - } else - inode->i_mode &= ~current_umask(); - -init_acl_cleanup: - posix_acl_release(acl); - return err; -} - -int hfsplus_posix_acl_chmod(struct inode *inode) -{ - int err; - struct posix_acl *acl; - - hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - acl = hfsplus_get_posix_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - - err = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (unlikely(err)) + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) return err; - err = hfsplus_set_posix_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return err; -} - -static int hfsplus_xattr_get_posix_acl(struct dentry *dentry, - const char *name, - void *buffer, - size_t size, - int type) -{ - int err = 0; - struct posix_acl *acl; - - hfs_dbg(ACL_MOD, - "[%s]: ino %lu, buffer %p, size %zu, type %#x\n", - __func__, dentry->d_inode->i_ino, buffer, size, type); - - if (strcmp(name, "") != 0) - return -EINVAL; - - acl = hfsplus_get_posix_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - - err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return err; -} - -static int hfsplus_xattr_set_posix_acl(struct dentry *dentry, - const char *name, - const void *value, - size_t size, - int flags, - int type) -{ - int err = 0; - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; - - hfs_dbg(ACL_MOD, - "[%s]: ino %lu, value %p, size %zu, flags %#x, type %#x\n", - __func__, inode->i_ino, value, size, flags, type); - - if (strcmp(name, "") != 0) - return -EINVAL; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - err = posix_acl_valid(acl); - if (err) - goto end_xattr_set_acl; - } + if (default_acl) { + err = hfsplus_set_posix_acl(inode, default_acl, + ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); } - err = hfsplus_set_posix_acl(inode, type, acl); - -end_xattr_set_acl: - posix_acl_release(acl); + if (acl) { + if (!err) + err = hfsplus_set_posix_acl(inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); + } return err; } - -static size_t hfsplus_xattr_list_posix_acl(struct dentry *dentry, - char *list, - size_t list_size, - const char *name, - size_t name_len, - int type) -{ - /* - * This method is not used. - * It is used hfsplus_listxattr() instead of generic_listxattr(). - */ - return -EOPNOTSUPP; -} - -const struct xattr_handler hfsplus_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = hfsplus_xattr_list_posix_acl, - .get = hfsplus_xattr_get_posix_acl, - .set = hfsplus_xattr_set_posix_acl, -}; - -const struct xattr_handler hfsplus_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = hfsplus_xattr_list_posix_acl, - .get = hfsplus_xattr_get_posix_acl, - .set = hfsplus_xattr_set_posix_acl, -}; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 3c6136f98c7..0b4a5c9b93c 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -7,6 +7,7 @@ */ #include "hfsplus_fs.h" +#include <linux/posix_acl_xattr.h> #include "xattr.h" #include "acl.h" @@ -15,8 +16,8 @@ const struct xattr_handler *hfsplus_xattr_handlers[] = { &hfsplus_xattr_user_handler, &hfsplus_xattr_trusted_handler, #ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - &hfsplus_xattr_acl_access_handler, - &hfsplus_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &hfsplus_xattr_security_handler, NULL @@ -51,82 +52,6 @@ static inline int is_known_namespace(const char *name) return true; } -static int can_set_system_xattr(struct inode *inode, const char *name, - const void *value, size_t size) -{ -#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - struct posix_acl *acl; - int err; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - /* - * POSIX_ACL_XATTR_ACCESS is tied to i_mode - */ - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - err = posix_acl_equiv_mode(acl, &inode->i_mode); - posix_acl_release(acl); - if (err < 0) - return err; - mark_inode_dirty(inode); - } - /* - * We're changing the ACL. Get rid of the cached one - */ - forget_cached_acl(inode, ACL_TYPE_ACCESS); - - return 0; - } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - posix_acl_release(acl); - - /* - * We're changing the default ACL. Get rid of the cached one - */ - forget_cached_acl(inode, ACL_TYPE_DEFAULT); - - return 0; - } -#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */ - return -EOPNOTSUPP; -} - -static int can_set_xattr(struct inode *inode, const char *name, - const void *value, size_t value_len) -{ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return can_set_system_xattr(inode, name, value, value_len); - - if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) { - /* - * This makes sure that we aren't trying to set an - * attribute in a different namespace by prefixing it - * with "osx." - */ - if (is_known_namespace(name + XATTR_MAC_OSX_PREFIX_LEN)) - return -EOPNOTSUPP; - - return 0; - } - - /* - * Don't allow setting an attribute in an unknown namespace. - */ - if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && - strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && - strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return -EOPNOTSUPP; - - return 0; -} - static void hfsplus_init_header_node(struct inode *attr_file, u32 clump_size, char *buf, u16 node_size) @@ -349,10 +274,6 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, HFSPLUS_IS_RSRC(inode)) return -EOPNOTSUPP; - err = can_set_xattr(inode, name, value, size); - if (err) - return err; - if (strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN) == 0) name += XATTR_MAC_OSX_PREFIX_LEN; @@ -840,10 +761,6 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name) if (!HFSPLUS_SB(inode->i_sb)->attr_tree) return -EOPNOTSUPP; - err = can_set_xattr(inode, name, NULL, 0); - if (err) - return err; - if (strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN) == 0) name += XATTR_MAC_OSX_PREFIX_LEN; @@ -940,6 +857,9 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, if (len > HFSPLUS_ATTR_MAX_STRLEN) return -EOPNOTSUPP; + if (is_known_namespace(name)) + return -EOPNOTSUPP; + strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index 841b5698c0f..9e214490c31 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -14,8 +14,6 @@ extern const struct xattr_handler hfsplus_xattr_osx_handler; extern const struct xattr_handler hfsplus_xattr_user_handler; extern const struct xattr_handler hfsplus_xattr_trusted_handler; -extern const struct xattr_handler hfsplus_xattr_acl_access_handler; -extern const struct xattr_handler hfsplus_xattr_acl_default_handler; extern const struct xattr_handler hfsplus_xattr_security_handler; extern const struct xattr_handler *hfsplus_xattr_handlers[]; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index db23ce1bd90..fe649d325b1 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -186,7 +186,7 @@ static struct inode *hostfs_iget(struct super_block *sb) return inode; } -int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) +static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) { /* * do_statfs uses struct statfs64 internally, but the linux kernel @@ -268,7 +268,7 @@ static const struct super_operations hostfs_sbops = { .show_options = hostfs_show_options, }; -int hostfs_readdir(struct file *file, struct dir_context *ctx) +static int hostfs_readdir(struct file *file, struct dir_context *ctx) { void *dir; char *name; @@ -293,7 +293,7 @@ int hostfs_readdir(struct file *file, struct dir_context *ctx) return 0; } -int hostfs_file_open(struct inode *ino, struct file *file) +static int hostfs_file_open(struct inode *ino, struct file *file) { static DEFINE_MUTEX(open_mutex); char *name; @@ -359,7 +359,8 @@ static int hostfs_file_release(struct inode *inode, struct file *file) return 0; } -int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) +static int hostfs_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { struct inode *inode = file->f_mapping->host; int ret; @@ -394,7 +395,7 @@ static const struct file_operations hostfs_dir_fops = { .read = generic_read_dir, }; -int hostfs_writepage(struct page *page, struct writeback_control *wbc) +static int hostfs_writepage(struct page *page, struct writeback_control *wbc) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; @@ -430,7 +431,7 @@ int hostfs_writepage(struct page *page, struct writeback_control *wbc) return err; } -int hostfs_readpage(struct file *file, struct page *page) +static int hostfs_readpage(struct file *file, struct page *page) { char *buffer; long long start; @@ -455,9 +456,9 @@ int hostfs_readpage(struct file *file, struct page *page) return err; } -int hostfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) +static int hostfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { pgoff_t index = pos >> PAGE_CACHE_SHIFT; @@ -467,9 +468,9 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping, return 0; } -int hostfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) +static int hostfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { struct inode *inode = mapping->host; void *buffer; @@ -549,8 +550,8 @@ static int read_name(struct inode *ino, char *name) return 0; } -int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) { struct inode *inode; char *name; @@ -591,8 +592,8 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return error; } -struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, - unsigned int flags) +static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, + unsigned int flags) { struct inode *inode; char *name; @@ -628,7 +629,8 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, return ERR_PTR(err); } -int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) +static int hostfs_link(struct dentry *to, struct inode *ino, + struct dentry *from) { char *from_name, *to_name; int err; @@ -646,7 +648,7 @@ int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) return err; } -int hostfs_unlink(struct inode *ino, struct dentry *dentry) +static int hostfs_unlink(struct inode *ino, struct dentry *dentry) { char *file; int err; @@ -662,7 +664,8 @@ int hostfs_unlink(struct inode *ino, struct dentry *dentry) return err; } -int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) +static int hostfs_symlink(struct inode *ino, struct dentry *dentry, + const char *to) { char *file; int err; @@ -674,7 +677,7 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) return err; } -int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) +static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) { char *file; int err; @@ -686,7 +689,7 @@ int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) return err; } -int hostfs_rmdir(struct inode *ino, struct dentry *dentry) +static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) { char *file; int err; @@ -738,8 +741,8 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, return err; } -int hostfs_rename(struct inode *from_ino, struct dentry *from, - struct inode *to_ino, struct dentry *to) +static int hostfs_rename(struct inode *from_ino, struct dentry *from, + struct inode *to_ino, struct dentry *to) { char *from_name, *to_name; int err; @@ -756,7 +759,7 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from, return err; } -int hostfs_permission(struct inode *ino, int desired) +static int hostfs_permission(struct inode *ino, int desired) { char *name; int r = 0, w = 0, x = 0, err; @@ -782,7 +785,7 @@ int hostfs_permission(struct inode *ino, int desired) return err; } -int hostfs_setattr(struct dentry *dentry, struct iattr *attr) +static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct hostfs_iattr attrs; diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 2d04f9afafd..06fe11e0abf 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -573,7 +573,7 @@ int log_wait_commit(journal_t *journal, tid_t tid) #ifdef CONFIG_JBD_DEBUG spin_lock(&journal->j_state_lock); if (!tid_geq(journal->j_commit_request, tid)) { - printk(KERN_EMERG + printk(KERN_ERR "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } @@ -604,10 +604,8 @@ int log_wait_commit(journal_t *journal, tid_t tid) out_unlock: spin_unlock(&journal->j_state_lock); - if (unlikely(is_journal_aborted(journal))) { - printk(KERN_EMERG "journal commit I/O error\n"); + if (unlikely(is_journal_aborted(journal))) err = -EIO; - } return err; } @@ -2136,7 +2134,7 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); + printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n); #endif jbd_remove_debugfs_entry(); journal_destroy_caches(); diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index aa603e017d2..1695ba8334a 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -675,7 +675,7 @@ repeat: jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - printk(KERN_EMERG + printk(KERN_ERR "%s: OOM for frozen_buffer\n", __func__); JBUFFER_TRACE(jh, "oom!"); @@ -898,7 +898,7 @@ repeat: if (!jh->b_committed_data) { committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { - printk(KERN_EMERG "%s: No memory for committed data\n", + printk(KERN_ERR "%s: No memory for committed data\n", __func__); err = -ENOMEM; goto out; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 52032647dd4..5fa344afb49 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -702,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) read_lock(&journal->j_state_lock); #ifdef CONFIG_JBD2_DEBUG if (!tid_geq(journal->j_commit_request, tid)) { - printk(KERN_EMERG + printk(KERN_ERR "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } @@ -718,10 +718,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) } read_unlock(&journal->j_state_lock); - if (unlikely(is_journal_aborted(journal))) { - printk(KERN_EMERG "journal commit I/O error\n"); + if (unlikely(is_journal_aborted(journal))) err = -EIO; - } return err; } @@ -1527,13 +1525,13 @@ static int journal_get_superblock(journal_t *journal) if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { /* Can't have checksum v1 and v2 on at the same time! */ - printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 " + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " "at the same time!\n"); goto out; } if (!jbd2_verify_csum_type(journal, sb)) { - printk(KERN_ERR "JBD: Unknown checksum type\n"); + printk(KERN_ERR "JBD2: Unknown checksum type\n"); goto out; } @@ -1541,7 +1539,7 @@ static int journal_get_superblock(journal_t *journal) if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD: Cannot load crc32c driver.\n"); + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); err = PTR_ERR(journal->j_chksum_driver); journal->j_chksum_driver = NULL; goto out; @@ -1550,7 +1548,7 @@ static int journal_get_superblock(journal_t *journal) /* Check superblock checksum */ if (!jbd2_superblock_csum_verify(journal, sb)) { - printk(KERN_ERR "JBD: journal checksum error\n"); + printk(KERN_ERR "JBD2: journal checksum error\n"); goto out; } @@ -1836,7 +1834,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD: Cannot load crc32c " + printk(KERN_ERR "JBD2: Cannot load crc32c " "driver.\n"); journal->j_chksum_driver = NULL; return 0; @@ -2645,7 +2643,7 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD2_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n); + printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n); #endif jbd2_remove_jbd_stats_proc_entry(); jbd2_journal_destroy_caches(); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 3929c50428b..3b6bb19d60b 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -594,7 +594,7 @@ static int do_one_pass(journal_t *journal, be32_to_cpu(tmp->h_sequence))) { brelse(obh); success = -EIO; - printk(KERN_ERR "JBD: Invalid " + printk(KERN_ERR "JBD2: Invalid " "checksum recovering " "block %llu in log\n", blocknr); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 7aa9a32573b..8360674c85b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -932,7 +932,7 @@ repeat: jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - printk(KERN_EMERG + printk(KERN_ERR "%s: OOM for frozen_buffer\n", __func__); JBUFFER_TRACE(jh, "oom!"); @@ -1166,7 +1166,7 @@ repeat: if (!jh->b_committed_data) { committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { - printk(KERN_EMERG "%s: No memory for committed data\n", + printk(KERN_ERR "%s: No memory for committed data\n", __func__); err = -ENOMEM; goto out; @@ -1290,7 +1290,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * once a transaction -bzzz */ jh->b_modified = 1; - J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + if (handle->h_buffer_credits <= 0) { + ret = -ENOSPC; + goto out_unlock_bh; + } handle->h_buffer_credits--; } @@ -1305,7 +1308,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "fastpath"); if (unlikely(jh->b_transaction != journal->j_running_transaction)) { - printk(KERN_EMERG "JBD: %s: " + printk(KERN_ERR "JBD2: %s: " "jh->b_transaction (%llu, %p, %u) != " "journal->j_running_transaction (%p, %u)", journal->j_devname, @@ -1332,7 +1335,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "already on other transaction"); if (unlikely(jh->b_transaction != journal->j_committing_transaction)) { - printk(KERN_EMERG "JBD: %s: " + printk(KERN_ERR "JBD2: %s: " "jh->b_transaction (%llu, %p, %u) != " "journal->j_committing_transaction (%p, %u)", journal->j_devname, @@ -1345,7 +1348,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) ret = -EINVAL; } if (unlikely(jh->b_next_transaction != transaction)) { - printk(KERN_EMERG "JBD: %s: " + printk(KERN_ERR "JBD2: %s: " "jh->b_next_transaction (%llu, %p, %u) != " "transaction (%p, %u)", journal->j_devname, @@ -1373,7 +1376,6 @@ out_unlock_bh: jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); - WARN_ON(ret); /* All errors are bugs, so dump the stack */ return ret; } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 223283c3011..009ec0b5993 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -178,10 +178,6 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type) char *value = NULL; int rc, xprefix; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; @@ -232,13 +228,10 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a return rc; } -static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) +int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int rc, xprefix; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; @@ -277,30 +270,21 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode) { - struct posix_acl *acl; + struct posix_acl *default_acl, *acl; int rc; cache_no_acl(inode); - if (S_ISLNK(*i_mode)) - return 0; /* Symlink always has no-ACL */ - - acl = jffs2_get_acl(dir_i, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (!acl) { - *i_mode &= ~current_umask(); - } else { - if (S_ISDIR(*i_mode)) - set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); - - rc = posix_acl_create(&acl, GFP_KERNEL, i_mode); - if (rc < 0) - return rc; - if (rc > 0) - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + rc = posix_acl_create(dir_i, i_mode, &default_acl, &acl); + if (rc) + return rc; + if (default_acl) { + set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl); + posix_acl_release(default_acl); + } + if (acl) { + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); } return 0; @@ -324,106 +308,3 @@ int jffs2_init_acl_post(struct inode *inode) return 0; } - -int jffs2_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int rc; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (rc) - return rc; - rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return rc; -} - -static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (list && retlen <= list_size) - strcpy(list, POSIX_ACL_XATTR_ACCESS); - return retlen; -} - -static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (list && retlen <= list_size) - strcpy(list, POSIX_ACL_XATTR_DEFAULT); - return retlen; -} - -static int jffs2_acl_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct posix_acl *acl; - int rc; - - if (name[0] != '\0') - return -EINVAL; - - acl = jffs2_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) - return -ENODATA; - rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return rc; -} - -static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct posix_acl *acl; - int rc; - - if (name[0] != '\0') - return -EINVAL; - if (!inode_owner_or_capable(dentry->d_inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - rc = posix_acl_valid(acl); - if (rc) - goto out; - } - } else { - acl = NULL; - } - rc = jffs2_set_acl(dentry->d_inode, type, acl); - out: - posix_acl_release(acl); - return rc; -} - -const struct xattr_handler jffs2_acl_access_xattr_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_DEFAULT, - .list = jffs2_acl_access_listxattr, - .get = jffs2_acl_getxattr, - .set = jffs2_acl_setxattr, -}; - -const struct xattr_handler jffs2_acl_default_xattr_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = jffs2_acl_default_listxattr, - .get = jffs2_acl_getxattr, - .set = jffs2_acl_setxattr, -}; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 9b477246f2a..2e2b5745c3b 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -27,17 +27,14 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL struct posix_acl *jffs2_get_acl(struct inode *inode, int type); -extern int jffs2_acl_chmod(struct inode *); +int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); -extern const struct xattr_handler jffs2_acl_access_xattr_handler; -extern const struct xattr_handler jffs2_acl_default_xattr_handler; - #else #define jffs2_get_acl (NULL) -#define jffs2_acl_chmod(inode) (0) +#define jffs2_set_acl (NULL) #define jffs2_init_acl_pre(dir_i,inode,mode) (0) #define jffs2_init_acl_post(inode) (0) diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index e3aac222472..938556025d6 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -59,6 +59,7 @@ const struct inode_operations jffs2_dir_inode_operations = .mknod = jffs2_mknod, .rename = jffs2_rename, .get_acl = jffs2_get_acl, + .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 1506673c087..256cd19a3b7 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -66,6 +66,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { .get_acl = jffs2_get_acl, + .set_acl = jffs2_set_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 09b3ed45572..a69e426435d 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -190,15 +190,16 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) { + struct inode *inode = dentry->d_inode; int rc; - rc = inode_change_ok(dentry->d_inode, iattr); + rc = inode_change_ok(inode, iattr); if (rc) return rc; - rc = jffs2_do_setattr(dentry->d_inode, iattr); + rc = jffs2_do_setattr(inode, iattr); if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = jffs2_acl_chmod(dentry->d_inode); + rc = posix_acl_chmod(inode, inode->i_mode); return rc; } diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index 4f47aa24b55..b8fd651307a 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c @@ -288,6 +288,8 @@ struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) struct jffs2_xattr_datum *xd; xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL); dbg_memalloc("%p\n", xd); + if (!xd) + return NULL; xd->class = RAWNODE_CLASS_XATTR_DATUM; xd->node = (void *)xd; @@ -306,6 +308,8 @@ struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) struct jffs2_xattr_ref *ref; ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL); dbg_memalloc("%p\n", ref); + if (!ref) + return NULL; ref->class = RAWNODE_CLASS_XATTR_REF; ref->node = (void *)ref; diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index 975a1f562c1..9a5449bc3af 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -564,25 +564,10 @@ struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_ they're killed. */ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c) { - struct jffs2_node_frag *frag; - struct jffs2_node_frag *parent; - - if (!root->rb_node) - return; + struct jffs2_node_frag *frag, *next; dbg_fragtree("killing\n"); - - frag = (rb_entry(root->rb_node, struct jffs2_node_frag, rb)); - while(frag) { - if (frag->rb.rb_left) { - frag = frag_left(frag); - continue; - } - if (frag->rb.rb_right) { - frag = frag_right(frag); - continue; - } - + rbtree_postorder_for_each_entry_safe(frag, next, root, rb) { if (frag->node && !(--frag->node->frags)) { /* Not a hole, and it's the final remaining frag of this node. Free the node */ @@ -591,17 +576,8 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c) jffs2_free_full_dnode(frag->node); } - parent = frag_parent(frag); - if (parent) { - if (frag_left(parent) == frag) - parent->rb.rb_left = NULL; - else - parent->rb.rb_right = NULL; - } jffs2_free_node_frag(frag); - frag = parent; - cond_resched(); } } diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index ae81b01e6fd..386303dca38 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -543,33 +543,13 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c, static void jffs2_free_tmp_dnode_info_list(struct rb_root *list) { - struct rb_node *this; - struct jffs2_tmp_dnode_info *tn; - - this = list->rb_node; + struct jffs2_tmp_dnode_info *tn, *next; - /* Now at bottom of tree */ - while (this) { - if (this->rb_left) - this = this->rb_left; - else if (this->rb_right) - this = this->rb_right; - else { - tn = rb_entry(this, struct jffs2_tmp_dnode_info, rb); + rbtree_postorder_for_each_entry_safe(tn, next, list, rb) { jffs2_free_full_dnode(tn->fn); jffs2_free_tmp_dnode_info(tn); - - this = rb_parent(this); - if (!this) - break; - - if (this->rb_left == &tn->rb) - this->rb_left = NULL; - else if (this->rb_right == &tn->rb) - this->rb_right = NULL; - else BUG(); - } } + *list = RB_ROOT; } diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index 6e563332bb2..c7c77b0dfcc 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -22,7 +22,6 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, - .get_acl = jffs2_get_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 3034e970eb9..ad0f2e2a170 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -22,6 +22,7 @@ #include <linux/crc32.h> #include <linux/jffs2.h> #include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> #include <linux/mtd/mtd.h> #include "nodelist.h" /* -------- xdatum related functions ---------------- @@ -921,8 +922,8 @@ const struct xattr_handler *jffs2_xattr_handlers[] = { &jffs2_security_xattr_handler, #endif #ifdef CONFIG_JFFS2_FS_POSIX_ACL - &jffs2_acl_access_xattr_handler, - &jffs2_acl_default_xattr_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &jffs2_trusted_xattr_handler, NULL @@ -942,10 +943,10 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) { #endif #ifdef CONFIG_JFFS2_FS_POSIX_ACL case JFFS2_XPREFIX_ACL_ACCESS: - ret = &jffs2_acl_access_xattr_handler; + ret = &posix_acl_access_xattr_handler; break; case JFFS2_XPREFIX_ACL_DEFAULT: - ret = &jffs2_acl_default_xattr_handler; + ret = &posix_acl_default_xattr_handler; break; #endif case JFFS2_XPREFIX_TRUSTED: diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index d254d6d3599..e973b85d6af 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -72,7 +72,7 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type) return acl; } -static int jfs_set_acl(tid_t tid, struct inode *inode, int type, +static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, struct posix_acl *acl) { char *ea_name; @@ -80,21 +80,22 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type, int size = 0; char *value = NULL; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - switch(type) { - case ACL_TYPE_ACCESS: - ea_name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - ea_name = POSIX_ACL_XATTR_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - return acl ? -EACCES : 0; - break; - default: - return -EINVAL; + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = POSIX_ACL_XATTR_ACCESS; + rc = posix_acl_equiv_mode(acl, &inode->i_mode); + if (rc < 0) + return rc; + if (rc == 0) + acl = NULL; + break; + case ACL_TYPE_DEFAULT: + ea_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return -EINVAL; } + if (acl) { size = posix_acl_xattr_size(acl->a_count); value = kmalloc(size, GFP_KERNEL); @@ -114,65 +115,43 @@ out: return rc; } +int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int rc; + tid_t tid; + + tid = txBegin(inode->i_sb, 0); + mutex_lock(&JFS_IP(inode)->commit_mutex); + rc = __jfs_set_acl(tid, inode, type, acl); + if (!rc) + rc = txCommit(tid, 1, &inode, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(inode)->commit_mutex); + return rc; +} + int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) { - struct posix_acl *acl = NULL; + struct posix_acl *default_acl, *acl; int rc = 0; - if (S_ISLNK(inode->i_mode)) - return 0; + rc = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (rc) + return rc; - acl = jfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); + if (default_acl) { + rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl); + posix_acl_release(default_acl); + } if (acl) { - if (S_ISDIR(inode->i_mode)) { - rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl); - if (rc) - goto cleanup; - } - rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (rc < 0) - goto cleanup; /* posix_acl_release(NULL) is no-op */ - if (rc > 0) - rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); -cleanup: + if (!rc) + rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); posix_acl_release(acl); - } else - inode->i_mode &= ~current_umask(); + } JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | inode->i_mode; return rc; } - -int jfs_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int rc; - tid_t tid; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - - rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (rc) - return rc; - - tid = txBegin(inode->i_sb, 0); - mutex_lock(&JFS_IP(inode)->commit_mutex); - rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); - if (!rc) - rc = txCommit(tid, 1, &inode, 0); - txEnd(tid); - mutex_unlock(&JFS_IP(inode)->commit_mutex); - - posix_acl_release(acl); - return rc; -} diff --git a/fs/jfs/file.c b/fs/jfs/file.c index dd7442c5835..794da944d5c 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/fs.h> +#include <linux/posix_acl.h> #include <linux/quotaops.h> #include "jfs_incore.h" #include "jfs_inode.h" @@ -131,7 +132,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) - rc = jfs_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); return rc; } @@ -143,6 +144,7 @@ const struct inode_operations jfs_file_inode_operations = { .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, + .set_acl = jfs_set_acl, #endif }; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index ad84fe50ca9..489f993b7b1 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -21,8 +21,8 @@ #ifdef CONFIG_JFS_POSIX_ACL struct posix_acl *jfs_get_acl(struct inode *inode, int type); +int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); -int jfs_acl_chmod(struct inode *inode); #else @@ -32,10 +32,5 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode, return 0; } -static inline int jfs_acl_chmod(struct inode *inode) -{ - return 0; -} - #endif #endif /* _H_JFS_ACL */ diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h index e9e100fd7c0..e8d717dabca 100644 --- a/fs/jfs/jfs_xattr.h +++ b/fs/jfs/jfs_xattr.h @@ -61,6 +61,8 @@ extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); extern int jfs_removexattr(struct dentry *, const char *); +extern const struct xattr_handler *jfs_xattr_handlers[]; + #ifdef CONFIG_JFS_SECURITY extern int jfs_init_security(tid_t, struct inode *, struct inode *, const struct qstr *); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index aa8a3370631..d59c7defb1e 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1524,6 +1524,7 @@ const struct inode_operations jfs_dir_inode_operations = { .setattr = jfs_setattr, #ifdef CONFIG_JFS_POSIX_ACL .get_acl = jfs_get_acl, + .set_acl = jfs_set_acl, #endif }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 6669aa2042c..e2b7483444f 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -44,6 +44,7 @@ #include "jfs_imap.h" #include "jfs_acl.h" #include "jfs_debug.h" +#include "jfs_xattr.h" MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); @@ -522,6 +523,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) */ sb->s_op = &jfs_super_operations; sb->s_export_op = &jfs_export_operations; + sb->s_xattr = jfs_xattr_handlers; #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; sb->s_qcop = &dquot_quotactl_ops; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index d3472f4cd53..5324e4e2b99 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -666,81 +666,12 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, } /* - * can_set_system_xattr - * - * This code is specific to the system.* namespace. It contains policy - * which doesn't belong in the main xattr codepath. - */ -static int can_set_system_xattr(struct inode *inode, const char *name, - const void *value, size_t value_len) -{ -#ifdef CONFIG_JFS_POSIX_ACL - struct posix_acl *acl; - int rc; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - /* - * POSIX_ACL_XATTR_ACCESS is tied to i_mode - */ - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, value_len); - if (IS_ERR(acl)) { - rc = PTR_ERR(acl); - printk(KERN_ERR "posix_acl_from_xattr returned %d\n", - rc); - return rc; - } - if (acl) { - rc = posix_acl_equiv_mode(acl, &inode->i_mode); - posix_acl_release(acl); - if (rc < 0) { - printk(KERN_ERR - "posix_acl_equiv_mode returned %d\n", - rc); - return rc; - } - mark_inode_dirty(inode); - } - /* - * We're changing the ACL. Get rid of the cached one - */ - forget_cached_acl(inode, ACL_TYPE_ACCESS); - - return 0; - } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { - acl = posix_acl_from_xattr(&init_user_ns, value, value_len); - if (IS_ERR(acl)) { - rc = PTR_ERR(acl); - printk(KERN_ERR "posix_acl_from_xattr returned %d\n", - rc); - return rc; - } - posix_acl_release(acl); - - /* - * We're changing the default ACL. Get rid of the cached one - */ - forget_cached_acl(inode, ACL_TYPE_DEFAULT); - - return 0; - } -#endif /* CONFIG_JFS_POSIX_ACL */ - return -EOPNOTSUPP; -} - -/* * Most of the permission checking is done by xattr_permission in the vfs. - * The local file system is responsible for handling the system.* namespace. * We also need to verify that this is a namespace that we recognize. */ static int can_set_xattr(struct inode *inode, const char *name, const void *value, size_t value_len) { - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return can_set_system_xattr(inode, name, value, value_len); - if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) { /* * This makes sure that we aren't trying to set an @@ -748,7 +679,7 @@ static int can_set_xattr(struct inode *inode, const char *name, * with "os2." */ if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN)) - return -EOPNOTSUPP; + return -EOPNOTSUPP; return 0; } @@ -913,6 +844,14 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, if ((rc = can_set_xattr(inode, name, value, value_len))) return rc; + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, value_len, flags); + if (value == NULL) { /* empty EA, do not remove */ value = ""; value_len = 0; @@ -986,6 +925,14 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, { int err; + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, data, buf_size); + if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { /* * skip past "os2." prefix @@ -1077,6 +1024,14 @@ int jfs_removexattr(struct dentry *dentry, const char *name) if ((rc = can_set_xattr(inode, name, NULL, 0))) return rc; + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + tid = txBegin(inode->i_sb, 0); mutex_lock(&ji->commit_mutex); rc = __jfs_setxattr(tid, dentry->d_inode, name, NULL, 0, XATTR_REPLACE); @@ -1088,6 +1043,19 @@ int jfs_removexattr(struct dentry *dentry, const char *name) return rc; } +/* + * List of handlers for synthetic system.* attributes. All real ondisk + * attributes are handled directly. + */ +const struct xattr_handler *jfs_xattr_handlers[] = { +#ifdef JFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL, +}; + + #ifdef CONFIG_JFS_SECURITY static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile new file mode 100644 index 00000000000..674337c7667 --- /dev/null +++ b/fs/kernfs/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the kernfs pseudo filesystem +# + +obj-y := mount.o inode.o dir.o file.o symlink.o diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c new file mode 100644 index 00000000000..5104cf5d25c --- /dev/null +++ b/fs/kernfs/dir.c @@ -0,0 +1,1073 @@ +/* + * fs/kernfs/dir.c - kernfs directory implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/idr.h> +#include <linux/slab.h> +#include <linux/security.h> +#include <linux/hash.h> + +#include "kernfs-internal.h" + +DEFINE_MUTEX(kernfs_mutex); + +#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) + +/** + * kernfs_name_hash + * @name: Null terminated string to hash + * @ns: Namespace tag to hash + * + * Returns 31 bit hash of ns + name (so it fits in an off_t ) + */ +static unsigned int kernfs_name_hash(const char *name, const void *ns) +{ + unsigned long hash = init_name_hash(); + unsigned int len = strlen(name); + while (len--) + hash = partial_name_hash(*name++, hash); + hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); + hash &= 0x7fffffffU; + /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ + if (hash < 1) + hash += 2; + if (hash >= INT_MAX) + hash = INT_MAX - 1; + return hash; +} + +static int kernfs_name_compare(unsigned int hash, const char *name, + const void *ns, const struct kernfs_node *kn) +{ + if (hash != kn->hash) + return hash - kn->hash; + if (ns != kn->ns) + return ns - kn->ns; + return strcmp(name, kn->name); +} + +static int kernfs_sd_compare(const struct kernfs_node *left, + const struct kernfs_node *right) +{ + return kernfs_name_compare(left->hash, left->name, left->ns, right); +} + +/** + * kernfs_link_sibling - link kernfs_node into sibling rbtree + * @kn: kernfs_node of interest + * + * Link @kn into its sibling rbtree which starts from + * @kn->parent->dir.children. + * + * Locking: + * mutex_lock(kernfs_mutex) + * + * RETURNS: + * 0 on susccess -EEXIST on failure. + */ +static int kernfs_link_sibling(struct kernfs_node *kn) +{ + struct rb_node **node = &kn->parent->dir.children.rb_node; + struct rb_node *parent = NULL; + + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs++; + + while (*node) { + struct kernfs_node *pos; + int result; + + pos = rb_to_kn(*node); + parent = *node; + result = kernfs_sd_compare(kn, pos); + if (result < 0) + node = &pos->rb.rb_left; + else if (result > 0) + node = &pos->rb.rb_right; + else + return -EEXIST; + } + /* add new node and rebalance the tree */ + rb_link_node(&kn->rb, parent, node); + rb_insert_color(&kn->rb, &kn->parent->dir.children); + return 0; +} + +/** + * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree + * @kn: kernfs_node of interest + * + * Unlink @kn from its sibling rbtree which starts from + * kn->parent->dir.children. + * + * Locking: + * mutex_lock(kernfs_mutex) + */ +static void kernfs_unlink_sibling(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) + kn->parent->dir.subdirs--; + + rb_erase(&kn->rb, &kn->parent->dir.children); +} + +/** + * kernfs_get_active - get an active reference to kernfs_node + * @kn: kernfs_node to get an active reference to + * + * Get an active reference of @kn. This function is noop if @kn + * is NULL. + * + * RETURNS: + * Pointer to @kn on success, NULL on failure. + */ +struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) +{ + if (unlikely(!kn)) + return NULL; + + if (!atomic_inc_unless_negative(&kn->active)) + return NULL; + + if (kn->flags & KERNFS_LOCKDEP) + rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); + return kn; +} + +/** + * kernfs_put_active - put an active reference to kernfs_node + * @kn: kernfs_node to put an active reference to + * + * Put an active reference to @kn. This function is noop if @kn + * is NULL. + */ +void kernfs_put_active(struct kernfs_node *kn) +{ + int v; + + if (unlikely(!kn)) + return; + + if (kn->flags & KERNFS_LOCKDEP) + rwsem_release(&kn->dep_map, 1, _RET_IP_); + v = atomic_dec_return(&kn->active); + if (likely(v != KN_DEACTIVATED_BIAS)) + return; + + /* + * atomic_dec_return() is a mb(), we'll always see the updated + * kn->u.completion. + */ + complete(kn->u.completion); +} + +/** + * kernfs_deactivate - deactivate kernfs_node + * @kn: kernfs_node to deactivate + * + * Deny new active references and drain existing ones. + */ +static void kernfs_deactivate(struct kernfs_node *kn) +{ + DECLARE_COMPLETION_ONSTACK(wait); + int v; + + BUG_ON(!(kn->flags & KERNFS_REMOVED)); + + if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF)) + return; + + kn->u.completion = (void *)&wait; + + rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); + /* atomic_add_return() is a mb(), put_active() will always see + * the updated kn->u.completion. + */ + v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active); + + if (v != KN_DEACTIVATED_BIAS) { + lock_contended(&kn->dep_map, _RET_IP_); + wait_for_completion(&wait); + } + + lock_acquired(&kn->dep_map, _RET_IP_); + rwsem_release(&kn->dep_map, 1, _RET_IP_); +} + +/** + * kernfs_get - get a reference count on a kernfs_node + * @kn: the target kernfs_node + */ +void kernfs_get(struct kernfs_node *kn) +{ + if (kn) { + WARN_ON(!atomic_read(&kn->count)); + atomic_inc(&kn->count); + } +} +EXPORT_SYMBOL_GPL(kernfs_get); + +/** + * kernfs_put - put a reference count on a kernfs_node + * @kn: the target kernfs_node + * + * Put a reference count of @kn and destroy it if it reached zero. + */ +void kernfs_put(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + struct kernfs_root *root; + + if (!kn || !atomic_dec_and_test(&kn->count)) + return; + root = kernfs_root(kn); + repeat: + /* Moving/renaming is always done while holding reference. + * kn->parent won't change beneath us. + */ + parent = kn->parent; + + WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n", + parent ? parent->name : "", kn->name); + + if (kernfs_type(kn) == KERNFS_LINK) + kernfs_put(kn->symlink.target_kn); + if (!(kn->flags & KERNFS_STATIC_NAME)) + kfree(kn->name); + if (kn->iattr) { + if (kn->iattr->ia_secdata) + security_release_secctx(kn->iattr->ia_secdata, + kn->iattr->ia_secdata_len); + simple_xattrs_free(&kn->iattr->xattrs); + } + kfree(kn->iattr); + ida_simple_remove(&root->ino_ida, kn->ino); + kmem_cache_free(kernfs_node_cache, kn); + + kn = parent; + if (kn) { + if (atomic_dec_and_test(&kn->count)) + goto repeat; + } else { + /* just released the root kn, free @root too */ + ida_destroy(&root->ino_ida); + kfree(root); + } +} +EXPORT_SYMBOL_GPL(kernfs_put); + +static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct kernfs_node *kn; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* Always perform fresh lookup for negatives */ + if (!dentry->d_inode) + goto out_bad_unlocked; + + kn = dentry->d_fsdata; + mutex_lock(&kernfs_mutex); + + /* The kernfs node has been deleted */ + if (kn->flags & KERNFS_REMOVED) + goto out_bad; + + /* The kernfs node has been moved? */ + if (dentry->d_parent->d_fsdata != kn->parent) + goto out_bad; + + /* The kernfs node has been renamed */ + if (strcmp(dentry->d_name.name, kn->name) != 0) + goto out_bad; + + /* The kernfs node has been moved to a different namespace */ + if (kn->parent && kernfs_ns_enabled(kn->parent) && + kernfs_info(dentry->d_sb)->ns != kn->ns) + goto out_bad; + + mutex_unlock(&kernfs_mutex); +out_valid: + return 1; +out_bad: + mutex_unlock(&kernfs_mutex); +out_bad_unlocked: + /* + * @dentry doesn't match the underlying kernfs node, drop the + * dentry and force lookup. If we have submounts we must allow the + * vfs caches to lie about the state of the filesystem to prevent + * leaks and other nasty things, so use check_submounts_and_drop() + * instead of d_drop(). + */ + if (check_submounts_and_drop(dentry) != 0) + goto out_valid; + + return 0; +} + +static void kernfs_dop_release(struct dentry *dentry) +{ + kernfs_put(dentry->d_fsdata); +} + +const struct dentry_operations kernfs_dops = { + .d_revalidate = kernfs_dop_revalidate, + .d_release = kernfs_dop_release, +}; + +static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, + const char *name, umode_t mode, + unsigned flags) +{ + char *dup_name = NULL; + struct kernfs_node *kn; + int ret; + + if (!(flags & KERNFS_STATIC_NAME)) { + name = dup_name = kstrdup(name, GFP_KERNEL); + if (!name) + return NULL; + } + + kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); + if (!kn) + goto err_out1; + + ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + if (ret < 0) + goto err_out2; + kn->ino = ret; + + atomic_set(&kn->count, 1); + atomic_set(&kn->active, 0); + + kn->name = name; + kn->mode = mode; + kn->flags = flags | KERNFS_REMOVED; + + return kn; + + err_out2: + kmem_cache_free(kernfs_node_cache, kn); + err_out1: + kfree(dup_name); + return NULL; +} + +struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, + const char *name, umode_t mode, + unsigned flags) +{ + struct kernfs_node *kn; + + kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags); + if (kn) { + kernfs_get(parent); + kn->parent = parent; + } + return kn; +} + +/** + * kernfs_addrm_start - prepare for kernfs_node add/remove + * @acxt: pointer to kernfs_addrm_cxt to be used + * + * This function is called when the caller is about to add or remove + * kernfs_node. This function acquires kernfs_mutex. @acxt is used + * to keep and pass context to other addrm functions. + * + * LOCKING: + * Kernel thread context (may sleep). kernfs_mutex is locked on + * return. + */ +void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt) + __acquires(kernfs_mutex) +{ + memset(acxt, 0, sizeof(*acxt)); + + mutex_lock(&kernfs_mutex); +} + +/** + * kernfs_add_one - add kernfs_node to parent without warning + * @acxt: addrm context to use + * @kn: kernfs_node to be added + * + * The caller must already have initialized @kn->parent. This + * function increments nlink of the parent's inode if @kn is a + * directory and link into the children list of the parent. + * + * This function should be called between calls to + * kernfs_addrm_start() and kernfs_addrm_finish() and should be passed + * the same @acxt as passed to kernfs_addrm_start(). + * + * LOCKING: + * Determined by kernfs_addrm_start(). + * + * RETURNS: + * 0 on success, -EEXIST if entry with the given name already + * exists. + */ +int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn) +{ + struct kernfs_node *parent = kn->parent; + bool has_ns = kernfs_ns_enabled(parent); + struct kernfs_iattrs *ps_iattr; + int ret; + + if (has_ns != (bool)kn->ns) { + WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, kn->name); + return -EINVAL; + } + + if (kernfs_type(parent) != KERNFS_DIR) + return -EINVAL; + + if (parent->flags & KERNFS_REMOVED) + return -ENOENT; + + kn->hash = kernfs_name_hash(kn->name, kn->ns); + + ret = kernfs_link_sibling(kn); + if (ret) + return ret; + + /* Update timestamps on the parent */ + ps_iattr = parent->iattr; + if (ps_iattr) { + struct iattr *ps_iattrs = &ps_iattr->ia_iattr; + ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + } + + /* Mark the entry added into directory tree */ + kn->flags &= ~KERNFS_REMOVED; + + return 0; +} + +/** + * kernfs_remove_one - remove kernfs_node from parent + * @acxt: addrm context to use + * @kn: kernfs_node to be removed + * + * Mark @kn removed and drop nlink of parent inode if @kn is a + * directory. @kn is unlinked from the children list. + * + * This function should be called between calls to + * kernfs_addrm_start() and kernfs_addrm_finish() and should be + * passed the same @acxt as passed to kernfs_addrm_start(). + * + * LOCKING: + * Determined by kernfs_addrm_start(). + */ +static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt, + struct kernfs_node *kn) +{ + struct kernfs_iattrs *ps_iattr; + + /* + * Removal can be called multiple times on the same node. Only the + * first invocation is effective and puts the base ref. + */ + if (kn->flags & KERNFS_REMOVED) + return; + + if (kn->parent) { + kernfs_unlink_sibling(kn); + + /* Update timestamps on the parent */ + ps_iattr = kn->parent->iattr; + if (ps_iattr) { + ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME; + ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME; + } + } + + kn->flags |= KERNFS_REMOVED; + kn->u.removed_list = acxt->removed; + acxt->removed = kn; +} + +/** + * kernfs_addrm_finish - finish up kernfs_node add/remove + * @acxt: addrm context to finish up + * + * Finish up kernfs_node add/remove. Resources acquired by + * kernfs_addrm_start() are released and removed kernfs_nodes are + * cleaned up. + * + * LOCKING: + * kernfs_mutex is released. + */ +void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt) + __releases(kernfs_mutex) +{ + /* release resources acquired by kernfs_addrm_start() */ + mutex_unlock(&kernfs_mutex); + + /* kill removed kernfs_nodes */ + while (acxt->removed) { + struct kernfs_node *kn = acxt->removed; + + acxt->removed = kn->u.removed_list; + + kernfs_deactivate(kn); + kernfs_unmap_bin_file(kn); + kernfs_put(kn); + } +} + +/** + * kernfs_find_ns - find kernfs_node with the given name + * @parent: kernfs_node to search under + * @name: name to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with name @name under @parent. Returns pointer to + * the found kernfs_node on success, %NULL on failure. + */ +static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, + const unsigned char *name, + const void *ns) +{ + struct rb_node *node = parent->dir.children.rb_node; + bool has_ns = kernfs_ns_enabled(parent); + unsigned int hash; + + lockdep_assert_held(&kernfs_mutex); + + if (has_ns != (bool)ns) { + WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, name); + return NULL; + } + + hash = kernfs_name_hash(name, ns); + while (node) { + struct kernfs_node *kn; + int result; + + kn = rb_to_kn(node); + result = kernfs_name_compare(hash, name, ns, kn); + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return kn; + } + return NULL; +} + +/** + * kernfs_find_and_get_ns - find and get kernfs_node with the given name + * @parent: kernfs_node to search under + * @name: name to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with name @name under @parent and get a reference + * if found. This function may sleep and returns pointer to the found + * kernfs_node on success, %NULL on failure. + */ +struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, + const char *name, const void *ns) +{ + struct kernfs_node *kn; + + mutex_lock(&kernfs_mutex); + kn = kernfs_find_ns(parent, name, ns); + kernfs_get(kn); + mutex_unlock(&kernfs_mutex); + + return kn; +} +EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); + +/** + * kernfs_create_root - create a new kernfs hierarchy + * @kdops: optional directory syscall operations for the hierarchy + * @priv: opaque data associated with the new directory + * + * Returns the root of the new hierarchy on success, ERR_PTR() value on + * failure. + */ +struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv) +{ + struct kernfs_root *root; + struct kernfs_node *kn; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + ida_init(&root->ino_ida); + + kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, + KERNFS_DIR); + if (!kn) { + ida_destroy(&root->ino_ida); + kfree(root); + return ERR_PTR(-ENOMEM); + } + + kn->flags &= ~KERNFS_REMOVED; + kn->priv = priv; + kn->dir.root = root; + + root->dir_ops = kdops; + root->kn = kn; + + return root; +} + +/** + * kernfs_destroy_root - destroy a kernfs hierarchy + * @root: root of the hierarchy to destroy + * + * Destroy the hierarchy anchored at @root by removing all existing + * directories and destroying @root. + */ +void kernfs_destroy_root(struct kernfs_root *root) +{ + kernfs_remove(root->kn); /* will also free @root */ +} + +/** + * kernfs_create_dir_ns - create a directory + * @parent: parent in which to create a new directory + * @name: name of the new directory + * @mode: mode of the new directory + * @priv: opaque data associated with the new directory + * @ns: optional namespace tag of the directory + * + * Returns the created node on success, ERR_PTR() value on failure. + */ +struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, + const char *name, umode_t mode, + void *priv, const void *ns) +{ + struct kernfs_addrm_cxt acxt; + struct kernfs_node *kn; + int rc; + + /* allocate */ + kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->dir.root = parent->dir.root; + kn->ns = ns; + kn->priv = priv; + + /* link in */ + kernfs_addrm_start(&acxt); + rc = kernfs_add_one(&acxt, kn); + kernfs_addrm_finish(&acxt); + + if (!rc) + return kn; + + kernfs_put(kn); + return ERR_PTR(rc); +} + +static struct dentry *kernfs_iop_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct dentry *ret; + struct kernfs_node *parent = dentry->d_parent->d_fsdata; + struct kernfs_node *kn; + struct inode *inode; + const void *ns = NULL; + + mutex_lock(&kernfs_mutex); + + if (kernfs_ns_enabled(parent)) + ns = kernfs_info(dir->i_sb)->ns; + + kn = kernfs_find_ns(parent, dentry->d_name.name, ns); + + /* no such entry */ + if (!kn) { + ret = NULL; + goto out_unlock; + } + kernfs_get(kn); + dentry->d_fsdata = kn; + + /* attach dentry and inode */ + inode = kernfs_get_inode(dir->i_sb, kn); + if (!inode) { + ret = ERR_PTR(-ENOMEM); + goto out_unlock; + } + + /* instantiate and hash dentry */ + ret = d_materialise_unique(dentry, inode); + out_unlock: + mutex_unlock(&kernfs_mutex); + return ret; +} + +static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode) +{ + struct kernfs_node *parent = dir->i_private; + struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops; + + if (!kdops || !kdops->mkdir) + return -EPERM; + + return kdops->mkdir(parent, dentry->d_name.name, mode); +} + +static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; + + if (!kdops || !kdops->rmdir) + return -EPERM; + + return kdops->rmdir(kn); +} + +static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct kernfs_node *kn = old_dentry->d_fsdata; + struct kernfs_node *new_parent = new_dir->i_private; + struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; + + if (!kdops || !kdops->rename) + return -EPERM; + + return kdops->rename(kn, new_parent, new_dentry->d_name.name); +} + +const struct inode_operations kernfs_dir_iops = { + .lookup = kernfs_iop_lookup, + .permission = kernfs_iop_permission, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, + + .mkdir = kernfs_iop_mkdir, + .rmdir = kernfs_iop_rmdir, + .rename = kernfs_iop_rename, +}; + +static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) +{ + struct kernfs_node *last; + + while (true) { + struct rb_node *rbn; + + last = pos; + + if (kernfs_type(pos) != KERNFS_DIR) + break; + + rbn = rb_first(&pos->dir.children); + if (!rbn) + break; + + pos = rb_to_kn(rbn); + } + + return last; +} + +/** + * kernfs_next_descendant_post - find the next descendant for post-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: kernfs_node whose descendants to walk + * + * Find the next descendant to visit for post-order traversal of @root's + * descendants. @root is included in the iteration and the last node to be + * visited. + */ +static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, + struct kernfs_node *root) +{ + struct rb_node *rbn; + + lockdep_assert_held(&kernfs_mutex); + + /* if first iteration, visit leftmost descendant which may be root */ + if (!pos) + return kernfs_leftmost_descendant(root); + + /* if we visited @root, we're done */ + if (pos == root) + return NULL; + + /* if there's an unvisited sibling, visit its leftmost descendant */ + rbn = rb_next(&pos->rb); + if (rbn) + return kernfs_leftmost_descendant(rb_to_kn(rbn)); + + /* no sibling left, visit parent */ + return pos->parent; +} + +static void __kernfs_remove(struct kernfs_addrm_cxt *acxt, + struct kernfs_node *kn) +{ + struct kernfs_node *pos, *next; + + if (!kn) + return; + + pr_debug("kernfs %s: removing\n", kn->name); + + next = NULL; + do { + pos = next; + next = kernfs_next_descendant_post(pos, kn); + if (pos) + kernfs_remove_one(acxt, pos); + } while (next); +} + +/** + * kernfs_remove - remove a kernfs_node recursively + * @kn: the kernfs_node to remove + * + * Remove @kn along with all its subdirectories and files. + */ +void kernfs_remove(struct kernfs_node *kn) +{ + struct kernfs_addrm_cxt acxt; + + kernfs_addrm_start(&acxt); + __kernfs_remove(&acxt, kn); + kernfs_addrm_finish(&acxt); +} + +/** + * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it + * @parent: parent of the target + * @name: name of the kernfs_node to remove + * @ns: namespace tag of the kernfs_node to remove + * + * Look for the kernfs_node with @name and @ns under @parent and remove it. + * Returns 0 on success, -ENOENT if such entry doesn't exist. + */ +int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, + const void *ns) +{ + struct kernfs_addrm_cxt acxt; + struct kernfs_node *kn; + + if (!parent) { + WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", + name); + return -ENOENT; + } + + kernfs_addrm_start(&acxt); + + kn = kernfs_find_ns(parent, name, ns); + if (kn) + __kernfs_remove(&acxt, kn); + + kernfs_addrm_finish(&acxt); + + if (kn) + return 0; + else + return -ENOENT; +} + +/** + * kernfs_rename_ns - move and rename a kernfs_node + * @kn: target node + * @new_parent: new parent to put @sd under + * @new_name: new name + * @new_ns: new namespace tag + */ +int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name, const void *new_ns) +{ + int error; + + mutex_lock(&kernfs_mutex); + + error = -ENOENT; + if ((kn->flags | new_parent->flags) & KERNFS_REMOVED) + goto out; + + error = 0; + if ((kn->parent == new_parent) && (kn->ns == new_ns) && + (strcmp(kn->name, new_name) == 0)) + goto out; /* nothing to rename */ + + error = -EEXIST; + if (kernfs_find_ns(new_parent, new_name, new_ns)) + goto out; + + /* rename kernfs_node */ + if (strcmp(kn->name, new_name) != 0) { + error = -ENOMEM; + new_name = kstrdup(new_name, GFP_KERNEL); + if (!new_name) + goto out; + + if (kn->flags & KERNFS_STATIC_NAME) + kn->flags &= ~KERNFS_STATIC_NAME; + else + kfree(kn->name); + + kn->name = new_name; + } + + /* + * Move to the appropriate place in the appropriate directories rbtree. + */ + kernfs_unlink_sibling(kn); + kernfs_get(new_parent); + kernfs_put(kn->parent); + kn->ns = new_ns; + kn->hash = kernfs_name_hash(kn->name, kn->ns); + kn->parent = new_parent; + kernfs_link_sibling(kn); + + error = 0; + out: + mutex_unlock(&kernfs_mutex); + return error; +} + +/* Relationship between s_mode and the DT_xxx types */ +static inline unsigned char dt_type(struct kernfs_node *kn) +{ + return (kn->mode >> 12) & 15; +} + +static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) +{ + kernfs_put(filp->private_data); + return 0; +} + +static struct kernfs_node *kernfs_dir_pos(const void *ns, + struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) +{ + if (pos) { + int valid = !(pos->flags & KERNFS_REMOVED) && + pos->parent == parent && hash == pos->hash; + kernfs_put(pos); + if (!valid) + pos = NULL; + } + if (!pos && (hash > 1) && (hash < INT_MAX)) { + struct rb_node *node = parent->dir.children.rb_node; + while (node) { + pos = rb_to_kn(node); + + if (hash < pos->hash) + node = node->rb_left; + else if (hash > pos->hash) + node = node->rb_right; + else + break; + } + } + /* Skip over entries in the wrong namespace */ + while (pos && pos->ns != ns) { + struct rb_node *node = rb_next(&pos->rb); + if (!node) + pos = NULL; + else + pos = rb_to_kn(node); + } + return pos; +} + +static struct kernfs_node *kernfs_dir_next_pos(const void *ns, + struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) +{ + pos = kernfs_dir_pos(ns, parent, ino, pos); + if (pos) + do { + struct rb_node *node = rb_next(&pos->rb); + if (!node) + pos = NULL; + else + pos = rb_to_kn(node); + } while (pos && pos->ns != ns); + return pos; +} + +static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry = file->f_path.dentry; + struct kernfs_node *parent = dentry->d_fsdata; + struct kernfs_node *pos = file->private_data; + const void *ns = NULL; + + if (!dir_emit_dots(file, ctx)) + return 0; + mutex_lock(&kernfs_mutex); + + if (kernfs_ns_enabled(parent)) + ns = kernfs_info(dentry->d_sb)->ns; + + for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); + pos; + pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { + const char *name = pos->name; + unsigned int type = dt_type(pos); + int len = strlen(name); + ino_t ino = pos->ino; + + ctx->pos = pos->hash; + file->private_data = pos; + kernfs_get(pos); + + mutex_unlock(&kernfs_mutex); + if (!dir_emit(ctx, name, len, ino, type)) + return 0; + mutex_lock(&kernfs_mutex); + } + mutex_unlock(&kernfs_mutex); + file->private_data = NULL; + ctx->pos = INT_MAX; + return 0; +} + +static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset, + int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} + +const struct file_operations kernfs_dir_fops = { + .read = generic_read_dir, + .iterate = kernfs_fop_readdir, + .release = kernfs_dir_fop_release, + .llseek = kernfs_dir_fop_llseek, +}; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c new file mode 100644 index 00000000000..dbf397bfdff --- /dev/null +++ b/fs/kernfs/file.c @@ -0,0 +1,867 @@ +/* + * fs/kernfs/file.c - kernfs file implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/poll.h> +#include <linux/pagemap.h> +#include <linux/sched.h> + +#include "kernfs-internal.h" + +/* + * There's one kernfs_open_file for each open file and one kernfs_open_node + * for each kernfs_node with one or more open files. + * + * kernfs_node->attr.open points to kernfs_open_node. attr.open is + * protected by kernfs_open_node_lock. + * + * filp->private_data points to seq_file whose ->private points to + * kernfs_open_file. kernfs_open_files are chained at + * kernfs_open_node->files, which is protected by kernfs_open_file_mutex. + */ +static DEFINE_SPINLOCK(kernfs_open_node_lock); +static DEFINE_MUTEX(kernfs_open_file_mutex); + +struct kernfs_open_node { + atomic_t refcnt; + atomic_t event; + wait_queue_head_t poll; + struct list_head files; /* goes through kernfs_open_file.list */ +}; + +static struct kernfs_open_file *kernfs_of(struct file *file) +{ + return ((struct seq_file *)file->private_data)->private; +} + +/* + * Determine the kernfs_ops for the given kernfs_node. This function must + * be called while holding an active reference. + */ +static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) +{ + if (kn->flags & KERNFS_LOCKDEP) + lockdep_assert_held(kn); + return kn->attr.ops; +} + +/* + * As kernfs_seq_stop() is also called after kernfs_seq_start() or + * kernfs_seq_next() failure, it needs to distinguish whether it's stopping + * a seq_file iteration which is fully initialized with an active reference + * or an aborted kernfs_seq_start() due to get_active failure. The + * position pointer is the only context for each seq_file iteration and + * thus the stop condition should be encoded in it. As the return value is + * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable + * choice to indicate get_active failure. + * + * Unfortunately, this is complicated due to the optional custom seq_file + * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() + * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or + * custom seq_file operations and thus can't decide whether put_active + * should be performed or not only on ERR_PTR(-ENODEV). + * + * This is worked around by factoring out the custom seq_stop() and + * put_active part into kernfs_seq_stop_active(), skipping it from + * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after + * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures + * that kernfs_seq_stop_active() is skipped only after get_active failure. + */ +static void kernfs_seq_stop_active(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops = kernfs_ops(of->kn); + + if (ops->seq_stop) + ops->seq_stop(sf, v); + kernfs_put_active(of->kn); +} + +static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops; + + /* + * @of->mutex nests outside active ref and is just to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) + return ERR_PTR(-ENODEV); + + ops = kernfs_ops(of->kn); + if (ops->seq_start) { + void *next = ops->seq_start(sf, ppos); + /* see the comment above kernfs_seq_stop_active() */ + if (next == ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, next); + return next; + } else { + /* + * The same behavior and code as single_open(). Returns + * !NULL if pos is at the beginning; otherwise, NULL. + */ + return NULL + !*ppos; + } +} + +static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) +{ + struct kernfs_open_file *of = sf->private; + const struct kernfs_ops *ops = kernfs_ops(of->kn); + + if (ops->seq_next) { + void *next = ops->seq_next(sf, v, ppos); + /* see the comment above kernfs_seq_stop_active() */ + if (next == ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, next); + return next; + } else { + /* + * The same behavior and code as single_open(), always + * terminate after the initial read. + */ + ++*ppos; + return NULL; + } +} + +static void kernfs_seq_stop(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + + if (v != ERR_PTR(-ENODEV)) + kernfs_seq_stop_active(sf, v); + mutex_unlock(&of->mutex); +} + +static int kernfs_seq_show(struct seq_file *sf, void *v) +{ + struct kernfs_open_file *of = sf->private; + + of->event = atomic_read(&of->kn->attr.open->event); + + return of->kn->attr.ops->seq_show(sf, v); +} + +static const struct seq_operations kernfs_seq_ops = { + .start = kernfs_seq_start, + .next = kernfs_seq_next, + .stop = kernfs_seq_stop, + .show = kernfs_seq_show, +}; + +/* + * As reading a bin file can have side-effects, the exact offset and bytes + * specified in read(2) call should be passed to the read callback making + * it difficult to use seq_file. Implement simplistic custom buffering for + * bin files. + */ +static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, + char __user *user_buf, size_t count, + loff_t *ppos) +{ + ssize_t len = min_t(size_t, count, PAGE_SIZE); + const struct kernfs_ops *ops; + char *buf; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* + * @of->mutex nests outside active ref and is just to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + len = -ENODEV; + mutex_unlock(&of->mutex); + goto out_free; + } + + ops = kernfs_ops(of->kn); + if (ops->read) + len = ops->read(of, buf, len, *ppos); + else + len = -EINVAL; + + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + + if (len < 0) + goto out_free; + + if (copy_to_user(user_buf, buf, len)) { + len = -EFAULT; + goto out_free; + } + + *ppos += len; + + out_free: + kfree(buf); + return len; +} + +/** + * kernfs_fop_read - kernfs vfs read callback + * @file: file pointer + * @user_buf: data to write + * @count: number of bytes + * @ppos: starting offset + */ +static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct kernfs_open_file *of = kernfs_of(file); + + if (of->kn->flags & KERNFS_HAS_SEQ_SHOW) + return seq_read(file, user_buf, count, ppos); + else + return kernfs_file_direct_read(of, user_buf, count, ppos); +} + +/** + * kernfs_fop_write - kernfs vfs write callback + * @file: file pointer + * @user_buf: data to write + * @count: number of bytes + * @ppos: starting offset + * + * Copy data in from userland and pass it to the matching kernfs write + * operation. + * + * There is no easy way for us to know if userspace is only doing a partial + * write, so we don't support them. We expect the entire buffer to come on + * the first write. Hint: if you're writing a value, first read the file, + * modify only the the value you're changing, then write entire buffer + * back. + */ +static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct kernfs_open_file *of = kernfs_of(file); + ssize_t len = min_t(size_t, count, PAGE_SIZE); + const struct kernfs_ops *ops; + char *buf; + + buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, user_buf, len)) { + len = -EFAULT; + goto out_free; + } + buf[len] = '\0'; /* guarantee string termination */ + + /* + * @of->mutex nests outside active ref and is just to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + mutex_unlock(&of->mutex); + len = -ENODEV; + goto out_free; + } + + ops = kernfs_ops(of->kn); + if (ops->write) + len = ops->write(of, buf, len, *ppos); + else + len = -EINVAL; + + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + + if (len > 0) + *ppos += len; +out_free: + kfree(buf); + return len; +} + +static void kernfs_vma_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + + if (!of->vm_ops) + return; + + if (!kernfs_get_active(of->kn)) + return; + + if (of->vm_ops->open) + of->vm_ops->open(vma); + + kernfs_put_active(of->kn); +} + +static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!kernfs_get_active(of->kn)) + return VM_FAULT_SIGBUS; + + ret = VM_FAULT_SIGBUS; + if (of->vm_ops->fault) + ret = of->vm_ops->fault(vma, vmf); + + kernfs_put_active(of->kn); + return ret; +} + +static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return VM_FAULT_SIGBUS; + + if (!kernfs_get_active(of->kn)) + return VM_FAULT_SIGBUS; + + ret = 0; + if (of->vm_ops->page_mkwrite) + ret = of->vm_ops->page_mkwrite(vma, vmf); + else + file_update_time(file); + + kernfs_put_active(of->kn); + return ret; +} + +static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return -EINVAL; + + if (!kernfs_get_active(of->kn)) + return -EINVAL; + + ret = -EINVAL; + if (of->vm_ops->access) + ret = of->vm_ops->access(vma, addr, buf, len, write); + + kernfs_put_active(of->kn); + return ret; +} + +#ifdef CONFIG_NUMA +static int kernfs_vma_set_policy(struct vm_area_struct *vma, + struct mempolicy *new) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return 0; + + if (!kernfs_get_active(of->kn)) + return -EINVAL; + + ret = 0; + if (of->vm_ops->set_policy) + ret = of->vm_ops->set_policy(vma, new); + + kernfs_put_active(of->kn); + return ret; +} + +static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + struct mempolicy *pol; + + if (!of->vm_ops) + return vma->vm_policy; + + if (!kernfs_get_active(of->kn)) + return vma->vm_policy; + + pol = vma->vm_policy; + if (of->vm_ops->get_policy) + pol = of->vm_ops->get_policy(vma, addr); + + kernfs_put_active(of->kn); + return pol; +} + +static int kernfs_vma_migrate(struct vm_area_struct *vma, + const nodemask_t *from, const nodemask_t *to, + unsigned long flags) +{ + struct file *file = vma->vm_file; + struct kernfs_open_file *of = kernfs_of(file); + int ret; + + if (!of->vm_ops) + return 0; + + if (!kernfs_get_active(of->kn)) + return 0; + + ret = 0; + if (of->vm_ops->migrate) + ret = of->vm_ops->migrate(vma, from, to, flags); + + kernfs_put_active(of->kn); + return ret; +} +#endif + +static const struct vm_operations_struct kernfs_vm_ops = { + .open = kernfs_vma_open, + .fault = kernfs_vma_fault, + .page_mkwrite = kernfs_vma_page_mkwrite, + .access = kernfs_vma_access, +#ifdef CONFIG_NUMA + .set_policy = kernfs_vma_set_policy, + .get_policy = kernfs_vma_get_policy, + .migrate = kernfs_vma_migrate, +#endif +}; + +static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct kernfs_open_file *of = kernfs_of(file); + const struct kernfs_ops *ops; + int rc; + + /* + * mmap path and of->mutex are prone to triggering spurious lockdep + * warnings and we don't want to add spurious locking dependency + * between the two. Check whether mmap is actually implemented + * without grabbing @of->mutex by testing HAS_MMAP flag. See the + * comment in kernfs_file_open() for more details. + */ + if (!(of->kn->flags & KERNFS_HAS_MMAP)) + return -ENODEV; + + mutex_lock(&of->mutex); + + rc = -ENODEV; + if (!kernfs_get_active(of->kn)) + goto out_unlock; + + ops = kernfs_ops(of->kn); + rc = ops->mmap(of, vma); + + /* + * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() + * to satisfy versions of X which crash if the mmap fails: that + * substitutes a new vm_file, and we don't then want bin_vm_ops. + */ + if (vma->vm_file != file) + goto out_put; + + rc = -EINVAL; + if (of->mmapped && of->vm_ops != vma->vm_ops) + goto out_put; + + /* + * It is not possible to successfully wrap close. + * So error if someone is trying to use close. + */ + rc = -EINVAL; + if (vma->vm_ops && vma->vm_ops->close) + goto out_put; + + rc = 0; + of->mmapped = 1; + of->vm_ops = vma->vm_ops; + vma->vm_ops = &kernfs_vm_ops; +out_put: + kernfs_put_active(of->kn); +out_unlock: + mutex_unlock(&of->mutex); + + return rc; +} + +/** + * kernfs_get_open_node - get or create kernfs_open_node + * @kn: target kernfs_node + * @of: kernfs_open_file for this instance of open + * + * If @kn->attr.open exists, increment its reference count; otherwise, + * create one. @of is chained to the files list. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int kernfs_get_open_node(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + struct kernfs_open_node *on, *new_on = NULL; + + retry: + mutex_lock(&kernfs_open_file_mutex); + spin_lock_irq(&kernfs_open_node_lock); + + if (!kn->attr.open && new_on) { + kn->attr.open = new_on; + new_on = NULL; + } + + on = kn->attr.open; + if (on) { + atomic_inc(&on->refcnt); + list_add_tail(&of->list, &on->files); + } + + spin_unlock_irq(&kernfs_open_node_lock); + mutex_unlock(&kernfs_open_file_mutex); + + if (on) { + kfree(new_on); + return 0; + } + + /* not there, initialize a new one and retry */ + new_on = kmalloc(sizeof(*new_on), GFP_KERNEL); + if (!new_on) + return -ENOMEM; + + atomic_set(&new_on->refcnt, 0); + atomic_set(&new_on->event, 1); + init_waitqueue_head(&new_on->poll); + INIT_LIST_HEAD(&new_on->files); + goto retry; +} + +/** + * kernfs_put_open_node - put kernfs_open_node + * @kn: target kernfs_nodet + * @of: associated kernfs_open_file + * + * Put @kn->attr.open and unlink @of from the files list. If + * reference count reaches zero, disassociate and free it. + * + * LOCKING: + * None. + */ +static void kernfs_put_open_node(struct kernfs_node *kn, + struct kernfs_open_file *of) +{ + struct kernfs_open_node *on = kn->attr.open; + unsigned long flags; + + mutex_lock(&kernfs_open_file_mutex); + spin_lock_irqsave(&kernfs_open_node_lock, flags); + + if (of) + list_del(&of->list); + + if (atomic_dec_and_test(&on->refcnt)) + kn->attr.open = NULL; + else + on = NULL; + + spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + mutex_unlock(&kernfs_open_file_mutex); + + kfree(on); +} + +static int kernfs_fop_open(struct inode *inode, struct file *file) +{ + struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + const struct kernfs_ops *ops; + struct kernfs_open_file *of; + bool has_read, has_write, has_mmap; + int error = -EACCES; + + if (!kernfs_get_active(kn)) + return -ENODEV; + + ops = kernfs_ops(kn); + + has_read = ops->seq_show || ops->read || ops->mmap; + has_write = ops->write || ops->mmap; + has_mmap = ops->mmap; + + /* check perms and supported operations */ + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; + + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + + /* allocate a kernfs_open_file for the file */ + error = -ENOMEM; + of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); + if (!of) + goto err_out; + + /* + * The following is done to give a different lockdep key to + * @of->mutex for files which implement mmap. This is a rather + * crude way to avoid false positive lockdep warning around + * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and + * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under + * which mm->mmap_sem nests, while holding @of->mutex. As each + * open file has a separate mutex, it's okay as long as those don't + * happen on the same file. At this point, we can't easily give + * each file a separate locking class. Let's differentiate on + * whether the file has mmap or not for now. + * + * Both paths of the branch look the same. They're supposed to + * look that way and give @of->mutex different static lockdep keys. + */ + if (has_mmap) + mutex_init(&of->mutex); + else + mutex_init(&of->mutex); + + of->kn = kn; + of->file = file; + + /* + * Always instantiate seq_file even if read access doesn't use + * seq_file or is not requested. This unifies private data access + * and readable regular files are the vast majority anyway. + */ + if (ops->seq_show) + error = seq_open(file, &kernfs_seq_ops); + else + error = seq_open(file, NULL); + if (error) + goto err_free; + + ((struct seq_file *)file->private_data)->private = of; + + /* seq_file clears PWRITE unconditionally, restore it if WRITE */ + if (file->f_mode & FMODE_WRITE) + file->f_mode |= FMODE_PWRITE; + + /* make sure we have open node struct */ + error = kernfs_get_open_node(kn, of); + if (error) + goto err_close; + + /* open succeeded, put active references */ + kernfs_put_active(kn); + return 0; + +err_close: + seq_release(inode, file); +err_free: + kfree(of); +err_out: + kernfs_put_active(kn); + return error; +} + +static int kernfs_fop_release(struct inode *inode, struct file *filp) +{ + struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_open_file *of = kernfs_of(filp); + + kernfs_put_open_node(kn, of); + seq_release(inode, filp); + kfree(of); + + return 0; +} + +void kernfs_unmap_bin_file(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + struct kernfs_open_file *of; + + if (!(kn->flags & KERNFS_HAS_MMAP)) + return; + + spin_lock_irq(&kernfs_open_node_lock); + on = kn->attr.open; + if (on) + atomic_inc(&on->refcnt); + spin_unlock_irq(&kernfs_open_node_lock); + if (!on) + return; + + mutex_lock(&kernfs_open_file_mutex); + list_for_each_entry(of, &on->files, list) { + struct inode *inode = file_inode(of->file); + unmap_mapping_range(inode->i_mapping, 0, 0, 1); + } + mutex_unlock(&kernfs_open_file_mutex); + + kernfs_put_open_node(kn, NULL); +} + +/* + * Kernfs attribute files are pollable. The idea is that you read + * the content and then you use 'poll' or 'select' to wait for + * the content to change. When the content changes (assuming the + * manager for the kobject supports notification), poll will + * return POLLERR|POLLPRI, and select will return the fd whether + * it is waiting for read, write, or exceptions. + * Once poll/select indicates that the value has changed, you + * need to close and re-open the file, or seek to 0 and read again. + * Reminder: this only works for attributes which actively support + * it, and it is not possible to test an attribute from userspace + * to see if it supports poll (Neither 'poll' nor 'select' return + * an appropriate error code). When in doubt, set a suitable timeout value. + */ +static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) +{ + struct kernfs_open_file *of = kernfs_of(filp); + struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_open_node *on = kn->attr.open; + + /* need parent for the kobj, grab both */ + if (!kernfs_get_active(kn)) + goto trigger; + + poll_wait(filp, &on->poll, wait); + + kernfs_put_active(kn); + + if (of->event != atomic_read(&on->event)) + goto trigger; + + return DEFAULT_POLLMASK; + + trigger: + return DEFAULT_POLLMASK|POLLERR|POLLPRI; +} + +/** + * kernfs_notify - notify a kernfs file + * @kn: file to notify + * + * Notify @kn such that poll(2) on @kn wakes up. + */ +void kernfs_notify(struct kernfs_node *kn) +{ + struct kernfs_open_node *on; + unsigned long flags; + + spin_lock_irqsave(&kernfs_open_node_lock, flags); + + if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) { + on = kn->attr.open; + if (on) { + atomic_inc(&on->event); + wake_up_interruptible(&on->poll); + } + } + + spin_unlock_irqrestore(&kernfs_open_node_lock, flags); +} +EXPORT_SYMBOL_GPL(kernfs_notify); + +const struct file_operations kernfs_file_fops = { + .read = kernfs_fop_read, + .write = kernfs_fop_write, + .llseek = generic_file_llseek, + .mmap = kernfs_fop_mmap, + .open = kernfs_fop_open, + .release = kernfs_fop_release, + .poll = kernfs_fop_poll, +}; + +/** + * __kernfs_create_file - kernfs internal function to create a file + * @parent: directory to create the file in + * @name: name of the file + * @mode: mode of the file + * @size: size of the file + * @ops: kernfs operations for the file + * @priv: private data for the file + * @ns: optional namespace tag of the file + * @static_name: don't copy file name + * @key: lockdep key for the file's active_ref, %NULL to disable lockdep + * + * Returns the created node on success, ERR_PTR() value on error. + */ +struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, + const char *name, + umode_t mode, loff_t size, + const struct kernfs_ops *ops, + void *priv, const void *ns, + bool name_is_static, + struct lock_class_key *key) +{ + struct kernfs_addrm_cxt acxt; + struct kernfs_node *kn; + unsigned flags; + int rc; + + flags = KERNFS_FILE; + if (name_is_static) + flags |= KERNFS_STATIC_NAME; + + kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->attr.ops = ops; + kn->attr.size = size; + kn->ns = ns; + kn->priv = priv; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (key) { + lockdep_init_map(&kn->dep_map, "s_active", key, 0); + kn->flags |= KERNFS_LOCKDEP; + } +#endif + + /* + * kn->attr.ops is accesible only while holding active ref. We + * need to know whether some ops are implemented outside active + * ref. Cache their existence in flags. + */ + if (ops->seq_show) + kn->flags |= KERNFS_HAS_SEQ_SHOW; + if (ops->mmap) + kn->flags |= KERNFS_HAS_MMAP; + + kernfs_addrm_start(&acxt); + rc = kernfs_add_one(&acxt, kn); + kernfs_addrm_finish(&acxt); + + if (rc) { + kernfs_put(kn); + return ERR_PTR(rc); + } + return kn; +} diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c new file mode 100644 index 00000000000..e55126f85bd --- /dev/null +++ b/fs/kernfs/inode.c @@ -0,0 +1,377 @@ +/* + * fs/kernfs/inode.c - kernfs inode implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/pagemap.h> +#include <linux/backing-dev.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/xattr.h> +#include <linux/security.h> + +#include "kernfs-internal.h" + +static const struct address_space_operations kernfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, +}; + +static struct backing_dev_info kernfs_bdi = { + .name = "kernfs", + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; + +static const struct inode_operations kernfs_iops = { + .permission = kernfs_iop_permission, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, +}; + +void __init kernfs_inode_init(void) +{ + if (bdi_init(&kernfs_bdi)) + panic("failed to init kernfs_bdi"); +} + +static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) +{ + struct iattr *iattrs; + + if (kn->iattr) + return kn->iattr; + + kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL); + if (!kn->iattr) + return NULL; + iattrs = &kn->iattr->ia_iattr; + + /* assign default attributes */ + iattrs->ia_mode = kn->mode; + iattrs->ia_uid = GLOBAL_ROOT_UID; + iattrs->ia_gid = GLOBAL_ROOT_GID; + iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + simple_xattrs_init(&kn->iattr->xattrs); + + return kn->iattr; +} + +static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +{ + struct kernfs_iattrs *attrs; + struct iattr *iattrs; + unsigned int ia_valid = iattr->ia_valid; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + iattrs = &attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = iattr->ia_atime; + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = iattr->ia_mtime; + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = iattr->ia_ctime; + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + iattrs->ia_mode = kn->mode = mode; + } + return 0; +} + +/** + * kernfs_setattr - set iattr on a node + * @kn: target node + * @iattr: iattr to set + * + * Returns 0 on success, -errno on failure. + */ +int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) +{ + int ret; + + mutex_lock(&kernfs_mutex); + ret = __kernfs_setattr(kn, iattr); + mutex_unlock(&kernfs_mutex); + return ret; +} + +int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + struct kernfs_node *kn = dentry->d_fsdata; + int error; + + if (!kn) + return -EINVAL; + + mutex_lock(&kernfs_mutex); + error = inode_change_ok(inode, iattr); + if (error) + goto out; + + error = __kernfs_setattr(kn, iattr); + if (error) + goto out; + + /* this ignores size changes */ + setattr_copy(inode, iattr); + +out: + mutex_unlock(&kernfs_mutex); + return error; +} + +static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata, + u32 *secdata_len) +{ + struct kernfs_iattrs *attrs; + void *old_secdata; + size_t old_secdata_len; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + old_secdata = attrs->ia_secdata; + old_secdata_len = attrs->ia_secdata_len; + + attrs->ia_secdata = *secdata; + attrs->ia_secdata_len = *secdata_len; + + *secdata = old_secdata; + *secdata_len = old_secdata_len; + return 0; +} + +int kernfs_iop_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + void *secdata; + int error; + u32 secdata_len = 0; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(dentry->d_inode, suffix, + value, size, flags); + if (error) + return error; + error = security_inode_getsecctx(dentry->d_inode, + &secdata, &secdata_len); + if (error) + return error; + + mutex_lock(&kernfs_mutex); + error = kernfs_node_setsecdata(kn, &secdata, &secdata_len); + mutex_unlock(&kernfs_mutex); + + if (secdata) + security_release_secctx(secdata, secdata_len); + return error; + } else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { + return simple_xattr_set(&attrs->xattrs, name, value, size, + flags); + } + + return -EINVAL; +} + +int kernfs_iop_removexattr(struct dentry *dentry, const char *name) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_remove(&attrs->xattrs, name); +} + +ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_get(&attrs->xattrs, name, buf, size); +} + +ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_iattrs *attrs; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + return simple_xattr_list(&attrs->xattrs, buf, size); +} + +static inline void set_default_inode_attr(struct inode *inode, umode_t mode) +{ + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) +{ + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) +{ + struct kernfs_iattrs *attrs = kn->iattr; + + inode->i_mode = kn->mode; + if (attrs) { + /* + * kernfs_node has non-default attributes get them from + * persistent copy in kernfs_node. + */ + set_inode_attr(inode, &attrs->ia_iattr); + security_inode_notifysecctx(inode, attrs->ia_secdata, + attrs->ia_secdata_len); + } + + if (kernfs_type(kn) == KERNFS_DIR) + set_nlink(inode, kn->dir.subdirs + 2); +} + +int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct inode *inode = dentry->d_inode; + + mutex_lock(&kernfs_mutex); + kernfs_refresh_inode(kn, inode); + mutex_unlock(&kernfs_mutex); + + generic_fillattr(inode, stat); + return 0; +} + +static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) +{ + kernfs_get(kn); + inode->i_private = kn; + inode->i_mapping->a_ops = &kernfs_aops; + inode->i_mapping->backing_dev_info = &kernfs_bdi; + inode->i_op = &kernfs_iops; + + set_default_inode_attr(inode, kn->mode); + kernfs_refresh_inode(kn, inode); + + /* initialize inode according to type */ + switch (kernfs_type(kn)) { + case KERNFS_DIR: + inode->i_op = &kernfs_dir_iops; + inode->i_fop = &kernfs_dir_fops; + break; + case KERNFS_FILE: + inode->i_size = kn->attr.size; + inode->i_fop = &kernfs_file_fops; + break; + case KERNFS_LINK: + inode->i_op = &kernfs_symlink_iops; + break; + default: + BUG(); + } + + unlock_new_inode(inode); +} + +/** + * kernfs_get_inode - get inode for kernfs_node + * @sb: super block + * @kn: kernfs_node to allocate inode for + * + * Get inode for @kn. If such inode doesn't exist, a new inode is + * allocated and basics are initialized. New inode is returned + * locked. + * + * LOCKING: + * Kernel thread context (may sleep). + * + * RETURNS: + * Pointer to allocated inode on success, NULL on failure. + */ +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) +{ + struct inode *inode; + + inode = iget_locked(sb, kn->ino); + if (inode && (inode->i_state & I_NEW)) + kernfs_init_inode(kn, inode); + + return inode; +} + +/* + * The kernfs_node serves as both an inode and a directory entry for + * kernfs. To prevent the kernfs inode numbers from being freed + * prematurely we take a reference to kernfs_node from the kernfs inode. A + * super_operations.evict_inode() implementation is needed to drop that + * reference upon inode destruction. + */ +void kernfs_evict_inode(struct inode *inode) +{ + struct kernfs_node *kn = inode->i_private; + + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + kernfs_put(kn); +} + +int kernfs_iop_permission(struct inode *inode, int mask) +{ + struct kernfs_node *kn; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + kn = inode->i_private; + + mutex_lock(&kernfs_mutex); + kernfs_refresh_inode(kn, inode); + mutex_unlock(&kernfs_mutex); + + return generic_permission(inode, mask); +} diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h new file mode 100644 index 00000000000..eb536b76374 --- /dev/null +++ b/fs/kernfs/kernfs-internal.h @@ -0,0 +1,122 @@ +/* + * fs/kernfs/kernfs-internal.h - kernfs internal header file + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de> + * + * This file is released under the GPLv2. + */ + +#ifndef __KERNFS_INTERNAL_H +#define __KERNFS_INTERNAL_H + +#include <linux/lockdep.h> +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/xattr.h> + +#include <linux/kernfs.h> + +struct kernfs_iattrs { + struct iattr ia_iattr; + void *ia_secdata; + u32 ia_secdata_len; + + struct simple_xattrs xattrs; +}; + +#define KN_DEACTIVATED_BIAS INT_MIN + +/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ + +/** + * kernfs_root - find out the kernfs_root a kernfs_node belongs to + * @kn: kernfs_node of interest + * + * Return the kernfs_root @kn belongs to. + */ +static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) +{ + /* if parent exists, it's always a dir; otherwise, @sd is a dir */ + if (kn->parent) + kn = kn->parent; + return kn->dir.root; +} + +/* + * Context structure to be used while adding/removing nodes. + */ +struct kernfs_addrm_cxt { + struct kernfs_node *removed; +}; + +/* + * mount.c + */ +struct kernfs_super_info { + /* + * The root associated with this super_block. Each super_block is + * identified by the root and ns it's associated with. + */ + struct kernfs_root *root; + + /* + * Each sb is associated with one namespace tag, currently the + * network namespace of the task which mounted this kernfs + * instance. If multiple tags become necessary, make the following + * an array and compare kernfs_node tag against every entry. + */ + const void *ns; +}; +#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) + +extern struct kmem_cache *kernfs_node_cache; + +/* + * inode.c + */ +struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); +void kernfs_evict_inode(struct inode *inode); +int kernfs_iop_permission(struct inode *inode, int mask); +int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); +int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); +int kernfs_iop_removexattr(struct dentry *dentry, const char *name); +ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size); +ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); +void kernfs_inode_init(void); + +/* + * dir.c + */ +extern struct mutex kernfs_mutex; +extern const struct dentry_operations kernfs_dops; +extern const struct file_operations kernfs_dir_fops; +extern const struct inode_operations kernfs_dir_iops; + +struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); +void kernfs_put_active(struct kernfs_node *kn); +void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt); +int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn); +void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt); +struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, + const char *name, umode_t mode, + unsigned flags); + +/* + * file.c + */ +extern const struct file_operations kernfs_file_fops; + +void kernfs_unmap_bin_file(struct kernfs_node *kn); + +/* + * symlink.c + */ +extern const struct inode_operations kernfs_symlink_iops; + +#endif /* __KERNFS_INTERNAL_H */ diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c new file mode 100644 index 00000000000..0d6ce895a9e --- /dev/null +++ b/fs/kernfs/mount.c @@ -0,0 +1,165 @@ +/* + * fs/kernfs/mount.c - kernfs mount implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/init.h> +#include <linux/magic.h> +#include <linux/slab.h> +#include <linux/pagemap.h> + +#include "kernfs-internal.h" + +struct kmem_cache *kernfs_node_cache; + +static const struct super_operations kernfs_sops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .evict_inode = kernfs_evict_inode, +}; + +static int kernfs_fill_super(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct inode *inode; + struct dentry *root; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = SYSFS_MAGIC; + sb->s_op = &kernfs_sops; + sb->s_time_gran = 1; + + /* get root inode, initialize and unlock it */ + mutex_lock(&kernfs_mutex); + inode = kernfs_get_inode(sb, info->root->kn); + mutex_unlock(&kernfs_mutex); + if (!inode) { + pr_debug("kernfs: could not get root inode\n"); + return -ENOMEM; + } + + /* instantiate and link root dentry */ + root = d_make_root(inode); + if (!root) { + pr_debug("%s: could not get root dentry!\n", __func__); + return -ENOMEM; + } + kernfs_get(info->root->kn); + root->d_fsdata = info->root->kn; + sb->s_root = root; + sb->s_d_op = &kernfs_dops; + return 0; +} + +static int kernfs_test_super(struct super_block *sb, void *data) +{ + struct kernfs_super_info *sb_info = kernfs_info(sb); + struct kernfs_super_info *info = data; + + return sb_info->root == info->root && sb_info->ns == info->ns; +} + +static int kernfs_set_super(struct super_block *sb, void *data) +{ + int error; + error = set_anon_super(sb, data); + if (!error) + sb->s_fs_info = data; + return error; +} + +/** + * kernfs_super_ns - determine the namespace tag of a kernfs super_block + * @sb: super_block of interest + * + * Return the namespace tag associated with kernfs super_block @sb. + */ +const void *kernfs_super_ns(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + + return info->ns; +} + +/** + * kernfs_mount_ns - kernfs mount helper + * @fs_type: file_system_type of the fs being mounted + * @flags: mount flags specified for the mount + * @root: kernfs_root of the hierarchy being mounted + * @ns: optional namespace tag of the mount + * + * This is to be called from each kernfs user's file_system_type->mount() + * implementation, which should pass through the specified @fs_type and + * @flags, and specify the hierarchy and namespace tag to mount via @root + * and @ns, respectively. + * + * The return value can be passed to the vfs layer verbatim. + */ +struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, + struct kernfs_root *root, const void *ns) +{ + struct super_block *sb; + struct kernfs_super_info *info; + int error; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + info->root = root; + info->ns = ns; + + sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info); + if (IS_ERR(sb) || sb->s_fs_info != info) + kfree(info); + if (IS_ERR(sb)) + return ERR_CAST(sb); + if (!sb->s_root) { + error = kernfs_fill_super(sb); + if (error) { + deactivate_locked_super(sb); + return ERR_PTR(error); + } + sb->s_flags |= MS_ACTIVE; + } + + return dget(sb->s_root); +} + +/** + * kernfs_kill_sb - kill_sb for kernfs + * @sb: super_block being killed + * + * This can be used directly for file_system_type->kill_sb(). If a kernfs + * user needs extra cleanup, it can implement its own kill_sb() and call + * this function at the end. + */ +void kernfs_kill_sb(struct super_block *sb) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_node *root_kn = sb->s_root->d_fsdata; + + /* + * Remove the superblock from fs_supers/s_instances + * so we can't find it, before freeing kernfs_super_info. + */ + kill_anon_super(sb); + kfree(info); + kernfs_put(root_kn); +} + +void __init kernfs_init(void) +{ + kernfs_node_cache = kmem_cache_create("kernfs_node_cache", + sizeof(struct kernfs_node), + 0, SLAB_PANIC, NULL); + kernfs_inode_init(); +} diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c new file mode 100644 index 00000000000..4d457055acb --- /dev/null +++ b/fs/kernfs/symlink.c @@ -0,0 +1,151 @@ +/* + * fs/kernfs/symlink.c - kernfs symlink implementation + * + * Copyright (c) 2001-3 Patrick Mochel + * Copyright (c) 2007 SUSE Linux Products GmbH + * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + */ + +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/namei.h> + +#include "kernfs-internal.h" + +/** + * kernfs_create_link - create a symlink + * @parent: directory to create the symlink in + * @name: name of the symlink + * @target: target node for the symlink to point to + * + * Returns the created node on success, ERR_PTR() value on error. + */ +struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, + const char *name, + struct kernfs_node *target) +{ + struct kernfs_node *kn; + struct kernfs_addrm_cxt acxt; + int error; + + kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); + if (!kn) + return ERR_PTR(-ENOMEM); + + if (kernfs_ns_enabled(parent)) + kn->ns = target->ns; + kn->symlink.target_kn = target; + kernfs_get(target); /* ref owned by symlink */ + + kernfs_addrm_start(&acxt); + error = kernfs_add_one(&acxt, kn); + kernfs_addrm_finish(&acxt); + + if (!error) + return kn; + + kernfs_put(kn); + return ERR_PTR(error); +} + +static int kernfs_get_target_path(struct kernfs_node *parent, + struct kernfs_node *target, char *path) +{ + struct kernfs_node *base, *kn; + char *s = path; + int len = 0; + + /* go up to the root, stop at the base */ + base = parent; + while (base->parent) { + kn = target->parent; + while (kn->parent && base != kn) + kn = kn->parent; + + if (base == kn) + break; + + strcpy(s, "../"); + s += 3; + base = base->parent; + } + + /* determine end of target string for reverse fillup */ + kn = target; + while (kn->parent && kn != base) { + len += strlen(kn->name) + 1; + kn = kn->parent; + } + + /* check limits */ + if (len < 2) + return -EINVAL; + len--; + if ((s - path) + len > PATH_MAX) + return -ENAMETOOLONG; + + /* reverse fillup of target string from target to base */ + kn = target; + while (kn->parent && kn != base) { + int slen = strlen(kn->name); + + len -= slen; + strncpy(s + len, kn->name, slen); + if (len) + s[--len] = '/'; + + kn = kn->parent; + } + + return 0; +} + +static int kernfs_getlink(struct dentry *dentry, char *path) +{ + struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *parent = kn->parent; + struct kernfs_node *target = kn->symlink.target_kn; + int error; + + mutex_lock(&kernfs_mutex); + error = kernfs_get_target_path(parent, target, path); + mutex_unlock(&kernfs_mutex); + + return error; +} + +static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int error = -ENOMEM; + unsigned long page = get_zeroed_page(GFP_KERNEL); + if (page) { + error = kernfs_getlink(dentry, (char *) page); + if (error < 0) + free_page((unsigned long)page); + } + nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); + return NULL; +} + +static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *page = nd_get_link(nd); + if (!IS_ERR(page)) + free_page((unsigned long)page); +} + +const struct inode_operations kernfs_symlink_iops = { + .setxattr = kernfs_iop_setxattr, + .removexattr = kernfs_iop_removexattr, + .getxattr = kernfs_iop_getxattr, + .listxattr = kernfs_iop_listxattr, + .readlink = generic_readlink, + .follow_link = kernfs_iop_follow_link, + .put_link = kernfs_iop_put_link, + .setattr = kernfs_iop_setattr, + .getattr = kernfs_iop_getattr, + .permission = kernfs_iop_permission, +}; diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index d448a777166..7f9b096d8d5 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -62,7 +62,8 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, page = read_cache_page(mapping, index, filler, sb); else { page = find_or_create_page(mapping, index, GFP_NOFS); - unlock_page(page); + if (page) + unlock_page(page); } return page; } diff --git a/fs/mount.h b/fs/mount.h index d64c594be6c..a17458ca6f2 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -74,7 +74,7 @@ static inline int mnt_has_parent(struct mount *mnt) static inline int is_mounted(struct vfsmount *mnt) { /* neither detached nor internal? */ - return !IS_ERR_OR_NULL(real_mount(mnt)); + return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns); } extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); diff --git a/fs/namei.c b/fs/namei.c index c53d3a9547f..bcb838e2e52 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -235,27 +235,9 @@ static int check_acl(struct inode *inode, int mask) return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); } - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - - /* - * A filesystem can force a ACL callback by just never filling the - * ACL cache. But normally you'd fill the cache either at inode - * instantiation time, or on the first ->get_acl call. - * - * If the filesystem doesn't have a get_acl() function at all, we'll - * just create the negative cache entry. - */ - if (acl == ACL_NOT_CACHED) { - if (inode->i_op->get_acl) { - acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } else { - set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); - return -EAGAIN; - } - } - + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); @@ -1598,11 +1580,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) * do a "get_unaligned()" if this helps and is sufficiently * fast. * - * - Little-endian machines (so that we can generate the mask - * of low bytes efficiently). Again, we *could* do a byte - * swapping load on big-endian architectures if that is not - * expensive enough to make the optimization worthless. - * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. @@ -1646,7 +1623,7 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len) if (!len) goto done; } - mask = ~(~0ul << len*8); + mask = bytemask_from_count(len); hash += mask & a; done: return fold_hash(hash); diff --git a/fs/namespace.c b/fs/namespace.c index ac2ce8a766e..22e536705c4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2790,6 +2790,8 @@ void __init mnt_init(void) for (u = 0; u < HASH_SIZE; u++) INIT_LIST_HEAD(&mountpoint_hashtable[u]); + kernfs_init(); + err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", @@ -2886,7 +2888,7 @@ bool fs_fully_visible(struct file_system_type *type) struct inode *inode = child->mnt_mountpoint->d_inode; if (!S_ISDIR(inode->i_mode)) goto next; - if (inode->i_nlink != 2) + if (inode->i_nlink > 2) goto next; } visible = true; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 812154aff98..b266f734bd5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1404,7 +1404,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, /* Expect a negative dentry */ BUG_ON(dentry->d_inode); - dfprintk(VFS, "NFS: atomic_open(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); err = nfs_check_flags(open_flags); @@ -1594,7 +1594,7 @@ int nfs_create(struct inode *dir, struct dentry *dentry, int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; int error; - dfprintk(VFS, "NFS: create(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; @@ -1621,7 +1621,7 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) struct iattr attr; int status; - dfprintk(VFS, "NFS: mknod(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); if (!new_valid_dev(rdev)) @@ -1650,7 +1650,7 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct iattr attr; int error; - dfprintk(VFS, "NFS: mkdir(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_valid = ATTR_MODE; @@ -1678,7 +1678,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) { int error; - dfprintk(VFS, "NFS: rmdir(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_rmdir_enter(dir, dentry); @@ -1747,7 +1747,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) int error; int need_rehash = 0; - dfprintk(VFS, "NFS: unlink(%s/%ld, %pd)\n", dir->i_sb->s_id, + dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_unlink_enter(dir, dentry); @@ -1798,7 +1798,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) unsigned int pathlen = strlen(symname); int error; - dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s)\n", dir->i_sb->s_id, + dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry, symname); if (pathlen > PAGE_SIZE) @@ -1821,7 +1821,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); trace_nfs_symlink_exit(dir, dentry, error); if (error != 0) { - dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s) error %d\n", + dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n", dir->i_sb->s_id, dir->i_ino, dentry, symname, error); d_drop(dentry); @@ -2304,7 +2304,7 @@ out: if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) res = -EACCES; - dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", + dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); return res; out_notsup: diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index d71d66c9e0a..b8797ae6831 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -222,14 +222,31 @@ out: * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust * the iocb is still valid here if this is a synchronous request. */ -static void nfs_direct_complete(struct nfs_direct_req *dreq) +static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) { + struct inode *inode = dreq->inode; + + if (dreq->iocb && write) { + loff_t pos = dreq->iocb->ki_pos + dreq->count; + + spin_lock(&inode->i_lock); + if (i_size_read(inode) < pos) + i_size_write(inode, pos); + spin_unlock(&inode->i_lock); + } + + if (write) + nfs_zap_mapping(inode, inode->i_mapping); + + inode_dio_done(inode); + if (dreq->iocb) { long res = (long) dreq->error; if (!res) res = (long) dreq->count; aio_complete(dreq->iocb, res, 0); } + complete_all(&dreq->completion); nfs_direct_req_release(dreq); @@ -237,9 +254,9 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) static void nfs_direct_readpage_release(struct nfs_page *req) { - dprintk("NFS: direct read done (%s/%lld %d@%lld)\n", + dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", req->wb_context->dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); nfs_release_request(req); @@ -272,7 +289,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) } out_put: if (put_dreq(dreq)) - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, false); hdr->release(hdr); } @@ -402,6 +419,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, loff_t pos, bool uio) { struct nfs_pageio_descriptor desc; + struct inode *inode = dreq->inode; ssize_t result = -EINVAL; size_t requested_bytes = 0; unsigned long seg; @@ -410,6 +428,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, &nfs_direct_read_completion_ops); get_dreq(dreq); desc.pg_dreq = dreq; + atomic_inc(&inode->i_dio_count); for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; @@ -429,26 +448,69 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * generic layer handle the completion. */ if (requested_bytes == 0) { + inode_dio_done(inode); nfs_direct_req_release(dreq); return result < 0 ? result : -EIO; } if (put_dreq(dreq)) - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, false); return 0; } -static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, bool uio) +/** + * nfs_file_direct_read - file direct read operation for NFS files + * @iocb: target I/O control block + * @iov: vector of user buffers into which to read data + * @nr_segs: size of iov vector + * @pos: byte offset in file where reading starts + * + * We use this function for direct reads instead of calling + * generic_file_aio_read() in order to avoid gfar's check to see if + * the request starts before the end of the file. For that check + * to work, we must generate a GETATTR before each direct read, and + * even then there is a window between the GETATTR and the subsequent + * READ where the file size could change. Our preference is simply + * to do all reads the application wants, and the server will take + * care of managing the end of file boundary. + * + * This function also eliminates unnecessarily updating the file's + * atime locally, as the NFS server sets the file's atime, and this + * client must read the updated atime from the server back into its + * cache. + */ +ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, bool uio) { - ssize_t result = -ENOMEM; - struct inode *inode = iocb->ki_filp->f_mapping->host; + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx; + ssize_t result = -EINVAL; + size_t count; + count = iov_length(iov, nr_segs); + nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); + + dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", + file, count, (long long) pos); + + result = 0; + if (!count) + goto out; + + mutex_lock(&inode->i_mutex); + result = nfs_sync_mapping(mapping); + if (result) + goto out_unlock; + + task_io_account_read(count); + + result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (dreq == NULL) - goto out; + goto out_unlock; dreq->inode = inode; dreq->bytes_left = iov_length(iov, nr_segs); @@ -464,20 +526,26 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, NFS_I(inode)->read_io += iov_length(iov, nr_segs); result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); - if (!result) + + mutex_unlock(&inode->i_mutex); + + if (!result) { result = nfs_direct_wait(dreq); + if (result > 0) + iocb->ki_pos = pos + result; + } + + nfs_direct_req_release(dreq); + return result; + out_release: nfs_direct_req_release(dreq); +out_unlock: + mutex_unlock(&inode->i_mutex); out: return result; } -static void nfs_inode_dio_write_done(struct inode *inode) -{ - nfs_zap_mapping(inode, inode->i_mapping); - inode_dio_done(inode); -} - #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { @@ -593,8 +661,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) nfs_direct_write_reschedule(dreq); break; default: - nfs_inode_dio_write_done(dreq->inode); - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, true); } } @@ -610,8 +677,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) { - nfs_inode_dio_write_done(inode); - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, true); } #endif @@ -842,93 +908,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, return 0; } -static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, - size_t count, bool uio) -{ - ssize_t result = -ENOMEM; - struct inode *inode = iocb->ki_filp->f_mapping->host; - struct nfs_direct_req *dreq; - struct nfs_lock_context *l_ctx; - - dreq = nfs_direct_req_alloc(); - if (!dreq) - goto out; - - dreq->inode = inode; - dreq->bytes_left = count; - dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); - l_ctx = nfs_get_lock_context(dreq->ctx); - if (IS_ERR(l_ctx)) { - result = PTR_ERR(l_ctx); - goto out_release; - } - dreq->l_ctx = l_ctx; - if (!is_sync_kiocb(iocb)) - dreq->iocb = iocb; - - result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); - if (!result) - result = nfs_direct_wait(dreq); -out_release: - nfs_direct_req_release(dreq); -out: - return result; -} - -/** - * nfs_file_direct_read - file direct read operation for NFS files - * @iocb: target I/O control block - * @iov: vector of user buffers into which to read data - * @nr_segs: size of iov vector - * @pos: byte offset in file where reading starts - * - * We use this function for direct reads instead of calling - * generic_file_aio_read() in order to avoid gfar's check to see if - * the request starts before the end of the file. For that check - * to work, we must generate a GETATTR before each direct read, and - * even then there is a window between the GETATTR and the subsequent - * READ where the file size could change. Our preference is simply - * to do all reads the application wants, and the server will take - * care of managing the end of file boundary. - * - * This function also eliminates unnecessarily updating the file's - * atime locally, as the NFS server sets the file's atime, and this - * client must read the updated atime from the server back into its - * cache. - */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, bool uio) -{ - ssize_t retval = -EINVAL; - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - size_t count; - - count = iov_length(iov, nr_segs); - nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); - - dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", - file, count, (long long) pos); - - retval = 0; - if (!count) - goto out; - - retval = nfs_sync_mapping(mapping); - if (retval) - goto out; - - task_io_account_read(count); - - retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio); - if (retval > 0) - iocb->ki_pos = pos + retval; - -out: - return retval; -} - /** * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block @@ -954,46 +933,96 @@ out: ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, bool uio) { - ssize_t retval = -EINVAL; + ssize_t result = -EINVAL; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct nfs_direct_req *dreq; + struct nfs_lock_context *l_ctx; + loff_t end; size_t count; count = iov_length(iov, nr_segs); + end = (pos + count - 1) >> PAGE_CACHE_SHIFT; + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, count, (long long) pos); - retval = generic_write_checks(file, &pos, &count, 0); - if (retval) + result = generic_write_checks(file, &pos, &count, 0); + if (result) goto out; - retval = -EINVAL; + result = -EINVAL; if ((ssize_t) count < 0) goto out; - retval = 0; + result = 0; if (!count) goto out; - retval = nfs_sync_mapping(mapping); - if (retval) - goto out; + mutex_lock(&inode->i_mutex); + + result = nfs_sync_mapping(mapping); + if (result) + goto out_unlock; + + if (mapping->nrpages) { + result = invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + if (result) + goto out_unlock; + } task_io_account_write(count); - retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio); - if (retval > 0) { - struct inode *inode = mapping->host; + result = -ENOMEM; + dreq = nfs_direct_req_alloc(); + if (!dreq) + goto out_unlock; - iocb->ki_pos = pos + retval; - spin_lock(&inode->i_lock); - if (i_size_read(inode) < iocb->ki_pos) - i_size_write(inode, iocb->ki_pos); - spin_unlock(&inode->i_lock); + dreq->inode = inode; + dreq->bytes_left = count; + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + l_ctx = nfs_get_lock_context(dreq->ctx); + if (IS_ERR(l_ctx)) { + result = PTR_ERR(l_ctx); + goto out_release; + } + dreq->l_ctx = l_ctx; + if (!is_sync_kiocb(iocb)) + dreq->iocb = iocb; + + result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); + + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); } + + mutex_unlock(&inode->i_mutex); + + if (!result) { + result = nfs_direct_wait(dreq); + if (result > 0) { + struct inode *inode = mapping->host; + + iocb->ki_pos = pos + result; + spin_lock(&inode->i_lock); + if (i_size_read(inode) < iocb->ki_pos) + i_size_write(inode, iocb->ki_pos); + spin_unlock(&inode->i_lock); + } + } + nfs_direct_req_release(dreq); + return result; + +out_release: + nfs_direct_req_release(dreq); +out_unlock: + mutex_unlock(&inode->i_mutex); out: - return retval; + return result; } /** diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e2fcacf07de..5bb790a69c7 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, struct page *page; int once_thru = 0; - dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%ld), %u@%lld)\n", + dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); start: @@ -395,7 +395,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, struct nfs_open_context *ctx = nfs_file_open_context(file); int status; - dfprintk(PAGECACHE, "NFS: write_end(%pD2(%ld), %u@%lld)\n", + dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); /* @@ -585,7 +585,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) int ret = VM_FAULT_NOPAGE; struct address_space *mapping; - dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%ld), offset %lld)\n", + dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", filp, filp->f_mapping->host->i_ino, (long long)page_offset(page)); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 00ad1c2b217..ea00b34ff07 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -458,9 +458,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); - dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", + dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), + (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), atomic_read(&inode->i_count)); @@ -870,8 +870,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) struct nfs_fattr *fattr = NULL; struct nfs_inode *nfsi = NFS_I(inode); - dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", - inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n", + inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); trace_nfs_revalidate_inode_enter(inode); @@ -895,9 +895,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label); if (status != 0) { - dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), status); + (unsigned long long)NFS_FILEID(inode), status); if (status == -ESTALE) { nfs_zap_caches(inode); if (!S_ISDIR(inode->i_mode)) @@ -908,9 +908,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) status = nfs_refresh_inode(inode, fattr); if (status) { - dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), status); + (unsigned long long)NFS_FILEID(inode), status); goto err_out; } @@ -919,9 +919,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) nfs_setsecurity(inode, fattr, label); - dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", + dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode)); + (unsigned long long)NFS_FILEID(inode)); err_out: nfs4_label_free(label); @@ -985,8 +985,9 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); nfs_fscache_wait_on_invalidate(inode); - dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", - inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", + inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode)); return 0; } @@ -1282,12 +1283,28 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } +/* + * Don't trust the change_attribute, mtime, ctime or size if + * a pnfs LAYOUTCOMMIT is outstanding + */ +static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, + struct nfs_fattr *fattr) +{ + if (pnfs_layoutcommit_outstanding(inode)) + fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | + NFS_ATTR_FATTR_MTIME | + NFS_ATTR_FATTR_CTIME | + NFS_ATTR_FATTR_SIZE); +} + static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { int ret; trace_nfs_refresh_inode_enter(inode); + nfs_inode_attrs_handle_layoutcommit(inode, fattr); + if (nfs_inode_attrs_need_update(inode, fattr)) ret = nfs_update_inode(inode, fattr); else @@ -1434,7 +1451,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long now = jiffies; unsigned long save_cache_validity; - dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n", + dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); @@ -1455,7 +1472,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* * Big trouble! The inode has become a different object. */ - printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n", + printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n", __func__, inode->i_ino, inode->i_mode, fattr->mode); goto out_err; } @@ -1517,8 +1534,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) || - new_isize > cur_isize) { + if ((nfsi->npages == 0) || new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } @@ -1641,10 +1657,6 @@ struct inode *nfs_alloc_inode(struct super_block *sb) return NULL; nfsi->flags = 0UL; nfsi->cache_validity = 0UL; -#ifdef CONFIG_NFS_V3_ACL - nfsi->acl_access = ERR_PTR(-EAGAIN); - nfsi->acl_default = ERR_PTR(-EAGAIN); -#endif #if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 4a1aafba6a2..9a5ca03fa53 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -10,179 +10,7 @@ #define NFSDBG_FACILITY NFSDBG_PROC -ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; - int pos=0, len=0; - -# define output(s) do { \ - if (pos + sizeof(s) <= size) { \ - memcpy(buffer + pos, s, sizeof(s)); \ - pos += sizeof(s); \ - } \ - len += sizeof(s); \ - } while(0) - - acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - output("system.posix_acl_access"); - posix_acl_release(acl); - } - - if (S_ISDIR(inode->i_mode)) { - acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - output("system.posix_acl_default"); - posix_acl_release(acl); - } - } - -# undef output - - if (!buffer || len <= size) - return len; - return -ERANGE; -} - -ssize_t nfs3_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; - int type, error = 0; - - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) - type = ACL_TYPE_ACCESS; - else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) - type = ACL_TYPE_DEFAULT; - else - return -EOPNOTSUPP; - - acl = nfs3_proc_getacl(inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - if (type == ACL_TYPE_ACCESS && acl->a_count == 0) - error = -ENODATA; - else - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - } else - error = -ENODATA; - - return error; -} - -int nfs3_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; - int type, error; - - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) - type = ACL_TYPE_ACCESS; - else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) - type = ACL_TYPE_DEFAULT; - else - return -EOPNOTSUPP; - - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - error = nfs3_proc_setacl(inode, type, acl); - posix_acl_release(acl); - - return error; -} - -int nfs3_removexattr(struct dentry *dentry, const char *name) -{ - struct inode *inode = dentry->d_inode; - int type; - - if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) - type = ACL_TYPE_ACCESS; - else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) - type = ACL_TYPE_DEFAULT; - else - return -EOPNOTSUPP; - - return nfs3_proc_setacl(inode, type, NULL); -} - -static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi) -{ - if (!IS_ERR(nfsi->acl_access)) { - posix_acl_release(nfsi->acl_access); - nfsi->acl_access = ERR_PTR(-EAGAIN); - } - if (!IS_ERR(nfsi->acl_default)) { - posix_acl_release(nfsi->acl_default); - nfsi->acl_default = ERR_PTR(-EAGAIN); - } -} - -void nfs3_forget_cached_acls(struct inode *inode) -{ - dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id, - inode->i_ino); - spin_lock(&inode->i_lock); - __nfs3_forget_cached_acls(NFS_I(inode)); - spin_unlock(&inode->i_lock); -} - -static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type) -{ - struct nfs_inode *nfsi = NFS_I(inode); - struct posix_acl *acl = ERR_PTR(-EINVAL); - - spin_lock(&inode->i_lock); - switch(type) { - case ACL_TYPE_ACCESS: - acl = nfsi->acl_access; - break; - - case ACL_TYPE_DEFAULT: - acl = nfsi->acl_default; - break; - - default: - goto out; - } - if (IS_ERR(acl)) - acl = ERR_PTR(-EAGAIN); - else - acl = posix_acl_dup(acl); -out: - spin_unlock(&inode->i_lock); - dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id, - inode->i_ino, type, acl); - return acl; -} - -static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl, - struct posix_acl *dfacl) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id, - inode->i_ino, acl, dfacl); - spin_lock(&inode->i_lock); - __nfs3_forget_cached_acls(NFS_I(inode)); - if (!IS_ERR(acl)) - nfsi->acl_access = posix_acl_dup(acl); - if (!IS_ERR(dfacl)) - nfsi->acl_default = posix_acl_dup(dfacl); - spin_unlock(&inode->i_lock); -} - -struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) +struct posix_acl *nfs3_get_acl(struct inode *inode, int type) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFSACL_MAXPAGES] = { }; @@ -198,7 +26,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) .rpc_argp = &args, .rpc_resp = &res, }; - struct posix_acl *acl; int status, count; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) @@ -207,10 +34,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) status = nfs_revalidate_inode(server, inode); if (status < 0) return ERR_PTR(status); - acl = nfs3_get_cached_acl(inode, type); - if (acl != ERR_PTR(-EAGAIN)) - return acl; - acl = NULL; /* * Only get the access acl when explicitly requested: We don't @@ -257,40 +80,41 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) } if (res.acl_access != NULL) { - if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) { + if (posix_acl_equiv_mode(res.acl_access, NULL) || + res.acl_access->a_count == 0) { posix_acl_release(res.acl_access); res.acl_access = NULL; } } - nfs3_cache_acls(inode, - (res.mask & NFS_ACL) ? res.acl_access : ERR_PTR(-EINVAL), - (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL)); - switch(type) { - case ACL_TYPE_ACCESS: - acl = res.acl_access; - res.acl_access = NULL; - break; + if (res.mask & NFS_ACL) + set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access); + else + forget_cached_acl(inode, ACL_TYPE_ACCESS); - case ACL_TYPE_DEFAULT: - acl = res.acl_default; - res.acl_default = NULL; + if (res.mask & NFS_DFACL) + set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default); + else + forget_cached_acl(inode, ACL_TYPE_DEFAULT); + + nfs_free_fattr(res.fattr); + if (type == ACL_TYPE_ACCESS) { + posix_acl_release(res.acl_default); + return res.acl_access; + } else { + posix_acl_release(res.acl_access); + return res.acl_default; } getout: posix_acl_release(res.acl_access); posix_acl_release(res.acl_default); nfs_free_fattr(res.fattr); - - if (status != 0) { - posix_acl_release(acl); - acl = ERR_PTR(status); - } - return acl; + return ERR_PTR(status); } -static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, - struct posix_acl *dfacl) +int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) { struct nfs_server *server = NFS_SERVER(inode); struct nfs_fattr *fattr; @@ -353,7 +177,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, switch (status) { case 0: status = nfs_refresh_inode(inode, fattr); - nfs3_cache_acls(inode, acl, dfacl); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: @@ -373,33 +198,27 @@ out: return status; } -int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl) +int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { struct posix_acl *alloc = NULL, *dfacl = NULL; int status; if (S_ISDIR(inode->i_mode)) { switch(type) { - case ACL_TYPE_ACCESS: - alloc = dfacl = nfs3_proc_getacl(inode, - ACL_TYPE_DEFAULT); - if (IS_ERR(alloc)) - goto fail; - break; - - case ACL_TYPE_DEFAULT: - dfacl = acl; - alloc = acl = nfs3_proc_getacl(inode, - ACL_TYPE_ACCESS); - if (IS_ERR(alloc)) - goto fail; - break; - - default: - return -EINVAL; + case ACL_TYPE_ACCESS: + alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(alloc)) + goto fail; + break; + + case ACL_TYPE_DEFAULT: + dfacl = acl; + alloc = acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(alloc)) + goto fail; + break; } - } else if (type != ACL_TYPE_ACCESS) - return -EINVAL; + } if (acl == NULL) { alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); @@ -417,24 +236,24 @@ fail: int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, umode_t mode) { - struct posix_acl *dfacl, *acl; - int error = 0; + struct posix_acl *default_acl, *acl; + int error; - dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(dfacl)) { - error = PTR_ERR(dfacl); + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) return (error == -EOPNOTSUPP) ? 0 : error; - } - if (!dfacl) - return 0; - acl = posix_acl_dup(dfacl); - error = posix_acl_create(&acl, GFP_KERNEL, &mode); - if (error < 0) - goto out_release_dfacl; - error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? - dfacl : NULL); - posix_acl_release(acl); -out_release_dfacl: - posix_acl_release(dfacl); + + error = nfs3_proc_setacls(inode, acl, default_acl); + + if (acl) + posix_acl_release(acl); + if (default_acl) + posix_acl_release(default_acl); return error; } + +const struct xattr_handler *nfs3_xattr_handlers[] = { + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, + NULL, +}; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 01b6f6a49d1..aa9bc973f36 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -317,8 +317,8 @@ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { + struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call create %pd\n", dentry); @@ -340,7 +340,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, data->arg.create.verifier[1] = cpu_to_be32(current->pid); } - sattr->ia_mode &= ~current_umask(); + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; for (;;) { status = nfs3_do_create(dir, dentry, data); @@ -366,7 +368,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, } if (status != 0) - goto out; + goto out_release_acls; /* When we created the file with exclusive semantics, make * sure we set the attributes afterwards. */ @@ -385,9 +387,14 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_post_op_update_inode(dentry->d_inode, data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); if (status != 0) - goto out; + goto out_release_acls; } - status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + + status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); out: nfs3_free_createdata(data); dprintk("NFS reply create: %d\n", status); @@ -572,18 +579,20 @@ out: static int nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { + struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); - sattr->ia_mode &= ~current_umask(); - data = nfs3_alloc_createdata(); if (data == NULL) goto out; + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; data->arg.mkdir.fh = NFS_FH(dir); data->arg.mkdir.name = dentry->d_name.name; @@ -592,9 +601,13 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) status = nfs3_do_create(dir, dentry, data); if (status != 0) - goto out; + goto out_release_acls; - status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); out: nfs3_free_createdata(data); dprintk("NFS reply mkdir: %d\n", status); @@ -691,19 +704,21 @@ static int nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { + struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - umode_t mode = sattr->ia_mode; int status = -ENOMEM; dprintk("NFS call mknod %pd %u:%u\n", dentry, MAJOR(rdev), MINOR(rdev)); - sattr->ia_mode &= ~current_umask(); - data = nfs3_alloc_createdata(); if (data == NULL) goto out; + status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); + if (status) + goto out; + data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD]; data->arg.mknod.fh = NFS_FH(dir); data->arg.mknod.name = dentry->d_name.name; @@ -731,8 +746,13 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = nfs3_do_create(dir, dentry, data); if (status != 0) - goto out; - status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); + goto out_release_acls; + + status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + +out_release_acls: + posix_acl_release(acl); + posix_acl_release(default_acl); out: nfs3_free_createdata(data); dprintk("NFS reply mknod: %d\n", status); @@ -904,20 +924,28 @@ static const struct inode_operations nfs3_dir_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, - .listxattr = nfs3_listxattr, - .getxattr = nfs3_getxattr, - .setxattr = nfs3_setxattr, - .removexattr = nfs3_removexattr, +#ifdef CONFIG_NFS_V3_ACL + .listxattr = generic_listxattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .removexattr = generic_removexattr, + .get_acl = nfs3_get_acl, + .set_acl = nfs3_set_acl, +#endif }; static const struct inode_operations nfs3_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, - .listxattr = nfs3_listxattr, - .getxattr = nfs3_getxattr, - .setxattr = nfs3_setxattr, - .removexattr = nfs3_removexattr, +#ifdef CONFIG_NFS_V3_ACL + .listxattr = generic_listxattr, + .getxattr = generic_getxattr, + .setxattr = generic_setxattr, + .removexattr = generic_removexattr, + .get_acl = nfs3_get_acl, + .set_acl = nfs3_set_acl, +#endif }; const struct nfs_rpc_ops nfs_v3_clientops = { @@ -965,7 +993,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .commit_rpc_prepare = nfs3_proc_commit_rpc_prepare, .commit_done = nfs3_commit_done, .lock = nfs3_proc_lock, - .clear_acl_cache = nfs3_forget_cached_acls, + .clear_acl_cache = forget_all_cached_acls, .close_context = nfs_close_context, .have_delegation = nfs3_have_delegation, .return_delegation = nfs3_return_delegation, diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c index cc471c72523..d6a98949af1 100644 --- a/fs/nfs/nfs3super.c +++ b/fs/nfs/nfs3super.c @@ -12,6 +12,9 @@ static struct nfs_subversion nfs_v3 = { .rpc_vers = &nfs_version3, .rpc_ops = &nfs_v3_clientops, .sops = &nfs_sops, +#ifdef CONFIG_NFS_V3_ACL + .xattr = nfs3_xattr_handlers, +#endif }; static int __init init_nfs_v3(void) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index b4a160a405c..73d4ecda1e3 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -10,6 +10,7 @@ #include <linux/sunrpc/auth.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/bc_xprt.h> +#include <linux/sunrpc/rpc_pipe_fs.h> #include "internal.h" #include "callback.h" #include "delegation.h" @@ -370,7 +371,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); + + error = -EINVAL; + if (gssd_running(clp->cl_net)) + error = nfs_create_rpc_client(clp, timeparms, + RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); if (error < 0) @@ -409,13 +414,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; - nfs_put_client(clp); - if (clp != old) { - clp->cl_preserve_clid = true; - clp = old; - } - return clp; + if (clp != old) + clp->cl_preserve_clid = true; + nfs_put_client(clp); + return old; error: nfs_mark_client_ready(clp, error); @@ -493,9 +496,10 @@ int nfs40_walk_client_list(struct nfs_client *new, prev = pos; status = nfs_wait_client_init_complete(pos); - spin_lock(&nn->nfs_client_lock); if (status < 0) - continue; + goto out; + status = -NFS4ERR_STALE_CLIENTID; + spin_lock(&nn->nfs_client_lock); } if (pos->cl_cons_state != NFS_CS_READY) continue; @@ -633,7 +637,8 @@ int nfs41_walk_client_list(struct nfs_client *new, } spin_lock(&nn->nfs_client_lock); if (status < 0) - continue; + break; + status = -NFS4ERR_STALE_CLIENTID; } if (pos->cl_cons_state != NFS_CS_READY) continue; diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index b86464ba25e..03fd8be8c0c 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -91,10 +91,10 @@ static void filelayout_reset_write(struct nfs_write_data *data) if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { dprintk("%s Reset task %5u for i/o through MDS " - "(req %s/%lld, %u bytes @ offset %llu)\n", __func__, + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, data->task.tk_pid, hdr->inode->i_sb->s_id, - (long long)NFS_FILEID(hdr->inode), + (unsigned long long)NFS_FILEID(hdr->inode), data->args.count, (unsigned long long)data->args.offset); @@ -112,10 +112,10 @@ static void filelayout_reset_read(struct nfs_read_data *data) if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { dprintk("%s Reset task %5u for i/o through MDS " - "(req %s/%lld, %u bytes @ offset %llu)\n", __func__, + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, data->task.tk_pid, hdr->inode->i_sb->s_id, - (long long)NFS_FILEID(hdr->inode), + (unsigned long long)NFS_FILEID(hdr->inode), data->args.count, (unsigned long long)data->args.offset); @@ -1216,17 +1216,17 @@ static void filelayout_recover_commit_reqs(struct list_head *dst, struct pnfs_commit_bucket *b; int i; - /* NOTE cinfo->lock is NOT held, relying on fact that this is - * only called on single thread per dreq. - * Can't take the lock because need to do pnfs_put_lseg - */ + spin_lock(cinfo->lock); for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (transfer_commit_list(&b->written, dst, cinfo, 0)) { + spin_unlock(cinfo->lock); pnfs_put_lseg(b->wlseg); b->wlseg = NULL; + spin_lock(cinfo->lock); } } cinfo->ds->nwritten = 0; + spin_unlock(cinfo->lock); } static unsigned int diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index c7c295e556e..efac602edb3 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -95,7 +95,7 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) b6 = (struct sockaddr_in6 *)addr2; /* LINKLOCAL addresses must have matching scope_id */ - if (ipv6_addr_scope(&a6->sin6_addr) == + if (ipv6_addr_src_scope(&a6->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && a6->sin6_scope_id != b6->sin6_scope_id) return false; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 15052b81df4..a1965329a12 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7409,9 +7409,9 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; struct nfs4_state *state = NULL; - unsigned long timeo, giveup; + unsigned long timeo, now, giveup; - dprintk("--> %s\n", __func__); + dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); if (!nfs41_sequence_done(task, &lgp->res.seq_res)) goto out; @@ -7419,12 +7419,38 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: goto out; + /* + * NFS4ERR_LAYOUTTRYLATER is a conflict with another client + * (or clients) writing to the same RAID stripe + */ case -NFS4ERR_LAYOUTTRYLATER: + /* + * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall + * existing layout before getting a new one). + */ case -NFS4ERR_RECALLCONFLICT: timeo = rpc_get_timeout(task->tk_client); giveup = lgp->args.timestamp + timeo; - if (time_after(giveup, jiffies)) - task->tk_status = -NFS4ERR_DELAY; + now = jiffies; + if (time_after(giveup, now)) { + unsigned long delay; + + /* Delay for: + * - Not less then NFS4_POLL_RETRY_MIN. + * - One last time a jiffie before we give up + * - exponential backoff (time_now minus start_attempt) + */ + delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN, + min((giveup - now - 1), + now - lgp->args.timestamp)); + + dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", + __func__, delay); + rpc_delay(task, delay); + task->tk_status = 0; + rpc_restart_call_prepare(task); + goto out; /* Do not call nfs4_async_handle_error() */ + } break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: @@ -7780,10 +7806,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) case -NFS4ERR_BADLAYOUT: /* no layout */ case -NFS4ERR_GRACE: /* loca_recalim always false */ task->tk_status = 0; - break; case 0: - nfs_post_op_update_inode_force_wcc(data->args.inode, - data->res.fattr); break; default: if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { @@ -7798,6 +7821,8 @@ static void nfs4_layoutcommit_release(void *calldata) struct nfs4_layoutcommit_data *data = calldata; pnfs_cleanup_layoutcommit(data); + nfs_post_op_update_inode_force_wcc(data->args.inode, + data->res.fattr); put_rpccred(data->cred); kfree(data); } @@ -7920,7 +7945,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, switch (err) { case 0: case -NFS4ERR_WRONGSEC: - case -NFS4ERR_NOTSUPP: + case -ENOTSUPP: goto out; default: err = nfs4_handle_exception(server, err, &exception); @@ -7954,7 +7979,7 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, * Fall back on "guess and check" method if * the server doesn't support SECINFO_NO_NAME */ - if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) { + if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) { err = nfs4_find_root_sec(server, fhandle, info); goto out_freepage; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 059c01b67a7..e5be72518bd 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1071,7 +1071,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid) /* * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or * failed with a seqid incrementing error - - * see comments nfs_fs.h:seqid_mutating_error() + * see comments nfs4.h:seqid_mutating_error() */ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) { @@ -1116,7 +1116,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) /* * Increment the seqid if the LOCK/LOCKU succeeded, or * failed with a seqid incrementing error - - * see comments nfs_fs.h:seqid_mutating_error() + * see comments nfs4.h:seqid_mutating_error() */ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) { diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 65ab0a0ca1c..808f2957441 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -77,17 +77,9 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret = nfs_write_inode(inode, wbc); - if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { - int status; - bool sync = true; - - if (wbc->sync_mode == WB_SYNC_NONE) - sync = false; - - status = pnfs_layoutcommit_inode(inode, sync); - if (status < 0) - return status; - } + if (ret == 0) + ret = pnfs_layoutcommit_inode(inode, + wbc->sync_mode == WB_SYNC_ALL); return ret; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5be2868c02f..8c21d69a9dc 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3097,7 +3097,8 @@ out_overflow: return -EIO; } -static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, + int *nfs_retval) { __be32 *p; uint32_t opnum; @@ -3107,19 +3108,32 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) if (unlikely(!p)) goto out_overflow; opnum = be32_to_cpup(p++); - if (opnum != expected) { - dprintk("nfs: Server returned operation" - " %d but we issued a request for %d\n", - opnum, expected); - return -EIO; - } + if (unlikely(opnum != expected)) + goto out_bad_operation; nfserr = be32_to_cpup(p); - if (nfserr != NFS_OK) - return nfs4_stat_to_errno(nfserr); - return 0; + if (nfserr == NFS_OK) + *nfs_retval = 0; + else + *nfs_retval = nfs4_stat_to_errno(nfserr); + return true; +out_bad_operation: + dprintk("nfs: Server returned operation" + " %d but we issued a request for %d\n", + opnum, expected); + *nfs_retval = -EREMOTEIO; + return false; out_overflow: print_overflow_msg(__func__, xdr); - return -EIO; + *nfs_retval = -EIO; + return false; +} + +static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +{ + int retval; + + __decode_op_hdr(xdr, expected, &retval); + return retval; } /* Dummy routine */ @@ -5001,11 +5015,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) uint32_t savewords, bmlen, i; int status; - status = decode_op_hdr(xdr, OP_OPEN); - if (status != -EIO) - nfs_increment_open_seqid(status, res->seqid); - if (!status) - status = decode_stateid(xdr, &res->stateid); + if (!__decode_op_hdr(xdr, OP_OPEN, &status)) + return status; + nfs_increment_open_seqid(status, res->seqid); + if (status) + return status; + status = decode_stateid(xdr, &res->stateid); if (unlikely(status)) return status; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d75d938d36c..4755858e37a 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1790,6 +1790,15 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) } EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); +static void pnfs_clear_layoutcommitting(struct inode *inode) +{ + unsigned long *bitlock = &NFS_I(inode)->flags; + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} + /* * There can be multiple RW segments. */ @@ -1807,7 +1816,6 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) { struct pnfs_layout_segment *lseg, *tmp; - unsigned long *bitlock = &NFS_I(inode)->flags; /* Matched by references in pnfs_set_layoutcommit */ list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { @@ -1815,9 +1823,7 @@ static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *lis pnfs_put_lseg(lseg); } - clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); - wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); + pnfs_clear_layoutcommitting(inode); } void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) @@ -1881,43 +1887,37 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) struct nfs4_layoutcommit_data *data; struct nfs_inode *nfsi = NFS_I(inode); loff_t end_pos; - int status = 0; + int status; - dprintk("--> %s inode %lu\n", __func__, inode->i_ino); - - if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + if (!pnfs_layoutcommit_outstanding(inode)) return 0; - /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ - data = kzalloc(sizeof(*data), GFP_NOFS); - if (!data) { - status = -ENOMEM; - goto out; - } - - if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) - goto out_free; + dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + status = -EAGAIN; if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { - if (!sync) { - status = -EAGAIN; - goto out_free; - } - status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, - nfs_wait_bit_killable, TASK_KILLABLE); + if (!sync) + goto out; + status = wait_on_bit_lock(&nfsi->flags, + NFS_INO_LAYOUTCOMMITTING, + nfs_wait_bit_killable, + TASK_KILLABLE); if (status) - goto out_free; + goto out; } - INIT_LIST_HEAD(&data->lseg_list); + status = -ENOMEM; + /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) + goto clear_layoutcommitting; + + status = 0; spin_lock(&inode->i_lock); - if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { - clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); - spin_unlock(&inode->i_lock); - wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); - goto out_free; - } + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + goto out_unlock; + INIT_LIST_HEAD(&data->lseg_list); pnfs_list_write_lseg(inode, &data->lseg_list); end_pos = nfsi->layout->plh_lwb; @@ -1940,8 +1940,11 @@ out: mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; -out_free: +out_unlock: + spin_unlock(&inode->i_lock); kfree(data); +clear_layoutcommitting: + pnfs_clear_layoutcommitting(inode); goto out; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index a4f41810a7f..02379390977 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -359,6 +359,15 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode) PNFS_LAYOUTRET_ON_SETATTR; } +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 || + test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0; +} + static inline int pnfs_return_layout(struct inode *ino) { struct nfs_inode *nfsi = NFS_I(ino); @@ -515,6 +524,13 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, return false; } +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + return false; +} + + static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { return NULL; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 31db5c366b8..411aedda14b 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -163,9 +163,9 @@ static void nfs_readpage_release(struct nfs_page *req) unlock_page(req->wb_page); - dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", + dprintk("NFS: read done (%s/%Lu %d@%Ld)\n", req->wb_context->dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); nfs_release_request(req); @@ -228,11 +228,11 @@ int nfs_initiate_read(struct rpc_clnt *clnt, /* Set up the initial task struct. */ NFS_PROTO(inode)->read_setup(data, &msg); - dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ " + dprintk("NFS: %5u initiated read call (req %s/%llu, %u bytes @ " "offset %llu)\n", data->task.tk_pid, inode->i_sb->s_id, - (long long)NFS_FILEID(inode), + (unsigned long long)NFS_FILEID(inode), data->args.count, (unsigned long long)data->args.offset); @@ -630,9 +630,9 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, unsigned long npages; int ret = -ESTALE; - dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", + dprintk("NFS: nfs_readpages (%s/%Lu %d)\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), + (unsigned long long)NFS_FILEID(inode), nr_pages); nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c1d548211c3..a44a87268a6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -922,19 +922,20 @@ out: * extend the write to cover the entire page in order to avoid fragmentation * inefficiencies. * - * If the file is opened for synchronous writes or if we have a write delegation - * from the server then we can just skip the rest of the checks. + * If the file is opened for synchronous writes then we can just skip the rest + * of the checks. */ static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) { if (file->f_flags & O_DSYNC) return 0; + if (!nfs_write_pageuptodate(page, inode)) + return 0; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) return 1; - if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL || - (inode->i_flock->fl_start == 0 && + if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 && inode->i_flock->fl_end == OFFSET_MAX && - inode->i_flock->fl_type != F_RDLCK))) + inode->i_flock->fl_type != F_RDLCK)) return 1; return 0; } @@ -1013,10 +1014,10 @@ int nfs_initiate_write(struct rpc_clnt *clnt, NFS_PROTO(inode)->write_setup(data, &msg); dprintk("NFS: %5u initiated write call " - "(req %s/%lld, %u bytes @ offset %llu)\n", + "(req %s/%llu, %u bytes @ offset %llu)\n", data->task.tk_pid, inode->i_sb->s_id, - (long long)NFS_FILEID(inode), + (unsigned long long)NFS_FILEID(inode), data->args.count, (unsigned long long)data->args.offset); @@ -1606,9 +1607,9 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) nfs_list_remove_request(req); nfs_clear_page_commit(req->wb_page); - dprintk("NFS: commit (%s/%lld %d@%lld)", + dprintk("NFS: commit (%s/%llu %d@%lld)", req->wb_context->dentry->d_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); if (status < 0) { diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index afd3e0eae64..a812fd1b92a 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -35,7 +35,9 @@ #ifndef LINUX_NFS4_ACL_H #define LINUX_NFS4_ACL_H -#include <linux/posix_acl.h> +struct nfs4_acl; +struct svc_fh; +struct svc_rqst; /* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to * fit in a page: */ @@ -45,13 +47,9 @@ struct nfs4_acl *nfs4_acl_new(int); int nfs4_acl_get_whotype(char *, u32); __be32 nfs4_acl_write_who(int who, __be32 **p, int *len); -#define NFS4_ACL_TYPE_DEFAULT 0x01 -#define NFS4_ACL_DIR 0x02 -#define NFS4_ACL_OWNER 0x04 - -struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *, - struct posix_acl *, unsigned int flags); -int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **, - struct posix_acl **, unsigned int flags); +int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, + struct nfs4_acl **acl); +__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl); #endif /* LINUX_NFS4_ACL_H */ diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 95d76dc6c5d..11c1fba2931 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -30,8 +30,9 @@ nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) { - svc_fh *fh; struct posix_acl *acl; + struct inode *inode; + svc_fh *fh; __be32 nfserr = 0; dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); @@ -41,6 +42,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, if (nfserr) RETURN_STATUS(nfserr); + inode = fh->fh_dentry->d_inode; + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) RETURN_STATUS(nfserr_inval); resp->mask = argp->mask; @@ -50,21 +53,13 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, goto fail; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); + acl = get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) { - int err = PTR_ERR(acl); - - if (err == -ENODATA || err == -EOPNOTSUPP) - acl = NULL; - else { - nfserr = nfserrno(err); - goto fail; - } + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; } if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ - - struct inode *inode = fh->fh_dentry->d_inode; acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } resp->acl_access = acl; @@ -72,17 +67,10 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - - acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT); + acl = get_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { - int err = PTR_ERR(acl); - - if (err == -ENODATA || err == -EOPNOTSUPP) - acl = NULL; - else { - nfserr = nfserrno(err); - goto fail; - } + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; } resp->acl_default = acl; } @@ -103,31 +91,51 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, struct nfsd3_setaclargs *argp, struct nfsd_attrstat *resp) { + struct inode *inode; svc_fh *fh; __be32 nfserr = 0; + int error; dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); fh = fh_copy(&resp->fh, &argp->fh); nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (nfserr) + goto out; - if (!nfserr) { - nfserr = nfserrno( nfsd_set_posix_acl( - fh, ACL_TYPE_ACCESS, argp->acl_access) ); - } - if (!nfserr) { - nfserr = nfserrno( nfsd_set_posix_acl( - fh, ACL_TYPE_DEFAULT, argp->acl_default) ); - } - if (!nfserr) { - nfserr = fh_getattr(fh, &resp->stat); + inode = fh->fh_dentry->d_inode; + if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { + error = -EOPNOTSUPP; + goto out_errno; } + error = fh_want_write(fh); + if (error) + goto out_errno; + + error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + if (error) + goto out_drop_write; + error = inode->i_op->set_acl(inode, argp->acl_default, + ACL_TYPE_DEFAULT); + if (error) + goto out_drop_write; + + fh_drop_write(fh); + + nfserr = fh_getattr(fh, &resp->stat); + +out: /* argp->acl_{access,default} may have been allocated in nfssvc_decode_setaclargs. */ posix_acl_release(argp->acl_access); posix_acl_release(argp->acl_default); return nfserr; +out_drop_write: + fh_drop_write(fh); +out_errno: + nfserr = nfserrno(error); + goto out; } /* diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 9cbc1a841f8..adc5f1b1dc2 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -29,8 +29,9 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) { - svc_fh *fh; struct posix_acl *acl; + struct inode *inode; + svc_fh *fh; __be32 nfserr = 0; fh = fh_copy(&resp->fh, &argp->fh); @@ -38,26 +39,20 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, if (nfserr) RETURN_STATUS(nfserr); + inode = fh->fh_dentry->d_inode; + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) RETURN_STATUS(nfserr_inval); resp->mask = argp->mask; if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { - acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); + acl = get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) { - int err = PTR_ERR(acl); - - if (err == -ENODATA || err == -EOPNOTSUPP) - acl = NULL; - else { - nfserr = nfserrno(err); - goto fail; - } + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; } if (acl == NULL) { /* Solaris returns the inode's minimum ACL. */ - - struct inode *inode = fh->fh_dentry->d_inode; acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); } resp->acl_access = acl; @@ -65,17 +60,10 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { /* Check how Solaris handles requests for the Default ACL of a non-directory! */ - - acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT); + acl = get_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) { - int err = PTR_ERR(acl); - - if (err == -ENODATA || err == -EOPNOTSUPP) - acl = NULL; - else { - nfserr = nfserrno(err); - goto fail; - } + nfserr = nfserrno(PTR_ERR(acl)); + goto fail; } resp->acl_default = acl; } @@ -96,21 +84,37 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, struct nfsd3_setaclargs *argp, struct nfsd3_attrstat *resp) { + struct inode *inode; svc_fh *fh; __be32 nfserr = 0; + int error; fh = fh_copy(&resp->fh, &argp->fh); nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR); + if (nfserr) + goto out; - if (!nfserr) { - nfserr = nfserrno( nfsd_set_posix_acl( - fh, ACL_TYPE_ACCESS, argp->acl_access) ); - } - if (!nfserr) { - nfserr = nfserrno( nfsd_set_posix_acl( - fh, ACL_TYPE_DEFAULT, argp->acl_default) ); + inode = fh->fh_dentry->d_inode; + if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { + error = -EOPNOTSUPP; + goto out_errno; } + error = fh_want_write(fh); + if (error) + goto out_errno; + + error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + if (error) + goto out_drop_write; + error = inode->i_op->set_acl(inode, argp->acl_default, + ACL_TYPE_DEFAULT); + +out_drop_write: + fh_drop_write(fh); +out_errno: + nfserr = nfserrno(error); +out: /* argp->acl_{access,default} may have been allocated in nfs3svc_decode_setaclargs. */ posix_acl_release(argp->acl_access); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index eea24c9a561..d3a58714422 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -37,9 +37,14 @@ #include <linux/slab.h> #include <linux/nfs_fs.h> #include <linux/export.h> +#include "nfsfh.h" #include "nfsd.h" #include "acl.h" +#include "vfs.h" +#define NFS4_ACL_TYPE_DEFAULT 0x01 +#define NFS4_ACL_DIR 0x02 +#define NFS4_ACL_OWNER 0x04 /* mode bit translations: */ #define NFS4_READ_MODE (NFS4_ACE_READ_DATA) @@ -131,36 +136,50 @@ static short ace2type(struct nfs4_ace *); static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int); -struct nfs4_acl * -nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl, - unsigned int flags) +int +nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, + struct nfs4_acl **acl) { - struct nfs4_acl *acl; + struct inode *inode = dentry->d_inode; + int error = 0; + struct posix_acl *pacl = NULL, *dpacl = NULL; + unsigned int flags = 0; int size = 0; - if (pacl) { - if (posix_acl_valid(pacl) < 0) - return ERR_PTR(-EINVAL); - size += 2*pacl->a_count; + pacl = get_acl(inode, ACL_TYPE_ACCESS); + if (!pacl) { + pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + if (IS_ERR(pacl)) + return PTR_ERR(pacl); + /* allocate for worst case: one (deny, allow) pair each: */ + size += 2 * pacl->a_count; } - if (dpacl) { - if (posix_acl_valid(dpacl) < 0) - return ERR_PTR(-EINVAL); - size += 2*dpacl->a_count; + + if (S_ISDIR(inode->i_mode)) { + flags = NFS4_ACL_DIR; + dpacl = get_acl(inode, ACL_TYPE_DEFAULT); + if (dpacl) + size += 2 * dpacl->a_count; + } else { + dpacl = NULL; } - /* Allocate for worst case: one (deny, allow) pair each: */ - acl = nfs4_acl_new(size); - if (acl == NULL) - return ERR_PTR(-ENOMEM); + *acl = nfs4_acl_new(size); + if (*acl == NULL) { + error = -ENOMEM; + goto out; + } if (pacl) - _posix_to_nfsv4_one(pacl, acl, flags & ~NFS4_ACL_TYPE_DEFAULT); + _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT); if (dpacl) - _posix_to_nfsv4_one(dpacl, acl, flags | NFS4_ACL_TYPE_DEFAULT); + _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT); - return acl; + out: + posix_acl_release(pacl); + posix_acl_release(dpacl); + return error; } struct posix_acl_summary { @@ -720,8 +739,9 @@ static void process_one_v4_ace(struct posix_acl_state *state, } } -int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, - struct posix_acl **dpacl, unsigned int flags) +static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, + struct posix_acl **pacl, struct posix_acl **dpacl, + unsigned int flags) { struct posix_acl_state effective_acl_state, default_acl_state; struct nfs4_ace *ace; @@ -781,6 +801,57 @@ out_estate: return ret; } +__be32 +nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfs4_acl *acl) +{ + __be32 error; + int host_error; + struct dentry *dentry; + struct inode *inode; + struct posix_acl *pacl = NULL, *dpacl = NULL; + unsigned int flags = 0; + + /* Get inode */ + error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); + if (error) + return error; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + + if (!inode->i_op->set_acl || !IS_POSIXACL(inode)) + return nfserr_attrnotsupp; + + if (S_ISDIR(inode->i_mode)) + flags = NFS4_ACL_DIR; + + host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); + if (host_error == -EINVAL) + return nfserr_attrnotsupp; + if (host_error < 0) + goto out_nfserr; + + host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS); + if (host_error < 0) + goto out_release; + + if (S_ISDIR(inode->i_mode)) { + host_error = inode->i_op->set_acl(inode, dpacl, + ACL_TYPE_DEFAULT); + } + +out_release: + posix_acl_release(pacl); + posix_acl_release(dpacl); +out_nfserr: + if (host_error == -EOPNOTSUPP) + return nfserr_attrnotsupp; + else + return nfserrno(host_error); +} + + static short ace2type(struct nfs4_ace *ace) { @@ -799,9 +870,6 @@ ace2type(struct nfs4_ace *ace) return -1; } -EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4); -EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix); - struct nfs4_acl * nfs4_acl_new(int n) { @@ -868,7 +936,3 @@ __be32 nfs4_acl_write_who(int who, __be32 **p, int *len) WARN_ON_ONCE(1); return -1; } - -EXPORT_SYMBOL(nfs4_acl_new); -EXPORT_SYMBOL(nfs4_acl_get_whotype); -EXPORT_SYMBOL(nfs4_acl_write_who); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ef76ba63238..82189b208af 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -41,6 +41,7 @@ #include "vfs.h" #include "current_stateid.h" #include "netns.h" +#include "acl.h" #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a41302a0065..017d3cb5e99 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -459,158 +459,7 @@ out: return err; } -#if defined(CONFIG_NFSD_V2_ACL) || \ - defined(CONFIG_NFSD_V3_ACL) || \ - defined(CONFIG_NFSD_V4) -static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) -{ - ssize_t buflen; - ssize_t ret; - - buflen = vfs_getxattr(dentry, key, NULL, 0); - if (buflen <= 0) - return buflen; - - *buf = kmalloc(buflen, GFP_KERNEL); - if (!*buf) - return -ENOMEM; - - ret = vfs_getxattr(dentry, key, *buf, buflen); - if (ret < 0) - kfree(*buf); - return ret; -} -#endif - #if defined(CONFIG_NFSD_V4) -static int -set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) -{ - int len; - size_t buflen; - char *buf = NULL; - int error = 0; - - buflen = posix_acl_xattr_size(pacl->a_count); - buf = kmalloc(buflen, GFP_KERNEL); - error = -ENOMEM; - if (buf == NULL) - goto out; - - len = posix_acl_to_xattr(&init_user_ns, pacl, buf, buflen); - if (len < 0) { - error = len; - goto out; - } - - error = vfs_setxattr(dentry, key, buf, len, 0); -out: - kfree(buf); - return error; -} - -__be32 -nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct nfs4_acl *acl) -{ - __be32 error; - int host_error; - struct dentry *dentry; - struct inode *inode; - struct posix_acl *pacl = NULL, *dpacl = NULL; - unsigned int flags = 0; - - /* Get inode */ - error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR); - if (error) - return error; - - dentry = fhp->fh_dentry; - inode = dentry->d_inode; - if (S_ISDIR(inode->i_mode)) - flags = NFS4_ACL_DIR; - - host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); - if (host_error == -EINVAL) { - return nfserr_attrnotsupp; - } else if (host_error < 0) - goto out_nfserr; - - host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); - if (host_error < 0) - goto out_release; - - if (S_ISDIR(inode->i_mode)) - host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); - -out_release: - posix_acl_release(pacl); - posix_acl_release(dpacl); -out_nfserr: - if (host_error == -EOPNOTSUPP) - return nfserr_attrnotsupp; - else - return nfserrno(host_error); -} - -static struct posix_acl * -_get_posix_acl(struct dentry *dentry, char *key) -{ - void *buf = NULL; - struct posix_acl *pacl = NULL; - int buflen; - - buflen = nfsd_getxattr(dentry, key, &buf); - if (!buflen) - buflen = -ENODATA; - if (buflen <= 0) - return ERR_PTR(buflen); - - pacl = posix_acl_from_xattr(&init_user_ns, buf, buflen); - kfree(buf); - return pacl; -} - -int -nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl) -{ - struct inode *inode = dentry->d_inode; - int error = 0; - struct posix_acl *pacl = NULL, *dpacl = NULL; - unsigned int flags = 0; - - pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS); - if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA) - pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); - if (IS_ERR(pacl)) { - error = PTR_ERR(pacl); - pacl = NULL; - goto out; - } - - if (S_ISDIR(inode->i_mode)) { - dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT); - if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA) - dpacl = NULL; - else if (IS_ERR(dpacl)) { - error = PTR_ERR(dpacl); - dpacl = NULL; - goto out; - } - flags = NFS4_ACL_DIR; - } - - *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags); - if (IS_ERR(*acl)) { - error = PTR_ERR(*acl); - *acl = NULL; - } - out: - posix_acl_release(pacl); - posix_acl_release(dpacl); - return error; -} - /* * NFS junction information is stored in an extended attribute. */ @@ -2258,93 +2107,3 @@ out_nomem: nfsd_racache_shutdown(); return -ENOMEM; } - -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -struct posix_acl * -nfsd_get_posix_acl(struct svc_fh *fhp, int type) -{ - struct inode *inode = fhp->fh_dentry->d_inode; - char *name; - void *value = NULL; - ssize_t size; - struct posix_acl *acl; - - if (!IS_POSIXACL(inode)) - return ERR_PTR(-EOPNOTSUPP); - - switch (type) { - case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; - break; - default: - return ERR_PTR(-EOPNOTSUPP); - } - - size = nfsd_getxattr(fhp->fh_dentry, name, &value); - if (size < 0) - return ERR_PTR(size); - - acl = posix_acl_from_xattr(&init_user_ns, value, size); - kfree(value); - return acl; -} - -int -nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) -{ - struct inode *inode = fhp->fh_dentry->d_inode; - char *name; - void *value = NULL; - size_t size; - int error; - - if (!IS_POSIXACL(inode) || - !inode->i_op->setxattr || !inode->i_op->removexattr) - return -EOPNOTSUPP; - switch(type) { - case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; - break; - default: - return -EOPNOTSUPP; - } - - if (acl && acl->a_count) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_KERNEL); - if (!value) - return -ENOMEM; - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (error < 0) - goto getout; - size = error; - } else - size = 0; - - error = fh_want_write(fhp); - if (error) - goto getout; - if (size) - error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0); - else { - if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT) - error = 0; - else { - error = vfs_removexattr(fhp->fh_dentry, name); - if (error == -ENODATA) - error = 0; - } - } - fh_drop_write(fhp); - -getout: - kfree(value); - return error; -} -#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index fd8c0cc9c25..fbe90bdb221 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -52,9 +52,6 @@ __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, struct iattr *, int, time_t); int nfsd_mountpoint(struct dentry *, struct svc_export *); #ifdef CONFIG_NFSD_V4 -__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *, - struct nfs4_acl *); -int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **); __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, struct xdr_netobj *); #endif /* CONFIG_NFSD_V4 */ @@ -99,11 +96,6 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); -int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); -#endif - static inline int fh_want_write(struct svc_fh *fh) { int ret = mnt_want_write(fh->fh_export->ex_path.mnt); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index b44bdb291b8..2b34021948e 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -37,7 +37,26 @@ #include "sufile.h" #include "dat.h" - +/** + * nilfs_ioctl_wrap_copy - wrapping function of get/set metadata info + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @dir: set of direction flags + * @dofunc: concrete function of get/set metadata info + * + * Description: nilfs_ioctl_wrap_copy() gets/sets metadata info by means of + * calling dofunc() function on the basis of @argv argument. + * + * Return Value: On success, 0 is returned and requested metadata info + * is copied into userspace. On error, one of the following + * negative error codes is returned. + * + * %-EINVAL - Invalid arguments from userspace. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during execution of requested operation. + */ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, struct nilfs_argv *argv, int dir, ssize_t (*dofunc)(struct the_nilfs *, @@ -57,6 +76,14 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, if (argv->v_size > PAGE_SIZE) return -EINVAL; + /* + * Reject pairs of a start item position (argv->v_index) and a + * total count (argv->v_nmembs) which leads position 'pos' to + * overflow by the increment at the end of the loop. + */ + if (argv->v_index > ~(__u64)0 - argv->v_nmembs) + return -EINVAL; + buf = (void *)__get_free_pages(GFP_NOFS, 0); if (unlikely(!buf)) return -ENOMEM; @@ -99,6 +126,9 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, return ret; } +/** + * nilfs_ioctl_getflags - ioctl to support lsattr + */ static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp) { unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE; @@ -106,6 +136,9 @@ static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp) return put_user(flags, (int __user *)argp); } +/** + * nilfs_ioctl_setflags - ioctl to support chattr + */ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, void __user *argp) { @@ -158,11 +191,33 @@ out: return ret; } +/** + * nilfs_ioctl_getversion - get info about a file's version (generation number) + */ static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp) { return put_user(inode->i_generation, (int __user *)argp); } +/** + * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot) + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_change_cpmode() function changes mode of + * given checkpoint between checkpoint and snapshot state. This ioctl + * is used in chcp and mkcp utilities. + * + * Return Value: On success, 0 is returned and mode of a checkpoint is + * changed. On error, one of the following negative error codes + * is returned. + * + * %-EPERM - Operation not permitted. + * + * %-EFAULT - Failure during checkpoint mode changing. + */ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -198,6 +253,25 @@ out: return ret; } +/** + * nilfs_ioctl_delete_checkpoint - remove checkpoint + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_delete_checkpoint() function removes + * checkpoint from NILFS2 file system. This ioctl is used in rmcp + * utility. + * + * Return Value: On success, 0 is returned and a checkpoint is + * removed. On error, one of the following negative error codes + * is returned. + * + * %-EPERM - Operation not permitted. + * + * %-EFAULT - Failure during checkpoint removing. + */ static int nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) @@ -229,6 +303,21 @@ out: return ret; } +/** + * nilfs_ioctl_do_get_cpinfo - callback method getting info about checkpoints + * @nilfs: nilfs object + * @posp: pointer on array of checkpoint's numbers + * @flags: checkpoint mode (checkpoint or snapshot) + * @buf: buffer for storing checkponts' info + * @size: size in bytes of one checkpoint info item in array + * @nmembs: number of checkpoints in array (numbers and infos) + * + * Description: nilfs_ioctl_do_get_cpinfo() function returns info about + * requested checkpoints. The NILFS_IOCTL_GET_CPINFO ioctl is used in + * lscp utility and by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_cpinfo structures in output buffer. + */ static ssize_t nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) @@ -242,6 +331,27 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, return ret; } +/** + * nilfs_ioctl_get_cpstat - get checkpoints statistics + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_get_cpstat() returns information about checkpoints. + * The NILFS_IOCTL_GET_CPSTAT ioctl is used by lscp, rmcp utilities + * and by nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned, and checkpoints information is + * copied into userspace pointer @argp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during getting checkpoints statistics. + */ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -260,6 +370,21 @@ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, return ret; } +/** + * nilfs_ioctl_do_get_suinfo - callback method getting segment usage info + * @nilfs: nilfs object + * @posp: pointer on array of segment numbers + * @flags: *not used* + * @buf: buffer for storing suinfo array + * @size: size in bytes of one suinfo item in array + * @nmembs: count of segment numbers and suinfos in array + * + * Description: nilfs_ioctl_do_get_suinfo() function returns segment usage + * info about requested segments. The NILFS_IOCTL_GET_SUINFO ioctl is used + * in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_suinfo structures in output buffer. + */ static ssize_t nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) @@ -273,6 +398,27 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, return ret; } +/** + * nilfs_ioctl_get_sustat - get segment usage statistics + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_get_sustat() returns segment usage statistics. + * The NILFS_IOCTL_GET_SUSTAT ioctl is used in lssu, nilfs_resize utilities + * and by nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned, and segment usage information is + * copied into userspace pointer @argp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during getting segment usage statistics. + */ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -291,6 +437,21 @@ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, return ret; } +/** + * nilfs_ioctl_do_get_vinfo - callback method getting virtual blocks info + * @nilfs: nilfs object + * @posp: *not used* + * @flags: *not used* + * @buf: buffer for storing array of nilfs_vinfo structures + * @size: size in bytes of one vinfo item in array + * @nmembs: count of vinfos in array + * + * Description: nilfs_ioctl_do_get_vinfo() function returns information + * on virtual block addresses. The NILFS_IOCTL_GET_VINFO ioctl is used + * by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_vinfo structures in output buffer. + */ static ssize_t nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) @@ -303,6 +464,21 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, return ret; } +/** + * nilfs_ioctl_do_get_bdescs - callback method getting disk block descriptors + * @nilfs: nilfs object + * @posp: *not used* + * @flags: *not used* + * @buf: buffer for storing array of nilfs_bdesc structures + * @size: size in bytes of one bdesc item in array + * @nmembs: count of bdescs in array + * + * Description: nilfs_ioctl_do_get_bdescs() function returns information + * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl + * is used by nilfs_cleanerd daemon. + * + * Return value: count of nilfs_bdescs structures in output buffer. + */ static ssize_t nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, void *buf, size_t size, size_t nmembs) @@ -329,6 +505,29 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, return nmembs; } +/** + * nilfs_ioctl_get_bdescs - get disk block descriptors + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_do_get_bdescs() function returns information + * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl + * is used by nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned, and disk block descriptors are + * copied into userspace pointer @argp. On error, one of the following + * negative error codes is returned. + * + * %-EINVAL - Invalid arguments from userspace. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during getting disk block descriptors. + */ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -352,6 +551,26 @@ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, return ret; } +/** + * nilfs_ioctl_move_inode_block - prepare data/node block for moving by GC + * @inode: inode object + * @vdesc: descriptor of virtual block number + * @buffers: list of moving buffers + * + * Description: nilfs_ioctl_move_inode_block() function registers data/node + * buffer in the GC pagecache and submit read request. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - Requested block doesn't exist. + * + * %-EEXIST - Blocks conflict is detected. + */ static int nilfs_ioctl_move_inode_block(struct inode *inode, struct nilfs_vdesc *vdesc, struct list_head *buffers) @@ -397,6 +616,19 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode, return 0; } +/** + * nilfs_ioctl_move_blocks - move valid inode's blocks during garbage collection + * @sb: superblock object + * @argv: vector of arguments from userspace + * @buf: array of nilfs_vdesc structures + * + * Description: nilfs_ioctl_move_blocks() function reads valid data/node + * blocks that garbage collector specified with the array of nilfs_vdesc + * structures and stores them into page caches of GC inodes. + * + * Return Value: Number of processed nilfs_vdesc structures or + * error code, otherwise. + */ static int nilfs_ioctl_move_blocks(struct super_block *sb, struct nilfs_argv *argv, void *buf) { @@ -462,6 +694,25 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, return ret; } +/** + * nilfs_ioctl_delete_checkpoints - delete checkpoints + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @buf: array of periods of checkpoints numbers + * + * Description: nilfs_ioctl_delete_checkpoints() function deletes checkpoints + * in the period from p_start to p_end, excluding p_end itself. The checkpoints + * which have been already deleted are ignored. + * + * Return Value: Number of processed nilfs_period structures or + * error code, otherwise. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - invalid checkpoints. + */ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) { @@ -479,6 +730,24 @@ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, return nmembs; } +/** + * nilfs_ioctl_free_vblocknrs - free virtual block numbers + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @buf: array of virtual block numbers + * + * Description: nilfs_ioctl_free_vblocknrs() function frees + * the virtual block numbers specified by @buf and @argv->v_nmembs. + * + * Return Value: Number of processed virtual block numbers or + * error code, otherwise. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The virtual block number have not been allocated. + */ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) { @@ -490,6 +759,24 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, return (ret < 0) ? ret : nmembs; } +/** + * nilfs_ioctl_mark_blocks_dirty - mark blocks dirty + * @nilfs: nilfs object + * @argv: vector of arguments from userspace + * @buf: array of block descriptors + * + * Description: nilfs_ioctl_mark_blocks_dirty() function marks + * metadata file or data blocks as dirty. + * + * Return Value: Number of processed block descriptors or + * error code, otherwise. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + */ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, struct nilfs_argv *argv, void *buf) { @@ -571,6 +858,20 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, return ret; } +/** + * nilfs_ioctl_clean_segments - clean segments + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_clean_segments() function makes garbage + * collection operation in the environment of requested parameters + * from userspace. The NILFS_IOCTL_CLEAN_SEGMENTS ioctl is used by + * nilfs_cleanerd daemon. + * + * Return Value: On success, 0 is returned or error code, otherwise. + */ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -682,6 +983,33 @@ out: return ret; } +/** + * nilfs_ioctl_sync - make a checkpoint + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * + * Description: nilfs_ioctl_sync() function constructs a logical segment + * for checkpointing. This function guarantees that all modified data + * and metadata are written out to the device when it successfully + * returned. + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EFAULT - Failure during execution of requested operation. + */ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { @@ -710,6 +1038,14 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, return 0; } +/** + * nilfs_ioctl_resize - resize NILFS2 volume + * @inode: inode object + * @filp: file object + * @argp: pointer on argument from userspace + * + * Return Value: On success, 0 is returned or error code, otherwise. + */ static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, void __user *argp) { @@ -735,6 +1071,17 @@ out: return ret; } +/** + * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated + * @inode: inode object + * @argp: pointer on argument from userspace + * + * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit + * of segments in bytes and upper limit of segments in bytes. + * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility. + * + * Return Value: On success, 0 is returned or error code, otherwise. + */ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) { struct the_nilfs *nilfs = inode->i_sb->s_fs_info; @@ -767,6 +1114,28 @@ out: return ret; } +/** + * nilfs_ioctl_get_info - wrapping function of get metadata info + * @inode: inode object + * @filp: file object + * @cmd: ioctl's request code + * @argp: pointer on argument from userspace + * @membsz: size of an item in bytes + * @dofunc: concrete function of getting metadata info + * + * Description: nilfs_ioctl_get_info() gets metadata info by means of + * calling dofunc() function. + * + * Return Value: On success, 0 is returned and requested metadata info + * is copied into userspace. On error, one of the following + * negative error codes is returned. + * + * %-EINVAL - Invalid arguments from userspace. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EFAULT - Failure during execution of requested operation. + */ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp, size_t membsz, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 9f6b486b6c0..a1a191634ab 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1440,17 +1440,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, nilfs_clear_logs(&sci->sc_segbufs); - err = nilfs_segctor_extend_segments(sci, nilfs, nadd); - if (unlikely(err)) - return err; - if (sci->sc_stage.flags & NILFS_CF_SUFREED) { err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, sci->sc_freesegs, sci->sc_nfreesegs, NULL); WARN_ON(err); /* do not happen */ + sci->sc_stage.flags &= ~NILFS_CF_SUFREED; } + + err = nilfs_segctor_extend_segments(sci, nilfs, nadd); + if (unlikely(err)) + return err; + nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); sci->sc_stage = prev_stage; } diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c index 634a8b717b0..266c2d7d50b 100644 --- a/fs/nls/mac-celtic.c +++ b/fs/nls/mac-celtic.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macceltic(void) diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c index 979e6265ac5..9789c605755 100644 --- a/fs/nls/mac-centeuro.c +++ b/fs/nls/mac-centeuro.c @@ -513,7 +513,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_maccenteuro(void) diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c index dd3f675911e..bb19e7a07d4 100644 --- a/fs/nls/mac-croatian.c +++ b/fs/nls/mac-croatian.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_maccroatian(void) diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c index 1112c84dd8b..2a7dea36acb 100644 --- a/fs/nls/mac-cyrillic.c +++ b/fs/nls/mac-cyrillic.c @@ -478,7 +478,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_maccyrillic(void) diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c index 2de9158409c..77b00165358 100644 --- a/fs/nls/mac-gaelic.c +++ b/fs/nls/mac-gaelic.c @@ -548,7 +548,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macgaelic(void) diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c index a8631008280..1eccf499e2e 100644 --- a/fs/nls/mac-greek.c +++ b/fs/nls/mac-greek.c @@ -478,7 +478,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macgreek(void) diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c index babe2998d5c..cbd0875c6d6 100644 --- a/fs/nls/mac-iceland.c +++ b/fs/nls/mac-iceland.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_maciceland(void) diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c index 312364f010d..fba8357aaf0 100644 --- a/fs/nls/mac-inuit.c +++ b/fs/nls/mac-inuit.c @@ -513,7 +513,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macinuit(void) diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c index 53ce0809cbd..b6a98a5208c 100644 --- a/fs/nls/mac-roman.c +++ b/fs/nls/mac-roman.c @@ -618,7 +618,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macroman(void) diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c index add6f7a0c66..25547f02363 100644 --- a/fs/nls/mac-romanian.c +++ b/fs/nls/mac-romanian.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macromanian(void) diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c index dffa96d5de0..b5454bc7b7f 100644 --- a/fs/nls/mac-turkish.c +++ b/fs/nls/mac-turkish.c @@ -583,7 +583,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_macturkish(void) diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c index 7020e940f74..a2620650d5e 100644 --- a/fs/nls/nls_ascii.c +++ b/fs/nls/nls_ascii.c @@ -148,7 +148,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_ascii(void) diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index fea6bd5831d..52ccd34b1e7 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -232,13 +232,14 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian, } EXPORT_SYMBOL(utf16s_to_utf8s); -int register_nls(struct nls_table * nls) +int __register_nls(struct nls_table *nls, struct module *owner) { struct nls_table ** tmp = &tables; if (nls->next) return -EBUSY; + nls->owner = owner; spin_lock(&nls_lock); while (*tmp) { if (nls == *tmp) { @@ -252,6 +253,7 @@ int register_nls(struct nls_table * nls) spin_unlock(&nls_lock); return 0; } +EXPORT_SYMBOL(__register_nls); int unregister_nls(struct nls_table * nls) { @@ -538,7 +540,6 @@ struct nls_table *load_nls_default(void) return &default_table; } -EXPORT_SYMBOL(register_nls); EXPORT_SYMBOL(unregister_nls); EXPORT_SYMBOL(unload_nls); EXPORT_SYMBOL(load_nls); diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c index c8471fe78e4..ace3e19d340 100644 --- a/fs/nls/nls_cp1250.c +++ b/fs/nls/nls_cp1250.c @@ -329,7 +329,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp1250(void) diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c index 1939b46e772..9273ddfd08a 100644 --- a/fs/nls/nls_cp1251.c +++ b/fs/nls/nls_cp1251.c @@ -283,7 +283,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp1251(void) diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c index 8120ae2e091..1caf5dfed85 100644 --- a/fs/nls/nls_cp1255.c +++ b/fs/nls/nls_cp1255.c @@ -365,7 +365,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp1255(void) diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c index ff37a4628ce..7ddb830da3f 100644 --- a/fs/nls/nls_cp437.c +++ b/fs/nls/nls_cp437.c @@ -369,7 +369,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp437(void) diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c index f5576b8be1b..c593f683a0c 100644 --- a/fs/nls/nls_cp737.c +++ b/fs/nls/nls_cp737.c @@ -332,7 +332,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp737(void) diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c index 4905635d1c0..554c863745f 100644 --- a/fs/nls/nls_cp775.c +++ b/fs/nls/nls_cp775.c @@ -301,7 +301,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp775(void) diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c index fe5bdad50e2..56cccd14b40 100644 --- a/fs/nls/nls_cp850.c +++ b/fs/nls/nls_cp850.c @@ -297,7 +297,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp850(void) diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c index ceb1c0166dd..7cdc05ac1d4 100644 --- a/fs/nls/nls_cp852.c +++ b/fs/nls/nls_cp852.c @@ -319,7 +319,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp852(void) diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c index cc7f5fb2e0c..7426eea0566 100644 --- a/fs/nls/nls_cp855.c +++ b/fs/nls/nls_cp855.c @@ -281,7 +281,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp855(void) diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c index e418e198e8d..098309733eb 100644 --- a/fs/nls/nls_cp857.c +++ b/fs/nls/nls_cp857.c @@ -283,7 +283,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp857(void) diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c index a86c97d1aa3..84224478e73 100644 --- a/fs/nls/nls_cp860.c +++ b/fs/nls/nls_cp860.c @@ -346,7 +346,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp860(void) diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c index bd920227acd..dc873e4be09 100644 --- a/fs/nls/nls_cp861.c +++ b/fs/nls/nls_cp861.c @@ -369,7 +369,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp861(void) diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c index e9b68eb3daf..d5263e3c556 100644 --- a/fs/nls/nls_cp862.c +++ b/fs/nls/nls_cp862.c @@ -403,7 +403,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp862(void) diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c index f8a9b07ab4e..051c9832e36 100644 --- a/fs/nls/nls_cp863.c +++ b/fs/nls/nls_cp863.c @@ -363,7 +363,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp863(void) diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c index 8d31f435fc6..97eb1273b2f 100644 --- a/fs/nls/nls_cp864.c +++ b/fs/nls/nls_cp864.c @@ -389,7 +389,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp864(void) diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c index 4bd902fe3ec..11121422852 100644 --- a/fs/nls/nls_cp865.c +++ b/fs/nls/nls_cp865.c @@ -369,7 +369,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp865(void) diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c index bdc7cb39139..ffdcbc3fc38 100644 --- a/fs/nls/nls_cp866.c +++ b/fs/nls/nls_cp866.c @@ -287,7 +287,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp866(void) diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c index 9f283a2b151..3b5a3458935 100644 --- a/fs/nls/nls_cp869.c +++ b/fs/nls/nls_cp869.c @@ -297,7 +297,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp869(void) diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c index 0b3c4886f8c..8dfaa10710f 100644 --- a/fs/nls/nls_cp874.c +++ b/fs/nls/nls_cp874.c @@ -256,7 +256,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp874(void) diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c index 0ffed6f1ceb..67b7398e848 100644 --- a/fs/nls/nls_cp932.c +++ b/fs/nls/nls_cp932.c @@ -7914,7 +7914,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp932(void) diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c index 82770301bc3..c96546cfec9 100644 --- a/fs/nls/nls_cp936.c +++ b/fs/nls/nls_cp936.c @@ -11092,7 +11092,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp936(void) diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c index 8a7a2fe85c6..199171e97aa 100644 --- a/fs/nls/nls_cp949.c +++ b/fs/nls/nls_cp949.c @@ -13927,7 +13927,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp949(void) diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c index ef2536829aa..8e141870820 100644 --- a/fs/nls/nls_cp950.c +++ b/fs/nls/nls_cp950.c @@ -9463,7 +9463,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_cp950(void) diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c index 7424929a278..162b3f16035 100644 --- a/fs/nls/nls_euc-jp.c +++ b/fs/nls/nls_euc-jp.c @@ -553,7 +553,6 @@ static struct nls_table table = { .charset = "euc-jp", .uni2char = uni2char, .char2uni = char2uni, - .owner = THIS_MODULE, }; static int __init init_nls_euc_jp(void) diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c index 7b951bb5849..69ac020d43b 100644 --- a/fs/nls/nls_iso8859-1.c +++ b/fs/nls/nls_iso8859-1.c @@ -239,7 +239,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_1(void) diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c index c4d52ea9f09..afb3f8f275f 100644 --- a/fs/nls/nls_iso8859-13.c +++ b/fs/nls/nls_iso8859-13.c @@ -267,7 +267,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_13(void) diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c index dc02600c7fe..046370f0b6f 100644 --- a/fs/nls/nls_iso8859-14.c +++ b/fs/nls/nls_iso8859-14.c @@ -323,7 +323,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_14(void) diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c index 3c7dfc832ef..7e34a841a05 100644 --- a/fs/nls/nls_iso8859-15.c +++ b/fs/nls/nls_iso8859-15.c @@ -289,7 +289,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_15(void) diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c index a2d2197e4c7..7dd57118174 100644 --- a/fs/nls/nls_iso8859-2.c +++ b/fs/nls/nls_iso8859-2.c @@ -290,7 +290,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_2(void) diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c index a61e0daa3a8..740b75ec449 100644 --- a/fs/nls/nls_iso8859-3.c +++ b/fs/nls/nls_iso8859-3.c @@ -290,7 +290,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_3(void) diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c index e8ff555483b..8826021e32f 100644 --- a/fs/nls/nls_iso8859-4.c +++ b/fs/nls/nls_iso8859-4.c @@ -290,7 +290,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_4(void) diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c index 4721e893012..7c04057a1ad 100644 --- a/fs/nls/nls_iso8859-5.c +++ b/fs/nls/nls_iso8859-5.c @@ -254,7 +254,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_5(void) diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c index 01a517d6d30..d4a881400d7 100644 --- a/fs/nls/nls_iso8859-6.c +++ b/fs/nls/nls_iso8859-6.c @@ -245,7 +245,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_6(void) diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c index 2d27b93ef19..37b75d825a7 100644 --- a/fs/nls/nls_iso8859-7.c +++ b/fs/nls/nls_iso8859-7.c @@ -299,7 +299,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_7(void) diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c index 694bf070c72..557b98250d3 100644 --- a/fs/nls/nls_iso8859-9.c +++ b/fs/nls/nls_iso8859-9.c @@ -254,7 +254,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_iso8859_9(void) diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c index 43875310540..811f232fccf 100644 --- a/fs/nls/nls_koi8-r.c +++ b/fs/nls/nls_koi8-r.c @@ -305,7 +305,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_koi8_r(void) diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c index e7bc1d75c78..a80a741a867 100644 --- a/fs/nls/nls_koi8-ru.c +++ b/fs/nls/nls_koi8-ru.c @@ -55,7 +55,6 @@ static struct nls_table table = { .charset = "koi8-ru", .uni2char = uni2char, .char2uni = char2uni, - .owner = THIS_MODULE, }; static int __init init_nls_koi8_ru(void) diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c index 8c9f0292b5a..7e029e4c188 100644 --- a/fs/nls/nls_koi8-u.c +++ b/fs/nls/nls_koi8-u.c @@ -312,7 +312,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, - .owner = THIS_MODULE, }; static int __init init_nls_koi8_u(void) diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c index 0d60a44acac..afcfbc4a14d 100644 --- a/fs/nls/nls_utf8.c +++ b/fs/nls/nls_utf8.c @@ -46,7 +46,6 @@ static struct nls_table table = { .char2uni = char2uni, .charset2lower = identity, /* no conversion */ .charset2upper = identity, - .owner = THIS_MODULE, }; static int __init init_nls_utf8(void) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 1fedd5f7ccc..0b9ff4395e6 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -82,20 +82,23 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) * events. */ static int dnotify_handle_event(struct fsnotify_group *group, + struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) + u32 mask, void *data, int data_type, + const unsigned char *file_name) { struct dnotify_mark *dn_mark; - struct inode *to_tell; struct dnotify_struct *dn; struct dnotify_struct **prev; struct fown_struct *fown; - __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; + __u32 test_mask = mask & ~FS_EVENT_ON_CHILD; - BUG_ON(vfsmount_mark); + /* not a dir, dnotify doesn't care */ + if (!S_ISDIR(inode->i_mode)) + return 0; - to_tell = event->to_tell; + BUG_ON(vfsmount_mark); dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark); @@ -122,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group, return 0; } -/* - * Given an inode and mask determine if dnotify would be interested in sending - * userspace notification for that pair. - */ -static bool dnotify_should_send_event(struct fsnotify_group *group, - struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - /* not a dir, dnotify doesn't care */ - if (!S_ISDIR(inode->i_mode)) - return false; - - return true; -} - static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) { struct dnotify_mark *dn_mark = container_of(fsn_mark, @@ -152,10 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) static struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, - .should_send_event = dnotify_should_send_event, - .free_group_priv = NULL, - .freeing_mark = NULL, - .free_event_priv = NULL, }; /* diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 0c2f9122b26..0e792f5e314 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -9,91 +9,59 @@ #include <linux/types.h> #include <linux/wait.h> -static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new) +#include "fanotify.h" + +static bool should_merge(struct fsnotify_event *old_fsn, + struct fsnotify_event *new_fsn) { - pr_debug("%s: old=%p new=%p\n", __func__, old, new); + struct fanotify_event_info *old, *new; - if (old->to_tell == new->to_tell && - old->data_type == new->data_type && - old->tgid == new->tgid) { - switch (old->data_type) { - case (FSNOTIFY_EVENT_PATH): -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - /* dont merge two permission events */ - if ((old->mask & FAN_ALL_PERM_EVENTS) && - (new->mask & FAN_ALL_PERM_EVENTS)) - return false; -#endif - if ((old->path.mnt == new->path.mnt) && - (old->path.dentry == new->path.dentry)) - return true; - break; - case (FSNOTIFY_EVENT_NONE): - return true; - default: - BUG(); - }; - } + pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); + old = FANOTIFY_E(old_fsn); + new = FANOTIFY_E(new_fsn); + + if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid && + old->path.mnt == new->path.mnt && + old->path.dentry == new->path.dentry) + return true; return false; } /* and the list better be locked by something too! */ -static struct fsnotify_event *fanotify_merge(struct list_head *list, - struct fsnotify_event *event) +static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) { - struct fsnotify_event_holder *test_holder; - struct fsnotify_event *test_event = NULL; - struct fsnotify_event *new_event; + struct fsnotify_event *test_event; + bool do_merge = false; pr_debug("%s: list=%p event=%p\n", __func__, list, event); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + /* + * Don't merge a permission event with any other event so that we know + * the event structure we have created in fanotify_handle_event() is the + * one we should check for permission response. + */ + if (event->mask & FAN_ALL_PERM_EVENTS) + return 0; +#endif - list_for_each_entry_reverse(test_holder, list, event_list) { - if (should_merge(test_holder->event, event)) { - test_event = test_holder->event; + list_for_each_entry_reverse(test_event, list, list) { + if (should_merge(test_event, event)) { + do_merge = true; break; } } - if (!test_event) - return NULL; - - fsnotify_get_event(test_event); - - /* if they are exactly the same we are done */ - if (test_event->mask == event->mask) - return test_event; - - /* - * if the refcnt == 2 this is the only queue - * for this event and so we can update the mask - * in place. - */ - if (atomic_read(&test_event->refcnt) == 2) { - test_event->mask |= event->mask; - return test_event; - } - - new_event = fsnotify_clone_event(test_event); - - /* done with test_event */ - fsnotify_put_event(test_event); - - /* couldn't allocate memory, merge was not possible */ - if (unlikely(!new_event)) - return ERR_PTR(-ENOMEM); - - /* build new event and replace it on the list */ - new_event->mask = (test_event->mask | event->mask); - fsnotify_replace_event(test_holder, new_event); + if (!do_merge) + return 0; - /* we hold a reference on new_event from clone_event */ - return new_event; + test_event->mask |= event->mask; + return 1; } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS static int fanotify_get_response_from_access(struct fsnotify_group *group, - struct fsnotify_event *event) + struct fanotify_event_info *event) { int ret; @@ -106,7 +74,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, return 0; /* userspace responded, convert to something usable */ - spin_lock(&event->lock); switch (event->response) { case FAN_ALLOW: ret = 0; @@ -116,7 +83,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, ret = -EPERM; } event->response = 0; - spin_unlock(&event->lock); pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__, group, event, ret); @@ -125,58 +91,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, } #endif -static int fanotify_handle_event(struct fsnotify_group *group, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *fanotify_mark, - struct fsnotify_event *event) -{ - int ret = 0; - struct fsnotify_event *notify_event = NULL; - - BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); - BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); - BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); - BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); - BUILD_BUG_ON(FAN_OPEN != FS_OPEN); - BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); - BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); - BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); - BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); - BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); - - pr_debug("%s: group=%p event=%p\n", __func__, group, event); - - notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge); - if (IS_ERR(notify_event)) - return PTR_ERR(notify_event); - -#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (event->mask & FAN_ALL_PERM_EVENTS) { - /* if we merged we need to wait on the new event */ - if (notify_event) - event = notify_event; - ret = fanotify_get_response_from_access(group, event); - } -#endif - - if (notify_event) - fsnotify_put_event(notify_event); - - return ret; -} - -static bool fanotify_should_send_event(struct fsnotify_group *group, - struct inode *to_tell, - struct fsnotify_mark *inode_mark, +static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmnt_mark, - __u32 event_mask, void *data, int data_type) + u32 event_mask, + void *data, int data_type) { __u32 marks_mask, marks_ignored_mask; struct path *path = data; - pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " - "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, - inode_mark, vfsmnt_mark, event_mask, data, data_type); + pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" + " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, + event_mask, data, data_type); /* if we don't have enough info to send an event to userspace say no */ if (data_type != FSNOTIFY_EVENT_PATH) @@ -217,6 +142,71 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, return false; } +static int fanotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *fanotify_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) +{ + int ret = 0; + struct fanotify_event_info *event; + struct fsnotify_event *fsn_event; + + BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); + BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); + BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); + BUILD_BUG_ON(FAN_OPEN != FS_OPEN); + BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); + BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); + BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); + BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); + BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + + if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data, + data_type)) + return 0; + + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + mask); + + event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL); + if (unlikely(!event)) + return -ENOMEM; + + fsn_event = &event->fse; + fsnotify_init_event(fsn_event, inode, mask); + event->tgid = get_pid(task_tgid(current)); + if (data_type == FSNOTIFY_EVENT_PATH) { + struct path *path = data; + event->path = *path; + path_get(&event->path); + } else { + event->path.mnt = NULL; + event->path.dentry = NULL; + } +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + event->response = 0; +#endif + + ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); + if (ret) { + BUG_ON(mask & FAN_ALL_PERM_EVENTS); + /* Our event wasn't used in the end. Free it. */ + fsnotify_destroy_event(group, fsn_event); + ret = 0; + } + +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (mask & FAN_ALL_PERM_EVENTS) { + ret = fanotify_get_response_from_access(group, event); + fsnotify_destroy_event(group, fsn_event); + } +#endif + return ret; +} + static void fanotify_free_group_priv(struct fsnotify_group *group) { struct user_struct *user; @@ -226,10 +216,18 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) free_uid(user); } +static void fanotify_free_event(struct fsnotify_event *fsn_event) +{ + struct fanotify_event_info *event; + + event = FANOTIFY_E(fsn_event); + path_put(&event->path); + put_pid(event->tgid); + kmem_cache_free(fanotify_event_cachep, event); +} + const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, - .should_send_event = fanotify_should_send_event, .free_group_priv = fanotify_free_group_priv, - .free_event_priv = NULL, - .freeing_mark = NULL, + .free_event = fanotify_free_event, }; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h new file mode 100644 index 00000000000..32a2f034fb9 --- /dev/null +++ b/fs/notify/fanotify/fanotify.h @@ -0,0 +1,30 @@ +#include <linux/fsnotify_backend.h> +#include <linux/path.h> +#include <linux/slab.h> + +extern struct kmem_cache *fanotify_event_cachep; + +/* + * Lifetime of the structure differs for normal and permission events. In both + * cases the structure is allocated in fanotify_handle_event(). For normal + * events the structure is freed immediately after reporting it to userspace. + * For permission events we free it only after we receive response from + * userspace. + */ +struct fanotify_event_info { + struct fsnotify_event fse; + /* + * We hold ref to this path so it may be dereferenced at any point + * during this object's lifetime + */ + struct path path; + struct pid *tgid; +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + u32 response; /* userspace answer to question */ +#endif +}; + +static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) +{ + return container_of(fse, struct fanotify_event_info, fse); +} diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index e44cb6427df..b6175fa11bf 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -19,6 +19,7 @@ #include "../../mount.h" #include "../fdinfo.h" +#include "fanotify.h" #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_DEFAULT_MAX_MARKS 8192 @@ -28,11 +29,12 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; static struct kmem_cache *fanotify_mark_cache __read_mostly; static struct kmem_cache *fanotify_response_event_cache __read_mostly; +struct kmem_cache *fanotify_event_cachep __read_mostly; struct fanotify_response_event { struct list_head list; __s32 fd; - struct fsnotify_event *event; + struct fanotify_event_info *event; }; /* @@ -61,8 +63,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, } static int create_fd(struct fsnotify_group *group, - struct fsnotify_event *event, - struct file **file) + struct fanotify_event_info *event, + struct file **file) { int client_fd; struct file *new_file; @@ -73,12 +75,6 @@ static int create_fd(struct fsnotify_group *group, if (client_fd < 0) return client_fd; - if (event->data_type != FSNOTIFY_EVENT_PATH) { - WARN_ON(1); - put_unused_fd(client_fd); - return -EINVAL; - } - /* * we need a new file handle for the userspace program so it can read even if it was * originally opened O_WRONLY. @@ -109,23 +105,25 @@ static int create_fd(struct fsnotify_group *group, } static int fill_event_metadata(struct fsnotify_group *group, - struct fanotify_event_metadata *metadata, - struct fsnotify_event *event, - struct file **file) + struct fanotify_event_metadata *metadata, + struct fsnotify_event *fsn_event, + struct file **file) { int ret = 0; + struct fanotify_event_info *event; pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, - group, metadata, event); + group, metadata, fsn_event); *file = NULL; + event = container_of(fsn_event, struct fanotify_event_info, fse); metadata->event_len = FAN_EVENT_METADATA_LEN; metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; metadata->reserved = 0; - metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; + metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); - if (unlikely(event->mask & FAN_Q_OVERFLOW)) + if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) metadata->fd = FAN_NOFD; else { metadata->fd = create_fd(group, event, file); @@ -209,7 +207,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, if (!re) return -ENOMEM; - re->event = event; + re->event = FANOTIFY_E(event); re->fd = fd; mutex_lock(&group->fanotify_data.access_mutex); @@ -217,7 +215,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, if (atomic_read(&group->fanotify_data.bypass_perm)) { mutex_unlock(&group->fanotify_data.access_mutex); kmem_cache_free(fanotify_response_event_cache, re); - event->response = FAN_ALLOW; + FANOTIFY_E(event)->response = FAN_ALLOW; return 0; } @@ -273,7 +271,7 @@ out_close_fd: out: #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (event->mask & FAN_ALL_PERM_EVENTS) { - event->response = FAN_DENY; + FANOTIFY_E(event)->response = FAN_DENY; wake_up(&group->fanotify_data.access_waitq); } #endif @@ -321,7 +319,12 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); - fsnotify_put_event(kevent); + /* + * Permission events get destroyed after we + * receive response + */ + if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) + fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; @@ -409,7 +412,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; - struct fsnotify_event_holder *holder; + struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; @@ -421,7 +424,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar switch (cmd) { case FIONREAD: mutex_lock(&group->notification_mutex); - list_for_each_entry(holder, &group->notification_list, event_list) + list_for_each_entry(fsn_event, &group->notification_list, list) send_len += FAN_EVENT_METADATA_LEN; mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -888,9 +891,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, { return sys_fanotify_mark(fanotify_fd, flags, #ifdef __BIG_ENDIAN - ((__u64)mask1 << 32) | mask0, -#else ((__u64)mask0 << 32) | mask1, +#else + ((__u64)mask1 << 32) | mask0, #endif dfd, pathname); } @@ -906,6 +909,7 @@ static int __init fanotify_user_setup(void) fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC); fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event, SLAB_PANIC); + fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); return 0; } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 4bb21d67d9b..1d4e1ea2f37 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_mark *vfsmount_mark, __u32 mask, void *data, int data_is, u32 cookie, - const unsigned char *file_name, - struct fsnotify_event **event) + const unsigned char *file_name) { struct fsnotify_group *group = NULL; __u32 inode_test_mask = 0; @@ -170,27 +169,17 @@ static int send_to_group(struct inode *to_tell, pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p" " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" - " data=%p data_is=%d cookie=%d event=%p\n", + " data=%p data_is=%d cookie=%d\n", __func__, group, to_tell, mask, inode_mark, inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, - data_is, cookie, *event); + data_is, cookie); if (!inode_test_mask && !vfsmount_test_mask) return 0; - if (group->ops->should_send_event(group, to_tell, inode_mark, - vfsmount_mark, mask, data, - data_is) == false) - return 0; - - if (!*event) { - *event = fsnotify_create_event(to_tell, mask, data, - data_is, file_name, - cookie, GFP_KERNEL); - if (!*event) - return -ENOMEM; - } - return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event); + return group->ops->handle_event(group, to_tell, inode_mark, + vfsmount_mark, mask, data, data_is, + file_name); } /* @@ -205,7 +194,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; - struct fsnotify_event *event = NULL; struct mount *mnt; int idx, ret = 0; /* global tests shouldn't care about events on child only the specific event */ @@ -258,18 +246,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, if (inode_group > vfsmount_group) { /* handle inode */ - ret = send_to_group(to_tell, inode_mark, NULL, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, inode_mark, NULL, mask, + data, data_is, cookie, file_name); /* we didn't use the vfsmount_mark */ vfsmount_group = NULL; } else if (vfsmount_group > inode_group) { - ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, + data, data_is, cookie, file_name); inode_group = NULL; } else { ret = send_to_group(to_tell, inode_mark, vfsmount_mark, - mask, data, data_is, cookie, file_name, - &event); + mask, data, data_is, cookie, + file_name); } if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) @@ -285,12 +273,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, idx); - /* - * fsnotify_create_event() took a reference so the event can't be cleaned - * up while we are still trying to add it to lists, drop that one. - */ - if (event) - fsnotify_put_event(event); return ret; } diff --git a/fs/notify/group.c b/fs/notify/group.c index bd2625bd88b..ee674fe2cec 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -99,6 +99,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) INIT_LIST_HEAD(&group->marks_list); group->ops = ops; + fsnotify_init_event(&group->overflow_event, NULL, FS_Q_OVERFLOW); return group; } diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index b6642e4de4b..485eef3f440 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -2,11 +2,12 @@ #include <linux/inotify.h> #include <linux/slab.h> /* struct kmem_cache */ -extern struct kmem_cache *event_priv_cachep; - -struct inotify_event_private_data { - struct fsnotify_event_private_data fsnotify_event_priv_data; +struct inotify_event_info { + struct fsnotify_event fse; int wd; + u32 sync_cookie; + int name_len; + char name[]; }; struct inotify_inode_mark { @@ -14,8 +15,18 @@ struct inotify_inode_mark { int wd; }; +static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) +{ + return container_of(fse, struct inotify_event_info, fse); +} + extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); -extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); +extern int inotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 4216308b81b..d5ee56348bb 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -34,107 +34,89 @@ #include "inotify.h" /* - * Check if 2 events contain the same information. We do not compare private data - * but at this moment that isn't a problem for any know fsnotify listeners. + * Check if 2 events contain the same information. */ -static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) +static bool event_compare(struct fsnotify_event *old_fsn, + struct fsnotify_event *new_fsn) { - if ((old->mask == new->mask) && - (old->to_tell == new->to_tell) && - (old->data_type == new->data_type) && - (old->name_len == new->name_len)) { - switch (old->data_type) { - case (FSNOTIFY_EVENT_INODE): - /* remember, after old was put on the wait_q we aren't - * allowed to look at the inode any more, only thing - * left to check was if the file_name is the same */ - if (!old->name_len || - !strcmp(old->file_name, new->file_name)) - return true; - break; - case (FSNOTIFY_EVENT_PATH): - if ((old->path.mnt == new->path.mnt) && - (old->path.dentry == new->path.dentry)) - return true; - break; - case (FSNOTIFY_EVENT_NONE): - if (old->mask & FS_Q_OVERFLOW) - return true; - else if (old->mask & FS_IN_IGNORED) - return false; - return true; - }; - } + struct inotify_event_info *old, *new; + + if (old_fsn->mask & FS_IN_IGNORED) + return false; + old = INOTIFY_E(old_fsn); + new = INOTIFY_E(new_fsn); + if ((old_fsn->mask == new_fsn->mask) && + (old_fsn->inode == new_fsn->inode) && + (old->name_len == new->name_len) && + (!old->name_len || !strcmp(old->name, new->name))) + return true; return false; } -static struct fsnotify_event *inotify_merge(struct list_head *list, - struct fsnotify_event *event) +static int inotify_merge(struct list_head *list, + struct fsnotify_event *event) { - struct fsnotify_event_holder *last_holder; struct fsnotify_event *last_event; - /* and the list better be locked by something too */ - spin_lock(&event->lock); - - last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); - last_event = last_holder->event; - if (event_compare(last_event, event)) - fsnotify_get_event(last_event); - else - last_event = NULL; - - spin_unlock(&event->lock); - - return last_event; + last_event = list_entry(list->prev, struct fsnotify_event, list); + return event_compare(last_event, event); } -static int inotify_handle_event(struct fsnotify_group *group, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) +int inotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name) { struct inotify_inode_mark *i_mark; - struct inode *to_tell; - struct inotify_event_private_data *event_priv; - struct fsnotify_event_private_data *fsn_event_priv; - struct fsnotify_event *added_event; - int wd, ret = 0; + struct inotify_event_info *event; + struct fsnotify_event *fsn_event; + int ret; + int len = 0; + int alloc_len = sizeof(struct inotify_event_info); BUG_ON(vfsmount_mark); - pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group, - event, event->to_tell, event->mask); + if ((inode_mark->mask & FS_EXCL_UNLINK) && + (data_type == FSNOTIFY_EVENT_PATH)) { + struct path *path = data; + + if (d_unlinked(path->dentry)) + return 0; + } + if (file_name) { + len = strlen(file_name); + alloc_len += len + 1; + } - to_tell = event->to_tell; + pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, + mask); i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); - wd = i_mark->wd; - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); - if (unlikely(!event_priv)) + event = kmalloc(alloc_len, GFP_KERNEL); + if (unlikely(!event)) return -ENOMEM; - fsn_event_priv = &event_priv->fsnotify_event_priv_data; - - fsnotify_get_group(group); - fsn_event_priv->group = group; - event_priv->wd = wd; - - added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge); - if (added_event) { - inotify_free_event_priv(fsn_event_priv); - if (!IS_ERR(added_event)) - fsnotify_put_event(added_event); - else - ret = PTR_ERR(added_event); + fsn_event = &event->fse; + fsnotify_init_event(fsn_event, inode, mask); + event->wd = i_mark->wd; + event->name_len = len; + if (len) + strcpy(event->name, file_name); + + ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge); + if (ret) { + /* Our event wasn't used in the end. Free it. */ + fsnotify_destroy_event(group, fsn_event); } if (inode_mark->mask & IN_ONESHOT) fsnotify_destroy_mark(inode_mark, group); - return ret; + return 0; } static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) @@ -142,22 +124,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify inotify_ignored_and_remove_idr(fsn_mark, group); } -static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - if ((inode_mark->mask & FS_EXCL_UNLINK) && - (data_type == FSNOTIFY_EVENT_PATH)) { - struct path *path = data; - - if (d_unlinked(path->dentry)) - return false; - } - - return true; -} - /* * This is NEVER supposed to be called. Inotify marks should either have been * removed from the idr when the watch was removed or in the @@ -202,22 +168,14 @@ static void inotify_free_group_priv(struct fsnotify_group *group) free_uid(group->inotify_data.user); } -void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) +static void inotify_free_event(struct fsnotify_event *fsn_event) { - struct inotify_event_private_data *event_priv; - - - event_priv = container_of(fsn_event_priv, struct inotify_event_private_data, - fsnotify_event_priv_data); - - fsnotify_put_group(fsn_event_priv->group); - kmem_cache_free(event_priv_cachep, event_priv); + kfree(INOTIFY_E(fsn_event)); } const struct fsnotify_ops inotify_fsnotify_ops = { .handle_event = inotify_handle_event, - .should_send_event = inotify_should_send_event, .free_group_priv = inotify_free_group_priv, - .free_event_priv = inotify_free_event_priv, + .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, }; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 60f954a891a..497395c8274 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly; static int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; -struct kmem_cache *event_priv_cachep __read_mostly; #ifdef CONFIG_SYSCTL @@ -124,6 +123,16 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) return ret; } +static int round_event_name_len(struct fsnotify_event *fsn_event) +{ + struct inotify_event_info *event; + + event = INOTIFY_E(fsn_event); + if (!event->name_len) + return 0; + return roundup(event->name_len + 1, sizeof(struct inotify_event)); +} + /* * Get an inotify_kernel_event if one exists and is small * enough to fit in "count". Return an error pointer if @@ -144,9 +153,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - if (event->name_len) - event_size += roundup(event->name_len + 1, event_size); - + event_size += round_event_name_len(event); if (event_size > count) return ERR_PTR(-EINVAL); @@ -164,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, * buffer we had in "get_one_event()" above. */ static ssize_t copy_event_to_user(struct fsnotify_group *group, - struct fsnotify_event *event, + struct fsnotify_event *fsn_event, char __user *buf) { struct inotify_event inotify_event; - struct fsnotify_event_private_data *fsn_priv; - struct inotify_event_private_data *priv; + struct inotify_event_info *event; size_t event_size = sizeof(struct inotify_event); - size_t name_len = 0; - - pr_debug("%s: group=%p event=%p\n", __func__, group, event); + size_t name_len; + size_t pad_name_len; - /* we get the inotify watch descriptor from the event private data */ - spin_lock(&event->lock); - fsn_priv = fsnotify_remove_priv_from_event(group, event); - spin_unlock(&event->lock); - - if (!fsn_priv) - inotify_event.wd = -1; - else { - priv = container_of(fsn_priv, struct inotify_event_private_data, - fsnotify_event_priv_data); - inotify_event.wd = priv->wd; - inotify_free_event_priv(fsn_priv); - } + pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); + event = INOTIFY_E(fsn_event); + name_len = event->name_len; /* - * round up event->name_len so it is a multiple of event_size + * round up name length so it is a multiple of event_size * plus an extra byte for the terminating '\0'. */ - if (event->name_len) - name_len = roundup(event->name_len + 1, event_size); - inotify_event.len = name_len; - - inotify_event.mask = inotify_mask_to_arg(event->mask); + pad_name_len = round_event_name_len(fsn_event); + inotify_event.len = pad_name_len; + inotify_event.mask = inotify_mask_to_arg(fsn_event->mask); + inotify_event.wd = event->wd; inotify_event.cookie = event->sync_cookie; /* send the main event */ @@ -209,20 +203,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, /* * fsnotify only stores the pathname, so here we have to send the pathname * and then pad that pathname out to a multiple of sizeof(inotify_event) - * with zeros. I get my zeros from the nul_inotify_event. + * with zeros. */ - if (name_len) { - unsigned int len_to_zero = name_len - event->name_len; + if (pad_name_len) { /* copy the path name */ - if (copy_to_user(buf, event->file_name, event->name_len)) + if (copy_to_user(buf, event->name, name_len)) return -EFAULT; - buf += event->name_len; + buf += name_len; /* fill userspace with 0's */ - if (clear_user(buf, len_to_zero)) + if (clear_user(buf, pad_name_len - name_len)) return -EFAULT; - buf += len_to_zero; - event_size += name_len; + event_size += pad_name_len; } return event_size; @@ -254,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf, if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); - fsnotify_put_event(kevent); + fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; @@ -297,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; - struct fsnotify_event_holder *holder; - struct fsnotify_event *event; + struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; @@ -311,12 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case FIONREAD: mutex_lock(&group->notification_mutex); - list_for_each_entry(holder, &group->notification_list, event_list) { - event = holder->event; + list_for_each_entry(fsn_event, &group->notification_list, + list) { send_len += sizeof(struct inotify_event); - if (event->name_len) - send_len += roundup(event->name_len + 1, - sizeof(struct inotify_event)); + send_len += round_event_name_len(fsn_event); } mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -503,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { struct inotify_inode_mark *i_mark; - struct fsnotify_event *ignored_event, *notify_event; - struct inotify_event_private_data *event_priv; - struct fsnotify_event_private_data *fsn_event_priv; - int ret; - - i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); - - ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, - FSNOTIFY_EVENT_NONE, NULL, 0, - GFP_NOFS); - if (!ignored_event) - goto skip_send_ignore; - - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); - if (unlikely(!event_priv)) - goto skip_send_ignore; - - fsn_event_priv = &event_priv->fsnotify_event_priv_data; - - fsnotify_get_group(group); - fsn_event_priv->group = group; - event_priv->wd = i_mark->wd; - - notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL); - if (notify_event) { - if (IS_ERR(notify_event)) - ret = PTR_ERR(notify_event); - else - fsnotify_put_event(notify_event); - inotify_free_event_priv(fsn_event_priv); - } -skip_send_ignore: - /* matches the reference taken when the event was created */ - if (ignored_event) - fsnotify_put_event(ignored_event); + /* Queue ignore event for the watch */ + inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, + NULL, FSNOTIFY_EVENT_NONE, NULL); + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); @@ -836,7 +794,6 @@ static int __init inotify_user_setup(void) BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); - event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); inotify_max_queued_events = 16384; inotify_max_user_instances = 128; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 7b51b05f160..18b3c4427dc 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -48,15 +48,6 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" -static struct kmem_cache *fsnotify_event_cachep; -static struct kmem_cache *fsnotify_event_holder_cachep; -/* - * This is a magic event we send when the q is too full. Since it doesn't - * hold real event information we just keep one system wide and use it any time - * it is needed. It's refcnt is set 1 at kernel init time and will never - * get set to 0 so it will never get 'freed' - */ -static struct fsnotify_event *q_overflow_event; static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); /** @@ -76,186 +67,72 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) return list_empty(&group->notification_list) ? true : false; } -void fsnotify_get_event(struct fsnotify_event *event) -{ - atomic_inc(&event->refcnt); -} - -void fsnotify_put_event(struct fsnotify_event *event) +void fsnotify_destroy_event(struct fsnotify_group *group, + struct fsnotify_event *event) { - if (!event) + /* Overflow events are per-group and we don't want to free them */ + if (!event || event->mask == FS_Q_OVERFLOW) return; - if (atomic_dec_and_test(&event->refcnt)) { - pr_debug("%s: event=%p\n", __func__, event); - - if (event->data_type == FSNOTIFY_EVENT_PATH) - path_put(&event->path); - - BUG_ON(!list_empty(&event->private_data_list)); - - kfree(event->file_name); - put_pid(event->tgid); - kmem_cache_free(fsnotify_event_cachep, event); - } -} - -struct fsnotify_event_holder *fsnotify_alloc_event_holder(void) -{ - return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL); -} - -void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) -{ - if (holder) - kmem_cache_free(fsnotify_event_holder_cachep, holder); -} - -/* - * Find the private data that the group previously attached to this event when - * the group added the event to the notification queue (fsnotify_add_notify_event) - */ -struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event) -{ - struct fsnotify_event_private_data *lpriv; - struct fsnotify_event_private_data *priv = NULL; - - assert_spin_locked(&event->lock); - - list_for_each_entry(lpriv, &event->private_data_list, event_list) { - if (lpriv->group == group) { - priv = lpriv; - list_del(&priv->event_list); - break; - } - } - return priv; + group->ops->free_event(event); } /* * Add an event to the group notification queue. The group can later pull this - * event off the queue to deal with. If the event is successfully added to the - * group's notification queue, a reference is taken on event. + * event off the queue to deal with. The function returns 0 if the event was + * added to the queue, 1 if the event was merged with some other queued event. */ -struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, - struct fsnotify_event_private_data *priv, - struct fsnotify_event *(*merge)(struct list_head *, - struct fsnotify_event *)) +int fsnotify_add_notify_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct list_head *, + struct fsnotify_event *)) { - struct fsnotify_event *return_event = NULL; - struct fsnotify_event_holder *holder = NULL; + int ret = 0; struct list_head *list = &group->notification_list; - pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv); - - /* - * There is one fsnotify_event_holder embedded inside each fsnotify_event. - * Check if we expect to be able to use that holder. If not alloc a new - * holder. - * For the overflow event it's possible that something will use the in - * event holder before we get the lock so we may need to jump back and - * alloc a new holder, this can't happen for most events... - */ - if (!list_empty(&event->holder.event_list)) { -alloc_holder: - holder = fsnotify_alloc_event_holder(); - if (!holder) - return ERR_PTR(-ENOMEM); - } + pr_debug("%s: group=%p event=%p\n", __func__, group, event); mutex_lock(&group->notification_mutex); if (group->q_len >= group->max_events) { - event = q_overflow_event; - - /* - * we need to return the overflow event - * which means we need a ref - */ - fsnotify_get_event(event); - return_event = event; - - /* sorry, no private data on the overflow event */ - priv = NULL; + /* Queue overflow event only if it isn't already queued */ + if (list_empty(&group->overflow_event.list)) + event = &group->overflow_event; + ret = 1; } if (!list_empty(list) && merge) { - struct fsnotify_event *tmp; - - tmp = merge(list, event); - if (tmp) { + ret = merge(list, event); + if (ret) { mutex_unlock(&group->notification_mutex); - - if (return_event) - fsnotify_put_event(return_event); - if (holder != &event->holder) - fsnotify_destroy_event_holder(holder); - return tmp; + return ret; } } - spin_lock(&event->lock); - - if (list_empty(&event->holder.event_list)) { - if (unlikely(holder)) - fsnotify_destroy_event_holder(holder); - holder = &event->holder; - } else if (unlikely(!holder)) { - /* between the time we checked above and got the lock the in - * event holder was used, go back and get a new one */ - spin_unlock(&event->lock); - mutex_unlock(&group->notification_mutex); - - if (return_event) { - fsnotify_put_event(return_event); - return_event = NULL; - } - - goto alloc_holder; - } - group->q_len++; - holder->event = event; - - fsnotify_get_event(event); - list_add_tail(&holder->event_list, list); - if (priv) - list_add_tail(&priv->event_list, &event->private_data_list); - spin_unlock(&event->lock); + list_add_tail(&event->list, list); mutex_unlock(&group->notification_mutex); wake_up(&group->notification_waitq); kill_fasync(&group->fsn_fa, SIGIO, POLL_IN); - return return_event; + return ret; } /* - * Remove and return the first event from the notification list. There is a - * reference held on this event since it was on the list. It is the responsibility - * of the caller to drop this reference. + * Remove and return the first event from the notification list. It is the + * responsibility of the caller to destroy the obtained event */ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) { struct fsnotify_event *event; - struct fsnotify_event_holder *holder; BUG_ON(!mutex_is_locked(&group->notification_mutex)); pr_debug("%s: group=%p\n", __func__, group); - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); - - event = holder->event; - - spin_lock(&event->lock); - holder->event = NULL; - list_del_init(&holder->event_list); - spin_unlock(&event->lock); - - /* event == holder means we are referenced through the in event holder */ - if (holder != &event->holder) - fsnotify_destroy_event_holder(holder); - + event = list_first_entry(&group->notification_list, + struct fsnotify_event, list); + list_del(&event->list); group->q_len--; return event; @@ -266,15 +143,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group */ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) { - struct fsnotify_event *event; - struct fsnotify_event_holder *holder; - BUG_ON(!mutex_is_locked(&group->notification_mutex)); - holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); - event = holder->event; - - return event; + return list_first_entry(&group->notification_list, + struct fsnotify_event, list); } /* @@ -284,181 +156,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) void fsnotify_flush_notify(struct fsnotify_group *group) { struct fsnotify_event *event; - struct fsnotify_event_private_data *priv; mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { event = fsnotify_remove_notify_event(group); - /* if they don't implement free_event_priv they better not have attached any */ - if (group->ops->free_event_priv) { - spin_lock(&event->lock); - priv = fsnotify_remove_priv_from_event(group, event); - spin_unlock(&event->lock); - if (priv) - group->ops->free_event_priv(priv); - } - fsnotify_put_event(event); /* matches fsnotify_add_notify_event */ + fsnotify_destroy_event(group, event); } mutex_unlock(&group->notification_mutex); } -static void initialize_event(struct fsnotify_event *event) -{ - INIT_LIST_HEAD(&event->holder.event_list); - atomic_set(&event->refcnt, 1); - - spin_lock_init(&event->lock); - - INIT_LIST_HEAD(&event->private_data_list); -} - -/* - * Caller damn well better be holding whatever mutex is protecting the - * old_holder->event_list and the new_event must be a clean event which - * cannot be found anywhere else in the kernel. - */ -int fsnotify_replace_event(struct fsnotify_event_holder *old_holder, - struct fsnotify_event *new_event) -{ - struct fsnotify_event *old_event = old_holder->event; - struct fsnotify_event_holder *new_holder = &new_event->holder; - - enum event_spinlock_class { - SPINLOCK_OLD, - SPINLOCK_NEW, - }; - - pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event); - - /* - * if the new_event's embedded holder is in use someone - * screwed up and didn't give us a clean new event. - */ - BUG_ON(!list_empty(&new_holder->event_list)); - - spin_lock_nested(&old_event->lock, SPINLOCK_OLD); - spin_lock_nested(&new_event->lock, SPINLOCK_NEW); - - new_holder->event = new_event; - list_replace_init(&old_holder->event_list, &new_holder->event_list); - - spin_unlock(&new_event->lock); - spin_unlock(&old_event->lock); - - /* event == holder means we are referenced through the in event holder */ - if (old_holder != &old_event->holder) - fsnotify_destroy_event_holder(old_holder); - - fsnotify_get_event(new_event); /* on the list take reference */ - fsnotify_put_event(old_event); /* off the list, drop reference */ - - return 0; -} - -struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event) -{ - struct fsnotify_event *event; - - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); - if (!event) - return NULL; - - pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event); - - memcpy(event, old_event, sizeof(*event)); - initialize_event(event); - - if (event->name_len) { - event->file_name = kstrdup(old_event->file_name, GFP_KERNEL); - if (!event->file_name) { - kmem_cache_free(fsnotify_event_cachep, event); - return NULL; - } - } - event->tgid = get_pid(old_event->tgid); - if (event->data_type == FSNOTIFY_EVENT_PATH) - path_get(&event->path); - - return event; -} - /* * fsnotify_create_event - Allocate a new event which will be sent to each * group's handle_event function if the group was interested in this * particular event. * - * @to_tell the inode which is supposed to receive the event (sometimes a + * @inode the inode which is supposed to receive the event (sometimes a * parent of the inode to which the event happened. * @mask what actually happened. * @data pointer to the object which was actually affected * @data_type flag indication if the data is a file, path, inode, nothing... * @name the filename, if available */ -struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, - int data_type, const unsigned char *name, - u32 cookie, gfp_t gfp) +void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode, + u32 mask) { - struct fsnotify_event *event; - - event = kmem_cache_zalloc(fsnotify_event_cachep, gfp); - if (!event) - return NULL; - - pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n", - __func__, event, to_tell, mask, data, data_type); - - initialize_event(event); - - if (name) { - event->file_name = kstrdup(name, gfp); - if (!event->file_name) { - kmem_cache_free(fsnotify_event_cachep, event); - return NULL; - } - event->name_len = strlen(event->file_name); - } - - event->tgid = get_pid(task_tgid(current)); - event->sync_cookie = cookie; - event->to_tell = to_tell; - event->data_type = data_type; - - switch (data_type) { - case FSNOTIFY_EVENT_PATH: { - struct path *path = data; - event->path.dentry = path->dentry; - event->path.mnt = path->mnt; - path_get(&event->path); - break; - } - case FSNOTIFY_EVENT_INODE: - event->inode = data; - break; - case FSNOTIFY_EVENT_NONE: - event->inode = NULL; - event->path.dentry = NULL; - event->path.mnt = NULL; - break; - default: - BUG(); - } - + INIT_LIST_HEAD(&event->list); + event->inode = inode; event->mask = mask; - - return event; -} - -static __init int fsnotify_notification_init(void) -{ - fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); - fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); - - q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL, - FSNOTIFY_EVENT_NONE, NULL, 0, - GFP_KERNEL); - if (!q_overflow_event) - panic("unable to allocate fsnotify q_overflow_event\n"); - - return 0; } -subsys_initcall(fsnotify_notification_init); diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index f17e58b3298..ce210d4951a 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -38,7 +38,6 @@ ocfs2-objs := \ symlink.o \ sysfile.o \ uptodate.o \ - ver.o \ quota_local.o \ quota_global.o \ xattr.o \ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index b4f788e0ca3..555f4cddefe 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -160,36 +160,6 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, return acl; } - -/* - * Get posix acl. - */ -static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct buffer_head *di_bh = NULL; - struct posix_acl *acl; - int ret; - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return NULL; - - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - mlog_errno(ret); - acl = ERR_PTR(ret); - return acl; - } - - acl = ocfs2_get_acl_nolock(inode, type, di_bh); - - ocfs2_inode_unlock(inode, 0); - - brelse(di_bh); - - return acl; -} - /* * Helper function to set i_mode in memory and disk. Some call paths * will not have di_bh or a journal handle to pass, in which case it @@ -250,7 +220,7 @@ out: /* * Set the access or default ACL of an inode. */ -static int ocfs2_set_acl(handle_t *handle, +int ocfs2_set_acl(handle_t *handle, struct inode *inode, struct buffer_head *di_bh, int type, @@ -313,6 +283,11 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } +int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); +} + struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) { struct ocfs2_super *osb; @@ -334,200 +309,3 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) return acl; } - -int ocfs2_acl_chmod(struct inode *inode) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct posix_acl *acl; - int ret; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return 0; - - acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (ret) - return ret; - ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, - acl, NULL, NULL); - posix_acl_release(acl); - return ret; -} - -/* - * Initialize the ACLs of a new inode. If parent directory has default ACL, - * then clone to new inode. Called from ocfs2_mknod. - */ -int ocfs2_init_acl(handle_t *handle, - struct inode *inode, - struct inode *dir, - struct buffer_head *di_bh, - struct buffer_head *dir_bh, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_alloc_context *data_ac) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct posix_acl *acl = NULL; - int ret = 0, ret2; - umode_t mode; - - if (!S_ISLNK(inode->i_mode)) { - if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { - acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, - dir_bh); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) { - mode = inode->i_mode & ~current_umask(); - ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); - if (ret) { - mlog_errno(ret); - goto cleanup; - } - } - } - if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { - if (S_ISDIR(inode->i_mode)) { - ret = ocfs2_set_acl(handle, inode, di_bh, - ACL_TYPE_DEFAULT, acl, - meta_ac, data_ac); - if (ret) - goto cleanup; - } - mode = inode->i_mode; - ret = posix_acl_create(&acl, GFP_NOFS, &mode); - if (ret < 0) - return ret; - - ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); - if (ret2) { - mlog_errno(ret2); - ret = ret2; - goto cleanup; - } - if (ret > 0) { - ret = ocfs2_set_acl(handle, inode, - di_bh, ACL_TYPE_ACCESS, - acl, meta_ac, data_ac); - } - } -cleanup: - posix_acl_release(acl); - return ret; -} - -static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry, - char *list, - size_t list_len, - const char *name, - size_t name_len, - int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return 0; - - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; -} - -static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry, - char *list, - size_t list_len, - const char *name, - size_t name_len, - int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return 0; - - if (list && size <= list_len) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - struct posix_acl *acl; - int ret; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return -EOPNOTSUPP; - - acl = ocfs2_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return ret; -} - -static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct posix_acl *acl; - int ret = 0; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return -EOPNOTSUPP; - - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - else if (acl) { - ret = posix_acl_valid(acl); - if (ret) - goto cleanup; - } - } else - acl = NULL; - - ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); - -cleanup: - posix_acl_release(acl); - return ret; -} - -const struct xattr_handler ocfs2_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = ocfs2_xattr_list_acl_access, - .get = ocfs2_xattr_get_acl, - .set = ocfs2_xattr_set_acl, -}; - -const struct xattr_handler ocfs2_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = ocfs2_xattr_list_acl_default, - .get = ocfs2_xattr_get_acl, - .set = ocfs2_xattr_set_acl, -}; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 071fbd380f2..3fce68d0862 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -27,10 +27,13 @@ struct ocfs2_acl_entry { }; struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); -extern int ocfs2_acl_chmod(struct inode *); -extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, - struct buffer_head *, struct buffer_head *, - struct ocfs2_alloc_context *, - struct ocfs2_alloc_context *); +int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ocfs2_set_acl(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + int type, + struct posix_acl *acl, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac); #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index dc7411fe185..8750ae1b863 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7260,14 +7260,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; minlen = range->minlen >> osb->s_clustersize_bits; - trimmed = 0; - - if (!len) { - range->len = 0; - return 0; - } - if (minlen >= osb->bitmap_cpg) + if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) return -EINVAL; main_bm_inode = ocfs2_get_system_file_inode(osb, @@ -7293,6 +7287,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) goto out_unlock; } + len = range->len >> osb->s_clustersize_bits; if (start + len > le32_to_cpu(main_bm->i_clusters)) len = le32_to_cpu(main_bm->i_clusters) - start; @@ -7307,6 +7302,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); last_bit = osb->bitmap_cpg; + trimmed = 0; for (group = first_group; group <= last_group;) { if (first_bit + len >= osb->bitmap_cpg) last_bit = osb->bitmap_cpg; diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile index bc8c5e7d860..1aefc0350ec 100644 --- a/fs/ocfs2/cluster/Makefile +++ b/fs/ocfs2/cluster/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ - quorum.o tcp.o netdebug.o ver.o + quorum.o tcp.o netdebug.o diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index bb240647ca5..441c84e169e 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -29,7 +29,6 @@ #include "heartbeat.h" #include "masklog.h" #include "sys.h" -#include "ver.h" /* for now we operate under the assertion that there can be only one * cluster active at a time. Changing this will require trickling @@ -945,8 +944,6 @@ static int __init init_o2nm(void) { int ret = -1; - cluster_print_version(); - ret = o2hb_init(); if (ret) goto out; @@ -984,6 +981,7 @@ out: MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster management"); module_init(init_o2nm) module_exit(exit_o2nm) diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c deleted file mode 100644 index a56eee6abad..00000000000 --- a/fs/ocfs2/cluster/ver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define CLUSTER_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION - -void cluster_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(CLUSTER_BUILD_VERSION); diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h deleted file mode 100644 index 32554c3382c..00000000000 --- a/fs/ocfs2/cluster/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef O2CLUSTER_VER_H -#define O2CLUSTER_VER_H - -void cluster_print_version(void); - -#endif /* O2CLUSTER_VER_H */ diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index c8a044efbb1..bd1aab1f49a 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ - dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o + dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 8b3382abf84..33660a4a52f 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -43,8 +43,6 @@ #include "dlmdomain.h" #include "dlmdebug.h" -#include "dlmver.h" - #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) #include "cluster/masklog.h" @@ -2328,8 +2326,6 @@ static int __init dlm_init(void) { int status; - dlm_print_version(); - status = dlm_init_mle_cache(); if (status) { mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); @@ -2379,6 +2375,7 @@ static void __exit dlm_exit (void) MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 Distributed Lock Management"); module_init(dlm_init); module_exit(dlm_exit); diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c deleted file mode 100644 index dfc0da4d158..00000000000 --- a/fs/ocfs2/dlm/dlmver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION - -void dlm_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h deleted file mode 100644 index f674aee77a1..00000000000 --- a/fs/ocfs2/dlm/dlmver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLM_VER_H -#define DLM_VER_H - -void dlm_print_version(void); - -#endif /* DLM_VER_H */ diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index f14be89a670..eed3db8c5b4 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o -ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o +ocfs2_dlmfs-objs := userdlm.o dlmfs.o diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index efa2b3d339e..09b7d9dac71 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -49,7 +49,6 @@ #include "stackglue.h" #include "userdlm.h" -#include "dlmfsver.h" #define MLOG_MASK_PREFIX ML_DLMFS #include "cluster/masklog.h" @@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void) int status; int cleanup_inode = 0, cleanup_worker = 0; - dlmfs_print_version(); - status = bdi_init(&dlmfs_backing_dev_info); if (status) return status; @@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void) MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 DLM-Filesystem"); module_init(init_dlmfs_fs) module_exit(exit_dlmfs_fs) diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c deleted file mode 100644 index a733b3321f8..00000000000 --- a/fs/ocfs2/dlmfs/dlmfsver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmfsver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "dlmfsver.h" - -#define DLM_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION - -void dlmfs_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h deleted file mode 100644 index f35eadbed25..00000000000 --- a/fs/ocfs2/dlmfs/dlmfsver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef DLMFS_VER_H -#define DLMFS_VER_H - -void dlmfs_print_version(void); - -#endif /* DLMFS_VER_H */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 3407b2c62b2..19986959d14 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2996,6 +2996,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) /* for now, uuid == domain */ status = ocfs2_cluster_connect(osb->osb_cluster_stack, + osb->osb_cluster_name, + strlen(osb->osb_cluster_name), osb->uuid_str, strlen(osb->uuid_str), &lproto, ocfs2_do_node_down, osb, @@ -3005,7 +3007,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) goto bail; } - status = ocfs2_cluster_this_node(&osb->node_num); + status = ocfs2_cluster_this_node(conn, &osb->node_num); if (status < 0) { mlog_errno(status); mlog(ML_ERROR, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6fff128cad1..d77d71ead8d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1236,7 +1236,7 @@ bail: dqput(transfer_to[qtype]); if (!status && attr->ia_valid & ATTR_MODE) { - status = ocfs2_acl_chmod(inode); + status = posix_acl_chmod(inode, inode->i_mode); if (status < 0) mlog_errno(status); } @@ -1869,7 +1869,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, } size = sr->l_start + sr->l_len; - if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { + if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 || + cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) { if (sr->l_len <= 0) { ret = -EINVAL; goto out_inode_unlock; @@ -2661,6 +2662,7 @@ const struct inode_operations ocfs2_file_iops = { .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, .get_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, }; const struct inode_operations ocfs2_special_file_iops = { @@ -2668,6 +2670,7 @@ const struct inode_operations ocfs2_special_file_iops = { .getattr = ocfs2_getattr, .permission = ocfs2_permission, .get_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, }; /* diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index fa32ce9b455..8ca3c29accb 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/mount.h> +#include <linux/blkdev.h> #include <linux/compat.h> #include <cluster/masklog.h> @@ -966,15 +967,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FITRIM: { struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; + range.minlen = max_t(u64, q->limits.discard_granularity, + range.minlen); ret = ocfs2_trim_fs(sb, &range); if (ret < 0) return ret; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 631a9821347..64c304d668f 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -561,83 +561,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, - handle_t *handle, - struct buffer_head *di_bh, - u32 num_bits, - u16 chain) -{ - int ret; - u32 tmp_used; - struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; - struct ocfs2_chain_list *cl = - (struct ocfs2_chain_list *) &di->id2.i_chain; - - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - - tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); - di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); - ocfs2_journal_dirty(handle, di_bh); - -out: - return ret; -} - -static inline int ocfs2_block_group_set_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits) -{ - int status; - void *bitmap = bg->bg_bitmap; - int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; - - /* All callers get the descriptor via - * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ - BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); - BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); - - mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, - num_bits); - - if (ocfs2_is_cluster_bitmap(alloc_inode)) - journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - - status = ocfs2_journal_access_gd(handle, - INODE_CACHE(alloc_inode), - group_bh, - journal_type); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - le16_add_cpu(&bg->bg_free_bits_count, -num_bits); - if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { - ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" - " count %u but claims %u are freed. num_bits %d", - (unsigned long long)le64_to_cpu(bg->bg_blkno), - le16_to_cpu(bg->bg_bits), - le16_to_cpu(bg->bg_free_bits_count), num_bits); - return -EROFS; - } - while (num_bits--) - ocfs2_set_bit(bit_off++, bitmap); - - ocfs2_journal_dirty(handle, group_bh); - -bail: - return status; -} - static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, u32 len, int ext_flags) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 4f791f6d27d..f4d609be940 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -230,6 +230,7 @@ static int ocfs2_mknod(struct inode *dir, struct ocfs2_dir_lookup_result lookup = { NULL, }; sigset_t oldset; int did_block_signals = 0; + struct posix_acl *default_acl = NULL, *acl = NULL; trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, @@ -331,6 +332,12 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + status = posix_acl_create(dir, &mode, &default_acl, &acl); + if (status) { + mlog_errno(status); + goto leave; + } + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, S_ISDIR(mode), xattr_credits)); @@ -379,8 +386,17 @@ static int ocfs2_mknod(struct inode *dir, inc_nlink(dir); } - status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, - meta_ac, data_ac); + if (default_acl) { + status = ocfs2_set_acl(handle, inode, new_fe_bh, + ACL_TYPE_DEFAULT, default_acl, + meta_ac, data_ac); + } + if (!status && acl) { + status = ocfs2_set_acl(handle, inode, new_fe_bh, + ACL_TYPE_ACCESS, acl, + meta_ac, data_ac); + } + if (status < 0) { mlog_errno(status); goto leave; @@ -419,6 +435,10 @@ static int ocfs2_mknod(struct inode *dir, d_instantiate(dentry, inode); status = 0; leave: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); if (status < 0 && did_quota_inode) dquot_free_inode(inode); if (handle) @@ -948,7 +968,7 @@ leave: ocfs2_free_dir_lookup_result(&orphan_insert); ocfs2_free_dir_lookup_result(&lookup); - if (status && (status != -ENOTEMPTY)) + if (status && (status != -ENOTEMPTY) && (status != -ENOENT)) mlog_errno(status); return status; @@ -2504,4 +2524,5 @@ const struct inode_operations ocfs2_dir_iops = { .removexattr = generic_removexattr, .fiemap = ocfs2_fiemap, .get_acl = ocfs2_iop_get_acl, + .set_acl = ocfs2_iop_set_acl, }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 3a903470c79..553f53cc73a 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -387,6 +387,7 @@ struct ocfs2_super u8 osb_stackflags; char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; + char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1]; struct ocfs2_cluster_connection *cconn; struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 55767e1ba72..6ba4bcbc479 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -46,6 +46,7 @@ #include <linux/quotaops.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/posix_acl.h> struct ocfs2_cow_context { struct inode *inode; @@ -4268,11 +4269,20 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct inode *inode = old_dentry->d_inode; struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; + struct posix_acl *default_acl, *acl; + umode_t mode; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + mode = inode->i_mode; + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) { + mlog_errno(error); + goto out; + } + + error = ocfs2_create_inode_in_orphan(dir, mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4303,11 +4313,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { error = ocfs2_init_security_and_acl(dir, new_orphan_inode, - &new_dentry->d_name); + &new_dentry->d_name, + default_acl, acl); if (error) mlog_errno(error); } out: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); if (!error) { error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, new_dentry); diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index bf1f8930456..1724d43d3da 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn) return 0; } -static int o2cb_cluster_this_node(unsigned int *node) +static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node) { int node_num; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 286edf1e231..13a8537d8e8 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -23,6 +23,7 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/reboot.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include "stackglue.h" @@ -102,6 +103,12 @@ #define OCFS2_TEXT_UUID_LEN 32 #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 +#define VERSION_LOCK "version_lock" + +enum ocfs2_connection_type { + WITH_CONTROLD, + NO_CONTROLD +}; /* * ocfs2_live_connection is refcounted because the filesystem and @@ -110,6 +117,13 @@ struct ocfs2_live_connection { struct list_head oc_list; struct ocfs2_cluster_connection *oc_conn; + enum ocfs2_connection_type oc_type; + atomic_t oc_this_node; + int oc_our_slot; + struct dlm_lksb oc_version_lksb; + char oc_lvb[DLM_LVB_LEN]; + struct completion oc_sync_wait; + wait_queue_head_t oc_wait; }; struct ocfs2_control_private { @@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) * mount path. Since the VFS prevents multiple calls to * fill_super(), we can't get dupes here. */ -static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, - struct ocfs2_live_connection **c_ret) +static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn, + struct ocfs2_live_connection *c) { int rc = 0; - struct ocfs2_live_connection *c; - - c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); - if (!c) - return -ENOMEM; mutex_lock(&ocfs2_control_lock); c->oc_conn = conn; - if (atomic_read(&ocfs2_control_opened)) + if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened)) list_add(&c->oc_list, &ocfs2_live_connection_list); else { printk(KERN_ERR @@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, } mutex_unlock(&ocfs2_control_lock); - - if (!rc) - *c_ret = c; - else - kfree(c); - return rc; } @@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing, return 0; } +static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + ver->pv_major = pv->pv_major; + ver->pv_minor = pv->pv_minor; +} + +static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb) +{ + struct ocfs2_protocol_version *pv = + (struct ocfs2_protocol_version *)lvb; + /* + * ocfs2_protocol_version has two u8 variables, so we don't + * need any endian conversion. + */ + pv->pv_major = ver->pv_major; + pv->pv_minor = ver->pv_minor; +} + +static void sync_wait_cb(void *arg) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + complete(&lc->oc_sync_wait); +} + +static int sync_unlock(struct ocfs2_cluster_connection *conn, + struct dlm_lksb *lksb, char *name) +{ + int error; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn); + if (error) { + printk(KERN_ERR "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + printk(KERN_ERR "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct ocfs2_cluster_connection *conn, + int mode, uint32_t flags, + struct dlm_lksb *lksb, char *name) +{ + int error, status; + struct ocfs2_live_connection *lc = conn->cc_private; + + error = dlm_lock(conn->cc_lockspace, mode, lksb, flags, + name, strlen(name), + 0, sync_wait_cb, conn, NULL); + if (error) { + printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&lc->oc_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + + +static int version_lock(struct ocfs2_cluster_connection *conn, int mode, + int flags) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_lock(conn, mode, flags, + &lc->oc_version_lksb, VERSION_LOCK); +} + +static int version_unlock(struct ocfs2_cluster_connection *conn) +{ + struct ocfs2_live_connection *lc = conn->cc_private; + return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK); +} + +/* get_protocol_version() + * + * To exchange ocfs2 versioning, we use the LVB of the version dlm lock. + * The algorithm is: + * 1. Attempt to take the lock in EX mode (non-blocking). + * 2. If successful (which means it is the first mount), write the + * version number and downconvert to PR lock. + * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after + * taking the PR lock. + */ + +static int get_protocol_version(struct ocfs2_cluster_connection *conn) +{ + int ret; + struct ocfs2_live_connection *lc = conn->cc_private; + struct ocfs2_protocol_version pv; + + running_proto.pv_major = + ocfs2_user_plugin.sp_max_proto.pv_major; + running_proto.pv_minor = + ocfs2_user_plugin.sp_max_proto.pv_minor; + + lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb; + ret = version_lock(conn, DLM_LOCK_EX, + DLM_LKF_VALBLK|DLM_LKF_NOQUEUE); + if (!ret) { + conn->cc_version.pv_major = running_proto.pv_major; + conn->cc_version.pv_minor = running_proto.pv_minor; + version_to_lvb(&running_proto, lc->oc_lvb); + version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + } else if (ret == -EAGAIN) { + ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK); + if (ret) + goto out; + lvb_to_version(lc->oc_lvb, &pv); + + if ((pv.pv_major != running_proto.pv_major) || + (pv.pv_minor > running_proto.pv_minor)) { + ret = -EINVAL; + goto out; + } + + conn->cc_version.pv_major = pv.pv_major; + conn->cc_version.pv_minor = pv.pv_minor; + } +out: + return ret; +} + +static void user_recover_prep(void *arg) +{ +} + +static void user_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct ocfs2_cluster_connection *conn = arg; + printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n", + slot->nodeid, slot->slot); + conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data); + +} + +static void user_recover_done(void *arg, struct dlm_slot *slots, + int num_slots, int our_slot, + uint32_t generation) +{ + struct ocfs2_cluster_connection *conn = arg; + struct ocfs2_live_connection *lc = conn->cc_private; + int i; + + for (i = 0; i < num_slots; i++) + if (slots[i].slot == our_slot) { + atomic_set(&lc->oc_this_node, slots[i].nodeid); + break; + } + + lc->oc_our_slot = our_slot; + wake_up(&lc->oc_wait); +} + +static const struct dlm_lockspace_ops ocfs2_ls_ops = { + .recover_prep = user_recover_prep, + .recover_slot = user_recover_slot, + .recover_done = user_recover_done, +}; + +static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) +{ + version_unlock(conn); + dlm_release_lockspace(conn->cc_lockspace, 2); + conn->cc_lockspace = NULL; + ocfs2_live_connection_drop(conn->cc_private); + conn->cc_private = NULL; + return 0; +} + static int user_cluster_connect(struct ocfs2_cluster_connection *conn) { dlm_lockspace_t *fsdlm; - struct ocfs2_live_connection *uninitialized_var(control); - int rc = 0; + struct ocfs2_live_connection *lc; + int rc, ops_rv; BUG_ON(conn == NULL); - rc = ocfs2_live_connection_new(conn, &control); + lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); + if (!lc) { + rc = -ENOMEM; + goto out; + } + + init_waitqueue_head(&lc->oc_wait); + init_completion(&lc->oc_sync_wait); + atomic_set(&lc->oc_this_node, 0); + conn->cc_private = lc; + lc->oc_type = NO_CONTROLD; + + rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, + DLM_LSFL_FS, DLM_LVB_LEN, + &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); + if (rc) + goto out; + + if (ops_rv == -EOPNOTSUPP) { + lc->oc_type = WITH_CONTROLD; + printk(KERN_NOTICE "ocfs2: You seem to be using an older " + "version of dlm_controld and/or ocfs2-tools." + " Please consider upgrading.\n"); + } else if (ops_rv) { + rc = ops_rv; + goto out; + } + conn->cc_lockspace = fsdlm; + + rc = ocfs2_live_connection_attach(conn, lc); if (rc) goto out; + if (lc->oc_type == NO_CONTROLD) { + rc = get_protocol_version(conn); + if (rc) { + printk(KERN_ERR "ocfs2: Could not determine" + " locking version\n"); + user_cluster_disconnect(conn); + goto out; + } + wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0)); + } + /* * running_proto must have been set before we allowed any mounts * to proceed. @@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) if (fs_protocol_compare(&running_proto, &conn->cc_version)) { printk(KERN_ERR "Unable to mount with fs locking protocol version " - "%u.%u because the userspace control daemon has " - "negotiated %u.%u\n", + "%u.%u because negotiated protocol is %u.%u\n", conn->cc_version.pv_major, conn->cc_version.pv_minor, running_proto.pv_major, running_proto.pv_minor); rc = -EPROTO; - ocfs2_live_connection_drop(control); - goto out; - } - - rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN, - NULL, NULL, NULL, &fsdlm); - if (rc) { - ocfs2_live_connection_drop(control); - goto out; + ocfs2_live_connection_drop(lc); + lc = NULL; } - conn->cc_private = control; - conn->cc_lockspace = fsdlm; out: + if (rc && lc) + kfree(lc); return rc; } -static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) -{ - dlm_release_lockspace(conn->cc_lockspace, 2); - conn->cc_lockspace = NULL; - ocfs2_live_connection_drop(conn->cc_private); - conn->cc_private = NULL; - return 0; -} -static int user_cluster_this_node(unsigned int *this_node) +static int user_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *this_node) { int rc; + struct ocfs2_live_connection *lc = conn->cc_private; + + if (lc->oc_type == WITH_CONTROLD) + rc = ocfs2_control_get_this_node(); + else if (lc->oc_type == NO_CONTROLD) + rc = atomic_read(&lc->oc_this_node); + else + rc = -EINVAL; - rc = ocfs2_control_get_this_node(); if (rc < 0) return rc; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index cb7ec0b63dd..1324e6600e5 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, EXPORT_SYMBOL_GPL(ocfs2_plock); int ocfs2_cluster_connect(const char *stack_name, + const char *cluster_name, + int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, @@ -342,8 +344,10 @@ int ocfs2_cluster_connect(const char *stack_name, goto out; } - memcpy(new_conn->cc_name, group, grouplen); + strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; + strlcpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); + new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; new_conn->cc_recovery_data = recovery_data; @@ -386,8 +390,9 @@ int ocfs2_cluster_connect_agnostic(const char *group, if (cluster_stack_name[0]) stack_name = cluster_stack_name; - return ocfs2_cluster_connect(stack_name, group, grouplen, lproto, - recovery_handler, recovery_data, conn); + return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, + lproto, recovery_handler, recovery_data, + conn); } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); @@ -460,9 +465,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen) } EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); -int ocfs2_cluster_this_node(unsigned int *node) +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node) { - return active_stack->sp_ops->this_node(node); + return active_stack->sp_ops->this_node(conn, node); } EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 1ec56fdb8d0..66334a30cea 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -45,6 +45,9 @@ struct file_lock; */ #define GROUP_NAME_MAX 64 +/* This shadows OCFS2_CLUSTER_NAME_LEN */ +#define CLUSTER_NAME_MAX 16 + /* * ocfs2_protocol_version changes when ocfs2 does something different in @@ -97,8 +100,10 @@ struct ocfs2_locking_protocol { * locking compatibility. */ struct ocfs2_cluster_connection { - char cc_name[GROUP_NAME_MAX]; + char cc_name[GROUP_NAME_MAX + 1]; int cc_namelen; + char cc_cluster_name[CLUSTER_NAME_MAX + 1]; + int cc_cluster_name_len; struct ocfs2_protocol_version cc_version; struct ocfs2_locking_protocol *cc_proto; void (*cc_recovery_handler)(int node_num, void *recovery_data); @@ -152,7 +157,8 @@ struct ocfs2_stack_operations { * ->this_node() returns the cluster's unique identifier for the * local node. */ - int (*this_node)(unsigned int *node); + int (*this_node)(struct ocfs2_cluster_connection *conn, + unsigned int *node); /* * Call the underlying dlm lock function. The ->dlm_lock() @@ -239,6 +245,8 @@ struct ocfs2_stack_plugin { /* Used by the filesystem */ int ocfs2_cluster_connect(const char *stack_name, + const char *cluster_name, + int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, @@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group, int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, int hangup_pending); void ocfs2_cluster_hangup(const char *group, int grouplen); -int ocfs2_cluster_this_node(unsigned int *node); +int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, + unsigned int *node); struct ocfs2_lock_res; int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 2c91452c404..47ae2663a6f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, struct ocfs2_suballoc_result *res); static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr); -static inline int ocfs2_block_group_set_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits); static int ocfs2_relink_block_group(handle_t *handle, struct inode *alloc_inode, struct buffer_head *fe_bh, @@ -1343,7 +1337,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, return status; } -static inline int ocfs2_block_group_set_bits(handle_t *handle, +int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, @@ -1388,8 +1382,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, ocfs2_journal_dirty(handle, group_bh); bail: - if (status) - mlog_errno(status); return status; } @@ -1588,7 +1580,7 @@ static int ocfs2_block_group_search(struct inode *inode, return ret; } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, +int ocfs2_alloc_dinode_update_counts(struct inode *inode, handle_t *handle, struct buffer_head *di_bh, u32 num_bits, diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a36d0aa5091..218d8036b3e 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -86,6 +86,18 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, u32 bits_wanted, struct ocfs2_alloc_context **ac); +int ocfs2_alloc_dinode_update_counts(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain); +int ocfs2_block_group_set_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits); + int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c41492957aa..49d84f80f36 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -68,7 +68,6 @@ #include "super.h" #include "sysfile.h" #include "uptodate.h" -#include "ver.h" #include "xattr.h" #include "quota.h" #include "refcounttree.h" @@ -90,6 +89,7 @@ static struct dentry *ocfs2_debugfs_root = NULL; MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster file system"); struct mount_options { @@ -1618,8 +1618,6 @@ static int __init ocfs2_init(void) { int status, i; - ocfs2_print_version(); - for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) init_waitqueue_head(&ocfs2__ioend_wq[i]); @@ -1947,11 +1945,15 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_shutdown_local_alloc(osb); - ocfs2_truncate_log_shutdown(osb); - /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); + /* + * During dismount, when it recovers another node it will call + * ocfs2_recover_orphans and queue delayed work osb_truncate_log_wq. + */ + ocfs2_truncate_log_shutdown(osb); + ocfs2_journal_shutdown(osb); ocfs2_sync_blockdev(sb); @@ -2225,10 +2227,9 @@ static int ocfs2_initialize_super(struct super_block *sb, if (ocfs2_clusterinfo_valid(osb)) { osb->osb_stackflags = OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; - memcpy(osb->osb_cluster_stack, + strlcpy(osb->osb_cluster_stack, OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, - OCFS2_STACK_LABEL_LEN); - osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + OCFS2_STACK_LABEL_LEN + 1); if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { mlog(ML_ERROR, "couldn't mount because of an invalid " @@ -2237,6 +2238,9 @@ static int ocfs2_initialize_super(struct super_block *sb, status = -EINVAL; goto bail; } + strlcpy(osb->osb_cluster_name, + OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, + OCFS2_CLUSTER_NAME_LEN + 1); } else { /* The empty string is identical with classic tools that * don't know about s_cluster_info. */ diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c deleted file mode 100644 index e2488f4128a..00000000000 --- a/fs/ocfs2/ver.c +++ /dev/null @@ -1,43 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define OCFS2_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION - -void ocfs2_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(OCFS2_BUILD_VERSION); diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h deleted file mode 100644 index d7395cb91d2..00000000000 --- a/fs/ocfs2/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef OCFS2_VER_H -#define OCFS2_VER_H - -void ocfs2_print_version(void); - -#endif /* OCFS2_VER_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f0a1326d9bb..185fa3b7f96 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -99,8 +99,8 @@ static struct ocfs2_xattr_def_value_root def_xv = { const struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, - &ocfs2_xattr_acl_access_handler, - &ocfs2_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL @@ -109,9 +109,9 @@ const struct xattr_handler *ocfs2_xattr_handlers[] = { static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] - = &ocfs2_xattr_acl_access_handler, + = &posix_acl_access_xattr_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] - = &ocfs2_xattr_acl_default_handler, + = &posix_acl_default_xattr_handler, [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; @@ -7190,10 +7190,12 @@ out: */ int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr) + const struct qstr *qstr, + struct posix_acl *default_acl, + struct posix_acl *acl) { - int ret = 0; struct buffer_head *dir_bh = NULL; + int ret = 0; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7207,9 +7209,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); - if (ret) - mlog_errno(ret); + if (!ret && default_acl) + ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (!ret && acl) + ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS); ocfs2_inode_unlock(dir, 0); brelse(dir_bh); diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 19f134e896a..f10d5b93c36 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -40,8 +40,6 @@ struct ocfs2_security_xattr_info { extern const struct xattr_handler ocfs2_xattr_user_handler; extern const struct xattr_handler ocfs2_xattr_trusted_handler; extern const struct xattr_handler ocfs2_xattr_security_handler; -extern const struct xattr_handler ocfs2_xattr_acl_access_handler; -extern const struct xattr_handler ocfs2_xattr_acl_default_handler; extern const struct xattr_handler *ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); @@ -96,5 +94,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, struct inode *inode, - const struct qstr *qstr); + const struct qstr *qstr, + struct posix_acl *default_acl, + struct posix_acl *acl); #endif /* OCFS2_XATTR_H */ diff --git a/fs/pipe.c b/fs/pipe.c index 0e0752ef271..78fd0d0788d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -663,10 +663,11 @@ out: wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } - if (ret > 0) { + if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { int err = file_update_time(filp); if (err) ret = err; + sb_end_write(file_inode(filp)->i_sb); } return ret; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 8bd2135b7f8..38bae5a0ea2 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -1,10 +1,8 @@ /* - * linux/fs/posix_acl.c + * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org> * - * Copyright (C) 2002 by Andreas Gruenbacher <a.gruenbacher@computer.org> - * - * Fixes from William Schumacher incorporated on 15 March 2001. - * (Reported by Charles Bertsch, <CBertsch@microtest.com>). + * Fixes from William Schumacher incorporated on 15 March 2001. + * (Reported by Charles Bertsch, <CBertsch@microtest.com>). */ /* @@ -18,15 +16,112 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> #include <linux/export.h> +#include <linux/user_namespace.h> -#include <linux/errno.h> +struct posix_acl **acl_by_type(struct inode *inode, int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return &inode->i_acl; + case ACL_TYPE_DEFAULT: + return &inode->i_default_acl; + default: + BUG(); + } +} +EXPORT_SYMBOL(acl_by_type); -EXPORT_SYMBOL(posix_acl_init); -EXPORT_SYMBOL(posix_acl_alloc); -EXPORT_SYMBOL(posix_acl_valid); -EXPORT_SYMBOL(posix_acl_equiv_mode); -EXPORT_SYMBOL(posix_acl_from_mode); +struct posix_acl *get_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *acl = ACCESS_ONCE(*p); + if (acl) { + spin_lock(&inode->i_lock); + acl = *p; + if (acl != ACL_NOT_CACHED) + acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); + } + return acl; +} +EXPORT_SYMBOL(get_cached_acl); + +struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) +{ + return rcu_dereference(*acl_by_type(inode, type)); +} +EXPORT_SYMBOL(get_cached_acl_rcu); + +void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *old; + spin_lock(&inode->i_lock); + old = *p; + rcu_assign_pointer(*p, posix_acl_dup(acl)); + spin_unlock(&inode->i_lock); + if (old != ACL_NOT_CACHED) + posix_acl_release(old); +} +EXPORT_SYMBOL(set_cached_acl); + +void forget_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p = acl_by_type(inode, type); + struct posix_acl *old; + spin_lock(&inode->i_lock); + old = *p; + *p = ACL_NOT_CACHED; + spin_unlock(&inode->i_lock); + if (old != ACL_NOT_CACHED) + posix_acl_release(old); +} +EXPORT_SYMBOL(forget_cached_acl); + +void forget_all_cached_acls(struct inode *inode) +{ + struct posix_acl *old_access, *old_default; + spin_lock(&inode->i_lock); + old_access = inode->i_acl; + old_default = inode->i_default_acl; + inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; + spin_unlock(&inode->i_lock); + if (old_access != ACL_NOT_CACHED) + posix_acl_release(old_access); + if (old_default != ACL_NOT_CACHED) + posix_acl_release(old_default); +} +EXPORT_SYMBOL(forget_all_cached_acls); + +struct posix_acl *get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + if (!IS_POSIXACL(inode)) + return NULL; + + /* + * A filesystem can force a ACL callback by just never filling the + * ACL cache. But normally you'd fill the cache either at inode + * instantiation time, or on the first ->get_acl call. + * + * If the filesystem doesn't have a get_acl() function at all, we'll + * just create the negative cache entry. + */ + if (!inode->i_op->get_acl) { + set_cached_acl(inode, type, NULL); + return NULL; + } + return inode->i_op->get_acl(inode, type); +} +EXPORT_SYMBOL(get_acl); /* * Init a fresh posix_acl @@ -37,6 +132,7 @@ posix_acl_init(struct posix_acl *acl, int count) atomic_set(&acl->a_refcount, 1); acl->a_count = count; } +EXPORT_SYMBOL(posix_acl_init); /* * Allocate a new ACL with the specified number of entries. @@ -51,6 +147,7 @@ posix_acl_alloc(int count, gfp_t flags) posix_acl_init(acl, count); return acl; } +EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. @@ -78,8 +175,6 @@ posix_acl_valid(const struct posix_acl *acl) { const struct posix_acl_entry *pa, *pe; int state = ACL_USER_OBJ; - kuid_t prev_uid = INVALID_UID; - kgid_t prev_gid = INVALID_GID; int needs_mask = 0; FOREACH_ACL_ENTRY(pa, acl, pe) { @@ -98,10 +193,6 @@ posix_acl_valid(const struct posix_acl *acl) return -EINVAL; if (!uid_valid(pa->e_uid)) return -EINVAL; - if (uid_valid(prev_uid) && - uid_lte(pa->e_uid, prev_uid)) - return -EINVAL; - prev_uid = pa->e_uid; needs_mask = 1; break; @@ -117,10 +208,6 @@ posix_acl_valid(const struct posix_acl *acl) return -EINVAL; if (!gid_valid(pa->e_gid)) return -EINVAL; - if (gid_valid(prev_gid) && - gid_lte(pa->e_gid, prev_gid)) - return -EINVAL; - prev_gid = pa->e_gid; needs_mask = 1; break; @@ -146,6 +233,7 @@ posix_acl_valid(const struct posix_acl *acl) return 0; return -EINVAL; } +EXPORT_SYMBOL(posix_acl_valid); /* * Returns 0 if the acl can be exactly represented in the traditional @@ -186,6 +274,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) *mode_p = (*mode_p & ~S_IRWXUGO) | mode; return not_equiv; } +EXPORT_SYMBOL(posix_acl_equiv_mode); /* * Create an ACL representing the file mode permission bits of an inode. @@ -207,6 +296,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags) acl->a_entries[2].e_perm = (mode & S_IRWXO); return acl; } +EXPORT_SYMBOL(posix_acl_from_mode); /* * Return 0 if current is granted want access to the inode @@ -338,7 +428,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) /* * Modify the ACL for the chmod syscall. */ -static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) +static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) { struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; struct posix_acl_entry *pa, *pe; @@ -384,7 +474,7 @@ static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) } int -posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) +__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; @@ -399,15 +489,15 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) *acl = clone; return err; } -EXPORT_SYMBOL(posix_acl_create); +EXPORT_SYMBOL(__posix_acl_create); int -posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) +__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; if (clone) { - err = posix_acl_chmod_masq(clone, mode); + err = __posix_acl_chmod_masq(clone, mode); if (err) { posix_acl_release(clone); clone = NULL; @@ -417,4 +507,382 @@ posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) *acl = clone; return err; } +EXPORT_SYMBOL(__posix_acl_chmod); + +int +posix_acl_chmod(struct inode *inode, umode_t mode) +{ + struct posix_acl *acl; + int ret = 0; + + if (!IS_POSIXACL(inode)) + return 0; + if (!inode->i_op->set_acl) + return -EOPNOTSUPP; + + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR_OR_NULL(acl)) + return PTR_ERR(acl); + + ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); + if (ret) + return ret; + ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + return ret; +} EXPORT_SYMBOL(posix_acl_chmod); + +int +posix_acl_create(struct inode *dir, umode_t *mode, + struct posix_acl **default_acl, struct posix_acl **acl) +{ + struct posix_acl *p; + int ret; + + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) + goto no_acl; + + p = get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(p)) + return PTR_ERR(p); + + if (!p) { + *mode &= ~current_umask(); + goto no_acl; + } + + *acl = posix_acl_clone(p, GFP_NOFS); + if (!*acl) + return -ENOMEM; + + ret = posix_acl_create_masq(*acl, mode); + if (ret < 0) { + posix_acl_release(*acl); + return -ENOMEM; + } + + if (ret == 0) { + posix_acl_release(*acl); + *acl = NULL; + } + + if (!S_ISDIR(*mode)) { + posix_acl_release(p); + *default_acl = NULL; + } else { + *default_acl = p; + } + return 0; + +no_acl: + *default_acl = NULL; + *acl = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(posix_acl_create); + +/* + * Fix up the uids and gids in posix acl extended attributes in place. + */ +static void posix_acl_fix_xattr_userns( + struct user_namespace *to, struct user_namespace *from, + void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + kuid_t uid; + kgid_t gid; + + if (!value) + return; + if (size < sizeof(posix_acl_xattr_header)) + return; + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return; + + count = posix_acl_xattr_count(size); + if (count < 0) + return; + if (count == 0) + return; + + for (end = entry + count; entry != end; entry++) { + switch(le16_to_cpu(entry->e_tag)) { + case ACL_USER: + uid = make_kuid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kuid(to, uid)); + break; + case ACL_GROUP: + gid = make_kgid(from, le32_to_cpu(entry->e_id)); + entry->e_id = cpu_to_le32(from_kgid(to, gid)); + break; + default: + break; + } + } +} + +void posix_acl_fix_xattr_from_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); +} + +void posix_acl_fix_xattr_to_user(void *value, size_t size) +{ + struct user_namespace *user_ns = current_user_ns(); + if (user_ns == &init_user_ns) + return; + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); +} + +/* + * Convert from extended attribute to in-memory representation. + */ +struct posix_acl * +posix_acl_from_xattr(struct user_namespace *user_ns, + const void *value, size_t size) +{ + posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; + posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; + int count; + struct posix_acl *acl; + struct posix_acl_entry *acl_e; + + if (!value) + return NULL; + if (size < sizeof(posix_acl_xattr_header)) + return ERR_PTR(-EINVAL); + if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) + return ERR_PTR(-EOPNOTSUPP); + + count = posix_acl_xattr_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + acl_e = acl->a_entries; + + for (end = entry + count; entry != end; acl_e++, entry++) { + acl_e->e_tag = le16_to_cpu(entry->e_tag); + acl_e->e_perm = le16_to_cpu(entry->e_perm); + + switch(acl_e->e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + break; + + case ACL_USER: + acl_e->e_uid = + make_kuid(user_ns, + le32_to_cpu(entry->e_id)); + if (!uid_valid(acl_e->e_uid)) + goto fail; + break; + case ACL_GROUP: + acl_e->e_gid = + make_kgid(user_ns, + le32_to_cpu(entry->e_id)); + if (!gid_valid(acl_e->e_gid)) + goto fail; + break; + + default: + goto fail; + } + } + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL (posix_acl_from_xattr); + +/* + * Convert from in-memory to extended attribute representation. + */ +int +posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, + void *buffer, size_t size) +{ + posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; + posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; + int real_size, n; + + real_size = posix_acl_xattr_size(acl->a_count); + if (!buffer) + return real_size; + if (real_size > size) + return -ERANGE; + + ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); + + for (n=0; n < acl->a_count; n++, ext_entry++) { + const struct posix_acl_entry *acl_e = &acl->a_entries[n]; + ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); + ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch(acl_e->e_tag) { + case ACL_USER: + ext_entry->e_id = + cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); + break; + case ACL_GROUP: + ext_entry->e_id = + cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); + break; + default: + ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + break; + } + } + return real_size; +} +EXPORT_SYMBOL (posix_acl_to_xattr); + +static int +posix_acl_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size, int type) +{ + struct posix_acl *acl; + int error; + + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + if (S_ISLNK(dentry->d_inode->i_mode)) + return -EOPNOTSUPP; + + acl = get_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + + error = posix_acl_to_xattr(&init_user_ns, acl, value, size); + posix_acl_release(acl); + + return error; +} + +static int +posix_acl_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl = NULL; + int ret; + + if (!IS_POSIXACL(inode)) + return -EOPNOTSUPP; + if (!inode->i_op->set_acl) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return value ? -EACCES : 0; + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + ret = posix_acl_valid(acl); + if (ret) + goto out; + } + } + + ret = inode->i_op->set_acl(inode, acl, type); +out: + posix_acl_release(acl); + return ret; +} + +static size_t +posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len, int type) +{ + const char *xname; + size_t size; + + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + if (S_ISLNK(dentry->d_inode->i_mode)) + return -EOPNOTSUPP; + + if (type == ACL_TYPE_ACCESS) + xname = POSIX_ACL_XATTR_ACCESS; + else + xname = POSIX_ACL_XATTR_DEFAULT; + + size = strlen(xname) + 1; + if (list && size <= list_size) + memcpy(list, xname, size); + return size; +} + +const struct xattr_handler posix_acl_access_xattr_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = posix_acl_xattr_list, + .get = posix_acl_xattr_get, + .set = posix_acl_xattr_set, +}; +EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler); + +const struct xattr_handler posix_acl_default_xattr_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = posix_acl_xattr_list, + .get = posix_acl_xattr_get, + .set = posix_acl_xattr_set, +}; +EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); + +int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + + if (type == ACL_TYPE_ACCESS) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) + return 0; + if (error == 0) + acl = NULL; + } + + inode->i_ctime = CURRENT_TIME; + set_cached_acl(inode, type, acl); + return 0; +} + +int simple_acl_create(struct inode *dir, struct inode *inode) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl); + set_cached_acl(inode, ACL_TYPE_ACCESS, acl); + + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); + return 0; +} diff --git a/fs/proc/array.c b/fs/proc/array.c index 1bd2077187f..656e401794d 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -140,24 +140,15 @@ static const char * const task_state_array[] = { "t (tracing stop)", /* 8 */ "Z (zombie)", /* 16 */ "X (dead)", /* 32 */ - "x (dead)", /* 64 */ - "K (wakekill)", /* 128 */ - "W (waking)", /* 256 */ - "P (parked)", /* 512 */ }; static inline const char *get_task_state(struct task_struct *tsk) { - unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; - const char * const *p = &task_state_array[0]; + unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; - BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); - while (state) { - p++; - state >>= 1; - } - return *p; + return task_state_array[fls(state)]; } static inline void task_state(struct seq_file *m, struct pid_namespace *ns, @@ -453,8 +444,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += t->min_flt; maj_flt += t->maj_flt; gtime += task_gtime(t); - t = next_thread(t); - } while (t != task); + } while_each_thread(task, t); min_flt += sig->min_flt; maj_flt += sig->maj_flt; diff --git a/fs/proc/base.c b/fs/proc/base.c index 03c8d747be4..51507065263 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1658,13 +1658,18 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) return 0; } +static inline bool proc_inode_is_dead(struct inode *inode) +{ + return !proc_pid(inode)->tasks[PIDTYPE_PID].first; +} + int pid_delete_dentry(const struct dentry *dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; + return proc_inode_is_dead(dentry->d_inode); } const struct dentry_operations pid_dentry_operations = @@ -3092,34 +3097,42 @@ out_no_task: * In the case of a seek we start with the leader and walk nr * threads past it. */ -static struct task_struct *first_tid(struct task_struct *leader, - int tid, int nr, struct pid_namespace *ns) +static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, + struct pid_namespace *ns) { - struct task_struct *pos; + struct task_struct *pos, *task; + unsigned long nr = f_pos; + + if (nr != f_pos) /* 32bit overflow? */ + return NULL; rcu_read_lock(); - /* Attempt to start with the pid of a thread */ - if (tid && (nr > 0)) { + task = pid_task(pid, PIDTYPE_PID); + if (!task) + goto fail; + + /* Attempt to start with the tid of a thread */ + if (tid && nr) { pos = find_task_by_pid_ns(tid, ns); - if (pos && (pos->group_leader == leader)) + if (pos && same_thread_group(pos, task)) goto found; } /* If nr exceeds the number of threads there is nothing todo */ - pos = NULL; - if (nr && nr >= get_nr_threads(leader)) - goto out; + if (nr >= get_nr_threads(task)) + goto fail; /* If we haven't found our starting place yet start * with the leader and walk nr threads forward. */ - for (pos = leader; nr > 0; --nr) { - pos = next_thread(pos); - if (pos == leader) { - pos = NULL; - goto out; - } - } + pos = task = task->group_leader; + do { + if (!nr--) + goto found; + } while_each_thread(task, pos); +fail: + pos = NULL; + goto out; found: get_task_struct(pos); out: @@ -3152,25 +3165,16 @@ static struct task_struct *next_tid(struct task_struct *start) /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file *file, struct dir_context *ctx) { - struct task_struct *leader = NULL; - struct task_struct *task = get_proc_task(file_inode(file)); + struct inode *inode = file_inode(file); + struct task_struct *task; struct pid_namespace *ns; int tid; - if (!task) - return -ENOENT; - rcu_read_lock(); - if (pid_alive(task)) { - leader = task->group_leader; - get_task_struct(leader); - } - rcu_read_unlock(); - put_task_struct(task); - if (!leader) + if (proc_inode_is_dead(inode)) return -ENOENT; if (!dir_emit_dots(file, ctx)) - goto out; + return 0; /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. @@ -3178,7 +3182,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) ns = file->f_dentry->d_sb->s_fs_info; tid = (int)file->f_version; file->f_version = 0; - for (task = first_tid(leader, tid, ctx->pos - 2, ns); + for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { char name[PROC_NUMBUF]; @@ -3194,8 +3198,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) break; } } -out: - put_task_struct(leader); + return 0; } diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c index 82676e3fcd1..cbd82dff7e8 100644 --- a/fs/proc/cmdline.c +++ b/fs/proc/cmdline.c @@ -26,4 +26,4 @@ static int __init proc_cmdline_init(void) proc_create("cmdline", 0, NULL, &cmdline_proc_fops); return 0; } -module_init(proc_cmdline_init); +fs_initcall(proc_cmdline_init); diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index 51942d5abce..290ba85cb90 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -109,4 +109,4 @@ static int __init proc_consoles_init(void) proc_create("consoles", 0, NULL, &proc_consoles_operations); return 0; } -module_init(proc_consoles_init); +fs_initcall(proc_consoles_init); diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index 5a1e539a234..06f4d31e039 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -21,4 +21,4 @@ static int __init proc_cpuinfo_init(void) proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); return 0; } -module_init(proc_cpuinfo_init); +fs_initcall(proc_cpuinfo_init); diff --git a/fs/proc/devices.c b/fs/proc/devices.c index b14347167c3..50493edc30e 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -67,4 +67,4 @@ static int __init proc_devices_init(void) proc_create("devices", 0, NULL, &proc_devinfo_operations); return 0; } -module_init(proc_devices_init); +fs_initcall(proc_devices_init); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index cca93b6fb9a..b7f268eb5f4 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -49,8 +49,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) setattr_copy(inode, iattr); mark_inode_dirty(inode); - de->uid = inode->i_uid; - de->gid = inode->i_gid; + proc_set_user(de, inode->i_uid, inode->i_gid); de->mode = inode->i_mode; return 0; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 28955d4b721..124fc43c709 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -292,16 +292,20 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, { struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned long rv = -EIO; - unsigned long (*get_area)(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long) = NULL; + if (use_pde(pde)) { + typeof(proc_reg_get_unmapped_area) *get_area; + + get_area = pde->proc_fops->get_unmapped_area; #ifdef CONFIG_MMU - get_area = current->mm->get_unmapped_area; + if (!get_area) + get_area = current->mm->get_unmapped_area; #endif - if (pde->proc_fops->get_unmapped_area) - get_area = pde->proc_fops->get_unmapped_area; + if (get_area) rv = get_area(file, orig_addr, len, pgoff, flags); + else + rv = orig_addr; unuse_pde(pde); } return rv; diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c index 05029c0e2f2..a352d5703b4 100644 --- a/fs/proc/interrupts.c +++ b/fs/proc/interrupts.c @@ -50,4 +50,4 @@ static int __init proc_interrupts_init(void) proc_create("interrupts", 0, NULL, &proc_interrupts_operations); return 0; } -module_init(proc_interrupts_init); +fs_initcall(proc_interrupts_init); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 5ed0e52d6aa..39e6ef32f0b 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -639,4 +639,4 @@ static int __init proc_kcore_init(void) return 0; } -module_init(proc_kcore_init); +fs_initcall(proc_kcore_init); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index bdfabdaefdc..05f8dcdb086 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -61,4 +61,4 @@ static int __init proc_kmsg_init(void) proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); return 0; } -module_init(proc_kmsg_init); +fs_initcall(proc_kmsg_init); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 1afa4dd4cae..aec66e6c206 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -42,4 +42,4 @@ static int __init proc_loadavg_init(void) proc_create("loadavg", 0, NULL, &loadavg_proc_fops); return 0; } -module_init(proc_loadavg_init); +fs_initcall(proc_loadavg_init); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a77d2b29919..136e548d956 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -26,7 +26,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) unsigned long committed; struct vmalloc_info vmi; long cached; + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; unsigned long pages[NR_LRU_LISTS]; + struct zone *zone; int lru; /* @@ -47,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v) for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_page_state(NR_LRU_BASE + lru); + for_each_zone(zone) + wmark_low += zone->watermark[WMARK_LOW]; + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping. + * + * Free memory cannot be taken below the low watermark, before the + * system starts swapping. + */ + available = i.freeram - wmark_low; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping. Assume at least half of the page cache, or the + * low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable swap consists of items that are in use, + * and cannot be freed. Cap this estimate at the low watermark. + */ + available += global_page_state(NR_SLAB_RECLAIMABLE) - + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + + if (available < 0) + available = 0; + /* * Tagged format, for easy grepping and expansion. */ seq_printf(m, "MemTotal: %8lu kB\n" "MemFree: %8lu kB\n" + "MemAvailable: %8lu kB\n" "Buffers: %8lu kB\n" "Cached: %8lu kB\n" "SwapCached: %8lu kB\n" @@ -105,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) , K(i.totalram), K(i.freeram), + K(available), K(i.bufferram), K(cached), K(total_swapcache_pages()), @@ -183,4 +220,4 @@ static int __init proc_meminfo_init(void) proc_create("meminfo", 0, NULL, &meminfo_proc_fops); return 0; } -module_init(proc_meminfo_init); +fs_initcall(proc_meminfo_init); diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 5f9bc8a746c..d4a35746cab 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -131,4 +131,4 @@ static int __init proc_nommu_init(void) return 0; } -module_init(proc_nommu_init); +fs_initcall(proc_nommu_init); diff --git a/fs/proc/page.c b/fs/proc/page.c index b8730d9ebae..02174a61031 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -118,10 +118,12 @@ u64 stable_page_flags(struct page *page) /* * PageTransCompound can be true for non-huge compound pages (slab * pages or pages allocated by drivers with __GFP_COMP) because it - * just checks PG_head/PG_tail, so we need to check PageLRU to make - * sure a given page is a thp, not a non-huge compound page. + * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon + * to make sure a given page is a thp, not a non-huge compound page. */ - else if (PageTransCompound(page) && PageLRU(compound_trans_head(page))) + else if (PageTransCompound(page) && + (PageLRU(compound_trans_head(page)) || + PageAnon(compound_trans_head(page)))) u |= 1 << KPF_THP; /* @@ -217,4 +219,4 @@ static int __init proc_page_init(void) proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); return 0; } -module_init(proc_page_init); +fs_initcall(proc_page_init); diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 70779b2fc20..c82dd514784 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -74,9 +74,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp, return NULL; if (!strncmp(name, "security-", 9)) - ent->size = 0; /* don't leak number of password chars */ + proc_set_size(ent, 0); /* don't leak number of password chars */ else - ent->size = pp->length; + proc_set_size(ent, pp->length); return ent; } @@ -232,6 +232,7 @@ void __init proc_device_tree_init(void) return; root = of_find_node_by_path("/"); if (root == NULL) { + remove_proc_entry("device-tree", NULL); pr_debug("/proc/device-tree: can't find root\n"); return; } diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c index 62604be9f58..ad8a77f94be 100644 --- a/fs/proc/softirqs.c +++ b/fs/proc/softirqs.c @@ -41,4 +41,4 @@ static int __init proc_softirqs_init(void) proc_create("softirqs", 0, NULL, &proc_softirqs_operations); return 0; } -module_init(proc_softirqs_init); +fs_initcall(proc_softirqs_init); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 1cf86c0e868..6f599c62f0c 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -221,4 +221,4 @@ static int __init proc_stat_init(void) proc_create("stat", 0, NULL, &proc_stat_operations); return 0; } -module_init(proc_stat_init); +fs_initcall(proc_stat_init); diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 06189462590..7141b8d0ca9 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -49,4 +49,4 @@ static int __init proc_uptime_init(void) proc_create("uptime", 0, NULL, &uptime_proc_fops); return 0; } -module_init(proc_uptime_init); +fs_initcall(proc_uptime_init); diff --git a/fs/proc/version.c b/fs/proc/version.c index 76817a60678..d2154eb6d78 100644 --- a/fs/proc/version.c +++ b/fs/proc/version.c @@ -31,4 +31,4 @@ static int __init proc_version_init(void) proc_create("version", 0, NULL, &version_proc_fops); return 0; } -module_init(proc_version_init); +fs_initcall(proc_version_init); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 9100d695988..2ca7ba047f0 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1082,7 +1082,7 @@ static int __init vmcore_init(void) proc_vmcore->size = vmcore_size; return 0; } -module_init(vmcore_init) +fs_initcall(vmcore_init); /* Cleanup function for vmcore module. */ void vmcore_cleanup(void) diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 439406e081a..7be26f03a3f 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -234,17 +234,12 @@ static int mounts_open_common(struct inode *inode, struct file *file, rcu_read_lock(); nsp = task_nsproxy(task); - if (!nsp) { + if (!nsp || !nsp->mnt_ns) { rcu_read_unlock(); put_task_struct(task); goto err; } ns = nsp->mnt_ns; - if (!ns) { - rcu_read_unlock(); - put_task_struct(task); - goto err; - } get_mnt_ns(ns); rcu_read_unlock(); task_lock(task); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b8e93a40a5d..78c3c209778 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -443,8 +443,11 @@ int pstore_register(struct pstore_info *psi) pstore_get_records(0); kmsg_dump_register(&pstore_dumper); - pstore_register_console(); - pstore_register_ftrace(); + + if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { + pstore_register_console(); + pstore_register_ftrace(); + } if (pstore_update_ms >= 0) { pstore_timer.expires = jiffies + diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 2e8caa62da7..89558810381 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -27,7 +27,6 @@ static const struct super_operations qnx4_sops; -static void qnx4_put_super(struct super_block *sb); static struct inode *qnx4_alloc_inode(struct super_block *sb); static void qnx4_destroy_inode(struct inode *inode); static int qnx4_remount(struct super_block *sb, int *flags, char *data); @@ -37,7 +36,6 @@ static const struct super_operations qnx4_sops = { .alloc_inode = qnx4_alloc_inode, .destroy_inode = qnx4_destroy_inode, - .put_super = qnx4_put_super, .statfs = qnx4_statfs, .remount_fs = qnx4_remount, }; @@ -148,18 +146,19 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) * it really _is_ a qnx4 filesystem, and to check the size * of the directory entry. */ -static const char *qnx4_checkroot(struct super_block *sb) +static const char *qnx4_checkroot(struct super_block *sb, + struct qnx4_super_block *s) { struct buffer_head *bh; struct qnx4_inode_entry *rootdir; int rd, rl; int i, j; - if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') + if (s->RootDir.di_fname[0] != '/' || s->RootDir.di_fname[1] != '\0') return "no qnx4 filesystem (no root dir)."; QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id)); - rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1; - rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size); + rd = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_blk) - 1; + rl = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_size); for (j = 0; j < rl; j++) { bh = sb_bread(sb, rd + j); /* root dir, first block */ if (bh == NULL) @@ -189,7 +188,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) struct inode *root; const char *errmsg; struct qnx4_sb_info *qs; - int ret = -EINVAL; qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL); if (!qs) @@ -198,67 +196,50 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) sb_set_blocksize(s, QNX4_BLOCK_SIZE); + s->s_op = &qnx4_sops; + s->s_magic = QNX4_SUPER_MAGIC; + s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ + /* Check the superblock signature. Since the qnx4 code is dangerous, we should leave as quickly as possible if we don't belong here... */ bh = sb_bread(s, 1); if (!bh) { printk(KERN_ERR "qnx4: unable to read the superblock\n"); - goto outnobh; + return -EINVAL; } - if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) { - if (!silent) - printk(KERN_ERR "qnx4: wrong fsid in superblock.\n"); - goto out; - } - s->s_op = &qnx4_sops; - s->s_magic = QNX4_SUPER_MAGIC; - s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ - qnx4_sb(s)->sb_buf = bh; - qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data; - /* check before allocating dentries, inodes, .. */ - errmsg = qnx4_checkroot(s); + errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data); + brelse(bh); if (errmsg != NULL) { if (!silent) printk(KERN_ERR "qnx4: %s\n", errmsg); - goto out; + return -EINVAL; } /* does root not have inode number QNX4_ROOT_INO ?? */ root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK); if (IS_ERR(root)) { printk(KERN_ERR "qnx4: get inode failed\n"); - ret = PTR_ERR(root); - goto outb; + return PTR_ERR(root); } - ret = -ENOMEM; s->s_root = d_make_root(root); if (s->s_root == NULL) - goto outb; + return -ENOMEM; - brelse(bh); return 0; - - outb: - kfree(qs->BitMap); - out: - brelse(bh); - outnobh: - kfree(qs); - s->s_fs_info = NULL; - return ret; } -static void qnx4_put_super(struct super_block *sb) +static void qnx4_kill_sb(struct super_block *sb) { struct qnx4_sb_info *qs = qnx4_sb(sb); - kfree( qs->BitMap ); - kfree( qs ); - sb->s_fs_info = NULL; - return; + kill_block_super(sb); + if (qs) { + kfree(qs->BitMap); + kfree(qs); + } } static int qnx4_readpage(struct file *file, struct page *page) @@ -409,7 +390,7 @@ static struct file_system_type qnx4_fs_type = { .owner = THIS_MODULE, .name = "qnx4", .mount = qnx4_mount, - .kill_sb = kill_block_super, + .kill_sb = qnx4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("qnx4"); diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h index 34e2d329c97..c9b1be2c164 100644 --- a/fs/qnx4/qnx4.h +++ b/fs/qnx4/qnx4.h @@ -10,8 +10,6 @@ #endif struct qnx4_sb_info { - struct buffer_head *sb_buf; /* superblock buffer */ - struct qnx4_super_block *sb; /* our superblock */ unsigned int Version; /* may be useful */ struct qnx4_inode_entry *BitMap; /* useful */ }; diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 4884ac5ae9b..1e56a4e8cf7 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -30,13 +30,6 @@ #include "internal.h" -const struct address_space_operations ramfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, - .set_page_dirty = __set_page_dirty_no_writeback, -}; - const struct file_operations ramfs_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 8d5b438cc18..0b3d8e4cb2f 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -27,13 +27,12 @@ #include "internal.h" static int ramfs_nommu_setattr(struct dentry *, struct iattr *); - -const struct address_space_operations ramfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, - .set_page_dirty = __set_page_dirty_no_writeback, -}; +static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags); +static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); const struct file_operations ramfs_file_operations = { .mmap = ramfs_nommu_mmap, @@ -197,7 +196,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) * - the pages to be mapped must exist * - the pages be physically contiguous in sequence */ -unsigned long ramfs_nommu_get_unmapped_area(struct file *file, +static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { @@ -256,7 +255,7 @@ out: /* * set up a mapping for shared memory segments */ -int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) +static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) { if (!(vma->vm_flags & VM_SHARED)) return -ENOSYS; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 39d14659a8d..d365b1c4eb3 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -43,6 +43,13 @@ static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; +static const struct address_space_operations ramfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, + .set_page_dirty = __set_page_dirty_no_writeback, +}; + static struct backing_dev_info ramfs_backing_dev_info = { .name = "ramfs", .ra_pages = 0, /* No readahead */ @@ -275,4 +282,4 @@ int __init init_ramfs_fs(void) return err; } -module_init(init_ramfs_fs) +fs_initcall(init_ramfs_fs); diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h index 6b330639b51..a9d8ae88fa1 100644 --- a/fs/ramfs/internal.h +++ b/fs/ramfs/internal.h @@ -10,5 +10,4 @@ */ -extern const struct address_space_operations ramfs_aops; extern const struct inode_operations ramfs_file_inode_operations; diff --git a/fs/read_write.c b/fs/read_write.c index 58e440df1bc..edc5746a902 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -901,10 +901,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, io_fn_t fn; iov_fn_t fnv; - ret = -EFAULT; - if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) - goto out; - ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, UIO_FASTIOV, iovstack, &iov); if (ret <= 0) @@ -968,9 +964,9 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd, +COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen) + compat_ulong_t, vlen) { struct fd f = fdget(fd); ssize_t ret; @@ -1005,9 +1001,9 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, return ret; } -COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd, +COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, u32, pos_low, u32, pos_high) + compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return compat_sys_preadv64(fd, vec, vlen, pos); @@ -1035,9 +1031,9 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd, +COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, const struct compat_iovec __user *, vec, - unsigned long, vlen) + compat_ulong_t, vlen) { struct fd f = fdget(fd); ssize_t ret; @@ -1072,9 +1068,9 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, return ret; } -COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd, +COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, u32, pos_low, u32, pos_high) + compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return compat_sys_pwritev64(fd, vec, vlen, pos); diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index f096b80e73d..4a211f5b34b 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -48,18 +48,18 @@ static inline int reiserfs_acl_count(size_t size) #ifdef CONFIG_REISERFS_FS_POSIX_ACL struct posix_acl *reiserfs_get_acl(struct inode *inode, int type); +int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); int reiserfs_acl_chmod(struct inode *inode); int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, struct inode *inode); int reiserfs_cache_default_acl(struct inode *dir); -extern const struct xattr_handler reiserfs_posix_acl_default_handler; -extern const struct xattr_handler reiserfs_posix_acl_access_handler; #else #define reiserfs_cache_default_acl(inode) 0 #define reiserfs_get_acl NULL +#define reiserfs_set_acl NULL static inline int reiserfs_acl_chmod(struct inode *inode) { diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index dcaafcfc23b..ed58d843d57 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -260,4 +260,5 @@ const struct inode_operations reiserfs_file_inode_operations = { .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, }; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index dc5236f6de1..e825f8b63e6 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1522,6 +1522,7 @@ const struct inode_operations reiserfs_dir_inode_operations = { .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, }; /* @@ -1538,8 +1539,6 @@ const struct inode_operations reiserfs_symlink_inode_operations = { .listxattr = reiserfs_listxattr, .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, - .get_acl = reiserfs_get_acl, - }; /* @@ -1553,4 +1552,5 @@ const struct inode_operations reiserfs_special_inode_operations = { .removexattr = reiserfs_removexattr, .permission = reiserfs_permission, .get_acl = reiserfs_get_acl, + .set_acl = reiserfs_set_acl, }; diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index a958444a75f..02b0b7d0f7d 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -419,7 +419,7 @@ int reiserfs_proc_info_init(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + strlcpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; @@ -449,7 +449,7 @@ int reiserfs_proc_info_done(struct super_block *sb) char *s; /* Some block devices use /'s */ - strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE); + strlcpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index f8adaee537c..8d06adf8994 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -608,14 +608,6 @@ int reiserfs_resize(struct super_block *, unsigned long); #define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->) -/* A safe version of the "bdevname", which returns the "s_id" field of - * a superblock or else "Null superblock" if the super block is NULL. - */ -static inline char *reiserfs_bdevname(struct super_block *s) -{ - return (s == NULL) ? "Null superblock" : s->s_id; -} - #define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal))) static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal *journal) @@ -1958,8 +1950,6 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,} #define MAX_US_INT 0xffff // reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset -#define U32_MAX (~(__u32)0) - static inline loff_t max_reiserfs_offset(struct inode *inode) { if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 3ead145dadc..2c803353f8a 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1479,7 +1479,7 @@ static int read_super_block(struct super_block *s, int offset) if (!bh) { reiserfs_warning(s, "sh-2006", "bread failed (dev %s, block %lu, size %lu)", - reiserfs_bdevname(s), offset / s->s_blocksize, + s->s_id, offset / s->s_blocksize, s->s_blocksize); return 1; } @@ -1500,7 +1500,7 @@ static int read_super_block(struct super_block *s, int offset) if (!bh) { reiserfs_warning(s, "sh-2007", "bread failed (dev %s, block %lu, size %lu)", - reiserfs_bdevname(s), offset / s->s_blocksize, + s->s_id, offset / s->s_blocksize, s->s_blocksize); return 1; } @@ -1509,7 +1509,7 @@ static int read_super_block(struct super_block *s, int offset) if (sb_blocksize(rs) != s->s_blocksize) { reiserfs_warning(s, "sh-2011", "can't find a reiserfs " "filesystem on (dev %s, block %Lu, size %lu)", - reiserfs_bdevname(s), + s->s_id, (unsigned long long)bh->b_blocknr, s->s_blocksize); brelse(bh); @@ -1825,7 +1825,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) /* try new format (64-th 1k block), which can contain reiserfs super block */ else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", - reiserfs_bdevname(s)); + s->s_id); goto error_unlocked; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8a9e2dcfe00..5cdfbd638b5 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -50,6 +50,7 @@ #include <linux/stat.h> #include <linux/quotaops.h> #include <linux/security.h> +#include <linux/posix_acl_xattr.h> #define PRIVROOT_NAME ".reiserfs_priv" #define XAROOT_NAME "xattrs" @@ -904,8 +905,8 @@ static const struct xattr_handler *reiserfs_xattr_handlers[] = { &reiserfs_xattr_security_handler, #endif #ifdef CONFIG_REISERFS_FS_POSIX_ACL - &reiserfs_posix_acl_access_handler, - &reiserfs_posix_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL }; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 06c04f73da6..a6ce532402d 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -11,35 +11,19 @@ #include "acl.h" #include <asm/uaccess.h> -static int reiserfs_set_acl(struct reiserfs_transaction_handle *th, +static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, int type, struct posix_acl *acl); -static int -posix_acl_set(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags, int type) + +int +reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct inode *inode = dentry->d_inode; - struct posix_acl *acl; int error, error2; struct reiserfs_transaction_handle th; size_t jcreate_blocks; - if (!reiserfs_posixacl(inode->i_sb)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) { - return PTR_ERR(acl); - } else if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else - acl = NULL; + int size = acl ? posix_acl_xattr_size(acl->a_count) : 0; + /* Pessimism: We can't assume that anything from the xattr root up * has been created. */ @@ -51,7 +35,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value, error = journal_begin(&th, inode->i_sb, jcreate_blocks); reiserfs_write_unlock(inode->i_sb); if (error == 0) { - error = reiserfs_set_acl(&th, inode, type, acl); + error = __reiserfs_set_acl(&th, inode, type, acl); reiserfs_write_lock(inode->i_sb); error2 = journal_end(&th, inode->i_sb, jcreate_blocks); reiserfs_write_unlock(inode->i_sb); @@ -59,36 +43,13 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value, error = error2; } - release_and_out: - posix_acl_release(acl); - return error; -} - -static int -posix_acl_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - struct posix_acl *acl; - int error; - - if (!reiserfs_posixacl(dentry->d_sb)) - return -EOPNOTSUPP; - - acl = reiserfs_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - return error; } /* * Convert from filesystem to in-memory representation. */ -static struct posix_acl *posix_acl_from_disk(const void *value, size_t size) +static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size) { const char *end = (char *)value + size; int n, count; @@ -158,7 +119,7 @@ static struct posix_acl *posix_acl_from_disk(const void *value, size_t size) /* * Convert from in-memory to filesystem representation. */ -static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size) +static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size) { reiserfs_acl_header *ext_acl; char *e; @@ -221,10 +182,6 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) int size; int retval; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; @@ -257,7 +214,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) } else if (retval < 0) { acl = ERR_PTR(retval); } else { - acl = posix_acl_from_disk(value, retval); + acl = reiserfs_posix_acl_from_disk(value, retval); } if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); @@ -273,7 +230,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) * BKL held [before 2.5.x] */ static int -reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, +__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, int type, struct posix_acl *acl) { char *name; @@ -281,9 +238,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, size_t size = 0; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: name = POSIX_ACL_XATTR_ACCESS; @@ -307,7 +261,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, } if (acl) { - value = posix_acl_to_disk(acl, &size); + value = reiserfs_posix_acl_to_disk(acl, &size); if (IS_ERR(value)) return (int)PTR_ERR(value); } @@ -343,7 +297,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, struct inode *inode) { - struct posix_acl *acl; + struct posix_acl *default_acl, *acl; int err = 0; /* ACLs only get applied to files and directories */ @@ -363,37 +317,28 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, goto apply_umask; } - acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) + return err; + if (default_acl) { + err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, + default_acl); + posix_acl_release(default_acl); + } if (acl) { - /* Copy the default ACL to the default ACL of a new directory */ - if (S_ISDIR(inode->i_mode)) { - err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, - acl); - if (err) - goto cleanup; - } - - /* Now we reconcile the new ACL and the mode, - potentially modifying both */ - err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (err < 0) - return err; - - /* If we need an ACL.. */ - if (err > 0) - err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl); - cleanup: + if (!err) + err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, + acl); posix_acl_release(acl); - } else { - apply_umask: - /* no ACL, apply umask */ - inode->i_mode &= ~current_umask(); } return err; + + apply_umask: + /* no ACL, apply umask */ + inode->i_mode &= ~current_umask(); + return err; } /* This is used to cache the default acl before a new object is created. @@ -442,84 +387,11 @@ int reiserfs_cache_default_acl(struct inode *inode) */ int reiserfs_acl_chmod(struct inode *inode) { - struct reiserfs_transaction_handle th; - struct posix_acl *acl; - size_t size; - int error; - if (IS_PRIVATE(inode)) return 0; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - if (get_inode_sd_version(inode) == STAT_DATA_V1 || - !reiserfs_posixacl(inode->i_sb)) { + !reiserfs_posixacl(inode->i_sb)) return 0; - } - acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - if (!acl) - return 0; - if (IS_ERR(acl)) - return PTR_ERR(acl); - error = posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode); - if (error) - return error; - - size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count)); - reiserfs_write_lock(inode->i_sb); - error = journal_begin(&th, inode->i_sb, size * 2); - reiserfs_write_unlock(inode->i_sb); - if (!error) { - int error2; - error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl); - reiserfs_write_lock(inode->i_sb); - error2 = journal_end(&th, inode->i_sb, size * 2); - reiserfs_write_unlock(inode->i_sb); - if (error2) - error = error2; - } - posix_acl_release(acl); - return error; -} - -static size_t posix_acl_access_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); - if (!reiserfs_posixacl(dentry->d_sb)) - return 0; - if (list && size <= list_size) - memcpy(list, POSIX_ACL_XATTR_ACCESS, size); - return size; + return posix_acl_chmod(inode, inode->i_mode); } - -const struct xattr_handler reiserfs_posix_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = posix_acl_get, - .set = posix_acl_set, - .list = posix_acl_access_list, -}; - -static size_t posix_acl_default_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len, int type) -{ - const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); - if (!reiserfs_posixacl(dentry->d_sb)) - return 0; - if (list && size <= list_size) - memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); - return size; -} - -const struct xattr_handler reiserfs_posix_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = posix_acl_get, - .set = posix_acl_set, - .list = posix_acl_default_list, -}; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index ff1d3d42e72..d8418782862 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -533,16 +533,14 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) root = romfs_iget(sb, pos); if (IS_ERR(root)) - goto error; + return PTR_ERR(root); sb->s_root = d_make_root(root); if (!sb->s_root) - goto error; + return -ENOMEM; return 0; -error: - return -EINVAL; error_rsb_inval: ret = -EINVAL; error_rsb: diff --git a/fs/splice.c b/fs/splice.c index 46a08f772d7..12028fa41de 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -555,6 +555,24 @@ static const struct pipe_buf_operations default_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +/* Pipe buffer operations for a socket and similar. */ +const struct pipe_buf_operations nosteal_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_nosteal, + .get = generic_pipe_buf_get, +}; +EXPORT_SYMBOL(nosteal_pipe_buf_ops); + static ssize_t kernel_readv(struct file *file, const struct iovec *vec, unsigned long vlen, loff_t offset) { diff --git a/fs/super.c b/fs/super.c index e5f6c2cfac3..cecd780e0f4 100644 --- a/fs/super.c +++ b/fs/super.c @@ -166,6 +166,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) if (!s) return NULL; + INIT_LIST_HEAD(&s->s_mounts); + if (security_sb_alloc(s)) goto fail; @@ -188,7 +190,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) if (list_lru_init(&s->s_inode_lru)) goto fail; - INIT_LIST_HEAD(&s->s_mounts); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); /* diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile index 8876ac18337..6eff6e1205a 100644 --- a/fs/sysfs/Makefile +++ b/fs/sysfs/Makefile @@ -2,4 +2,4 @@ # Makefile for the sysfs virtual filesystem # -obj-y := inode.o file.o dir.o symlink.o mount.o group.o +obj-y := file.o dir.o symlink.o mount.o group.o diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 5e73d6626e5..ee0d761c317 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -13,465 +13,31 @@ #undef DEBUG #include <linux/fs.h> -#include <linux/mount.h> -#include <linux/module.h> #include <linux/kobject.h> -#include <linux/namei.h> -#include <linux/idr.h> -#include <linux/completion.h> -#include <linux/mutex.h> #include <linux/slab.h> -#include <linux/security.h> -#include <linux/hash.h> #include "sysfs.h" -DEFINE_MUTEX(sysfs_mutex); DEFINE_SPINLOCK(sysfs_symlink_target_lock); -#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb) - -static DEFINE_SPINLOCK(sysfs_ino_lock); -static DEFINE_IDA(sysfs_ino_ida); - -/** - * sysfs_name_hash - * @name: Null terminated string to hash - * @ns: Namespace tag to hash - * - * Returns 31 bit hash of ns + name (so it fits in an off_t ) - */ -static unsigned int sysfs_name_hash(const char *name, const void *ns) -{ - unsigned long hash = init_name_hash(); - unsigned int len = strlen(name); - while (len--) - hash = partial_name_hash(*name++, hash); - hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); - hash &= 0x7fffffffU; - /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ - if (hash < 1) - hash += 2; - if (hash >= INT_MAX) - hash = INT_MAX - 1; - return hash; -} - -static int sysfs_name_compare(unsigned int hash, const char *name, - const void *ns, const struct sysfs_dirent *sd) -{ - if (hash != sd->s_hash) - return hash - sd->s_hash; - if (ns != sd->s_ns) - return ns - sd->s_ns; - return strcmp(name, sd->s_name); -} - -static int sysfs_sd_compare(const struct sysfs_dirent *left, - const struct sysfs_dirent *right) -{ - return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns, - right); -} - -/** - * sysfs_link_sibling - link sysfs_dirent into sibling rbtree - * @sd: sysfs_dirent of interest - * - * Link @sd into its sibling rbtree which starts from - * sd->s_parent->s_dir.children. - * - * Locking: - * mutex_lock(sysfs_mutex) - * - * RETURNS: - * 0 on susccess -EEXIST on failure. - */ -static int sysfs_link_sibling(struct sysfs_dirent *sd) -{ - struct rb_node **node = &sd->s_parent->s_dir.children.rb_node; - struct rb_node *parent = NULL; - - if (sysfs_type(sd) == SYSFS_DIR) - sd->s_parent->s_dir.subdirs++; - - while (*node) { - struct sysfs_dirent *pos; - int result; - - pos = to_sysfs_dirent(*node); - parent = *node; - result = sysfs_sd_compare(sd, pos); - if (result < 0) - node = &pos->s_rb.rb_left; - else if (result > 0) - node = &pos->s_rb.rb_right; - else - return -EEXIST; - } - /* add new node and rebalance the tree */ - rb_link_node(&sd->s_rb, parent, node); - rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children); - return 0; -} - -/** - * sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree - * @sd: sysfs_dirent of interest - * - * Unlink @sd from its sibling rbtree which starts from - * sd->s_parent->s_dir.children. - * - * Locking: - * mutex_lock(sysfs_mutex) - */ -static void sysfs_unlink_sibling(struct sysfs_dirent *sd) -{ - if (sysfs_type(sd) == SYSFS_DIR) - sd->s_parent->s_dir.subdirs--; - - rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children); -} - -/** - * sysfs_get_active - get an active reference to sysfs_dirent - * @sd: sysfs_dirent to get an active reference to - * - * Get an active reference of @sd. This function is noop if @sd - * is NULL. - * - * RETURNS: - * Pointer to @sd on success, NULL on failure. - */ -struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) -{ - if (unlikely(!sd)) - return NULL; - - if (!atomic_inc_unless_negative(&sd->s_active)) - return NULL; - - if (likely(!sysfs_ignore_lockdep(sd))) - rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); - return sd; -} - -/** - * sysfs_put_active - put an active reference to sysfs_dirent - * @sd: sysfs_dirent to put an active reference to - * - * Put an active reference to @sd. This function is noop if @sd - * is NULL. - */ -void sysfs_put_active(struct sysfs_dirent *sd) -{ - int v; - - if (unlikely(!sd)) - return; - - if (likely(!sysfs_ignore_lockdep(sd))) - rwsem_release(&sd->dep_map, 1, _RET_IP_); - v = atomic_dec_return(&sd->s_active); - if (likely(v != SD_DEACTIVATED_BIAS)) - return; - - /* atomic_dec_return() is a mb(), we'll always see the updated - * sd->u.completion. - */ - complete(sd->u.completion); -} - -/** - * sysfs_deactivate - deactivate sysfs_dirent - * @sd: sysfs_dirent to deactivate - * - * Deny new active references and drain existing ones. - */ -static void sysfs_deactivate(struct sysfs_dirent *sd) -{ - DECLARE_COMPLETION_ONSTACK(wait); - int v; - - BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED)); - - if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF)) - return; - - sd->u.completion = (void *)&wait; - - rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); - /* atomic_add_return() is a mb(), put_active() will always see - * the updated sd->u.completion. - */ - v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); - - if (v != SD_DEACTIVATED_BIAS) { - lock_contended(&sd->dep_map, _RET_IP_); - wait_for_completion(&wait); - } - - lock_acquired(&sd->dep_map, _RET_IP_); - rwsem_release(&sd->dep_map, 1, _RET_IP_); -} - -static int sysfs_alloc_ino(unsigned int *pino) -{ - int ino, rc; - - retry: - spin_lock(&sysfs_ino_lock); - rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino); - spin_unlock(&sysfs_ino_lock); - - if (rc == -EAGAIN) { - if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL)) - goto retry; - rc = -ENOMEM; - } - - *pino = ino; - return rc; -} - -static void sysfs_free_ino(unsigned int ino) -{ - spin_lock(&sysfs_ino_lock); - ida_remove(&sysfs_ino_ida, ino); - spin_unlock(&sysfs_ino_lock); -} - -void release_sysfs_dirent(struct sysfs_dirent *sd) -{ - struct sysfs_dirent *parent_sd; - - repeat: - /* Moving/renaming is always done while holding reference. - * sd->s_parent won't change beneath us. - */ - parent_sd = sd->s_parent; - - WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED), - "sysfs: free using entry: %s/%s\n", - parent_sd ? parent_sd->s_name : "", sd->s_name); - - if (sysfs_type(sd) == SYSFS_KOBJ_LINK) - sysfs_put(sd->s_symlink.target_sd); - if (sysfs_type(sd) & SYSFS_COPY_NAME) - kfree(sd->s_name); - if (sd->s_iattr && sd->s_iattr->ia_secdata) - security_release_secctx(sd->s_iattr->ia_secdata, - sd->s_iattr->ia_secdata_len); - kfree(sd->s_iattr); - sysfs_free_ino(sd->s_ino); - kmem_cache_free(sysfs_dir_cachep, sd); - - sd = parent_sd; - if (sd && atomic_dec_and_test(&sd->s_count)) - goto repeat; -} - -static int sysfs_dentry_delete(const struct dentry *dentry) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED)); -} - -static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct sysfs_dirent *sd; - int type; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - sd = dentry->d_fsdata; - mutex_lock(&sysfs_mutex); - - /* The sysfs dirent has been deleted */ - if (sd->s_flags & SYSFS_FLAG_REMOVED) - goto out_bad; - - /* The sysfs dirent has been moved? */ - if (dentry->d_parent->d_fsdata != sd->s_parent) - goto out_bad; - - /* The sysfs dirent has been renamed */ - if (strcmp(dentry->d_name.name, sd->s_name) != 0) - goto out_bad; - - /* The sysfs dirent has been moved to a different namespace */ - type = KOBJ_NS_TYPE_NONE; - if (sd->s_parent) { - type = sysfs_ns_type(sd->s_parent); - if (type != KOBJ_NS_TYPE_NONE && - sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns) - goto out_bad; - } - - mutex_unlock(&sysfs_mutex); -out_valid: - return 1; -out_bad: - /* Remove the dentry from the dcache hashes. - * If this is a deleted dentry we use d_drop instead of d_delete - * so sysfs doesn't need to cope with negative dentries. - * - * If this is a dentry that has simply been renamed we - * use d_drop to remove it from the dcache lookup on its - * old parent. If this dentry persists later when a lookup - * is performed at its new name the dentry will be readded - * to the dcache hashes. - */ - mutex_unlock(&sysfs_mutex); - - /* If we have submounts we must allow the vfs caches - * to lie about the state of the filesystem to prevent - * leaks and other nasty things. - */ - if (check_submounts_and_drop(dentry) != 0) - goto out_valid; - - return 0; -} - -static void sysfs_dentry_release(struct dentry *dentry) -{ - sysfs_put(dentry->d_fsdata); -} - -const struct dentry_operations sysfs_dentry_ops = { - .d_revalidate = sysfs_dentry_revalidate, - .d_delete = sysfs_dentry_delete, - .d_release = sysfs_dentry_release, -}; - -struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) -{ - char *dup_name = NULL; - struct sysfs_dirent *sd; - - if (type & SYSFS_COPY_NAME) { - name = dup_name = kstrdup(name, GFP_KERNEL); - if (!name) - return NULL; - } - - sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL); - if (!sd) - goto err_out1; - - if (sysfs_alloc_ino(&sd->s_ino)) - goto err_out2; - - atomic_set(&sd->s_count, 1); - atomic_set(&sd->s_active, 0); - - sd->s_name = name; - sd->s_mode = mode; - sd->s_flags = type | SYSFS_FLAG_REMOVED; - - return sd; - - err_out2: - kmem_cache_free(sysfs_dir_cachep, sd); - err_out1: - kfree(dup_name); - return NULL; -} - -/** - * sysfs_addrm_start - prepare for sysfs_dirent add/remove - * @acxt: pointer to sysfs_addrm_cxt to be used - * - * This function is called when the caller is about to add or remove - * sysfs_dirent. This function acquires sysfs_mutex. @acxt is used - * to keep and pass context to other addrm functions. - * - * LOCKING: - * Kernel thread context (may sleep). sysfs_mutex is locked on - * return. - */ -void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt) - __acquires(sysfs_mutex) -{ - memset(acxt, 0, sizeof(*acxt)); - - mutex_lock(&sysfs_mutex); -} - -/** - * __sysfs_add_one - add sysfs_dirent to parent without warning - * @acxt: addrm context to use - * @sd: sysfs_dirent to be added - * @parent_sd: the parent sysfs_dirent to add @sd to - * - * Get @parent_sd and set @sd->s_parent to it and increment nlink of - * the parent inode if @sd is a directory and link into the children - * list of the parent. - * - * This function should be called between calls to - * sysfs_addrm_start() and sysfs_addrm_finish() and should be - * passed the same @acxt as passed to sysfs_addrm_start(). - * - * LOCKING: - * Determined by sysfs_addrm_start(). - * - * RETURNS: - * 0 on success, -EEXIST if entry with the given name already - * exists. - */ -int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd) -{ - struct sysfs_inode_attrs *ps_iattr; - int ret; - - if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) { - WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(parent_sd) ? "required" : "invalid", - parent_sd->s_name, sd->s_name); - return -EINVAL; - } - - sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns); - sd->s_parent = sysfs_get(parent_sd); - - ret = sysfs_link_sibling(sd); - if (ret) - return ret; - - /* Update timestamps on the parent */ - ps_iattr = parent_sd->s_iattr; - if (ps_iattr) { - struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; - } - - /* Mark the entry added into directory tree */ - sd->s_flags &= ~SYSFS_FLAG_REMOVED; - - return 0; -} - /** * sysfs_pathname - return full path to sysfs dirent - * @sd: sysfs_dirent whose path we want + * @kn: kernfs_node whose path we want * @path: caller allocated buffer of size PATH_MAX * * Gives the name "/" to the sysfs_root entry; any path returned * is relative to wherever sysfs is mounted. */ -static char *sysfs_pathname(struct sysfs_dirent *sd, char *path) +static char *sysfs_pathname(struct kernfs_node *kn, char *path) { - if (sd->s_parent) { - sysfs_pathname(sd->s_parent, path); + if (kn->parent) { + sysfs_pathname(kn->parent, path); strlcat(path, "/", PATH_MAX); } - strlcat(path, sd->s_name, PATH_MAX); + strlcat(path, kn->name, PATH_MAX); return path; } -void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name) +void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { char *path; @@ -489,445 +55,34 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name) } /** - * sysfs_add_one - add sysfs_dirent to parent - * @acxt: addrm context to use - * @sd: sysfs_dirent to be added - * @parent_sd: the parent sysfs_dirent to add @sd to - * - * Get @parent_sd and set @sd->s_parent to it and increment nlink of - * the parent inode if @sd is a directory and link into the children - * list of the parent. - * - * This function should be called between calls to - * sysfs_addrm_start() and sysfs_addrm_finish() and should be - * passed the same @acxt as passed to sysfs_addrm_start(). - * - * LOCKING: - * Determined by sysfs_addrm_start(). - * - * RETURNS: - * 0 on success, -EEXIST if entry with the given name already - * exists. - */ -int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd) -{ - int ret; - - ret = __sysfs_add_one(acxt, sd, parent_sd); - - if (ret == -EEXIST) - sysfs_warn_dup(parent_sd, sd->s_name); - return ret; -} - -/** - * sysfs_remove_one - remove sysfs_dirent from parent - * @acxt: addrm context to use - * @sd: sysfs_dirent to be removed - * - * Mark @sd removed and drop nlink of parent inode if @sd is a - * directory. @sd is unlinked from the children list. - * - * This function should be called between calls to - * sysfs_addrm_start() and sysfs_addrm_finish() and should be - * passed the same @acxt as passed to sysfs_addrm_start(). - * - * LOCKING: - * Determined by sysfs_addrm_start(). - */ -static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, - struct sysfs_dirent *sd) -{ - struct sysfs_inode_attrs *ps_iattr; - - /* - * Removal can be called multiple times on the same node. Only the - * first invocation is effective and puts the base ref. - */ - if (sd->s_flags & SYSFS_FLAG_REMOVED) - return; - - sysfs_unlink_sibling(sd); - - /* Update timestamps on the parent */ - ps_iattr = sd->s_parent->s_iattr; - if (ps_iattr) { - struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; - } - - sd->s_flags |= SYSFS_FLAG_REMOVED; - sd->u.removed_list = acxt->removed; - acxt->removed = sd; -} - -/** - * sysfs_addrm_finish - finish up sysfs_dirent add/remove - * @acxt: addrm context to finish up - * - * Finish up sysfs_dirent add/remove. Resources acquired by - * sysfs_addrm_start() are released and removed sysfs_dirents are - * cleaned up. - * - * LOCKING: - * sysfs_mutex is released. - */ -void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt) - __releases(sysfs_mutex) -{ - /* release resources acquired by sysfs_addrm_start() */ - mutex_unlock(&sysfs_mutex); - - /* kill removed sysfs_dirents */ - while (acxt->removed) { - struct sysfs_dirent *sd = acxt->removed; - - acxt->removed = sd->u.removed_list; - - sysfs_deactivate(sd); - sysfs_unmap_bin_file(sd); - sysfs_put(sd); - } -} - -/** - * sysfs_find_dirent - find sysfs_dirent with the given name - * @parent_sd: sysfs_dirent to search under - * @name: name to look for - * @ns: the namespace tag to use - * - * Look for sysfs_dirent with name @name under @parent_sd. - * - * LOCKING: - * mutex_lock(sysfs_mutex) - * - * RETURNS: - * Pointer to sysfs_dirent if found, NULL if not. - */ -struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, - const unsigned char *name, - const void *ns) -{ - struct rb_node *node = parent_sd->s_dir.children.rb_node; - unsigned int hash; - - if (!!sysfs_ns_type(parent_sd) != !!ns) { - WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n", - sysfs_ns_type(parent_sd) ? "required" : "invalid", - parent_sd->s_name, name); - return NULL; - } - - hash = sysfs_name_hash(name, ns); - while (node) { - struct sysfs_dirent *sd; - int result; - - sd = to_sysfs_dirent(node); - result = sysfs_name_compare(hash, name, ns, sd); - if (result < 0) - node = node->rb_left; - else if (result > 0) - node = node->rb_right; - else - return sd; - } - return NULL; -} - -/** - * sysfs_get_dirent_ns - find and get sysfs_dirent with the given name - * @parent_sd: sysfs_dirent to search under - * @name: name to look for - * @ns: the namespace tag to use - * - * Look for sysfs_dirent with name @name under @parent_sd and get - * it if found. - * - * LOCKING: - * Kernel thread context (may sleep). Grabs sysfs_mutex. - * - * RETURNS: - * Pointer to sysfs_dirent if found, NULL if not. - */ -struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd, - const unsigned char *name, - const void *ns) -{ - struct sysfs_dirent *sd; - - mutex_lock(&sysfs_mutex); - sd = sysfs_find_dirent(parent_sd, name, ns); - sysfs_get(sd); - mutex_unlock(&sysfs_mutex); - - return sd; -} -EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns); - -static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, - enum kobj_ns_type type, - const char *name, const void *ns, - struct sysfs_dirent **p_sd) -{ - umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - int rc; - - /* allocate */ - sd = sysfs_new_dirent(name, mode, SYSFS_DIR); - if (!sd) - return -ENOMEM; - - sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT); - sd->s_ns = ns; - sd->s_dir.kobj = kobj; - - /* link in */ - sysfs_addrm_start(&acxt); - rc = sysfs_add_one(&acxt, sd, parent_sd); - sysfs_addrm_finish(&acxt); - - if (rc == 0) - *p_sd = sd; - else - sysfs_put(sd); - - return rc; -} - -int sysfs_create_subdir(struct kobject *kobj, const char *name, - struct sysfs_dirent **p_sd) -{ - return create_dir(kobj, kobj->sd, - KOBJ_NS_TYPE_NONE, name, NULL, p_sd); -} - -/** - * sysfs_read_ns_type: return associated ns_type - * @kobj: the kobject being queried - * - * Each kobject can be tagged with exactly one namespace type - * (i.e. network or user). Return the ns_type associated with - * this object if any - */ -static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj) -{ - const struct kobj_ns_type_operations *ops; - enum kobj_ns_type type; - - ops = kobj_child_ns_ops(kobj); - if (!ops) - return KOBJ_NS_TYPE_NONE; - - type = ops->type; - BUG_ON(type <= KOBJ_NS_TYPE_NONE); - BUG_ON(type >= KOBJ_NS_TYPES); - BUG_ON(!kobj_ns_type_registered(type)); - - return type; -} - -/** * sysfs_create_dir_ns - create a directory for an object with a namespace tag * @kobj: object we're creating directory for * @ns: the namespace tag to use */ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { - enum kobj_ns_type type; - struct sysfs_dirent *parent_sd, *sd; - int error = 0; + struct kernfs_node *parent, *kn; BUG_ON(!kobj); if (kobj->parent) - parent_sd = kobj->parent->sd; + parent = kobj->parent->sd; else - parent_sd = &sysfs_root; + parent = sysfs_root_kn; - if (!parent_sd) + if (!parent) return -ENOENT; - type = sysfs_read_ns_type(kobj); - - error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd); - if (!error) - kobj->sd = sd; - return error; -} - -static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct dentry *ret = NULL; - struct dentry *parent = dentry->d_parent; - struct sysfs_dirent *parent_sd = parent->d_fsdata; - struct sysfs_dirent *sd; - struct inode *inode; - enum kobj_ns_type type; - const void *ns; - - mutex_lock(&sysfs_mutex); - - type = sysfs_ns_type(parent_sd); - ns = sysfs_info(dir->i_sb)->ns[type]; - - sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns); - - /* no such entry */ - if (!sd) { - ret = ERR_PTR(-ENOENT); - goto out_unlock; - } - dentry->d_fsdata = sysfs_get(sd); - - /* attach dentry and inode */ - inode = sysfs_get_inode(dir->i_sb, sd); - if (!inode) { - ret = ERR_PTR(-ENOMEM); - goto out_unlock; - } - - /* instantiate and hash dentry */ - ret = d_materialise_unique(dentry, inode); - out_unlock: - mutex_unlock(&sysfs_mutex); - return ret; -} - -const struct inode_operations sysfs_dir_inode_operations = { - .lookup = sysfs_lookup, - .permission = sysfs_permission, - .setattr = sysfs_setattr, - .getattr = sysfs_getattr, - .setxattr = sysfs_setxattr, -}; - -static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos) -{ - struct sysfs_dirent *last; - - while (true) { - struct rb_node *rbn; - - last = pos; - - if (sysfs_type(pos) != SYSFS_DIR) - break; - - rbn = rb_first(&pos->s_dir.children); - if (!rbn) - break; - - pos = to_sysfs_dirent(rbn); - } - - return last; -} - -/** - * sysfs_next_descendant_post - find the next descendant for post-order walk - * @pos: the current position (%NULL to initiate traversal) - * @root: sysfs_dirent whose descendants to walk - * - * Find the next descendant to visit for post-order traversal of @root's - * descendants. @root is included in the iteration and the last node to be - * visited. - */ -static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos, - struct sysfs_dirent *root) -{ - struct rb_node *rbn; - - lockdep_assert_held(&sysfs_mutex); - - /* if first iteration, visit leftmost descendant which may be root */ - if (!pos) - return sysfs_leftmost_descendant(root); - - /* if we visited @root, we're done */ - if (pos == root) - return NULL; - - /* if there's an unvisited sibling, visit its leftmost descendant */ - rbn = rb_next(&pos->s_rb); - if (rbn) - return sysfs_leftmost_descendant(to_sysfs_dirent(rbn)); - - /* no sibling left, visit parent */ - return pos->s_parent; -} - -static void __sysfs_remove(struct sysfs_addrm_cxt *acxt, - struct sysfs_dirent *sd) -{ - struct sysfs_dirent *pos, *next; - - if (!sd) - return; - - pr_debug("sysfs %s: removing\n", sd->s_name); - - next = NULL; - do { - pos = next; - next = sysfs_next_descendant_post(pos, sd); - if (pos) - sysfs_remove_one(acxt, pos); - } while (next); -} - -/** - * sysfs_remove - remove a sysfs_dirent recursively - * @sd: the sysfs_dirent to remove - * - * Remove @sd along with all its subdirectories and files. - */ -void sysfs_remove(struct sysfs_dirent *sd) -{ - struct sysfs_addrm_cxt acxt; - - sysfs_addrm_start(&acxt); - __sysfs_remove(&acxt, sd); - sysfs_addrm_finish(&acxt); -} - -/** - * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it - * @dir_sd: parent of the target - * @name: name of the sysfs_dirent to remove - * @ns: namespace tag of the sysfs_dirent to remove - * - * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove - * it. Returns 0 on success, -ENOENT if such entry doesn't exist. - */ -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name, - const void *ns) -{ - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - - if (!dir_sd) { - WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n", - name); - return -ENOENT; + kn = kernfs_create_dir_ns(parent, kobject_name(kobj), + S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, kobject_name(kobj)); + return PTR_ERR(kn); } - sysfs_addrm_start(&acxt); - - sd = sysfs_find_dirent(dir_sd, name, ns); - if (sd) - __sysfs_remove(&acxt, sd); - - sysfs_addrm_finish(&acxt); - - if (sd) - return 0; - else - return -ENOENT; + kobj->sd = kn; + return 0; } /** @@ -940,207 +95,47 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name, */ void sysfs_remove_dir(struct kobject *kobj) { - struct sysfs_dirent *sd = kobj->sd; + struct kernfs_node *kn = kobj->sd; /* * In general, kboject owner is responsible for ensuring removal * doesn't race with other operations and sysfs doesn't provide any * protection; however, when @kobj is used as a symlink target, the * symlinking entity usually doesn't own @kobj and thus has no - * control over removal. @kobj->sd may be removed anytime and - * symlink code may end up dereferencing an already freed sd. + * control over removal. @kobj->sd may be removed anytime + * and symlink code may end up dereferencing an already freed node. * - * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation - * against symlink operations so that symlink code can safely - * dereference @kobj->sd. + * sysfs_symlink_target_lock synchronizes @kobj->sd + * disassociation against symlink operations so that symlink code + * can safely dereference @kobj->sd. */ spin_lock(&sysfs_symlink_target_lock); kobj->sd = NULL; spin_unlock(&sysfs_symlink_target_lock); - if (sd) { - WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR); - sysfs_remove(sd); + if (kn) { + WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); + kernfs_remove(kn); } } -int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd, - const char *new_name, const void *new_ns) -{ - int error; - - mutex_lock(&sysfs_mutex); - - error = 0; - if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) && - (strcmp(sd->s_name, new_name) == 0)) - goto out; /* nothing to rename */ - - error = -EEXIST; - if (sysfs_find_dirent(new_parent_sd, new_name, new_ns)) - goto out; - - /* rename sysfs_dirent */ - if (strcmp(sd->s_name, new_name) != 0) { - error = -ENOMEM; - new_name = kstrdup(new_name, GFP_KERNEL); - if (!new_name) - goto out; - - kfree(sd->s_name); - sd->s_name = new_name; - } - - /* - * Move to the appropriate place in the appropriate directories rbtree. - */ - sysfs_unlink_sibling(sd); - sysfs_get(new_parent_sd); - sysfs_put(sd->s_parent); - sd->s_ns = new_ns; - sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns); - sd->s_parent = new_parent_sd; - sysfs_link_sibling(sd); - - error = 0; - out: - mutex_unlock(&sysfs_mutex); - return error; -} - int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { - struct sysfs_dirent *parent_sd = kobj->sd->s_parent; + struct kernfs_node *parent = kobj->sd->parent; - return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns); + return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); } int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { - struct sysfs_dirent *sd = kobj->sd; - struct sysfs_dirent *new_parent_sd; + struct kernfs_node *kn = kobj->sd; + struct kernfs_node *new_parent; - BUG_ON(!sd->s_parent); - new_parent_sd = new_parent_kobj && new_parent_kobj->sd ? - new_parent_kobj->sd : &sysfs_root; + BUG_ON(!kn->parent); + new_parent = new_parent_kobj && new_parent_kobj->sd ? + new_parent_kobj->sd : sysfs_root_kn; - return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns); + return kernfs_rename_ns(kn, new_parent, kn->name, new_ns); } - -/* Relationship between s_mode and the DT_xxx types */ -static inline unsigned char dt_type(struct sysfs_dirent *sd) -{ - return (sd->s_mode >> 12) & 15; -} - -static int sysfs_dir_release(struct inode *inode, struct file *filp) -{ - sysfs_put(filp->private_data); - return 0; -} - -static struct sysfs_dirent *sysfs_dir_pos(const void *ns, - struct sysfs_dirent *parent_sd, loff_t hash, struct sysfs_dirent *pos) -{ - if (pos) { - int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) && - pos->s_parent == parent_sd && - hash == pos->s_hash; - sysfs_put(pos); - if (!valid) - pos = NULL; - } - if (!pos && (hash > 1) && (hash < INT_MAX)) { - struct rb_node *node = parent_sd->s_dir.children.rb_node; - while (node) { - pos = to_sysfs_dirent(node); - - if (hash < pos->s_hash) - node = node->rb_left; - else if (hash > pos->s_hash) - node = node->rb_right; - else - break; - } - } - /* Skip over entries in the wrong namespace */ - while (pos && pos->s_ns != ns) { - struct rb_node *node = rb_next(&pos->s_rb); - if (!node) - pos = NULL; - else - pos = to_sysfs_dirent(node); - } - return pos; -} - -static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns, - struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos) -{ - pos = sysfs_dir_pos(ns, parent_sd, ino, pos); - if (pos) - do { - struct rb_node *node = rb_next(&pos->s_rb); - if (!node) - pos = NULL; - else - pos = to_sysfs_dirent(node); - } while (pos && pos->s_ns != ns); - return pos; -} - -static int sysfs_readdir(struct file *file, struct dir_context *ctx) -{ - struct dentry *dentry = file->f_path.dentry; - struct sysfs_dirent *parent_sd = dentry->d_fsdata; - struct sysfs_dirent *pos = file->private_data; - enum kobj_ns_type type; - const void *ns; - - type = sysfs_ns_type(parent_sd); - ns = sysfs_info(dentry->d_sb)->ns[type]; - - if (!dir_emit_dots(file, ctx)) - return 0; - mutex_lock(&sysfs_mutex); - for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos); - pos; - pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) { - const char *name = pos->s_name; - unsigned int type = dt_type(pos); - int len = strlen(name); - ino_t ino = pos->s_ino; - ctx->pos = pos->s_hash; - file->private_data = sysfs_get(pos); - - mutex_unlock(&sysfs_mutex); - if (!dir_emit(ctx, name, len, ino, type)) - return 0; - mutex_lock(&sysfs_mutex); - } - mutex_unlock(&sysfs_mutex); - file->private_data = NULL; - ctx->pos = INT_MAX; - return 0; -} - -static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) -{ - struct inode *inode = file_inode(file); - loff_t ret; - - mutex_lock(&inode->i_mutex); - ret = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); - - return ret; -} - -const struct file_operations sysfs_dir_operations = { - .read = generic_read_dir, - .iterate = sysfs_readdir, - .release = sysfs_dir_release, - .llseek = sysfs_dir_llseek, -}; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index b94f9368509..810cf6e613e 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -14,70 +14,23 @@ #include <linux/kobject.h> #include <linux/kallsyms.h> #include <linux/slab.h> -#include <linux/fsnotify.h> -#include <linux/namei.h> -#include <linux/poll.h> #include <linux/list.h> #include <linux/mutex.h> -#include <linux/limits.h> -#include <linux/uaccess.h> #include <linux/seq_file.h> -#include <linux/mm.h> #include "sysfs.h" +#include "../kernfs/kernfs-internal.h" /* - * There's one sysfs_open_file for each open file and one sysfs_open_dirent - * for each sysfs_dirent with one or more open files. - * - * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open is - * protected by sysfs_open_dirent_lock. - * - * filp->private_data points to seq_file whose ->private points to - * sysfs_open_file. sysfs_open_files are chained at - * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex. - */ -static DEFINE_SPINLOCK(sysfs_open_dirent_lock); -static DEFINE_MUTEX(sysfs_open_file_mutex); - -struct sysfs_open_dirent { - atomic_t refcnt; - atomic_t event; - wait_queue_head_t poll; - struct list_head files; /* goes through sysfs_open_file.list */ -}; - -struct sysfs_open_file { - struct sysfs_dirent *sd; - struct file *file; - struct mutex mutex; - int event; - struct list_head list; - - bool mmapped; - const struct vm_operations_struct *vm_ops; -}; - -static bool sysfs_is_bin(struct sysfs_dirent *sd) -{ - return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR; -} - -static struct sysfs_open_file *sysfs_of(struct file *file) -{ - return ((struct seq_file *)file->private_data)->private; -} - -/* - * Determine ktype->sysfs_ops for the given sysfs_dirent. This function + * Determine ktype->sysfs_ops for the given kernfs_node. This function * must be called while holding an active reference. */ -static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd) +static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn) { - struct kobject *kobj = sd->s_parent->s_dir.kobj; + struct kobject *kobj = kn->parent->priv; - if (!sysfs_ignore_lockdep(sd)) - lockdep_assert_held(sd); + if (kn->flags & KERNFS_LOCKDEP) + lockdep_assert_held(kn); return kobj->ktype ? kobj->ktype->sysfs_ops : NULL; } @@ -86,13 +39,13 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd) * details like buffering and seeking. The following function pipes * sysfs_ops->show() result through seq_file. */ -static int sysfs_seq_show(struct seq_file *sf, void *v) +static int sysfs_kf_seq_show(struct seq_file *sf, void *v) { - struct sysfs_open_file *of = sf->private; - struct kobject *kobj = of->sd->s_parent->s_dir.kobj; - const struct sysfs_ops *ops; - char *buf; + struct kernfs_open_file *of = sf->private; + struct kobject *kobj = of->kn->parent->priv; + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); ssize_t count; + char *buf; /* acquire buffer and ensure that it's >= PAGE_SIZE */ count = seq_get_buf(sf, &buf); @@ -102,34 +55,15 @@ static int sysfs_seq_show(struct seq_file *sf, void *v) } /* - * Need @of->sd for attr and ops, its parent for kobj. @of->mutex - * nests outside active ref and is just to ensure that the ops - * aren't called concurrently for the same open file. + * Invoke show(). Control may reach here via seq file lseek even + * if @ops->show() isn't implemented. */ - mutex_lock(&of->mutex); - if (!sysfs_get_active(of->sd)) { - mutex_unlock(&of->mutex); - return -ENODEV; + if (ops->show) { + count = ops->show(kobj, of->kn->priv, buf); + if (count < 0) + return count; } - of->event = atomic_read(&of->sd->s_attr.open->event); - - /* - * Lookup @ops and invoke show(). Control may reach here via seq - * file lseek even if @ops->show() isn't implemented. - */ - ops = sysfs_file_ops(of->sd); - if (ops->show) - count = ops->show(kobj, of->sd->s_attr.attr, buf); - else - count = 0; - - sysfs_put_active(of->sd); - mutex_unlock(&of->mutex); - - if (count < 0) - return count; - /* * The code works fine with PAGE_SIZE return but it's likely to * indicate truncated result or overflow in normal use cases. @@ -144,728 +78,194 @@ static int sysfs_seq_show(struct seq_file *sf, void *v) return 0; } -/* - * Read method for bin files. As reading a bin file can have side-effects, - * the exact offset and bytes specified in read(2) call should be passed to - * the read callback making it difficult to use seq_file. Implement - * simplistic custom buffering for bin files. - */ -static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf, - size_t bytes, loff_t *off) +static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) { - struct sysfs_open_file *of = sysfs_of(file); - struct bin_attribute *battr = of->sd->s_attr.bin_attr; - struct kobject *kobj = of->sd->s_parent->s_dir.kobj; - loff_t size = file_inode(file)->i_size; - int count = min_t(size_t, bytes, PAGE_SIZE); - loff_t offs = *off; - char *buf; + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + loff_t size = file_inode(of->file)->i_size; - if (!bytes) + if (!count) return 0; if (size) { - if (offs > size) + if (pos > size) return 0; - if (offs + count > size) - count = size - offs; - } - - buf = kmalloc(count, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - /* need of->sd for battr, its parent for kobj */ - mutex_lock(&of->mutex); - if (!sysfs_get_active(of->sd)) { - count = -ENODEV; - mutex_unlock(&of->mutex); - goto out_free; - } - - if (battr->read) - count = battr->read(file, kobj, battr, buf, offs, count); - else - count = -EIO; - - sysfs_put_active(of->sd); - mutex_unlock(&of->mutex); - - if (count < 0) - goto out_free; - - if (copy_to_user(userbuf, buf, count)) { - count = -EFAULT; - goto out_free; + if (pos + count > size) + count = size - pos; } - pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); - - *off = offs + count; + if (!battr->read) + return -EIO; - out_free: - kfree(buf); - return count; + return battr->read(of->file, kobj, battr, buf, pos, count); } -/** - * flush_write_buffer - push buffer to kobject - * @of: open file - * @buf: data buffer for file - * @off: file offset to write to - * @count: number of bytes - * - * Get the correct pointers for the kobject and the attribute we're dealing - * with, then call the store() method for it with @buf. - */ -static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off, - size_t count) +/* kernfs write callback for regular sysfs files */ +static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) { - struct kobject *kobj = of->sd->s_parent->s_dir.kobj; - int rc = 0; - - /* - * Need @of->sd for attr and ops, its parent for kobj. @of->mutex - * nests outside active ref and is just to ensure that the ops - * aren't called concurrently for the same open file. - */ - mutex_lock(&of->mutex); - if (!sysfs_get_active(of->sd)) { - mutex_unlock(&of->mutex); - return -ENODEV; - } + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); + struct kobject *kobj = of->kn->parent->priv; - if (sysfs_is_bin(of->sd)) { - struct bin_attribute *battr = of->sd->s_attr.bin_attr; - - rc = -EIO; - if (battr->write) - rc = battr->write(of->file, kobj, battr, buf, off, - count); - } else { - const struct sysfs_ops *ops = sysfs_file_ops(of->sd); - - rc = ops->store(kobj, of->sd->s_attr.attr, buf, count); - } - - sysfs_put_active(of->sd); - mutex_unlock(&of->mutex); + if (!count) + return 0; - return rc; + return ops->store(kobj, of->kn->priv, buf, count); } -/** - * sysfs_write_file - write an attribute - * @file: file pointer - * @user_buf: data to write - * @count: number of bytes - * @ppos: starting offset - * - * Copy data in from userland and pass it to the matching - * sysfs_ops->store() by invoking flush_write_buffer(). - * - * There is no easy way for us to know if userspace is only doing a partial - * write, so we don't support them. We expect the entire buffer to come on - * the first write. Hint: if you're writing a value, first read the file, - * modify only the the value you're changing, then write entire buffer - * back. - */ -static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) +/* kernfs write callback for bin sysfs files */ +static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) { - struct sysfs_open_file *of = sysfs_of(file); - ssize_t len = min_t(size_t, count, PAGE_SIZE); - loff_t size = file_inode(file)->i_size; - char *buf; + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + loff_t size = file_inode(of->file)->i_size; - if (sysfs_is_bin(of->sd) && size) { - if (size <= *ppos) + if (size) { + if (size <= pos) return 0; - len = min_t(ssize_t, len, size - *ppos); + count = min_t(ssize_t, count, size - pos); } - - if (!len) + if (!count) return 0; - buf = kmalloc(len + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; + if (!battr->write) + return -EIO; - if (copy_from_user(buf, user_buf, len)) { - len = -EFAULT; - goto out_free; - } - buf[len] = '\0'; /* guarantee string termination */ - - len = flush_write_buffer(of, buf, *ppos, len); - if (len > 0) - *ppos += len; -out_free: - kfree(buf); - return len; -} - -static void sysfs_bin_vma_open(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - - if (!of->vm_ops) - return; - - if (!sysfs_get_active(of->sd)) - return; - - if (of->vm_ops->open) - of->vm_ops->open(vma); - - sysfs_put_active(of->sd); + return battr->write(of->file, kobj, battr, buf, pos, count); } -static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, + struct vm_area_struct *vma) { - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; - if (!of->vm_ops) - return VM_FAULT_SIGBUS; - - if (!sysfs_get_active(of->sd)) - return VM_FAULT_SIGBUS; - - ret = VM_FAULT_SIGBUS; - if (of->vm_ops->fault) - ret = of->vm_ops->fault(vma, vmf); - - sysfs_put_active(of->sd); - return ret; + return battr->mmap(of->file, kobj, battr, vma); } -static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) +void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; - - if (!of->vm_ops) - return VM_FAULT_SIGBUS; + struct kernfs_node *kn = kobj->sd, *tmp; - if (!sysfs_get_active(of->sd)) - return VM_FAULT_SIGBUS; - - ret = 0; - if (of->vm_ops->page_mkwrite) - ret = of->vm_ops->page_mkwrite(vma, vmf); + if (kn && dir) + kn = kernfs_find_and_get(kn, dir); else - file_update_time(file); - - sysfs_put_active(of->sd); - return ret; -} - -static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr, - void *buf, int len, int write) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; - - if (!of->vm_ops) - return -EINVAL; - - if (!sysfs_get_active(of->sd)) - return -EINVAL; - - ret = -EINVAL; - if (of->vm_ops->access) - ret = of->vm_ops->access(vma, addr, buf, len, write); - - sysfs_put_active(of->sd); - return ret; -} - -#ifdef CONFIG_NUMA -static int sysfs_bin_set_policy(struct vm_area_struct *vma, - struct mempolicy *new) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!sysfs_get_active(of->sd)) - return -EINVAL; - - ret = 0; - if (of->vm_ops->set_policy) - ret = of->vm_ops->set_policy(vma, new); - - sysfs_put_active(of->sd); - return ret; -} - -static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma, - unsigned long addr) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - struct mempolicy *pol; - - if (!of->vm_ops) - return vma->vm_policy; - - if (!sysfs_get_active(of->sd)) - return vma->vm_policy; - - pol = vma->vm_policy; - if (of->vm_ops->get_policy) - pol = of->vm_ops->get_policy(vma, addr); - - sysfs_put_active(of->sd); - return pol; -} - -static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from, - const nodemask_t *to, unsigned long flags) -{ - struct file *file = vma->vm_file; - struct sysfs_open_file *of = sysfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!sysfs_get_active(of->sd)) - return 0; - - ret = 0; - if (of->vm_ops->migrate) - ret = of->vm_ops->migrate(vma, from, to, flags); - - sysfs_put_active(of->sd); - return ret; -} -#endif - -static const struct vm_operations_struct sysfs_bin_vm_ops = { - .open = sysfs_bin_vma_open, - .fault = sysfs_bin_fault, - .page_mkwrite = sysfs_bin_page_mkwrite, - .access = sysfs_bin_access, -#ifdef CONFIG_NUMA - .set_policy = sysfs_bin_set_policy, - .get_policy = sysfs_bin_get_policy, - .migrate = sysfs_bin_migrate, -#endif -}; - -static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct sysfs_open_file *of = sysfs_of(file); - struct bin_attribute *battr = of->sd->s_attr.bin_attr; - struct kobject *kobj = of->sd->s_parent->s_dir.kobj; - int rc; - - mutex_lock(&of->mutex); - - /* need of->sd for battr, its parent for kobj */ - rc = -ENODEV; - if (!sysfs_get_active(of->sd)) - goto out_unlock; - - if (!battr->mmap) - goto out_put; - - rc = battr->mmap(file, kobj, battr, vma); - if (rc) - goto out_put; - - /* - * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() - * to satisfy versions of X which crash if the mmap fails: that - * substitutes a new vm_file, and we don't then want bin_vm_ops. - */ - if (vma->vm_file != file) - goto out_put; - - rc = -EINVAL; - if (of->mmapped && of->vm_ops != vma->vm_ops) - goto out_put; + kernfs_get(kn); - /* - * It is not possible to successfully wrap close. - * So error if someone is trying to use close. - */ - rc = -EINVAL; - if (vma->vm_ops && vma->vm_ops->close) - goto out_put; - - rc = 0; - of->mmapped = 1; - of->vm_ops = vma->vm_ops; - vma->vm_ops = &sysfs_bin_vm_ops; -out_put: - sysfs_put_active(of->sd); -out_unlock: - mutex_unlock(&of->mutex); - - return rc; -} - -/** - * sysfs_get_open_dirent - get or create sysfs_open_dirent - * @sd: target sysfs_dirent - * @of: sysfs_open_file for this instance of open - * - * If @sd->s_attr.open exists, increment its reference count; - * otherwise, create one. @of is chained to the files list. - * - * LOCKING: - * Kernel thread context (may sleep). - * - * RETURNS: - * 0 on success, -errno on failure. - */ -static int sysfs_get_open_dirent(struct sysfs_dirent *sd, - struct sysfs_open_file *of) -{ - struct sysfs_open_dirent *od, *new_od = NULL; - - retry: - mutex_lock(&sysfs_open_file_mutex); - spin_lock_irq(&sysfs_open_dirent_lock); - - if (!sd->s_attr.open && new_od) { - sd->s_attr.open = new_od; - new_od = NULL; + if (kn && attr) { + tmp = kernfs_find_and_get(kn, attr); + kernfs_put(kn); + kn = tmp; } - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->refcnt); - list_add_tail(&of->list, &od->files); - } - - spin_unlock_irq(&sysfs_open_dirent_lock); - mutex_unlock(&sysfs_open_file_mutex); - - if (od) { - kfree(new_od); - return 0; + if (kn) { + kernfs_notify(kn); + kernfs_put(kn); } +} +EXPORT_SYMBOL_GPL(sysfs_notify); - /* not there, initialize a new one and retry */ - new_od = kmalloc(sizeof(*new_od), GFP_KERNEL); - if (!new_od) - return -ENOMEM; +static const struct kernfs_ops sysfs_file_kfops_empty = { +}; - atomic_set(&new_od->refcnt, 0); - atomic_set(&new_od->event, 1); - init_waitqueue_head(&new_od->poll); - INIT_LIST_HEAD(&new_od->files); - goto retry; -} +static const struct kernfs_ops sysfs_file_kfops_ro = { + .seq_show = sysfs_kf_seq_show, +}; -/** - * sysfs_put_open_dirent - put sysfs_open_dirent - * @sd: target sysfs_dirent - * @of: associated sysfs_open_file - * - * Put @sd->s_attr.open and unlink @of from the files list. If - * reference count reaches zero, disassociate and free it. - * - * LOCKING: - * None. - */ -static void sysfs_put_open_dirent(struct sysfs_dirent *sd, - struct sysfs_open_file *of) -{ - struct sysfs_open_dirent *od = sd->s_attr.open; - unsigned long flags; +static const struct kernfs_ops sysfs_file_kfops_wo = { + .write = sysfs_kf_write, +}; - mutex_lock(&sysfs_open_file_mutex); - spin_lock_irqsave(&sysfs_open_dirent_lock, flags); +static const struct kernfs_ops sysfs_file_kfops_rw = { + .seq_show = sysfs_kf_seq_show, + .write = sysfs_kf_write, +}; - if (of) - list_del(&of->list); +static const struct kernfs_ops sysfs_bin_kfops_ro = { + .read = sysfs_kf_bin_read, +}; - if (atomic_dec_and_test(&od->refcnt)) - sd->s_attr.open = NULL; - else - od = NULL; +static const struct kernfs_ops sysfs_bin_kfops_wo = { + .write = sysfs_kf_bin_write, +}; - spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); - mutex_unlock(&sysfs_open_file_mutex); +static const struct kernfs_ops sysfs_bin_kfops_rw = { + .read = sysfs_kf_bin_read, + .write = sysfs_kf_bin_write, +}; - kfree(od); -} +static const struct kernfs_ops sysfs_bin_kfops_mmap = { + .read = sysfs_kf_bin_read, + .write = sysfs_kf_bin_write, + .mmap = sysfs_kf_bin_mmap, +}; -static int sysfs_open_file(struct inode *inode, struct file *file) +int sysfs_add_file_mode_ns(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin, + umode_t mode, const void *ns) { - struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; - struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - struct sysfs_open_file *of; - bool has_read, has_write, has_mmap; - int error = -EACCES; - - /* need attr_sd for attr and ops, its parent for kobj */ - if (!sysfs_get_active(attr_sd)) - return -ENODEV; + struct lock_class_key *key = NULL; + const struct kernfs_ops *ops; + struct kernfs_node *kn; + loff_t size; - if (sysfs_is_bin(attr_sd)) { - struct bin_attribute *battr = attr_sd->s_attr.bin_attr; - - has_read = battr->read || battr->mmap; - has_write = battr->write || battr->mmap; - has_mmap = battr->mmap; - } else { - const struct sysfs_ops *ops = sysfs_file_ops(attr_sd); + if (!is_bin) { + struct kobject *kobj = parent->priv; + const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops; /* every kobject with an attribute needs a ktype assigned */ - if (WARN(!ops, KERN_ERR + if (WARN(!sysfs_ops, KERN_ERR "missing sysfs attribute operations for kobject: %s\n", kobject_name(kobj))) - goto err_out; - - has_read = ops->show; - has_write = ops->store; - has_mmap = false; - } - - /* check perms and supported operations */ - if ((file->f_mode & FMODE_WRITE) && - (!(inode->i_mode & S_IWUGO) || !has_write)) - goto err_out; - - if ((file->f_mode & FMODE_READ) && - (!(inode->i_mode & S_IRUGO) || !has_read)) - goto err_out; - - /* allocate a sysfs_open_file for the file */ - error = -ENOMEM; - of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL); - if (!of) - goto err_out; - - /* - * The following is done to give a different lockdep key to - * @of->mutex for files which implement mmap. This is a rather - * crude way to avoid false positive lockdep warning around - * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and - * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under - * which mm->mmap_sem nests, while holding @of->mutex. As each - * open file has a separate mutex, it's okay as long as those don't - * happen on the same file. At this point, we can't easily give - * each file a separate locking class. Let's differentiate on - * whether the file has mmap or not for now. - */ - if (has_mmap) - mutex_init(&of->mutex); - else - mutex_init(&of->mutex); - - of->sd = attr_sd; - of->file = file; - - /* - * Always instantiate seq_file even if read access doesn't use - * seq_file or is not requested. This unifies private data access - * and readable regular files are the vast majority anyway. - */ - if (sysfs_is_bin(attr_sd)) - error = single_open(file, NULL, of); - else - error = single_open(file, sysfs_seq_show, of); - if (error) - goto err_free; - - /* seq_file clears PWRITE unconditionally, restore it if WRITE */ - if (file->f_mode & FMODE_WRITE) - file->f_mode |= FMODE_PWRITE; - - /* make sure we have open dirent struct */ - error = sysfs_get_open_dirent(attr_sd, of); - if (error) - goto err_close; - - /* open succeeded, put active references */ - sysfs_put_active(attr_sd); - return 0; - -err_close: - single_release(inode, file); -err_free: - kfree(of); -err_out: - sysfs_put_active(attr_sd); - return error; -} - -static int sysfs_release(struct inode *inode, struct file *filp) -{ - struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata; - struct sysfs_open_file *of = sysfs_of(filp); - - sysfs_put_open_dirent(sd, of); - single_release(inode, filp); - kfree(of); - - return 0; -} - -void sysfs_unmap_bin_file(struct sysfs_dirent *sd) -{ - struct sysfs_open_dirent *od; - struct sysfs_open_file *of; - - if (!sysfs_is_bin(sd)) - return; - - spin_lock_irq(&sysfs_open_dirent_lock); - od = sd->s_attr.open; - if (od) - atomic_inc(&od->refcnt); - spin_unlock_irq(&sysfs_open_dirent_lock); - if (!od) - return; - - mutex_lock(&sysfs_open_file_mutex); - list_for_each_entry(of, &od->files, list) { - struct inode *inode = file_inode(of->file); - unmap_mapping_range(inode->i_mapping, 0, 0, 1); + return -EINVAL; + + if (sysfs_ops->show && sysfs_ops->store) + ops = &sysfs_file_kfops_rw; + else if (sysfs_ops->show) + ops = &sysfs_file_kfops_ro; + else if (sysfs_ops->store) + ops = &sysfs_file_kfops_wo; + else + ops = &sysfs_file_kfops_empty; + + size = PAGE_SIZE; + } else { + struct bin_attribute *battr = (void *)attr; + + if (battr->mmap) + ops = &sysfs_bin_kfops_mmap; + else if (battr->read && battr->write) + ops = &sysfs_bin_kfops_rw; + else if (battr->read) + ops = &sysfs_bin_kfops_ro; + else if (battr->write) + ops = &sysfs_bin_kfops_wo; + else + ops = &sysfs_file_kfops_empty; + + size = battr->size; } - mutex_unlock(&sysfs_open_file_mutex); - - sysfs_put_open_dirent(sd, NULL); -} - -/* Sysfs attribute files are pollable. The idea is that you read - * the content and then you use 'poll' or 'select' to wait for - * the content to change. When the content changes (assuming the - * manager for the kobject supports notification), poll will - * return POLLERR|POLLPRI, and select will return the fd whether - * it is waiting for read, write, or exceptions. - * Once poll/select indicates that the value has changed, you - * need to close and re-open the file, or seek to 0 and read again. - * Reminder: this only works for attributes which actively support - * it, and it is not possible to test an attribute from userspace - * to see if it supports poll (Neither 'poll' nor 'select' return - * an appropriate error code). When in doubt, set a suitable timeout value. - */ -static unsigned int sysfs_poll(struct file *filp, poll_table *wait) -{ - struct sysfs_open_file *of = sysfs_of(filp); - struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata; - struct sysfs_open_dirent *od = attr_sd->s_attr.open; - - /* need parent for the kobj, grab both */ - if (!sysfs_get_active(attr_sd)) - goto trigger; - - poll_wait(filp, &od->poll, wait); - sysfs_put_active(attr_sd); - - if (of->event != atomic_read(&od->event)) - goto trigger; - - return DEFAULT_POLLMASK; - - trigger: - return DEFAULT_POLLMASK|POLLERR|POLLPRI; -} - -void sysfs_notify_dirent(struct sysfs_dirent *sd) -{ - struct sysfs_open_dirent *od; - unsigned long flags; - - spin_lock_irqsave(&sysfs_open_dirent_lock, flags); - - if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) { - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->event); - wake_up_interruptible(&od->poll); - } +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!attr->ignore_lockdep) + key = attr->key ?: (struct lock_class_key *)&attr->skey; +#endif + kn = __kernfs_create_file(parent, attr->name, mode, size, ops, + (void *)attr, ns, true, key); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, attr->name); + return PTR_ERR(kn); } - - spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); -} -EXPORT_SYMBOL_GPL(sysfs_notify_dirent); - -void sysfs_notify(struct kobject *k, const char *dir, const char *attr) -{ - struct sysfs_dirent *sd = k->sd; - - mutex_lock(&sysfs_mutex); - - if (sd && dir) - sd = sysfs_find_dirent(sd, dir, NULL); - if (sd && attr) - sd = sysfs_find_dirent(sd, attr, NULL); - if (sd) - sysfs_notify_dirent(sd); - - mutex_unlock(&sysfs_mutex); -} -EXPORT_SYMBOL_GPL(sysfs_notify); - -const struct file_operations sysfs_file_operations = { - .read = seq_read, - .write = sysfs_write_file, - .llseek = generic_file_llseek, - .open = sysfs_open_file, - .release = sysfs_release, - .poll = sysfs_poll, -}; - -const struct file_operations sysfs_bin_operations = { - .read = sysfs_bin_read, - .write = sysfs_write_file, - .llseek = generic_file_llseek, - .mmap = sysfs_bin_mmap, - .open = sysfs_open_file, - .release = sysfs_release, - .poll = sysfs_poll, -}; - -int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, - umode_t amode, const void *ns) -{ - umode_t mode = (amode & S_IALLUGO) | S_IFREG; - struct sysfs_addrm_cxt acxt; - struct sysfs_dirent *sd; - int rc; - - sd = sysfs_new_dirent(attr->name, mode, type); - if (!sd) - return -ENOMEM; - - sd->s_ns = ns; - sd->s_attr.attr = (void *)attr; - sysfs_dirent_init_lockdep(sd); - - sysfs_addrm_start(&acxt); - rc = sysfs_add_one(&acxt, sd, dir_sd); - sysfs_addrm_finish(&acxt); - - if (rc) - sysfs_put(sd); - - return rc; + return 0; } - -int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, - int type) +int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr, + bool is_bin) { - return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL); + return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL); } /** @@ -879,8 +279,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, { BUG_ON(!kobj || !kobj->sd || !attr); - return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR, - attr->mode, ns); + return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns); } EXPORT_SYMBOL_GPL(sysfs_create_file_ns); @@ -908,19 +307,21 @@ EXPORT_SYMBOL_GPL(sysfs_create_files); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; int error; - if (group) - dir_sd = sysfs_get_dirent(kobj->sd, group); - else - dir_sd = sysfs_get(kobj->sd); + if (group) { + parent = kernfs_find_and_get(kobj->sd, group); + } else { + parent = kobj->sd; + kernfs_get(parent); + } - if (!dir_sd) + if (!parent) return -ENOENT; - error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR); - sysfs_put(dir_sd); + error = sysfs_add_file(parent, attr, false); + kernfs_put(parent); return error; } @@ -936,23 +337,20 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { - struct sysfs_dirent *sd; + struct kernfs_node *kn; struct iattr newattrs; int rc; - mutex_lock(&sysfs_mutex); - - rc = -ENOENT; - sd = sysfs_find_dirent(kobj->sd, attr->name, NULL); - if (!sd) - goto out; + kn = kernfs_find_and_get(kobj->sd, attr->name); + if (!kn) + return -ENOENT; - newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO); + newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE; - rc = sysfs_sd_setattr(sd, &newattrs); - out: - mutex_unlock(&sysfs_mutex); + rc = kernfs_setattr(kn, &newattrs); + + kernfs_put(kn); return rc; } EXPORT_SYMBOL_GPL(sysfs_chmod_file); @@ -968,9 +366,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { - struct sysfs_dirent *dir_sd = kobj->sd; + struct kernfs_node *parent = kobj->sd; - sysfs_hash_and_remove(dir_sd, attr->name, ns); + kernfs_remove_by_name_ns(parent, attr->name, ns); } EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); @@ -991,15 +389,18 @@ EXPORT_SYMBOL_GPL(sysfs_remove_files); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; - if (group) - dir_sd = sysfs_get_dirent(kobj->sd, group); - else - dir_sd = sysfs_get(kobj->sd); - if (dir_sd) { - sysfs_hash_and_remove(dir_sd, attr->name, NULL); - sysfs_put(dir_sd); + if (group) { + parent = kernfs_find_and_get(kobj->sd, group); + } else { + parent = kobj->sd; + kernfs_get(parent); + } + + if (parent) { + kernfs_remove_by_name(parent, attr->name); + kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group); @@ -1014,7 +415,7 @@ int sysfs_create_bin_file(struct kobject *kobj, { BUG_ON(!kobj || !kobj->sd || !attr); - return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); + return sysfs_add_file(kobj->sd, &attr->attr, true); } EXPORT_SYMBOL_GPL(sysfs_create_bin_file); @@ -1026,7 +427,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_bin_file); void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { - sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL); + kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 1898a10e38c..6b579387c67 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -18,7 +18,7 @@ #include "sysfs.h" -static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, +static void remove_files(struct kernfs_node *parent, struct kobject *kobj, const struct attribute_group *grp) { struct attribute *const *attr; @@ -26,13 +26,13 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, if (grp->attrs) for (attr = grp->attrs; *attr; attr++) - sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL); + kernfs_remove_by_name(parent, (*attr)->name); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) sysfs_remove_bin_file(kobj, *bin_attr); } -static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, +static int create_files(struct kernfs_node *parent, struct kobject *kobj, const struct attribute_group *grp, int update) { struct attribute *const *attr; @@ -49,22 +49,20 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, * re-adding (if required) the file. */ if (update) - sysfs_hash_and_remove(dir_sd, (*attr)->name, - NULL); + kernfs_remove_by_name(parent, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); if (!mode) continue; } - error = sysfs_add_file_mode_ns(dir_sd, *attr, - SYSFS_KOBJ_ATTR, + error = sysfs_add_file_mode_ns(parent, *attr, false, (*attr)->mode | mode, NULL); if (unlikely(error)) break; } if (error) { - remove_files(dir_sd, kobj, grp); + remove_files(parent, kobj, grp); goto exit; } } @@ -78,7 +76,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, break; } if (error) - remove_files(dir_sd, kobj, grp); + remove_files(parent, kobj, grp); } exit: return error; @@ -88,7 +86,7 @@ exit: static int internal_create_group(struct kobject *kobj, int update, const struct attribute_group *grp) { - struct sysfs_dirent *sd; + struct kernfs_node *kn; int error; BUG_ON(!kobj || (!update && !kobj->sd)); @@ -102,18 +100,22 @@ static int internal_create_group(struct kobject *kobj, int update, return -EINVAL; } if (grp->name) { - error = sysfs_create_subdir(kobj, grp->name, &sd); - if (error) - return error; + kn = kernfs_create_dir(kobj->sd, grp->name, + S_IRWXU | S_IRUGO | S_IXUGO, kobj); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(kobj->sd, grp->name); + return PTR_ERR(kn); + } } else - sd = kobj->sd; - sysfs_get(sd); - error = create_files(sd, kobj, grp, update); + kn = kobj->sd; + kernfs_get(kn); + error = create_files(kn, kobj, grp, update); if (error) { if (grp->name) - sysfs_remove(sd); + kernfs_remove(kn); } - sysfs_put(sd); + kernfs_put(kn); return error; } @@ -203,25 +205,27 @@ EXPORT_SYMBOL_GPL(sysfs_update_group); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { - struct sysfs_dirent *dir_sd = kobj->sd; - struct sysfs_dirent *sd; + struct kernfs_node *parent = kobj->sd; + struct kernfs_node *kn; if (grp->name) { - sd = sysfs_get_dirent(dir_sd, grp->name); - if (!sd) { - WARN(!sd, KERN_WARNING + kn = kernfs_find_and_get(parent, grp->name); + if (!kn) { + WARN(!kn, KERN_WARNING "sysfs group %p not found for kobject '%s'\n", grp, kobject_name(kobj)); return; } - } else - sd = sysfs_get(dir_sd); + } else { + kn = parent; + kernfs_get(kn); + } - remove_files(sd, kobj, grp); + remove_files(kn, kobj, grp); if (grp->name) - sysfs_remove(sd); + kernfs_remove(kn); - sysfs_put(sd); + kernfs_put(kn); } EXPORT_SYMBOL_GPL(sysfs_remove_group); @@ -257,22 +261,22 @@ EXPORT_SYMBOL_GPL(sysfs_remove_groups); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; int error = 0; struct attribute *const *attr; int i; - dir_sd = sysfs_get_dirent(kobj->sd, grp->name); - if (!dir_sd) + parent = kernfs_find_and_get(kobj->sd, grp->name); + if (!parent) return -ENOENT; for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) - error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR); + error = sysfs_add_file(parent, *attr, false); if (error) { while (--i >= 0) - sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL); + kernfs_remove_by_name(parent, (*--attr)->name); } - sysfs_put(dir_sd); + kernfs_put(parent); return error; } @@ -286,14 +290,14 @@ EXPORT_SYMBOL_GPL(sysfs_merge_group); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; struct attribute *const *attr; - dir_sd = sysfs_get_dirent(kobj->sd, grp->name); - if (dir_sd) { + parent = kernfs_find_and_get(kobj->sd, grp->name); + if (parent) { for (attr = grp->attrs; *attr; ++attr) - sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL); - sysfs_put(dir_sd); + kernfs_remove_by_name(parent, (*attr)->name); + kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); @@ -308,15 +312,15 @@ EXPORT_SYMBOL_GPL(sysfs_unmerge_group); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; int error = 0; - dir_sd = sysfs_get_dirent(kobj->sd, group_name); - if (!dir_sd) + parent = kernfs_find_and_get(kobj->sd, group_name); + if (!parent) return -ENOENT; - error = sysfs_create_link_sd(dir_sd, target, link_name); - sysfs_put(dir_sd); + error = sysfs_create_link_sd(parent, target, link_name); + kernfs_put(parent); return error; } @@ -331,12 +335,12 @@ EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { - struct sysfs_dirent *dir_sd; + struct kernfs_node *parent; - dir_sd = sysfs_get_dirent(kobj->sd, group_name); - if (dir_sd) { - sysfs_hash_and_remove(dir_sd, link_name, NULL); - sysfs_put(dir_sd); + parent = kernfs_find_and_get(kobj->sd, group_name); + if (parent) { + kernfs_remove_by_name(parent, link_name); + kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c deleted file mode 100644 index 1750f790af3..00000000000 --- a/fs/sysfs/inode.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - * fs/sysfs/inode.c - basic sysfs inode and dentry operations - * - * Copyright (c) 2001-3 Patrick Mochel - * Copyright (c) 2007 SUSE Linux Products GmbH - * Copyright (c) 2007 Tejun Heo <teheo@suse.de> - * - * This file is released under the GPLv2. - * - * Please see Documentation/filesystems/sysfs.txt for more information. - */ - -#undef DEBUG - -#include <linux/pagemap.h> -#include <linux/namei.h> -#include <linux/backing-dev.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/sysfs.h> -#include <linux/xattr.h> -#include <linux/security.h> -#include "sysfs.h" - -static const struct address_space_operations sysfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, -}; - -static struct backing_dev_info sysfs_backing_dev_info = { - .name = "sysfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static const struct inode_operations sysfs_inode_operations = { - .permission = sysfs_permission, - .setattr = sysfs_setattr, - .getattr = sysfs_getattr, - .setxattr = sysfs_setxattr, -}; - -int __init sysfs_inode_init(void) -{ - return bdi_init(&sysfs_backing_dev_info); -} - -static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) -{ - struct sysfs_inode_attrs *attrs; - struct iattr *iattrs; - - attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL); - if (!attrs) - return NULL; - iattrs = &attrs->ia_iattr; - - /* assign default attributes */ - iattrs->ia_mode = sd->s_mode; - iattrs->ia_uid = GLOBAL_ROOT_UID; - iattrs->ia_gid = GLOBAL_ROOT_GID; - iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; - - return attrs; -} - -int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr) -{ - struct sysfs_inode_attrs *sd_attrs; - struct iattr *iattrs; - unsigned int ia_valid = iattr->ia_valid; - - sd_attrs = sd->s_iattr; - - if (!sd_attrs) { - /* setting attributes for the first time, allocate now */ - sd_attrs = sysfs_init_inode_attrs(sd); - if (!sd_attrs) - return -ENOMEM; - sd->s_iattr = sd_attrs; - } - /* attributes were changed at least once in past */ - iattrs = &sd_attrs->ia_iattr; - - if (ia_valid & ATTR_UID) - iattrs->ia_uid = iattr->ia_uid; - if (ia_valid & ATTR_GID) - iattrs->ia_gid = iattr->ia_gid; - if (ia_valid & ATTR_ATIME) - iattrs->ia_atime = iattr->ia_atime; - if (ia_valid & ATTR_MTIME) - iattrs->ia_mtime = iattr->ia_mtime; - if (ia_valid & ATTR_CTIME) - iattrs->ia_ctime = iattr->ia_ctime; - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - iattrs->ia_mode = sd->s_mode = mode; - } - return 0; -} - -int sysfs_setattr(struct dentry *dentry, struct iattr *iattr) -{ - struct inode *inode = dentry->d_inode; - struct sysfs_dirent *sd = dentry->d_fsdata; - int error; - - if (!sd) - return -EINVAL; - - mutex_lock(&sysfs_mutex); - error = inode_change_ok(inode, iattr); - if (error) - goto out; - - error = sysfs_sd_setattr(sd, iattr); - if (error) - goto out; - - /* this ignores size changes */ - setattr_copy(inode, iattr); - -out: - mutex_unlock(&sysfs_mutex); - return error; -} - -static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata, - u32 *secdata_len) -{ - struct sysfs_inode_attrs *iattrs; - void *old_secdata; - size_t old_secdata_len; - - if (!sd->s_iattr) { - sd->s_iattr = sysfs_init_inode_attrs(sd); - if (!sd->s_iattr) - return -ENOMEM; - } - - iattrs = sd->s_iattr; - old_secdata = iattrs->ia_secdata; - old_secdata_len = iattrs->ia_secdata_len; - - iattrs->ia_secdata = *secdata; - iattrs->ia_secdata_len = *secdata_len; - - *secdata = old_secdata; - *secdata_len = old_secdata_len; - return 0; -} - -int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - void *secdata; - int error; - u32 secdata_len = 0; - - if (!sd) - return -EINVAL; - - if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { - const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - error = security_inode_setsecurity(dentry->d_inode, suffix, - value, size, flags); - if (error) - goto out; - error = security_inode_getsecctx(dentry->d_inode, - &secdata, &secdata_len); - if (error) - goto out; - - mutex_lock(&sysfs_mutex); - error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len); - mutex_unlock(&sysfs_mutex); - - if (secdata) - security_release_secctx(secdata, secdata_len); - } else - return -EINVAL; -out: - return error; -} - -static inline void set_default_inode_attr(struct inode *inode, umode_t mode) -{ - inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; -} - -static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) -{ - inode->i_uid = iattr->ia_uid; - inode->i_gid = iattr->ia_gid; - inode->i_atime = iattr->ia_atime; - inode->i_mtime = iattr->ia_mtime; - inode->i_ctime = iattr->ia_ctime; -} - -static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode) -{ - struct sysfs_inode_attrs *iattrs = sd->s_iattr; - - inode->i_mode = sd->s_mode; - if (iattrs) { - /* sysfs_dirent has non-default attributes - * get them from persistent copy in sysfs_dirent - */ - set_inode_attr(inode, &iattrs->ia_iattr); - security_inode_notifysecctx(inode, - iattrs->ia_secdata, - iattrs->ia_secdata_len); - } - - if (sysfs_type(sd) == SYSFS_DIR) - set_nlink(inode, sd->s_dir.subdirs + 2); -} - -int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - struct inode *inode = dentry->d_inode; - - mutex_lock(&sysfs_mutex); - sysfs_refresh_inode(sd, inode); - mutex_unlock(&sysfs_mutex); - - generic_fillattr(inode, stat); - return 0; -} - -static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) -{ - struct bin_attribute *bin_attr; - - inode->i_private = sysfs_get(sd); - inode->i_mapping->a_ops = &sysfs_aops; - inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; - inode->i_op = &sysfs_inode_operations; - - set_default_inode_attr(inode, sd->s_mode); - sysfs_refresh_inode(sd, inode); - - /* initialize inode according to type */ - switch (sysfs_type(sd)) { - case SYSFS_DIR: - inode->i_op = &sysfs_dir_inode_operations; - inode->i_fop = &sysfs_dir_operations; - break; - case SYSFS_KOBJ_ATTR: - inode->i_size = PAGE_SIZE; - inode->i_fop = &sysfs_file_operations; - break; - case SYSFS_KOBJ_BIN_ATTR: - bin_attr = sd->s_attr.bin_attr; - inode->i_size = bin_attr->size; - inode->i_fop = &sysfs_bin_operations; - break; - case SYSFS_KOBJ_LINK: - inode->i_op = &sysfs_symlink_inode_operations; - break; - default: - BUG(); - } - - unlock_new_inode(inode); -} - -/** - * sysfs_get_inode - get inode for sysfs_dirent - * @sb: super block - * @sd: sysfs_dirent to allocate inode for - * - * Get inode for @sd. If such inode doesn't exist, a new inode - * is allocated and basics are initialized. New inode is - * returned locked. - * - * LOCKING: - * Kernel thread context (may sleep). - * - * RETURNS: - * Pointer to allocated inode on success, NULL on failure. - */ -struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd) -{ - struct inode *inode; - - inode = iget_locked(sb, sd->s_ino); - if (inode && (inode->i_state & I_NEW)) - sysfs_init_inode(sd, inode); - - return inode; -} - -/* - * The sysfs_dirent serves as both an inode and a directory entry for sysfs. - * To prevent the sysfs inode numbers from being freed prematurely we take a - * reference to sysfs_dirent from the sysfs inode. A - * super_operations.evict_inode() implementation is needed to drop that - * reference upon inode destruction. - */ -void sysfs_evict_inode(struct inode *inode) -{ - struct sysfs_dirent *sd = inode->i_private; - - truncate_inode_pages(&inode->i_data, 0); - clear_inode(inode); - sysfs_put(sd); -} - -int sysfs_permission(struct inode *inode, int mask) -{ - struct sysfs_dirent *sd; - - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - sd = inode->i_private; - - mutex_lock(&sysfs_mutex); - sysfs_refresh_inode(sd, inode); - mutex_unlock(&sysfs_mutex); - - return generic_permission(inode, mask); -} diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 834ec2cdb7a..6211230814f 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -14,146 +14,41 @@ #include <linux/fs.h> #include <linux/mount.h> -#include <linux/pagemap.h> #include <linux/init.h> -#include <linux/module.h> -#include <linux/magic.h> -#include <linux/slab.h> #include <linux/user_namespace.h> #include "sysfs.h" - -static struct vfsmount *sysfs_mnt; -struct kmem_cache *sysfs_dir_cachep; - -static const struct super_operations sysfs_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .evict_inode = sysfs_evict_inode, -}; - -struct sysfs_dirent sysfs_root = { - .s_name = "", - .s_count = ATOMIC_INIT(1), - .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT), - .s_mode = S_IFDIR | S_IRUGO | S_IXUGO, - .s_ino = 1, -}; - -static int sysfs_fill_super(struct super_block *sb, void *data, int silent) -{ - struct inode *inode; - struct dentry *root; - - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = SYSFS_MAGIC; - sb->s_op = &sysfs_ops; - sb->s_time_gran = 1; - - /* get root inode, initialize and unlock it */ - mutex_lock(&sysfs_mutex); - inode = sysfs_get_inode(sb, &sysfs_root); - mutex_unlock(&sysfs_mutex); - if (!inode) { - pr_debug("sysfs: could not get root inode\n"); - return -ENOMEM; - } - - /* instantiate and link root dentry */ - root = d_make_root(inode); - if (!root) { - pr_debug("%s: could not get root dentry!\n", __func__); - return -ENOMEM; - } - root->d_fsdata = &sysfs_root; - sb->s_root = root; - sb->s_d_op = &sysfs_dentry_ops; - return 0; -} - -static int sysfs_test_super(struct super_block *sb, void *data) -{ - struct sysfs_super_info *sb_info = sysfs_info(sb); - struct sysfs_super_info *info = data; - enum kobj_ns_type type; - int found = 1; - - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { - if (sb_info->ns[type] != info->ns[type]) - found = 0; - } - return found; -} - -static int sysfs_set_super(struct super_block *sb, void *data) -{ - int error; - error = set_anon_super(sb, data); - if (!error) - sb->s_fs_info = data; - return error; -} - -static void free_sysfs_super_info(struct sysfs_super_info *info) -{ - int type; - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - kobj_ns_drop(type, info->ns[type]); - kfree(info); -} +static struct kernfs_root *sysfs_root; +struct kernfs_node *sysfs_root_kn; static struct dentry *sysfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - struct sysfs_super_info *info; - enum kobj_ns_type type; - struct super_block *sb; - int error; + struct dentry *root; + void *ns; if (!(flags & MS_KERNMOUNT)) { if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) return ERR_PTR(-EPERM); - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) { - if (!kobj_ns_current_may_mount(type)) - return ERR_PTR(-EPERM); - } - } - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return ERR_PTR(-ENOMEM); - - for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - info->ns[type] = kobj_ns_grab_current(type); - - sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info); - if (IS_ERR(sb) || sb->s_fs_info != info) - free_sysfs_super_info(info); - if (IS_ERR(sb)) - return ERR_CAST(sb); - if (!sb->s_root) { - error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(sb); - return ERR_PTR(error); - } - sb->s_flags |= MS_ACTIVE; + if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) + return ERR_PTR(-EPERM); } - return dget(sb->s_root); + ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); + root = kernfs_mount_ns(fs_type, flags, sysfs_root, ns); + if (IS_ERR(root)) + kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); + return root; } static void sysfs_kill_sb(struct super_block *sb) { - struct sysfs_super_info *info = sysfs_info(sb); - /* Remove the superblock from fs_supers/s_instances - * so we can't find it, before freeing sysfs_super_info. - */ - kill_anon_super(sb); - free_sysfs_super_info(info); + void *ns = (void *)kernfs_super_ns(sb); + + kernfs_kill_sb(sb); + kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); } static struct file_system_type sysfs_fs_type = { @@ -165,48 +60,19 @@ static struct file_system_type sysfs_fs_type = { int __init sysfs_init(void) { - int err = -ENOMEM; + int err; - sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", - sizeof(struct sysfs_dirent), - 0, 0, NULL); - if (!sysfs_dir_cachep) - goto out; + sysfs_root = kernfs_create_root(NULL, NULL); + if (IS_ERR(sysfs_root)) + return PTR_ERR(sysfs_root); - err = sysfs_inode_init(); - if (err) - goto out_err; + sysfs_root_kn = sysfs_root->kn; err = register_filesystem(&sysfs_fs_type); - if (!err) { - sysfs_mnt = kern_mount(&sysfs_fs_type); - if (IS_ERR(sysfs_mnt)) { - printk(KERN_ERR "sysfs: could not mount!\n"); - err = PTR_ERR(sysfs_mnt); - sysfs_mnt = NULL; - unregister_filesystem(&sysfs_fs_type); - goto out_err; - } - } else - goto out_err; -out: - return err; -out_err: - kmem_cache_destroy(sysfs_dir_cachep); - sysfs_dir_cachep = NULL; - goto out; -} - -#undef sysfs_get -struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) -{ - return __sysfs_get(sd); -} -EXPORT_SYMBOL_GPL(sysfs_get); + if (err) { + kernfs_destroy_root(sysfs_root); + return err; + } -#undef sysfs_put -void sysfs_put(struct sysfs_dirent *sd) -{ - __sysfs_put(sd); + return 0; } -EXPORT_SYMBOL_GPL(sysfs_put); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 3ae3f1bf1a0..aecb15f8455 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -11,109 +11,73 @@ */ #include <linux/fs.h> -#include <linux/gfp.h> -#include <linux/mount.h> #include <linux/module.h> #include <linux/kobject.h> -#include <linux/namei.h> #include <linux/mutex.h> #include <linux/security.h> #include "sysfs.h" -static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, - struct kobject *target, +static int sysfs_do_create_link_sd(struct kernfs_node *parent, + struct kobject *target_kobj, const char *name, int warn) { - struct sysfs_dirent *target_sd = NULL; - struct sysfs_dirent *sd = NULL; - struct sysfs_addrm_cxt acxt; - enum kobj_ns_type ns_type; - int error; + struct kernfs_node *kn, *target = NULL; - BUG_ON(!name || !parent_sd); + BUG_ON(!name || !parent); /* - * We don't own @target and it may be removed at any time. + * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); - if (target->sd) - target_sd = sysfs_get(target->sd); + if (target_kobj->sd) { + target = target_kobj->sd; + kernfs_get(target); + } spin_unlock(&sysfs_symlink_target_lock); - error = -ENOENT; - if (!target_sd) - goto out_put; - - error = -ENOMEM; - sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK); - if (!sd) - goto out_put; + if (!target) + return -ENOENT; - ns_type = sysfs_ns_type(parent_sd); - if (ns_type) - sd->s_ns = target_sd->s_ns; - sd->s_symlink.target_sd = target_sd; - target_sd = NULL; /* reference is now owned by the symlink */ - - sysfs_addrm_start(&acxt); - /* Symlinks must be between directories with the same ns_type */ - if (!ns_type || - (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) { - if (warn) - error = sysfs_add_one(&acxt, sd, parent_sd); - else - error = __sysfs_add_one(&acxt, sd, parent_sd); - } else { - error = -EINVAL; - WARN(1, KERN_WARNING - "sysfs: symlink across ns_types %s/%s -> %s/%s\n", - parent_sd->s_name, - sd->s_name, - sd->s_symlink.target_sd->s_parent->s_name, - sd->s_symlink.target_sd->s_name); - } - sysfs_addrm_finish(&acxt); + kn = kernfs_create_link(parent, name, target); + kernfs_put(target); - if (error) - goto out_put; + if (!IS_ERR(kn)) + return 0; - return 0; - - out_put: - sysfs_put(target_sd); - sysfs_put(sd); - return error; + if (warn && PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, name); + return PTR_ERR(kn); } /** * sysfs_create_link_sd - create symlink to a given object. - * @sd: directory we're creating the link in. + * @kn: directory we're creating the link in. * @target: object we're pointing to. * @name: name of the symlink. */ -int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, +int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, const char *name) { - return sysfs_do_create_link_sd(sd, target, name, 1); + return sysfs_do_create_link_sd(kn, target, name, 1); } static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, const char *name, int warn) { - struct sysfs_dirent *parent_sd = NULL; + struct kernfs_node *parent = NULL; if (!kobj) - parent_sd = &sysfs_root; + parent = sysfs_root_kn; else - parent_sd = kobj->sd; + parent = kobj->sd; - if (!parent_sd) + if (!parent) return -EFAULT; - return sysfs_do_create_link_sd(parent_sd, target, name, warn); + return sysfs_do_create_link_sd(parent, target, name, warn); } /** @@ -164,10 +128,10 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, * sysfs_remove_dir() for details. */ spin_lock(&sysfs_symlink_target_lock); - if (targ->sd && sysfs_ns_type(kobj->sd)) - ns = targ->sd->s_ns; + if (targ->sd && kernfs_ns_enabled(kobj->sd)) + ns = targ->sd->ns; spin_unlock(&sysfs_symlink_target_lock); - sysfs_hash_and_remove(kobj->sd, name, ns); + kernfs_remove_by_name_ns(kobj->sd, name, ns); } /** @@ -177,14 +141,14 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ, */ void sysfs_remove_link(struct kobject *kobj, const char *name) { - struct sysfs_dirent *parent_sd = NULL; + struct kernfs_node *parent = NULL; if (!kobj) - parent_sd = &sysfs_root; + parent = sysfs_root_kn; else - parent_sd = kobj->sd; + parent = kobj->sd; - sysfs_hash_and_remove(parent_sd, name, NULL); + kernfs_remove_by_name(parent, name); } EXPORT_SYMBOL_GPL(sysfs_remove_link); @@ -201,130 +165,33 @@ EXPORT_SYMBOL_GPL(sysfs_remove_link); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ, const char *old, const char *new, const void *new_ns) { - struct sysfs_dirent *parent_sd, *sd = NULL; + struct kernfs_node *parent, *kn = NULL; const void *old_ns = NULL; int result; if (!kobj) - parent_sd = &sysfs_root; + parent = sysfs_root_kn; else - parent_sd = kobj->sd; + parent = kobj->sd; if (targ->sd) - old_ns = targ->sd->s_ns; + old_ns = targ->sd->ns; result = -ENOENT; - sd = sysfs_get_dirent_ns(parent_sd, old, old_ns); - if (!sd) + kn = kernfs_find_and_get_ns(parent, old, old_ns); + if (!kn) goto out; result = -EINVAL; - if (sysfs_type(sd) != SYSFS_KOBJ_LINK) + if (kernfs_type(kn) != KERNFS_LINK) goto out; - if (sd->s_symlink.target_sd->s_dir.kobj != targ) + if (kn->symlink.target_kn->priv != targ) goto out; - result = sysfs_rename(sd, parent_sd, new, new_ns); + result = kernfs_rename_ns(kn, parent, new, new_ns); out: - sysfs_put(sd); + kernfs_put(kn); return result; } EXPORT_SYMBOL_GPL(sysfs_rename_link_ns); - -static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, - struct sysfs_dirent *target_sd, char *path) -{ - struct sysfs_dirent *base, *sd; - char *s = path; - int len = 0; - - /* go up to the root, stop at the base */ - base = parent_sd; - while (base->s_parent) { - sd = target_sd->s_parent; - while (sd->s_parent && base != sd) - sd = sd->s_parent; - - if (base == sd) - break; - - strcpy(s, "../"); - s += 3; - base = base->s_parent; - } - - /* determine end of target string for reverse fillup */ - sd = target_sd; - while (sd->s_parent && sd != base) { - len += strlen(sd->s_name) + 1; - sd = sd->s_parent; - } - - /* check limits */ - if (len < 2) - return -EINVAL; - len--; - if ((s - path) + len > PATH_MAX) - return -ENAMETOOLONG; - - /* reverse fillup of target string from target to base */ - sd = target_sd; - while (sd->s_parent && sd != base) { - int slen = strlen(sd->s_name); - - len -= slen; - strncpy(s + len, sd->s_name, slen); - if (len) - s[--len] = '/'; - - sd = sd->s_parent; - } - - return 0; -} - -static int sysfs_getlink(struct dentry *dentry, char *path) -{ - struct sysfs_dirent *sd = dentry->d_fsdata; - struct sysfs_dirent *parent_sd = sd->s_parent; - struct sysfs_dirent *target_sd = sd->s_symlink.target_sd; - int error; - - mutex_lock(&sysfs_mutex); - error = sysfs_get_target_path(parent_sd, target_sd, path); - mutex_unlock(&sysfs_mutex); - - return error; -} - -static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - int error = -ENOMEM; - unsigned long page = get_zeroed_page(GFP_KERNEL); - if (page) { - error = sysfs_getlink(dentry, (char *) page); - if (error < 0) - free_page((unsigned long)page); - } - nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); - return NULL; -} - -static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - char *page = nd_get_link(nd); - if (!IS_ERR(page)) - free_page((unsigned long)page); -} - -const struct inode_operations sysfs_symlink_inode_operations = { - .setxattr = sysfs_setxattr, - .readlink = generic_readlink, - .follow_link = sysfs_follow_link, - .put_link = sysfs_put_link, - .setattr = sysfs_setattr, - .getattr = sysfs_getattr, - .permission = sysfs_permission, -}; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 0af09fbfb3f..0e2f1cccb81 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,248 +8,36 @@ * This file is released under the GPLv2. */ -#include <linux/lockdep.h> -#include <linux/kobject_ns.h> -#include <linux/fs.h> -#include <linux/rbtree.h> +#ifndef __SYSFS_INTERNAL_H +#define __SYSFS_INTERNAL_H -struct sysfs_open_dirent; - -/* type-specific structures for sysfs_dirent->s_* union members */ -struct sysfs_elem_dir { - struct kobject *kobj; - - unsigned long subdirs; - /* children rbtree starts here and goes through sd->s_rb */ - struct rb_root children; -}; - -struct sysfs_elem_symlink { - struct sysfs_dirent *target_sd; -}; - -struct sysfs_elem_attr { - union { - struct attribute *attr; - struct bin_attribute *bin_attr; - }; - struct sysfs_open_dirent *open; -}; - -struct sysfs_inode_attrs { - struct iattr ia_iattr; - void *ia_secdata; - u32 ia_secdata_len; -}; - -/* - * sysfs_dirent - the building block of sysfs hierarchy. Each and - * every sysfs node is represented by single sysfs_dirent. - * - * As long as s_count reference is held, the sysfs_dirent itself is - * accessible. Dereferencing s_elem or any other outer entity - * requires s_active reference. - */ -struct sysfs_dirent { - atomic_t s_count; - atomic_t s_active; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif - struct sysfs_dirent *s_parent; - const char *s_name; - - struct rb_node s_rb; - - union { - struct completion *completion; - struct sysfs_dirent *removed_list; - } u; - - const void *s_ns; /* namespace tag */ - unsigned int s_hash; /* ns + name hash */ - union { - struct sysfs_elem_dir s_dir; - struct sysfs_elem_symlink s_symlink; - struct sysfs_elem_attr s_attr; - }; - - unsigned short s_flags; - umode_t s_mode; - unsigned int s_ino; - struct sysfs_inode_attrs *s_iattr; -}; - -#define SD_DEACTIVATED_BIAS INT_MIN - -#define SYSFS_TYPE_MASK 0x00ff -#define SYSFS_DIR 0x0001 -#define SYSFS_KOBJ_ATTR 0x0002 -#define SYSFS_KOBJ_BIN_ATTR 0x0004 -#define SYSFS_KOBJ_LINK 0x0008 -#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) -#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR) - -/* identify any namespace tag on sysfs_dirents */ -#define SYSFS_NS_TYPE_MASK 0xf00 -#define SYSFS_NS_TYPE_SHIFT 8 - -#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK) -#define SYSFS_FLAG_REMOVED 0x02000 - -static inline unsigned int sysfs_type(struct sysfs_dirent *sd) -{ - return sd->s_flags & SYSFS_TYPE_MASK; -} - -/* - * Return any namespace tags on this dirent. - * enum kobj_ns_type is defined in linux/kobject.h - */ -static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd) -{ - return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -#define sysfs_dirent_init_lockdep(sd) \ -do { \ - struct attribute *attr = sd->s_attr.attr; \ - struct lock_class_key *key = attr->key; \ - if (!key) \ - key = &attr->skey; \ - \ - lockdep_init_map(&sd->dep_map, "s_active", key, 0); \ -} while (0) - -/* Test for attributes that want to ignore lockdep for read-locking */ -static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd) -{ - int type = sysfs_type(sd); - - return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) && - sd->s_attr.attr->ignore_lockdep; -} - -#else - -#define sysfs_dirent_init_lockdep(sd) do {} while (0) - -static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd) -{ - return true; -} - -#endif - -/* - * Context structure to be used while adding/removing nodes. - */ -struct sysfs_addrm_cxt { - struct sysfs_dirent *removed; -}; +#include <linux/sysfs.h> /* * mount.c */ - -/* - * Each sb is associated with a set of namespace tags (i.e. - * the network namespace of the task which mounted this sysfs - * instance). - */ -struct sysfs_super_info { - void *ns[KOBJ_NS_TYPES]; -}; -#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) -extern struct sysfs_dirent sysfs_root; -extern struct kmem_cache *sysfs_dir_cachep; +extern struct kernfs_node *sysfs_root_kn; /* * dir.c */ -extern struct mutex sysfs_mutex; extern spinlock_t sysfs_symlink_target_lock; -extern const struct dentry_operations sysfs_dentry_ops; - -extern const struct file_operations sysfs_dir_operations; -extern const struct inode_operations sysfs_dir_inode_operations; -struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd); -void sysfs_put_active(struct sysfs_dirent *sd); -void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt); -void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name); -int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd); -int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd, - struct sysfs_dirent *parent_sd); -void sysfs_remove(struct sysfs_dirent *sd); -int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name, - const void *ns); -void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt); - -struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd, - const unsigned char *name, - const void *ns); -struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type); - -void release_sysfs_dirent(struct sysfs_dirent *sd); - -int sysfs_create_subdir(struct kobject *kobj, const char *name, - struct sysfs_dirent **p_sd); - -int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd, - const char *new_name, const void *new_ns); - -static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) -{ - if (sd) { - WARN_ON(!atomic_read(&sd->s_count)); - atomic_inc(&sd->s_count); - } - return sd; -} -#define sysfs_get(sd) __sysfs_get(sd) - -static inline void __sysfs_put(struct sysfs_dirent *sd) -{ - if (sd && atomic_dec_and_test(&sd->s_count)) - release_sysfs_dirent(sd); -} -#define sysfs_put(sd) __sysfs_put(sd) - -/* - * inode.c - */ -struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd); -void sysfs_evict_inode(struct inode *inode); -int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr); -int sysfs_permission(struct inode *inode, int mask); -int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); -int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat); -int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); -int sysfs_inode_init(void); +void sysfs_warn_dup(struct kernfs_node *parent, const char *name); /* * file.c */ -extern const struct file_operations sysfs_file_operations; -extern const struct file_operations sysfs_bin_operations; - -int sysfs_add_file(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type); - -int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd, - const struct attribute *attr, int type, +int sysfs_add_file(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin); +int sysfs_add_file_mode_ns(struct kernfs_node *parent, + const struct attribute *attr, bool is_bin, umode_t amode, const void *ns); -void sysfs_unmap_bin_file(struct sysfs_dirent *sd); /* * symlink.c */ -extern const struct inode_operations sysfs_symlink_inode_operations; -int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, +int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target, const char *name); + +#endif /* __SYSFS_INTERNAL_H */ diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index cc1febd8fad..5157b866a85 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2118,26 +2118,10 @@ out_free: */ static void free_inodes(struct fsck_data *fsckd) { - struct rb_node *this = fsckd->inodes.rb_node; - struct fsck_inode *fscki; + struct fsck_inode *fscki, *n; - while (this) { - if (this->rb_left) - this = this->rb_left; - else if (this->rb_right) - this = this->rb_right; - else { - fscki = rb_entry(this, struct fsck_inode, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &fscki->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } - kfree(fscki); - } - } + rbtree_postorder_for_each_entry_safe(fscki, n, &fsckd->inodes, rb) + kfree(fscki); } /** diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 36bd4efd081..a902c5919e4 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -574,27 +574,10 @@ static int done_already(struct rb_root *done_tree, int lnum) */ static void destroy_done_tree(struct rb_root *done_tree) { - struct rb_node *this = done_tree->rb_node; - struct done_ref *dr; + struct done_ref *dr, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - dr = rb_entry(this, struct done_ref, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &dr->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } + rbtree_postorder_for_each_entry_safe(dr, n, done_tree, rb) kfree(dr); - } } /** diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index ba32da3fe08..f1c3e5a1b31 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -815,27 +815,10 @@ static int dbg_find_check_orphan(struct rb_root *root, ino_t inum) static void dbg_free_check_tree(struct rb_root *root) { - struct rb_node *this = root->rb_node; - struct check_orphan *o; + struct check_orphan *o, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - o = rb_entry(this, struct check_orphan, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &o->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } + rbtree_postorder_for_each_entry_safe(o, n, root, rb) kfree(o); - } } static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 065096e36ed..c14adb2f420 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -1335,29 +1335,14 @@ static void remove_ino(struct ubifs_info *c, ino_t inum) */ void ubifs_destroy_size_tree(struct ubifs_info *c) { - struct rb_node *this = c->size_tree.rb_node; - struct size_entry *e; + struct size_entry *e, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - e = rb_entry(this, struct size_entry, rb); + rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) { if (e->inode) iput(e->inode); - this = rb_parent(this); - if (this) { - if (this->rb_left == &e->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } kfree(e); } + c->size_tree = RB_ROOT; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f69daa514a5..5ded8490c0c 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -873,26 +873,10 @@ static void free_orphans(struct ubifs_info *c) */ static void free_buds(struct ubifs_info *c) { - struct rb_node *this = c->buds.rb_node; - struct ubifs_bud *bud; - - while (this) { - if (this->rb_left) - this = this->rb_left; - else if (this->rb_right) - this = this->rb_right; - else { - bud = rb_entry(this, struct ubifs_bud, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &bud->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } - kfree(bud); - } - } + struct ubifs_bud *bud, *n; + + rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) + kfree(bud); } /** diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 349f31a30f4..9083bc7ed4a 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -178,27 +178,11 @@ static int ins_clr_old_idx_znode(struct ubifs_info *c, */ void destroy_old_idx(struct ubifs_info *c) { - struct rb_node *this = c->old_idx.rb_node; - struct ubifs_old_idx *old_idx; + struct ubifs_old_idx *old_idx, *n; - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - old_idx = rb_entry(this, struct ubifs_old_idx, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &old_idx->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } + rbtree_postorder_for_each_entry_safe(old_idx, n, &c->old_idx, rb) kfree(old_idx); - } + c->old_idx = RB_ROOT; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 5f6fc17d6bc..9737cba1357 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1010,6 +1010,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, else udf_truncate_tail_extent(inode); mark_inode_dirty(inode); + up_write(&iinfo->i_data_sem); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) @@ -1023,7 +1024,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) mark_inode_dirty(dir); - up_write(&iinfo->i_data_sem); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c deleted file mode 100644 index 9fbea87fdb6..00000000000 --- a/fs/xattr_acl.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * linux/fs/xattr_acl.c - * - * Almost all from linux/fs/ext2/acl.c: - * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> - */ - -#include <linux/export.h> -#include <linux/fs.h> -#include <linux/posix_acl_xattr.h> -#include <linux/gfp.h> -#include <linux/user_namespace.h> - -/* - * Fix up the uids and gids in posix acl extended attributes in place. - */ -static void posix_acl_fix_xattr_userns( - struct user_namespace *to, struct user_namespace *from, - void *value, size_t size) -{ - posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; - posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; - int count; - kuid_t uid; - kgid_t gid; - - if (!value) - return; - if (size < sizeof(posix_acl_xattr_header)) - return; - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return; - - count = posix_acl_xattr_count(size); - if (count < 0) - return; - if (count == 0) - return; - - for (end = entry + count; entry != end; entry++) { - switch(le16_to_cpu(entry->e_tag)) { - case ACL_USER: - uid = make_kuid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kuid(to, uid)); - break; - case ACL_GROUP: - gid = make_kgid(from, le32_to_cpu(entry->e_id)); - entry->e_id = cpu_to_le32(from_kgid(to, gid)); - break; - default: - break; - } - } -} - -void posix_acl_fix_xattr_from_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); -} - -void posix_acl_fix_xattr_to_user(void *value, size_t size) -{ - struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) - return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); -} - -/* - * Convert from extended attribute to in-memory representation. - */ -struct posix_acl * -posix_acl_from_xattr(struct user_namespace *user_ns, - const void *value, size_t size) -{ - posix_acl_xattr_header *header = (posix_acl_xattr_header *)value; - posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end; - int count; - struct posix_acl *acl; - struct posix_acl_entry *acl_e; - - if (!value) - return NULL; - if (size < sizeof(posix_acl_xattr_header)) - return ERR_PTR(-EINVAL); - if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return ERR_PTR(-EOPNOTSUPP); - - count = posix_acl_xattr_count(size); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; - - acl = posix_acl_alloc(count, GFP_NOFS); - if (!acl) - return ERR_PTR(-ENOMEM); - acl_e = acl->a_entries; - - for (end = entry + count; entry != end; acl_e++, entry++) { - acl_e->e_tag = le16_to_cpu(entry->e_tag); - acl_e->e_perm = le16_to_cpu(entry->e_perm); - - switch(acl_e->e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - break; - - case ACL_USER: - acl_e->e_uid = - make_kuid(user_ns, - le32_to_cpu(entry->e_id)); - if (!uid_valid(acl_e->e_uid)) - goto fail; - break; - case ACL_GROUP: - acl_e->e_gid = - make_kgid(user_ns, - le32_to_cpu(entry->e_id)); - if (!gid_valid(acl_e->e_gid)) - goto fail; - break; - - default: - goto fail; - } - } - return acl; - -fail: - posix_acl_release(acl); - return ERR_PTR(-EINVAL); -} -EXPORT_SYMBOL (posix_acl_from_xattr); - -/* - * Convert from in-memory to extended attribute representation. - */ -int -posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, - void *buffer, size_t size) -{ - posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer; - posix_acl_xattr_entry *ext_entry = ext_acl->a_entries; - int real_size, n; - - real_size = posix_acl_xattr_size(acl->a_count); - if (!buffer) - return real_size; - if (real_size > size) - return -ERANGE; - - ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); - - for (n=0; n < acl->a_count; n++, ext_entry++) { - const struct posix_acl_entry *acl_e = &acl->a_entries[n]; - ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); - ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); - switch(acl_e->e_tag) { - case ACL_USER: - ext_entry->e_id = - cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); - break; - case ACL_GROUP: - ext_entry->e_id = - cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); - break; - default: - ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); - break; - } - } - return real_size; -} -EXPORT_SYMBOL (posix_acl_to_xattr); diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 370eb3e121d..0ecec1896f2 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -124,16 +124,12 @@ struct posix_acl * xfs_get_acl(struct inode *inode, int type) { struct xfs_inode *ip = XFS_I(inode); - struct posix_acl *acl; + struct posix_acl *acl = NULL; struct xfs_acl *xfs_acl; unsigned char *ea_name; int error; int len; - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - trace_xfs_get_acl(ip); switch (type) { @@ -164,10 +160,8 @@ xfs_get_acl(struct inode *inode, int type) * cache entry, for any other error assume it is transient and * leave the cache entry as ACL_NOT_CACHED. */ - if (error == -ENOATTR) { - acl = NULL; + if (error == -ENOATTR) goto out_update_cache; - } goto out; } @@ -183,15 +177,12 @@ out: } STATIC int -xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) { struct xfs_inode *ip = XFS_I(inode); unsigned char *ea_name; int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - switch (type) { case ACL_TYPE_ACCESS: ea_name = SGI_ACL_FILE; @@ -282,131 +273,23 @@ posix_acl_default_exists(struct inode *inode) return xfs_acl_exists(inode, SGI_ACL_DEFAULT); } -/* - * No need for i_mutex because the inode is not yet exposed to the VFS. - */ int -xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) +xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - umode_t mode = inode->i_mode; - int error = 0, inherit = 0; - - if (S_ISDIR(inode->i_mode)) { - error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto out; - } - - error = posix_acl_create(&acl, GFP_KERNEL, &mode); - if (error < 0) - return error; - - /* - * If posix_acl_create returns a positive value we need to - * inherit a permission that can't be represented using the Unix - * mode bits and we actually need to set an ACL. - */ - if (error > 0) - inherit = 1; - - error = xfs_set_mode(inode, mode); - if (error) - goto out; - - if (inherit) - error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl); - -out: - posix_acl_release(acl); - return error; -} - -int -xfs_acl_chmod(struct inode *inode) -{ - struct posix_acl *acl; - int error; - - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; - - acl = xfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); - - error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); - if (error) - return error; - - error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return error; -} - -static int -xfs_xattr_acl_get(struct dentry *dentry, const char *name, - void *value, size_t size, int type) -{ - struct posix_acl *acl; - int error; - - acl = xfs_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl == NULL) - return -ENODATA; - - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - posix_acl_release(acl); - - return error; -} - -static int -xfs_xattr_acl_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; int error = 0; - if (flags & XATTR_CREATE) - return -EINVAL; - if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) - return value ? -EACCES : 0; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (!value) + if (!acl) goto set_acl; - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (!acl) { - /* - * acl_set_file(3) may request that we set default ACLs with - * zero length -- defend (gracefully) against that here. - */ - goto out; - } - if (IS_ERR(acl)) { - error = PTR_ERR(acl); - goto out; - } - - error = posix_acl_valid(acl); - if (error) - goto out_release; - error = -EINVAL; if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) - goto out_release; + return error; if (type == ACL_TYPE_ACCESS) { umode_t mode = inode->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error <= 0) { - posix_acl_release(acl); acl = NULL; if (error < 0) @@ -415,27 +298,9 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, error = xfs_set_mode(inode, mode); if (error) - goto out_release; + return error; } set_acl: - error = xfs_set_acl(inode, type, acl); - out_release: - posix_acl_release(acl); - out: - return error; + return __xfs_set_acl(inode, type, acl); } - -const struct xattr_handler xfs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .get = xfs_xattr_acl_get, - .set = xfs_xattr_acl_set, -}; - -const struct xattr_handler xfs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .get = xfs_xattr_acl_get, - .set = xfs_xattr_acl_set, -}; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 4016a567b83..5dc16374451 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -60,20 +60,15 @@ struct xfs_acl { #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); -extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl); -extern int xfs_acl_chmod(struct inode *inode); +extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int posix_acl_access_exists(struct inode *inode); extern int posix_acl_default_exists(struct inode *inode); - -extern const struct xattr_handler xfs_xattr_acl_access_handler; -extern const struct xattr_handler xfs_xattr_acl_default_handler; #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { return NULL; } -# define xfs_inherit_acl(inode, default_acl) 0 -# define xfs_acl_chmod(inode) 0 +# define xfs_set_acl NULL # define posix_acl_access_exists(inode) 0 # define posix_acl_default_exists(inode) 0 #endif /* CONFIG_XFS_POSIX_ACL */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 71c8c9d2b88..a26739451b5 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1217,7 +1217,7 @@ __xfs_get_blocks( lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); } else { - lockmode = xfs_ilock_map_shared(ip); + lockmode = xfs_ilock_data_map_shared(ip); } ASSERT(offset <= mp->m_super->s_maxbytes); diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index b86127072ac..01b6a0102fb 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -164,6 +164,7 @@ xfs_attr_get( { int error; struct xfs_name xname; + uint lock_mode; XFS_STATS_INC(xs_attr_get); @@ -174,9 +175,9 @@ xfs_attr_get( if (error) return error; - xfs_ilock(ip, XFS_ILOCK_SHARED); + lock_mode = xfs_ilock_attr_map_shared(ip); error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags); - xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_iunlock(ip, lock_mode); return(error); } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 2d174b12815..01db96f60cf 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -507,17 +507,17 @@ xfs_attr_list_int( { int error; xfs_inode_t *dp = context->dp; + uint lock_mode; XFS_STATS_INC(xs_attr_list); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return EIO; - xfs_ilock(dp, XFS_ILOCK_SHARED); - /* * Decide on what work routines to call based on the inode size. */ + lock_mode = xfs_ilock_attr_map_shared(dp); if (!xfs_inode_hasattr(dp)) { error = 0; } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { @@ -527,9 +527,7 @@ xfs_attr_list_int( } else { error = xfs_attr_node_list(context); } - - xfs_iunlock(dp, XFS_ILOCK_SHARED); - + xfs_iunlock(dp, lock_mode); return error; } diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index 739e0a52ded..5549d69ddb4 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -110,7 +110,7 @@ xfs_attr3_rmt_verify( if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) return false; if (be32_to_cpu(rmt->rm_offset) + - be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) + be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX) return false; if (rmt->rm_owner == 0) return false; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 3ef11b22e75..152543c4ca7 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -1635,7 +1635,7 @@ xfs_bmap_last_extent( * blocks at the end of the file which do not start at the previous data block, * we will try to align the new blocks at stripe unit boundaries. * - * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be + * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be * at, or past the EOF. */ STATIC int @@ -1650,9 +1650,14 @@ xfs_bmap_isaeof( bma->aeof = 0; error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, &is_empty); - if (error || is_empty) + if (error) return error; + if (is_empty) { + bma->aeof = 1; + return 0; + } + /* * Check if we are allocation or past the last extent, or at least into * the last delayed allocated extent. @@ -3643,10 +3648,19 @@ xfs_bmap_btalloc( int isaligned; int tryagain; int error; + int stripe_align; ASSERT(ap->length); mp = ap->ip->i_mount; + + /* stripe alignment for allocation is determined by mount parameters */ + stripe_align = 0; + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + stripe_align = mp->m_swidth; + else if (mp->m_dalign) + stripe_align = mp->m_dalign; + align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, @@ -3655,6 +3669,8 @@ xfs_bmap_btalloc( ASSERT(!error); ASSERT(ap->length); } + + nullfb = *ap->firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); if (nullfb) { @@ -3730,7 +3746,7 @@ xfs_bmap_btalloc( */ if (!ap->flist->xbf_low && ap->aeof) { if (!ap->offset) { - args.alignment = mp->m_dalign; + args.alignment = stripe_align; atype = args.type; isaligned = 1; /* @@ -3755,13 +3771,13 @@ xfs_bmap_btalloc( * of minlen+alignment+slop doesn't go up * between the calls. */ - if (blen > mp->m_dalign && blen <= args.maxlen) - nextminlen = blen - mp->m_dalign; + if (blen > stripe_align && blen <= args.maxlen) + nextminlen = blen - stripe_align; else nextminlen = args.minlen; - if (nextminlen + mp->m_dalign > args.minlen + 1) + if (nextminlen + stripe_align > args.minlen + 1) args.minalignslop = - nextminlen + mp->m_dalign - + nextminlen + stripe_align - args.minlen - 1; else args.minalignslop = 0; @@ -3783,7 +3799,7 @@ xfs_bmap_btalloc( */ args.type = atype; args.fsbno = ap->blkno; - args.alignment = mp->m_dalign; + args.alignment = stripe_align; args.minlen = nextminlen; args.minalignslop = 0; isaligned = 1; @@ -3997,6 +4013,7 @@ xfs_bmapi_read( ASSERT(*nmap >= 1); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| XFS_BMAPI_IGSTATE))); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -4191,6 +4208,7 @@ xfs_bmapi_delay( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && @@ -4484,6 +4502,7 @@ xfs_bmapi_write( ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5035,6 +5054,7 @@ xfs_bunmapi( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(len > 0); ASSERT(nexts >= 0); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 5887e41c032..f264616080c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -287,6 +287,7 @@ xfs_bmapi_allocate( INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker); queue_work(xfs_alloc_wq, &args->work); wait_for_completion(&done); + destroy_work_on_stack(&args->work); return args->result; } @@ -617,22 +618,27 @@ xfs_getbmap( return XFS_ERROR(ENOMEM); xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { - if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) { + if (whichfork == XFS_DATA_FORK) { + if (!(iflags & BMV_IF_DELALLOC) && + (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) goto out_unlock_iolock; + + /* + * Even after flushing the inode, there can still be + * delalloc blocks on the inode beyond EOF due to + * speculative preallocation. These are not removed + * until the release function is called or the inode + * is inactivated. Hence we cannot assert here that + * ip->i_delayed_blks == 0. + */ } - /* - * even after flushing the inode, there can still be delalloc - * blocks on the inode beyond EOF due to speculative - * preallocation. These are not removed until the release - * function is called or the inode is inactivated. Hence we - * cannot assert here that ip->i_delayed_blks == 0. - */ - } - lock = xfs_ilock_map_shared(ip); + lock = xfs_ilock_data_map_shared(ip); + } else { + lock = xfs_ilock_attr_map_shared(ip); + } /* * Don't let nex be bigger than the number of extents @@ -737,7 +743,7 @@ xfs_getbmap( out_free_map: kmem_free(map); out_unlock_ilock: - xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, lock); out_unlock_iolock: xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -1168,9 +1174,15 @@ xfs_zero_remaining_bytes( xfs_buf_unlock(bp); for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { + uint lock_mode; + offset_fsb = XFS_B_TO_FSBT(mp, offset); nimap = 1; + + lock_mode = xfs_ilock_data_map_shared(ip); error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); + xfs_iunlock(ip, lock_mode); + if (error || nimap < 1) break; ASSERT(imap.br_blockcount >= 1); @@ -1187,7 +1199,12 @@ xfs_zero_remaining_bytes( XFS_BUF_UNWRITE(bp); XFS_BUF_READ(bp); XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); - xfsbdstrat(mp, bp); + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + break; + } + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, @@ -1200,7 +1217,12 @@ xfs_zero_remaining_bytes( XFS_BUF_UNDONE(bp); XFS_BUF_UNREAD(bp); XFS_BUF_WRITE(bp); - xfsbdstrat(mp, bp); + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + break; + } + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c7f0b77dcb0..51757113a82 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -445,8 +445,8 @@ _xfs_buf_find( numbytes = BBTOB(numblks); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(numbytes < (1 << btp->bt_sshift))); - ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); + ASSERT(!(numbytes < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); /* * Corrupted block numbers can get through to here, unfortunately, so we @@ -698,7 +698,11 @@ xfs_buf_read_uncached( bp->b_flags |= XBF_READ; bp->b_ops = ops; - xfsbdstrat(target->bt_mount, bp); + if (XFS_FORCED_SHUTDOWN(target->bt_mount)) { + xfs_buf_relse(bp); + return NULL; + } + xfs_buf_iorequest(bp); xfs_buf_iowait(bp); return bp; } @@ -1089,7 +1093,7 @@ xfs_bioerror( * This is meant for userdata errors; metadata bufs come with * iodone functions attached, so that we can track down errors. */ -STATIC int +int xfs_bioerror_relse( struct xfs_buf *bp) { @@ -1152,7 +1156,7 @@ xfs_bwrite( ASSERT(xfs_buf_islocked(bp)); bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); xfs_bdstrat_cb(bp); @@ -1164,25 +1168,6 @@ xfs_bwrite( return error; } -/* - * Wrapper around bdstrat so that we can stop data from going to disk in case - * we are shutting down the filesystem. Typically user data goes thru this - * path; one of the exceptions is the superblock. - */ -void -xfsbdstrat( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - if (XFS_FORCED_SHUTDOWN(mp)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - xfs_bioerror_relse(bp); - return; - } - - xfs_buf_iorequest(bp); -} - STATIC void _xfs_buf_ioend( xfs_buf_t *bp, @@ -1516,6 +1501,12 @@ xfs_wait_buftarg( struct xfs_buf *bp; bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); + if (bp->b_flags & XBF_WRITE_FAIL) { + xfs_alert(btp->bt_mount, +"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n" +"Please run xfs_repair to determine the extent of the problem.", + (long long)bp->b_bn); + } xfs_buf_rele(bp); } if (loop++ != 0) @@ -1602,16 +1593,15 @@ xfs_free_buftarg( kmem_free(btp); } -STATIC int -xfs_setsize_buftarg_flags( +int +xfs_setsize_buftarg( xfs_buftarg_t *btp, unsigned int blocksize, - unsigned int sectorsize, - int verbose) + unsigned int sectorsize) { - btp->bt_bsize = blocksize; - btp->bt_sshift = ffs(sectorsize) - 1; - btp->bt_smask = sectorsize - 1; + /* Set up metadata sector size info */ + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { char name[BDEVNAME_SIZE]; @@ -1624,30 +1614,25 @@ xfs_setsize_buftarg_flags( return EINVAL; } + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; + return 0; } /* - * When allocating the initial buffer target we have not yet - * read in the superblock, so don't know what sized sectors - * are being used at this early stage. Play safe. + * When allocating the initial buffer target we have not yet + * read in the superblock, so don't know what sized sectors + * are being used at this early stage. Play safe. */ STATIC int xfs_setsize_buftarg_early( xfs_buftarg_t *btp, struct block_device *bdev) { - return xfs_setsize_buftarg_flags(btp, - PAGE_SIZE, bdev_logical_block_size(bdev), 0); -} - -int -xfs_setsize_buftarg( - xfs_buftarg_t *btp, - unsigned int blocksize, - unsigned int sectorsize) -{ - return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); + return xfs_setsize_buftarg(btp, PAGE_SIZE, + bdev_logical_block_size(bdev)); } xfs_buftarg_t * @@ -1799,7 +1784,7 @@ __xfs_buf_delwri_submit( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); bp->b_flags |= XBF_WRITE; if (!wait) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index e6568336101..995339534db 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -45,6 +45,7 @@ typedef enum { #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ +#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */ /* I/O hints for the BIO layer */ #define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ @@ -70,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ + { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { XBF_SYNCIO, "SYNCIO" }, \ { XBF_FUA, "FUA" }, \ { XBF_FLUSH, "FLUSH" }, \ @@ -80,19 +82,34 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_DELWRI_Q, "DELWRI_Q" }, \ { _XBF_COMPOUND, "COMPOUND" } + /* * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ +/* + * The xfs_buftarg contains 2 notions of "sector size" - + * + * 1) The metadata sector size, which is the minimum unit and + * alignment of IO which will be performed by metadata operations. + * 2) The device logical sector size + * + * The first is specified at mkfs time, and is stored on-disk in the + * superblock's sb_sectsize. + * + * The latter is derived from the underlying device, and controls direct IO + * alignment constraints. + */ typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; struct backing_dev_info *bt_bdi; struct xfs_mount *bt_mount; - unsigned int bt_bsize; - unsigned int bt_sshift; - size_t bt_smask; + unsigned int bt_meta_sectorsize; + size_t bt_meta_sectormask; + size_t bt_logical_sectorsize; + size_t bt_logical_sectormask; /* LRU control structures */ struct shrinker bt_shrinker; @@ -269,9 +286,6 @@ extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); - -extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *); - extern void xfs_buf_ioend(xfs_buf_t *, int); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); @@ -282,6 +296,8 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, #define xfs_buf_zero(bp, off, len) \ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) +extern int xfs_bioerror_relse(struct xfs_buf *); + static inline int xfs_buf_geterror(xfs_buf_t *bp) { return bp ? bp->b_error : ENOMEM; @@ -301,7 +317,8 @@ extern void xfs_buf_terminate(void); #define XFS_BUF_ZEROFLAGS(bp) \ ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ - XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) + XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \ + XBF_WRITE_FAIL)) void xfs_buf_stale(struct xfs_buf *bp); #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a64f67ba25d..33149113e33 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -182,21 +182,47 @@ xfs_buf_item_size( trace_xfs_buf_item_size(bip); } -static struct xfs_log_iovec * +static inline void +xfs_buf_item_copy_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + struct xfs_buf *bp, + uint offset, + int first_bit, + uint nbits) +{ + offset += first_bit * XFS_BLF_CHUNK; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, + xfs_buf_offset(bp, offset), + nbits * XFS_BLF_CHUNK); +} + +static inline bool +xfs_buf_item_straddle( + struct xfs_buf *bp, + uint offset, + int next_bit, + int last_bit) +{ + return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) != + (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) + + XFS_BLF_CHUNK); +} + +static void xfs_buf_item_format_segment( struct xfs_buf_log_item *bip, - struct xfs_log_iovec *vecp, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, uint offset, struct xfs_buf_log_format *blfp) { struct xfs_buf *bp = bip->bli_buf; uint base_size; - uint nvecs; int first_bit; int last_bit; int next_bit; uint nbits; - uint buffer_offset; /* copy the flags across from the base format item */ blfp->blf_flags = bip->__bli_format.blf_flags; @@ -208,21 +234,17 @@ xfs_buf_item_format_segment( */ base_size = xfs_buf_log_format_size(blfp); - nvecs = 0; first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { /* * If the map is not be dirty in the transaction, mark * the size as zero and do not advance the vector pointer. */ - goto out; + return; } - vecp->i_addr = blfp; - vecp->i_len = base_size; - vecp->i_type = XLOG_REG_TYPE_BFORMAT; - vecp++; - nvecs = 1; + blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); + blfp->blf_size = 1; if (bip->bli_flags & XFS_BLI_STALE) { /* @@ -232,14 +254,13 @@ xfs_buf_item_format_segment( */ trace_xfs_buf_item_format_stale(bip); ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); - goto out; + return; } /* * Fill in an iovec for each set of contiguous chunks. */ - last_bit = first_bit; nbits = 1; for (;;) { @@ -252,42 +273,22 @@ xfs_buf_item_format_segment( next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, (uint)last_bit + 1); /* - * If we run out of bits fill in the last iovec and get - * out of the loop. - * Else if we start a new set of bits then fill in the - * iovec for the series we were looking at and start - * counting the bits in the new one. - * Else we're still in the same set of bits so just - * keep counting and scanning. + * If we run out of bits fill in the last iovec and get out of + * the loop. Else if we start a new set of bits then fill in + * the iovec for the series we were looking at and start + * counting the bits in the new one. Else we're still in the + * same set of bits so just keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; break; - } else if (next_bit != last_bit + 1) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; - vecp++; - first_bit = next_bit; - last_bit = next_bit; - nbits = 1; - } else if (xfs_buf_offset(bp, offset + - (next_bit << XFS_BLF_SHIFT)) != - (xfs_buf_offset(bp, offset + - (last_bit << XFS_BLF_SHIFT)) + - XFS_BLF_CHUNK)) { - buffer_offset = offset + first_bit * XFS_BLF_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLF_CHUNK; - vecp->i_type = XLOG_REG_TYPE_BCHUNK; - nvecs++; - vecp++; + } else if (next_bit != last_bit + 1 || + xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) { + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; first_bit = next_bit; last_bit = next_bit; nbits = 1; @@ -296,9 +297,6 @@ xfs_buf_item_format_segment( nbits++; } } -out: - blfp->blf_size = nvecs; - return vecp; } /* @@ -310,10 +308,11 @@ out: STATIC void xfs_buf_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) + struct xfs_log_vec *lv) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_iovec *vecp = NULL; uint offset = 0; int i; @@ -354,8 +353,8 @@ xfs_buf_item_format( } for (i = 0; i < bip->bli_format_count; i++) { - vecp = xfs_buf_item_format_segment(bip, vecp, offset, - &bip->bli_formats[i]); + xfs_buf_item_format_segment(bip, lv, &vecp, offset, + &bip->bli_formats[i]); offset += bp->b_maps[i].bm_len; } @@ -496,6 +495,14 @@ xfs_buf_item_unpin( } } +/* + * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 + * seconds so as to not spam logs too much on repeated detection of the same + * buffer being bad.. + */ + +DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); + STATIC uint xfs_buf_item_push( struct xfs_log_item *lip, @@ -524,6 +531,14 @@ xfs_buf_item_push( trace_xfs_buf_item_push(bip); + /* has a previous flush failed due to IO errors? */ + if ((bp->b_flags & XBF_WRITE_FAIL) && + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { + xfs_warn(bp->b_target->bt_mount, +"Detected failing async write on buffer block 0x%llx. Retrying async write.\n", + (long long)bp->b_bn); + } + if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_unlock(bp); @@ -1096,8 +1111,9 @@ xfs_buf_iodone_callbacks( xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ - if (!XFS_BUF_ISSTALE(bp)) { - bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE; + if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { + bp->b_flags |= XBF_WRITE | XBF_ASYNC | + XBF_DONE | XBF_WRITE_FAIL; xfs_buf_iorequest(bp); } else { xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 56369d4509d..48c7d18f68c 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -2067,12 +2067,12 @@ xfs_dir2_node_lookup( */ int /* error */ xfs_dir2_node_removename( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) /* operation arguments */ { - xfs_da_state_blk_t *blk; /* leaf block */ + struct xfs_da_state_blk *blk; /* leaf block */ int error; /* error return value */ int rval; /* operation return value */ - xfs_da_state_t *state; /* btree cursor */ + struct xfs_da_state *state; /* btree cursor */ trace_xfs_dir2_node_removename(args); @@ -2084,19 +2084,18 @@ xfs_dir2_node_removename( state->mp = args->dp->i_mount; state->blocksize = state->mp->m_dirblksize; state->node_ents = state->mp->m_dir_node_ents; - /* - * Look up the entry we're deleting, set up the cursor. - */ + + /* Look up the entry we're deleting, set up the cursor. */ error = xfs_da3_node_lookup_int(state, &rval); if (error) - rval = error; - /* - * Didn't find it, upper layer screwed up. - */ + goto out_free; + + /* Didn't find it, upper layer screwed up. */ if (rval != EEXIST) { - xfs_da_state_free(state); - return rval; + error = rval; + goto out_free; } + blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(state->extravalid); @@ -2107,7 +2106,7 @@ xfs_dir2_node_removename( error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, &state->extrablk, &rval); if (error) - return error; + goto out_free; /* * Fix the hash values up the btree. */ @@ -2122,6 +2121,7 @@ xfs_dir2_node_removename( */ if (!error) error = xfs_dir2_node_to_leaf(state); +out_free: xfs_da_state_free(state); return error; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index c4e50c6ed58..aead369e1c3 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -674,6 +674,7 @@ xfs_readdir( { int rval; /* return value */ int v; /* type-checking value */ + uint lock_mode; trace_xfs_readdir(dp); @@ -683,6 +684,7 @@ xfs_readdir( ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_getdents); + lock_mode = xfs_ilock_data_map_shared(dp); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(dp, ctx); else if ((rval = xfs_dir2_isblock(NULL, dp, &v))) @@ -691,5 +693,7 @@ xfs_readdir( rval = xfs_dir2_block_getdents(dp, ctx); else rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize); + xfs_iunlock(dp, lock_mode); + return rval; } diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index aafc6e46cb5..3725fb1b902 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -170,6 +170,7 @@ xfs_dir2_block_to_sf( char *ptr; /* current data pointer */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */ + xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */ trace_xfs_dir2_block_to_sf(args); @@ -177,35 +178,20 @@ xfs_dir2_block_to_sf( mp = dp->i_mount; /* - * Make a copy of the block data, so we can shrink the inode - * and add local data. + * allocate a temporary destination buffer the size of the inode + * to format the data into. Once we have formatted the data, we + * can free the block and copy the formatted data into the inode literal + * area. */ - hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP); - memcpy(hdr, bp->b_addr, mp->m_dirblksize); - logflags = XFS_ILOG_CORE; - if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) { - ASSERT(error != ENOSPC); - goto out; - } + dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); + hdr = bp->b_addr; /* - * The buffer is now unconditionally gone, whether - * xfs_dir2_shrink_inode worked or not. - * - * Convert the inode to local format. - */ - dp->i_df.if_flags &= ~XFS_IFEXTENTS; - dp->i_df.if_flags |= XFS_IFINLINE; - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; - ASSERT(dp->i_df.if_bytes == 0); - xfs_idata_realloc(dp, size, XFS_DATA_FORK); - logflags |= XFS_ILOG_DDATA; - /* * Copy the header into the newly allocate local space. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dst; memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); - dp->i_d.di_size = size; + /* * Set up to loop over the block's entries. */ @@ -258,10 +244,34 @@ xfs_dir2_block_to_sf( ptr += dp->d_ops->data_entsize(dep->namelen); } ASSERT((char *)sfep - (char *)sfp == size); + + /* now we are done with the block, we can shrink the inode */ + logflags = XFS_ILOG_CORE; + error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp); + if (error) { + ASSERT(error != ENOSPC); + goto out; + } + + /* + * The buffer is now unconditionally gone, whether + * xfs_dir2_shrink_inode worked or not. + * + * Convert the inode to local format and copy the data in. + */ + dp->i_df.if_flags &= ~XFS_IFEXTENTS; + dp->i_df.if_flags |= XFS_IFINLINE; + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + ASSERT(dp->i_df.if_bytes == 0); + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + + logflags |= XFS_ILOG_DDATA; + memcpy(dp->i_df.if_u1.if_data, dst, size); + dp->i_d.di_size = size; xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(hdr); + kmem_free(dst); return error; } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 8367d6dc18c..4f11ef01113 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -157,7 +157,7 @@ xfs_ioc_trim( struct xfs_mount *mp, struct fstrim_range __user *urange) { - struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); unsigned int granularity = q->limits.discard_granularity; struct fstrim_range range; xfs_daddr_t start, end, minlen; @@ -180,7 +180,8 @@ xfs_ioc_trim( * matter as trimming blocks is an advisory interface. */ if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || - range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp))) + range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || + range.len < mp->m_sb.sb_blocksize) return -XFS_ERROR(EINVAL); start = BTOBB(range.start); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 6b1e695caf0..7aeb4c895b3 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -469,16 +469,17 @@ xfs_qm_dqtobp( struct xfs_mount *mp = dqp->q_mount; xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); struct xfs_trans *tp = (tpp ? *tpp : NULL); + uint lock_mode; dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; - xfs_ilock(quotip, XFS_ILOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(quotip); if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. */ - xfs_iunlock(quotip, XFS_ILOCK_SHARED); + xfs_iunlock(quotip, lock_mode); return ESRCH; } @@ -488,7 +489,7 @@ xfs_qm_dqtobp( error = xfs_bmapi_read(quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); - xfs_iunlock(quotip, XFS_ILOCK_SHARED); + xfs_iunlock(quotip, lock_mode); if (error) return error; diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 92e5f62eefc..f33fbaaa4d8 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -57,20 +57,24 @@ xfs_qm_dquot_logitem_size( STATIC void xfs_qm_dquot_logitem_format( struct xfs_log_item *lip, - struct xfs_log_iovec *logvec) + struct xfs_log_vec *lv) { struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); - - logvec->i_addr = &qlip->qli_format; - logvec->i_len = sizeof(xfs_dq_logformat_t); - logvec->i_type = XLOG_REG_TYPE_QFORMAT; - logvec++; - logvec->i_addr = &qlip->qli_dquot->q_core; - logvec->i_len = sizeof(xfs_disk_dquot_t); - logvec->i_type = XLOG_REG_TYPE_DQUOT; - - qlip->qli_format.qlf_size = 2; - + struct xfs_log_iovec *vecp = NULL; + struct xfs_dq_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT); + qlf->qlf_type = XFS_LI_DQUOT; + qlf->qlf_size = 2; + qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id); + qlf->qlf_blkno = qlip->qli_dquot->q_blkno; + qlf->qlf_len = 1; + qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat)); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, + &qlip->qli_dquot->q_core, + sizeof(struct xfs_disk_dquot)); } /* @@ -257,18 +261,6 @@ xfs_qm_dquot_logitem_init( xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, &xfs_dquot_item_ops); lp->qli_dquot = dqp; - lp->qli_format.qlf_type = XFS_LI_DQUOT; - lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); - lp->qli_format.qlf_blkno = dqp->q_blkno; - lp->qli_format.qlf_len = 1; - /* - * This is just the offset of this dquot within its buffer - * (which is currently 1 FSB and probably won't change). - * Hence 32 bits for this offset should be just fine. - * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t)) - * here, and recompute it at recovery time. - */ - lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset; } /*------------------ QUOTAOFF LOG ITEMS -------------------*/ @@ -294,26 +286,20 @@ xfs_qm_qoff_logitem_size( *nbytes += sizeof(struct xfs_qoff_logitem); } -/* - * This is called to fill in the vector of log iovecs for the - * given quotaoff log item. We use only 1 iovec, and we point that - * at the quotaoff_log_format structure embedded in the quotaoff item. - * It is at this point that we assert that all of the extent - * slots in the quotaoff item have been filled. - */ STATIC void xfs_qm_qoff_logitem_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); - - ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF); - - log_vector->i_addr = &qflip->qql_format; - log_vector->i_len = sizeof(xfs_qoff_logitem_t); - log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF; - qflip->qql_format.qf_size = 1; + struct xfs_log_iovec *vecp = NULL; + struct xfs_qoff_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF); + qlf->qf_type = XFS_LI_QUOTAOFF; + qlf->qf_size = 1; + qlf->qf_flags = qflip->qql_flags; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem)); } /* @@ -453,8 +439,7 @@ xfs_qm_qoff_logitem_init( xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); qf->qql_item.li_mountp = mp; - qf->qql_format.qf_type = XFS_LI_QUOTAOFF; - qf->qql_format.qf_flags = flags; qf->qql_start_lip = start; + qf->qql_flags = flags; return qf; } diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 5acae2ada70..502e9464634 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -27,13 +27,12 @@ typedef struct xfs_dq_logitem { xfs_log_item_t qli_item; /* common portion */ struct xfs_dquot *qli_dquot; /* dquot ptr */ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ - xfs_dq_logformat_t qli_format; /* logged structure */ } xfs_dq_logitem_t; typedef struct xfs_qoff_logitem { xfs_log_item_t qql_item; /* common portion */ struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ - xfs_qoff_logformat_t qql_format; /* logged structure */ + unsigned int qql_flags; } xfs_qoff_logitem_t; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3680d04f973..fb7a4c1ce1c 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -26,6 +26,7 @@ #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_extfree_item.h" +#include "xfs_log.h" kmem_zone_t *xfs_efi_zone; @@ -101,9 +102,10 @@ xfs_efi_item_size( STATIC void xfs_efi_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; ASSERT(atomic_read(&efip->efi_next_extent) == efip->efi_format.efi_nextents); @@ -111,10 +113,9 @@ xfs_efi_item_format( efip->efi_format.efi_type = XFS_LI_EFI; efip->efi_format.efi_size = 1; - log_vector->i_addr = &efip->efi_format; - log_vector->i_len = xfs_efi_item_sizeof(efip); - log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; - ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t)); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, + &efip->efi_format, + xfs_efi_item_sizeof(efip)); } @@ -368,19 +369,19 @@ xfs_efd_item_size( STATIC void xfs_efd_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); efdp->efd_format.efd_type = XFS_LI_EFD; efdp->efd_format.efd_size = 1; - log_vector->i_addr = &efdp->efd_format; - log_vector->i_len = xfs_efd_item_sizeof(efdp); - log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; - ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t)); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, + &efdp->efd_format, + xfs_efd_item_sizeof(efdp)); } /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 52c91e14372..2e7989e3a2d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -261,7 +261,8 @@ xfs_file_aio_read( xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((pos & target->bt_smask) || (size & target->bt_smask)) { + /* DIO must be aligned to device logical sector size */ + if ((pos | size) & target->bt_logical_sectormask) { if (pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); @@ -641,9 +642,11 @@ xfs_file_dio_aio_write( struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((pos & target->bt_smask) || (count & target->bt_smask)) + /* DIO must be aligned to device logical sector size */ + if ((pos | count) & target->bt_logical_sectormask) return -XFS_ERROR(EINVAL); + /* "unaligned" here means not aligned to a filesystem block */ if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; @@ -912,7 +915,7 @@ xfs_dir_open( * If there are any blocks, read-ahead block 0 as we're almost * certain to have the next operation be a read there. */ - mode = xfs_ilock_map_shared(ip); + mode = xfs_ilock_data_map_shared(ip); if (ip->i_d.di_nextents > 0) xfs_dir3_data_readahead(NULL, ip, 0, -1); xfs_iunlock(ip, mode); @@ -1215,7 +1218,7 @@ xfs_seek_data( uint lock; int error; - lock = xfs_ilock_map_shared(ip); + lock = xfs_ilock_data_map_shared(ip); isize = i_size_read(inode); if (start >= isize) { @@ -1294,7 +1297,7 @@ out: offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: - xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, lock); if (error) return -error; @@ -1319,7 +1322,7 @@ xfs_seek_hole( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - lock = xfs_ilock_map_shared(ip); + lock = xfs_ilock_data_map_shared(ip); isize = i_size_read(inode); if (start >= isize) { @@ -1402,7 +1405,7 @@ out: offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: - xfs_iunlock_map_shared(ip, lock); + xfs_iunlock(ip, lock); if (error) return -error; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index a6e54b3319b..02fb943cbf2 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -220,6 +220,8 @@ xfs_growfs_data_private( */ nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { + __be32 *agfl_bno; + /* * AG freespace header block */ @@ -279,8 +281,10 @@ xfs_growfs_data_private( agfl->agfl_seqno = cpu_to_be32(agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) - agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); error = xfs_bwrite(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index e87719c5beb..5d7f105a1c8 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -52,7 +52,7 @@ xfs_ialloc_cluster_alignment( { if (xfs_sb_version_hasalign(&args->mp->m_sb) && args->mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp))) + XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size)) return args->mp->m_sb.sb_inoalignmt; return 1; } @@ -170,27 +170,20 @@ xfs_ialloc_inode_init( { struct xfs_buf *fbuf; struct xfs_dinode *free; - int blks_per_cluster, nbufs, ninodes; + int nbufs, blks_per_cluster, inodes_per_cluster; int version; int i, j; xfs_daddr_t d; xfs_ino_t ino = 0; /* - * Loop over the new block(s), filling in the inodes. - * For small block sizes, manipulate the inodes in buffers - * which are multiples of the blocks size. + * Loop over the new block(s), filling in the inodes. For small block + * sizes, manipulate the inodes in buffers which are multiples of the + * blocks size. */ - if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { - blks_per_cluster = 1; - nbufs = length; - ninodes = mp->m_sb.sb_inopblock; - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / - mp->m_sb.sb_blocksize; - nbufs = length / blks_per_cluster; - ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; - } + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = length / blks_per_cluster; /* * Figure out what version number to use in the inodes we create. If @@ -225,7 +218,7 @@ xfs_ialloc_inode_init( * they track in the AIL as if they were physically logged. */ if (tp) - xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp), + xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, mp->m_sb.sb_inodesize, length, gen); } else if (xfs_sb_version_hasnlink(&mp->m_sb)) version = 2; @@ -246,7 +239,7 @@ xfs_ialloc_inode_init( /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); - for (i = 0; i < ninodes; i++) { + for (i = 0; i < inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; uint isize = xfs_dinode_size(version); @@ -329,11 +322,11 @@ xfs_ialloc_ag_alloc( * Locking will ensure that we don't have two callers in here * at one time. */ - newlen = XFS_IALLOC_INODES(args.mp); + newlen = args.mp->m_ialloc_inos; if (args.mp->m_maxicount && args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) return XFS_ERROR(ENOSPC); - args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp); + args.minlen = args.maxlen = args.mp->m_ialloc_blks; /* * First try to allocate inodes contiguous with the last-allocated * chunk of inodes. If the filesystem is striped, this will fill @@ -343,7 +336,7 @@ xfs_ialloc_ag_alloc( newino = be32_to_cpu(agi->agi_newino); agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + - XFS_IALLOC_BLOCKS(args.mp); + args.mp->m_ialloc_blks; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); @@ -585,7 +578,7 @@ xfs_ialloc_ag_select( * Is there enough free space for the file plus a block of * inodes? (if we need to allocate some)? */ - ineed = XFS_IALLOC_BLOCKS(mp); + ineed = mp->m_ialloc_blks; longest = pag->pagf_longest; if (!longest) longest = pag->pagf_flcount > 0; @@ -999,7 +992,7 @@ xfs_dialloc( * inode. */ if (mp->m_maxicount && - mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { + mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) { noroom = 1; okalloc = 0; } @@ -1202,7 +1195,7 @@ xfs_difree( * When an inode cluster is free, it becomes eligible for removal */ if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { + (rec.ir_freecount == mp->m_ialloc_inos)) { *delete = 1; *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); @@ -1212,7 +1205,7 @@ xfs_difree( * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ - ilen = XFS_IALLOC_INODES(mp); + ilen = mp->m_ialloc_inos; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); @@ -1228,9 +1221,9 @@ xfs_difree( goto error0; } - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, - agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)), - XFS_IALLOC_BLOCKS(mp), flist, mp); + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), + mp->m_ialloc_blks, flist, mp); } else { *delete = 0; @@ -1311,7 +1304,7 @@ xfs_imap_lookup( /* check that the returned record contains the required inode */ if (rec.ir_startino > agino || - rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) + rec.ir_startino + mp->m_ialloc_inos <= agino) return EINVAL; /* for untrusted inodes check it is allocated first */ @@ -1384,7 +1377,7 @@ xfs_imap( return XFS_ERROR(EINVAL); } - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; + blks_per_cluster = xfs_icluster_size_fsb(mp); /* * For bulkstat and handle lookups, we have an untrusted inode number @@ -1405,7 +1398,7 @@ xfs_imap( * If the inode cluster size is the same as the blocksize or * smaller we get to the buffer by simple arithmetics. */ - if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) { + if (blks_per_cluster == 1) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index a8f76a5ff41..812365d17e6 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -25,17 +25,18 @@ struct xfs_mount; struct xfs_trans; struct xfs_btree_cur; -/* - * Allocation parameters for inode allocation. - */ -#define XFS_IALLOC_INODES(mp) (mp)->m_ialloc_inos -#define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks - -/* - * Move inodes in clusters of this size. - */ +/* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 -#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size + +/* Calculate and return the number of filesystem blocks per inode cluster */ +static inline int +xfs_icluster_size_fsb( + struct xfs_mount *mp) +{ + if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) + return 1; + return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog; +} /* * Make an inode pointer out of the buffer/offset. diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index d2eaccfa73f..7e454923325 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -28,6 +28,7 @@ #include "xfs_trans_priv.h" #include "xfs_error.h" #include "xfs_icreate_item.h" +#include "xfs_log.h" kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ @@ -58,13 +59,14 @@ xfs_icreate_item_size( STATIC void xfs_icreate_item_format( struct xfs_log_item *lip, - struct xfs_log_iovec *log_vector) + struct xfs_log_vec *lv) { struct xfs_icreate_item *icp = ICR_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; - log_vector->i_addr = (xfs_caddr_t)&icp->ic_format; - log_vector->i_len = sizeof(struct xfs_icreate_log); - log_vector->i_type = XLOG_REG_TYPE_ICREATE; + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE, + &icp->ic_format, + sizeof(struct xfs_icreate_log)); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 001aa893ed5..3a137e9f9a7 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -77,48 +77,44 @@ xfs_get_extsz_hint( } /* - * This is a wrapper routine around the xfs_ilock() routine used to centralize - * some grungy code. It is used in places that wish to lock the inode solely - * for reading the extents. The reason these places can't just call - * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the - * extents from disk for a file in b-tree format. If the inode is in b-tree - * format, then we need to lock the inode exclusively until the extents are read - * in. Locking it exclusively all the time would limit our parallelism - * unnecessarily, though. What we do instead is check to see if the extents - * have been read in yet, and only lock the inode exclusively if they have not. + * These two are wrapper routines around the xfs_ilock() routine used to + * centralize some grungy code. They are used in places that wish to lock the + * inode solely for reading the extents. The reason these places can't just + * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to + * bringing in of the extents from disk for a file in b-tree format. If the + * inode is in b-tree format, then we need to lock the inode exclusively until + * the extents are read in. Locking it exclusively all the time would limit + * our parallelism unnecessarily, though. What we do instead is check to see + * if the extents have been read in yet, and only lock the inode exclusively + * if they have not. * - * The function returns a value which should be given to the corresponding - * xfs_iunlock_map_shared(). This value is the mode in which the lock was - * actually taken. + * The functions return a value which should be given to the corresponding + * xfs_iunlock() call. */ uint -xfs_ilock_map_shared( - xfs_inode_t *ip) +xfs_ilock_data_map_shared( + struct xfs_inode *ip) { - uint lock_mode; + uint lock_mode = XFS_ILOCK_SHARED; - if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && - ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; - } else { - lock_mode = XFS_ILOCK_SHARED; - } - xfs_ilock(ip, lock_mode); - return lock_mode; } -/* - * This is simply the unlock routine to go with xfs_ilock_map_shared(). - * All it does is call xfs_iunlock() with the given lock_mode. - */ -void -xfs_iunlock_map_shared( - xfs_inode_t *ip, - unsigned int lock_mode) +uint +xfs_ilock_attr_map_shared( + struct xfs_inode *ip) { - xfs_iunlock(ip, lock_mode); + uint lock_mode = XFS_ILOCK_SHARED; + + if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) + lock_mode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_mode); + return lock_mode; } /* @@ -588,9 +584,9 @@ xfs_lookup( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return XFS_ERROR(EIO); - lock_mode = xfs_ilock_map_shared(dp); + lock_mode = xfs_ilock_data_map_shared(dp); error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); - xfs_iunlock_map_shared(dp, lock_mode); + xfs_iunlock(dp, lock_mode); if (error) goto out; @@ -2141,8 +2137,8 @@ xfs_ifree_cluster( { xfs_mount_t *mp = free_ip->i_mount; int blks_per_cluster; + int inodes_per_cluster; int nbufs; - int ninodes; int i, j; xfs_daddr_t blkno; xfs_buf_t *bp; @@ -2152,18 +2148,11 @@ xfs_ifree_cluster( struct xfs_perag *pag; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { - blks_per_cluster = 1; - ninodes = mp->m_sb.sb_inopblock; - nbufs = XFS_IALLOC_BLOCKS(mp); - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / - mp->m_sb.sb_blocksize; - ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; - nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; - } + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = mp->m_ialloc_blks / blks_per_cluster; - for (j = 0; j < nbufs; j++, inum += ninodes) { + for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -2225,7 +2214,7 @@ xfs_ifree_cluster( * transaction stale above, which means there is no point in * even trying to lock them. */ - for (i = 0; i < ninodes; i++) { + for (i = 0; i < inodes_per_cluster; i++) { retry: rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, @@ -2906,13 +2895,13 @@ xfs_iflush_cluster( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; + inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); if (!ilist) goto out_put; - mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); + mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; rcu_read_lock(); /* really need a gang lookup range call here */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 9e6efccbae0..65e2350f449 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -337,8 +337,8 @@ int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); -uint xfs_ilock_map_shared(xfs_inode_t *); -void xfs_iunlock_map_shared(xfs_inode_t *, uint); +uint xfs_ilock_data_map_shared(struct xfs_inode *); +uint xfs_ilock_attr_map_shared(struct xfs_inode *); int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_buf **, xfs_inode_t **); diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c index cfee14a83cf..73514c0486b 100644 --- a/fs/xfs/xfs_inode_fork.c +++ b/fs/xfs/xfs_inode_fork.c @@ -431,6 +431,8 @@ xfs_iread_extents( xfs_ifork_t *ifp; xfs_extnum_t nextents; + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, ip->i_mount); @@ -721,15 +723,16 @@ xfs_idestroy_fork( } /* - * xfs_iextents_copy() + * Convert in-core extents to on-disk form * - * This is called to copy the REAL extents (as opposed to the delayed - * allocation extents) from the inode into the given buffer. It - * returns the number of bytes copied into the buffer. + * For either the data or attr fork in extent format, we need to endian convert + * the in-core extent as we place them into the on-disk inode. * - * If there are no delayed allocation extents, then we can just - * memcpy() the extents into the buffer. Otherwise, we need to - * examine each extent in turn and skip those which are delayed. + * In the case of the data fork, the in-core and on-disk fork sizes can be + * different due to delayed allocation extents. We only copy on-disk extents + * here, so callers must always use the physical fork size to determine the + * size of the buffer passed to this routine. We will return the size actually + * used. */ int xfs_iextents_copy( diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 7c0d391f9a6..686889b4a1e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -30,6 +30,7 @@ #include "xfs_trace.h" #include "xfs_trans_priv.h" #include "xfs_dinode.h" +#include "xfs_log.h" kmem_zone_t *xfs_ili_zone; /* inode log item zone */ @@ -39,27 +40,14 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_inode_log_item, ili_item); } - -/* - * This returns the number of iovecs needed to log the given inode item. - * - * We need one iovec for the inode log format structure, one for the - * inode core, and possibly one for the inode data/extents/b-tree root - * and one for the inode attribute data/extents/b-tree root. - */ STATIC void -xfs_inode_item_size( - struct xfs_log_item *lip, +xfs_inode_item_data_fork_size( + struct xfs_inode_log_item *iip, int *nvecs, int *nbytes) { - struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - *nvecs += 2; - *nbytes += sizeof(struct xfs_inode_log_format) + - xfs_icdinode_size(ip->i_d.di_version); - switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_DEXT) && @@ -70,7 +58,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_DBROOT) && ip->i_df.if_broot_bytes > 0) { @@ -78,7 +65,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { @@ -90,19 +76,20 @@ xfs_inode_item_size( case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_UUID: break; - default: ASSERT(0); break; } +} - if (!XFS_IFORK_Q(ip)) - return; - +STATIC void +xfs_inode_item_attr_fork_size( + struct xfs_inode_log_item *iip, + int *nvecs, + int *nbytes) +{ + struct xfs_inode *ip = iip->ili_inode; - /* - * Log any necessary attribute data. - */ switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_AEXT) && @@ -113,7 +100,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_ABROOT) && ip->i_afp->if_broot_bytes > 0) { @@ -121,7 +107,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { @@ -129,7 +114,6 @@ xfs_inode_item_size( *nvecs += 1; } break; - default: ASSERT(0); break; @@ -137,98 +121,67 @@ xfs_inode_item_size( } /* - * xfs_inode_item_format_extents - convert in-core extents to on-disk form - * - * For either the data or attr fork in extent format, we need to endian convert - * the in-core extent as we place them into the on-disk inode. In this case, we - * need to do this conversion before we write the extents into the log. Because - * we don't have the disk inode to write into here, we allocate a buffer and - * format the extents into it via xfs_iextents_copy(). We free the buffer in - * the unlock routine after the copy for the log has been made. + * This returns the number of iovecs needed to log the given inode item. * - * In the case of the data fork, the in-core and on-disk fork sizes can be - * different due to delayed allocation extents. We only log on-disk extents - * here, so always use the physical fork size to determine the size of the - * buffer we need to allocate. + * We need one iovec for the inode log format structure, one for the + * inode core, and possibly one for the inode data/extents/b-tree root + * and one for the inode attribute data/extents/b-tree root. */ STATIC void -xfs_inode_item_format_extents( - struct xfs_inode *ip, - struct xfs_log_iovec *vecp, - int whichfork, - int type) +xfs_inode_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - xfs_bmbt_rec_t *ext_buffer; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; - ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP); - if (whichfork == XFS_DATA_FORK) - ip->i_itemp->ili_extents_buf = ext_buffer; - else - ip->i_itemp->ili_aextents_buf = ext_buffer; + *nvecs += 2; + *nbytes += sizeof(struct xfs_inode_log_format) + + xfs_icdinode_size(ip->i_d.di_version); - vecp->i_addr = ext_buffer; - vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork); - vecp->i_type = type; + xfs_inode_item_data_fork_size(iip, nvecs, nbytes); + if (XFS_IFORK_Q(ip)) + xfs_inode_item_attr_fork_size(iip, nvecs, nbytes); } /* - * This is called to fill in the vector of log iovecs for the - * given inode log item. It fills the first item with an inode - * log format structure, the second with the on-disk inode structure, - * and a possible third and/or fourth with the inode data/extents/b-tree - * root and inode attributes data/extents/b-tree root. + * If this is a v1 format inode, then we need to log it as such. This means + * that we have to copy the link count from the new field to the old. We + * don't have to worry about the new fields, because nothing trusts them as + * long as the old inode version number is there. */ STATIC void -xfs_inode_item_format( - struct xfs_log_item *lip, - struct xfs_log_iovec *vecp) +xfs_inode_item_format_v1_inode( + struct xfs_inode *ip) +{ + if (!xfs_sb_version_hasnlink(&ip->i_mount->m_sb)) { + /* + * Convert it back. + */ + ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); + ip->i_d.di_onlink = ip->i_d.di_nlink; + } else { + /* + * The superblock version has already been bumped, + * so just make the conversion to the new inode + * format permanent. + */ + ip->i_d.di_version = 2; + ip->i_d.di_onlink = 0; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + } +} + +STATIC void +xfs_inode_item_format_data_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) { - struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - uint nvecs; size_t data_bytes; - xfs_mount_t *mp; - - vecp->i_addr = &iip->ili_format; - vecp->i_len = sizeof(xfs_inode_log_format_t); - vecp->i_type = XLOG_REG_TYPE_IFORMAT; - vecp++; - nvecs = 1; - - vecp->i_addr = &ip->i_d; - vecp->i_len = xfs_icdinode_size(ip->i_d.di_version); - vecp->i_type = XLOG_REG_TYPE_ICORE; - vecp++; - nvecs++; - - /* - * If this is really an old format inode, then we need to - * log it as such. This means that we have to copy the link - * count from the new field to the old. We don't have to worry - * about the new fields, because nothing trusts them as long as - * the old inode version number is there. If the superblock already - * has a new version number, then we don't bother converting back. - */ - mp = ip->i_mount; - ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb)); - if (ip->i_d.di_version == 1) { - if (!xfs_sb_version_hasnlink(&mp->m_sb)) { - /* - * Convert it back. - */ - ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - ip->i_d.di_onlink = ip->i_d.di_nlink; - } else { - /* - * The superblock version has already been bumped, - * so just make the conversion to the new inode - * format permanent. - */ - ip->i_d.di_version = 2; - ip->i_d.di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - } - } switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: @@ -239,36 +192,23 @@ xfs_inode_item_format( if ((iip->ili_fields & XFS_ILOG_DEXT) && ip->i_d.di_nextents > 0 && ip->i_df.if_bytes > 0) { + struct xfs_bmbt_rec *p; + ASSERT(ip->i_df.if_u1.if_extents != NULL); ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); - ASSERT(iip->ili_extents_buf == NULL); - -#ifdef XFS_NATIVE_HOST - if (ip->i_d.di_nextents == ip->i_df.if_bytes / - (uint)sizeof(xfs_bmbt_rec_t)) { - /* - * There are no delayed allocation - * extents, so just point to the - * real extents array. - */ - vecp->i_addr = ip->i_df.if_u1.if_extents; - vecp->i_len = ip->i_df.if_bytes; - vecp->i_type = XLOG_REG_TYPE_IEXT; - } else -#endif - { - xfs_inode_item_format_extents(ip, vecp, - XFS_DATA_FORK, XLOG_REG_TYPE_IEXT); - } - ASSERT(vecp->i_len <= ip->i_df.if_bytes); - iip->ili_format.ilf_dsize = vecp->i_len; - vecp++; - nvecs++; + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ASSERT(data_bytes <= ip->i_df.if_bytes); + + ilf->ilf_dsize = data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_DEXT; } break; - case XFS_DINODE_FMT_BTREE: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | @@ -277,80 +217,70 @@ xfs_inode_item_format( if ((iip->ili_fields & XFS_ILOG_DBROOT) && ip->i_df.if_broot_bytes > 0) { ASSERT(ip->i_df.if_broot != NULL); - vecp->i_addr = ip->i_df.if_broot; - vecp->i_len = ip->i_df.if_broot_bytes; - vecp->i_type = XLOG_REG_TYPE_IBROOT; - vecp++; - nvecs++; - iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT, + ip->i_df.if_broot, + ip->i_df.if_broot_bytes); + ilf->ilf_dsize = ip->i_df.if_broot_bytes; + ilf->ilf_size++; } else { ASSERT(!(iip->ili_fields & XFS_ILOG_DBROOT)); iip->ili_fields &= ~XFS_ILOG_DBROOT; } break; - case XFS_DINODE_FMT_LOCAL: iip->ili_fields &= ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV | XFS_ILOG_UUID); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - ASSERT(ip->i_df.if_u1.if_data != NULL); - ASSERT(ip->i_d.di_size > 0); - - vecp->i_addr = ip->i_df.if_u1.if_data; /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to * to be there by xfs_idata_realloc(). */ data_bytes = roundup(ip->i_df.if_bytes, 4); - ASSERT((ip->i_df.if_real_bytes == 0) || - (ip->i_df.if_real_bytes == data_bytes)); - vecp->i_len = (int)data_bytes; - vecp->i_type = XLOG_REG_TYPE_ILOCAL; - vecp++; - nvecs++; - iip->ili_format.ilf_dsize = (unsigned)data_bytes; + ASSERT(ip->i_df.if_real_bytes == 0 || + ip->i_df.if_real_bytes == data_bytes); + ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_d.di_size > 0); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, + ip->i_df.if_u1.if_data, data_bytes); + ilf->ilf_dsize = (unsigned)data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_DDATA; } break; - case XFS_DINODE_FMT_DEV: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT | XFS_ILOG_UUID); - if (iip->ili_fields & XFS_ILOG_DEV) { - iip->ili_format.ilf_u.ilfu_rdev = - ip->i_df.if_u2.if_rdev; - } + if (iip->ili_fields & XFS_ILOG_DEV) + ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev; break; - case XFS_DINODE_FMT_UUID: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT | XFS_ILOG_DEV); - if (iip->ili_fields & XFS_ILOG_UUID) { - iip->ili_format.ilf_u.ilfu_uuid = - ip->i_df.if_u2.if_uuid; - } + if (iip->ili_fields & XFS_ILOG_UUID) + ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid; break; - default: ASSERT(0); break; } +} - /* - * If there are no attributes associated with the file, then we're done. - */ - if (!XFS_IFORK_Q(ip)) { - iip->ili_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); - goto out; - } +STATIC void +xfs_inode_item_format_attr_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) +{ + struct xfs_inode *ip = iip->ili_inode; + size_t data_bytes; switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: @@ -360,30 +290,22 @@ xfs_inode_item_format( if ((iip->ili_fields & XFS_ILOG_AEXT) && ip->i_d.di_anextents > 0 && ip->i_afp->if_bytes > 0) { + struct xfs_bmbt_rec *p; + ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == ip->i_d.di_anextents); ASSERT(ip->i_afp->if_u1.if_extents != NULL); -#ifdef XFS_NATIVE_HOST - /* - * There are not delayed allocation extents - * for attributes, so just point at the array. - */ - vecp->i_addr = ip->i_afp->if_u1.if_extents; - vecp->i_len = ip->i_afp->if_bytes; - vecp->i_type = XLOG_REG_TYPE_IATTR_EXT; -#else - ASSERT(iip->ili_aextents_buf == NULL); - xfs_inode_item_format_extents(ip, vecp, - XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT); -#endif - iip->ili_format.ilf_asize = vecp->i_len; - vecp++; - nvecs++; + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ilf->ilf_asize = data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_AEXT; } break; - case XFS_DINODE_FMT_BTREE: iip->ili_fields &= ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); @@ -392,61 +314,89 @@ xfs_inode_item_format( ip->i_afp->if_broot_bytes > 0) { ASSERT(ip->i_afp->if_broot != NULL); - vecp->i_addr = ip->i_afp->if_broot; - vecp->i_len = ip->i_afp->if_broot_bytes; - vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT; - vecp++; - nvecs++; - iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT, + ip->i_afp->if_broot, + ip->i_afp->if_broot_bytes); + ilf->ilf_asize = ip->i_afp->if_broot_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ABROOT; } break; - case XFS_DINODE_FMT_LOCAL: iip->ili_fields &= ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_afp->if_bytes > 0) { - ASSERT(ip->i_afp->if_u1.if_data != NULL); - - vecp->i_addr = ip->i_afp->if_u1.if_data; /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to * to be there by xfs_idata_realloc(). */ data_bytes = roundup(ip->i_afp->if_bytes, 4); - ASSERT((ip->i_afp->if_real_bytes == 0) || - (ip->i_afp->if_real_bytes == data_bytes)); - vecp->i_len = (int)data_bytes; - vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL; - vecp++; - nvecs++; - iip->ili_format.ilf_asize = (unsigned)data_bytes; + ASSERT(ip->i_afp->if_real_bytes == 0 || + ip->i_afp->if_real_bytes == data_bytes); + ASSERT(ip->i_afp->if_u1.if_data != NULL); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, + ip->i_afp->if_u1.if_data, + data_bytes); + ilf->ilf_asize = (unsigned)data_bytes; + ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ADATA; } break; - default: ASSERT(0); break; } - -out: - /* - * Now update the log format that goes out to disk from the in-core - * values. We always write the inode core to make the arithmetic - * games in recovery easier, which isn't a big deal as just about any - * transaction would dirty it anyway. - */ - iip->ili_format.ilf_fields = XFS_ILOG_CORE | - (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); - iip->ili_format.ilf_size = nvecs; } +/* + * This is called to fill in the vector of log iovecs for the given inode + * log item. It fills the first item with an inode log format structure, + * the second with the on-disk inode structure, and a possible third and/or + * fourth with the inode data/extents/b-tree root and inode attributes + * data/extents/b-tree root. + */ +STATIC void +xfs_inode_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_inode_log_format *ilf; + struct xfs_log_iovec *vecp = NULL; + + ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); + ilf->ilf_type = XFS_LI_INODE; + ilf->ilf_ino = ip->i_ino; + ilf->ilf_blkno = ip->i_imap.im_blkno; + ilf->ilf_len = ip->i_imap.im_len; + ilf->ilf_boffset = ip->i_imap.im_boffset; + ilf->ilf_fields = XFS_ILOG_CORE; + ilf->ilf_size = 2; /* format + core */ + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); + + if (ip->i_d.di_version == 1) + xfs_inode_item_format_v1_inode(ip); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE, + &ip->i_d, + xfs_icdinode_size(ip->i_d.di_version)); + + xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); + if (XFS_IFORK_Q(ip)) { + xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp); + } else { + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + } + + /* update the format with the exact fields we actually logged */ + ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); +} /* * This is called to pin the inode associated with the inode log @@ -563,27 +513,6 @@ xfs_inode_item_unlock( ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - /* - * If the inode needed a separate buffer with which to log - * its extents, then free it now. - */ - if (iip->ili_extents_buf != NULL) { - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); - ASSERT(ip->i_d.di_nextents > 0); - ASSERT(iip->ili_fields & XFS_ILOG_DEXT); - ASSERT(ip->i_df.if_bytes > 0); - kmem_free(iip->ili_extents_buf); - iip->ili_extents_buf = NULL; - } - if (iip->ili_aextents_buf != NULL) { - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); - ASSERT(ip->i_d.di_anextents > 0); - ASSERT(iip->ili_fields & XFS_ILOG_AEXT); - ASSERT(ip->i_afp->if_bytes > 0); - kmem_free(iip->ili_aextents_buf); - iip->ili_aextents_buf = NULL; - } - lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0; if (lock_flags) @@ -670,11 +599,6 @@ xfs_inode_item_init( iip->ili_inode = ip; xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, &xfs_inode_item_ops); - iip->ili_format.ilf_type = XFS_LI_INODE; - iip->ili_format.ilf_ino = ip->i_ino; - iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; - iip->ili_format.ilf_len = ip->i_imap.im_len; - iip->ili_format.ilf_boffset = ip->i_imap.im_boffset; } /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index dce4d656768..488d81254e2 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -34,11 +34,6 @@ typedef struct xfs_inode_log_item { unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ - struct xfs_bmbt_rec *ili_extents_buf; /* array of logged - data exts */ - struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged - attr exts */ - xfs_inode_log_format_t ili_format; /* logged structure */ } xfs_inode_log_item_t; static inline int xfs_inode_clean(xfs_inode_t *ip) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 4d613401a5e..bcfe6120211 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -112,15 +112,11 @@ xfs_find_handle( memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); hsize = sizeof(xfs_fsid_t); } else { - int lock_mode; - - lock_mode = xfs_ilock_map_shared(ip); handle.ha_fid.fid_len = sizeof(xfs_fid_t) - sizeof(handle.ha_fid.fid_len); handle.ha_fid.fid_pad = 0; handle.ha_fid.fid_gen = ip->i_d.di_gen; handle.ha_fid.fid_ino = ip->i_ino; - xfs_iunlock_map_shared(ip, lock_mode); hsize = XFS_HSIZE(handle); } @@ -442,7 +438,8 @@ xfs_attrlist_by_handle( return -XFS_ERROR(EPERM); if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* @@ -1586,7 +1583,7 @@ xfs_file_ioctl( XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; - da.d_mem = da.d_miniosz = 1 << target->bt_sshift; + da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); if (copy_to_user(arg, &da, sizeof(da))) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index e8fb1231db8..a7992f8de9d 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -356,7 +356,8 @@ xfs_compat_attrlist_by_handle( if (copy_from_user(&al_hreq, arg, sizeof(compat_xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 27e0e544e96..f35d5c953ff 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -123,7 +123,7 @@ xfs_vn_mknod( { struct inode *inode; struct xfs_inode *ip = NULL; - struct posix_acl *default_acl = NULL; + struct posix_acl *default_acl, *acl; struct xfs_name name; int error; @@ -139,14 +139,9 @@ xfs_vn_mknod( rdev = 0; } - if (IS_POSIXACL(dir)) { - default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(default_acl)) - return PTR_ERR(default_acl); - - if (!default_acl) - mode &= ~current_umask(); - } + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + return error; xfs_dentry_to_name(&name, dentry, mode); error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); @@ -159,22 +154,30 @@ xfs_vn_mknod( if (unlikely(error)) goto out_cleanup_inode; +#ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = -xfs_inherit_acl(inode, default_acl); - default_acl = NULL; - if (unlikely(error)) + error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) goto out_cleanup_inode; } - + if (acl) { + error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto out_cleanup_inode; + } +#endif d_instantiate(dentry, inode); + out_free_acl: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); return -error; out_cleanup_inode: xfs_cleanup_inode(dir, inode, dentry); - out_free_acl: - posix_acl_release(default_acl); - return -error; + goto out_free_acl; } STATIC int @@ -391,18 +394,6 @@ xfs_vn_follow_link( return NULL; } -STATIC void -xfs_vn_put_link( - struct dentry *dentry, - struct nameidata *nd, - void *p) -{ - char *s = nd_get_link(nd); - - if (!IS_ERR(s)) - kfree(s); -} - STATIC int xfs_vn_getattr( struct vfsmount *mnt, @@ -459,14 +450,12 @@ xfs_vn_getattr( static void xfs_setattr_mode( - struct xfs_trans *tp, struct xfs_inode *ip, struct iattr *iattr) { - struct inode *inode = VFS_I(ip); - umode_t mode = iattr->ia_mode; + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; - ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ip->i_d.di_mode &= S_IFMT; @@ -476,6 +465,32 @@ xfs_setattr_mode( inode->i_mode |= mode & ~S_IFMT; } +static void +xfs_setattr_time( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (iattr->ia_valid & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + } + if (iattr->ia_valid & ATTR_CTIME) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; + } + if (iattr->ia_valid & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + } +} + int xfs_setattr_nonsize( struct xfs_inode *ip, @@ -618,7 +633,8 @@ xfs_setattr_nonsize( } if (!gid_eq(igid, gid)) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { - ASSERT(!XFS_IS_PQUOTA_ON(mp)); + ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) || + !XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & ATTR_GID); ASSERT(gdqp); olddquot2 = xfs_qm_vop_chown(tp, ip, @@ -629,30 +645,10 @@ xfs_setattr_nonsize( } } - /* - * Change file access modes. - */ if (mask & ATTR_MODE) - xfs_setattr_mode(tp, ip, iattr); - - /* - * Change file access or modified times. - */ - if (mask & ATTR_ATIME) { - inode->i_atime = iattr->ia_atime; - ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; - ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; - } - if (mask & ATTR_CTIME) { - inode->i_ctime = iattr->ia_ctime; - ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - } + xfs_setattr_mode(ip, iattr); + if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -683,7 +679,7 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { - error = -xfs_acl_chmod(inode); + error = -posix_acl_chmod(inode, inode->i_mode); if (error) return XFS_ERROR(error); } @@ -867,22 +863,10 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } - /* - * Change file access modes. - */ if (mask & ATTR_MODE) - xfs_setattr_mode(tp, ip, iattr); - - if (mask & ATTR_CTIME) { - inode->i_ctime = iattr->ia_ctime; - ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - } + xfs_setattr_mode(ip, iattr); + if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1052,6 +1036,7 @@ xfs_vn_fiemap( static const struct inode_operations xfs_inode_operations = { .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1079,6 +1064,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1105,6 +1091,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1117,8 +1104,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, - .put_link = xfs_vn_put_link, - .get_acl = xfs_get_acl, + .put_link = kfree_put_link, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index d2c5057b5cc..1c34e433592 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -30,7 +30,7 @@ extern void xfs_setup_inode(struct xfs_inode *); /* * Internal setattr interfaces. */ -#define XFS_ATTR_NOACL 0x01 /* Don't call xfs_acl_chmod */ +#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c237ad15d50..f4633828515 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -209,9 +209,8 @@ xfs_bulkstat( xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ xfs_ino_t lastino; /* last inode number returned */ - int nbcluster; /* # of blocks in a cluster */ - int nicluster; /* # of inodes in a cluster */ - int nimask; /* mask for inode clusters */ + int blks_per_cluster; /* # of blocks per cluster */ + int inodes_per_cluster;/* # of inodes per cluster */ int nirbuf; /* size of irbuf */ int rval; /* return value error code */ int tmp; /* result value from btree calls */ @@ -243,11 +242,8 @@ xfs_bulkstat( *done = 0; fmterror = 0; ubufp = ubuffer; - nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ? - mp->m_sb.sb_inopblock : - (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); - nimask = ~(nicluster - 1); - nbcluster = nicluster >> mp->m_sb.sb_inopblog; + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); if (!irbuf) return ENOMEM; @@ -390,12 +386,12 @@ xfs_bulkstat( agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; - chunkidx += nicluster, - agbno += nbcluster) { - if (xfs_inobt_maskn(chunkidx, nicluster) - & ~r.ir_free) + chunkidx += inodes_per_cluster, + agbno += blks_per_cluster) { + if (xfs_inobt_maskn(chunkidx, + inodes_per_cluster) & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, - agbno, nbcluster, + agbno, blks_per_cluster, &xfs_inode_buf_ops); } blk_finish_plug(&plug); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index e148719e0a5..b0f4ef77fa7 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -30,6 +30,52 @@ struct xfs_log_vec { #define XFS_LOG_VEC_ORDERED (-1) +static inline void * +xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type) +{ + struct xfs_log_iovec *vec = *vecp; + + if (vec) { + ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); + vec++; + } else { + vec = &lv->lv_iovecp[0]; + } + + vec->i_type = type; + vec->i_addr = lv->lv_buf + lv->lv_buf_len; + + ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t))); + + *vecp = vec; + return vec->i_addr; +} + +static inline void +xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) +{ + /* + * We need to make sure the next buffer is naturally aligned for the + * biggest basic data type we put into it. We already accounted for + * this when sizing the buffer. + */ + lv->lv_buf_len += round_up(len, sizeof(uint64_t)); + vec->i_len = len; +} + +static inline void * +xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type, void *data, int len) +{ + void *buf; + + buf = xlog_prepare_iovec(lv, vecp, type); + memcpy(buf, data, len); + xlog_finish_iovec(lv, *vecp, len); + return buf; +} + /* * Structure used to pass callback function and the function's argument * to the log manager. diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 5eb51fc5eb8..cdebd832c3d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -82,36 +82,6 @@ xlog_cil_init_post_recovery( log->l_curr_block); } -STATIC int -xlog_cil_lv_item_format( - struct xfs_log_item *lip, - struct xfs_log_vec *lv) -{ - int index; - char *ptr; - - /* format new vectors into array */ - lip->li_ops->iop_format(lip, lv->lv_iovecp); - - /* copy data into existing array */ - ptr = lv->lv_buf; - for (index = 0; index < lv->lv_niovecs; index++) { - struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; - - memcpy(ptr, vec->i_addr, vec->i_len); - vec->i_addr = ptr; - ptr += vec->i_len; - } - - /* - * some size calculations for log vectors over-estimate, so the caller - * doesn't know the amount of space actually used by the item. Return - * the byte count to the caller so they can check and store it - * appropriately. - */ - return ptr - lv->lv_buf; -} - /* * Prepare the log item for insertion into the CIL. Calculate the difference in * log space and vectors it will consume, and if it is a new item pin it as @@ -232,6 +202,13 @@ xlog_cil_insert_format_items( nbytes = 0; } + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. + */ + nbytes += niovecs * sizeof(uint64_t); + /* grab the old item if it exists for reservation accounting */ old_lv = lip->li_lv; @@ -254,34 +231,27 @@ xlog_cil_insert_format_items( */ *diff_iovecs -= lv->lv_niovecs; *diff_len -= lv->lv_buf_len; - - /* Ensure the lv is set up according to ->iop_size */ - lv->lv_niovecs = niovecs; - lv->lv_buf = (char *)lv + buf_size - nbytes; - - lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv); - goto insert; + } else { + /* allocate new data chunk */ + lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) { + /* track as an ordered logvec */ + ASSERT(lip->li_lv == NULL); + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + goto insert; + } + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; } - /* allocate new data chunk */ - lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); - lv->lv_item = lip; - lv->lv_size = buf_size; + /* Ensure the lv is set up according to ->iop_size */ lv->lv_niovecs = niovecs; - if (ordered) { - /* track as an ordered logvec */ - ASSERT(lip->li_lv == NULL); - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; - goto insert; - } - - /* The allocated iovec region lies beyond the log vector. */ - lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; /* The allocated data region lies beyond the iovec region */ + lv->lv_buf_len = 0; lv->lv_buf = (char *)lv + buf_size - nbytes; - - lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv); + lip->li_ops->iop_format(lip, lv); insert: ASSERT(lv->lv_buf_len <= nbytes); xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b6b669df40f..bce53ac8109 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -193,7 +193,10 @@ xlog_bread_noalign( bp->b_io_length = nbblks; bp->b_error = 0; - xfsbdstrat(log->l_mp, bp); + if (XFS_FORCED_SHUTDOWN(log->l_mp)) + return XFS_ERROR(EIO); + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) xfs_buf_ioerror_alert(bp, __func__); @@ -1651,6 +1654,7 @@ xlog_recover_reorder_trans( int pass) { xlog_recover_item_t *item, *n; + int error = 0; LIST_HEAD(sort_list); LIST_HEAD(cancel_list); LIST_HEAD(buffer_list); @@ -1692,9 +1696,17 @@ xlog_recover_reorder_trans( "%s: unrecognized type of log operation", __func__); ASSERT(0); - return XFS_ERROR(EIO); + /* + * return the remaining items back to the transaction + * item list so they can be freed in caller. + */ + if (!list_empty(&sort_list)) + list_splice_init(&sort_list, &trans->r_itemq); + error = XFS_ERROR(EIO); + goto out; } } +out: ASSERT(list_empty(&sort_list)); if (!list_empty(&buffer_list)) list_splice(&buffer_list, &trans->r_itemq); @@ -1704,7 +1716,7 @@ xlog_recover_reorder_trans( list_splice_tail(&inode_buffer_list, &trans->r_itemq); if (!list_empty(&cancel_list)) list_splice_tail(&cancel_list, &trans->r_itemq); - return 0; + return error; } /* @@ -2514,19 +2526,19 @@ xlog_recover_buffer_pass2( * * Also make sure that only inode buffers with good sizes stay in * the buffer cache. The kernel moves inodes in buffers of 1 block - * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode + * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) - * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep + * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) + * for *our* value of mp->m_inode_cluster_size, then we need to keep * the buffer out of the buffer cache so that the buffer won't * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, - (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { + (__uint32_t)log->l_mp->m_inode_cluster_size))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { @@ -3199,10 +3211,10 @@ xlog_recover_do_icreate_pass2( } /* existing allocation is fixed value */ - ASSERT(count == XFS_IALLOC_INODES(mp)); - ASSERT(length == XFS_IALLOC_BLOCKS(mp)); - if (count != XFS_IALLOC_INODES(mp) || - length != XFS_IALLOC_BLOCKS(mp)) { + ASSERT(count == mp->m_ialloc_inos); + ASSERT(length == mp->m_ialloc_blks); + if (count != mp->m_ialloc_inos || + length != mp->m_ialloc_blks) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); return EINVAL; } @@ -3608,8 +3620,10 @@ xlog_recover_process_data( error = XFS_ERROR(EIO); break; } - if (error) + if (error) { + xlog_recover_free_trans(trans); return error; + } } dp += be32_to_cpu(ohead->oh_len); num_logops--; @@ -4397,7 +4411,13 @@ xlog_do_recover( XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); bp->b_ops = &xfs_sb_buf_ops; - xfsbdstrat(log->l_mp, bp); + + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_relse(bp); + return XFS_ERROR(EIO); + } + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 14a4996cfec..348e4d2ed6e 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -134,8 +134,6 @@ xfs_qm_dqpurge( { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; - struct xfs_dquot *gdqp = NULL; - struct xfs_dquot *pdqp = NULL; xfs_dqlock(dqp); if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { @@ -143,21 +141,6 @@ xfs_qm_dqpurge( return EAGAIN; } - /* - * If this quota has a hint attached, prepare for releasing it now. - */ - gdqp = dqp->q_gdquot; - if (gdqp) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - - pdqp = dqp->q_pdquot; - if (pdqp) { - xfs_dqlock(pdqp); - dqp->q_pdquot = NULL; - } - dqp->dq_flags |= XFS_DQ_FREEING; xfs_dqflock(dqp); @@ -206,11 +189,47 @@ xfs_qm_dqpurge( XFS_STATS_DEC(xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); + return 0; +} + +/* + * Release the group or project dquot pointers the user dquots maybe carrying + * around as a hint, and proceed to purge the user dquot cache if requested. +*/ +STATIC int +xfs_qm_dqpurge_hints( + struct xfs_dquot *dqp, + void *data) +{ + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + uint flags = *((uint *)data); + + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); + return EAGAIN; + } + /* If this quota has a hint attached, prepare for releasing it now */ + gdqp = dqp->q_gdquot; if (gdqp) - xfs_qm_dqput(gdqp); + dqp->q_gdquot = NULL; + + pdqp = dqp->q_pdquot; if (pdqp) - xfs_qm_dqput(pdqp); + dqp->q_pdquot = NULL; + + xfs_dqunlock(dqp); + + if (gdqp) + xfs_qm_dqrele(gdqp); + if (pdqp) + xfs_qm_dqrele(pdqp); + + if (flags & XFS_QMOPT_UQUOTA) + return xfs_qm_dqpurge(dqp, NULL); + return 0; } @@ -222,8 +241,18 @@ xfs_qm_dqpurge_all( struct xfs_mount *mp, uint flags) { - if (flags & XFS_QMOPT_UQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); + /* + * We have to release group/project dquot hint(s) from the user dquot + * at first if they are there, otherwise we would run into an infinite + * loop while walking through radix tree to purge other type of dquots + * since their refcount is not zero if the user dquot refers to them + * as hint. + * + * Call the special xfs_qm_dqpurge_hints() will end up go through the + * general xfs_qm_dqpurge() against user dquot cache if requested. + */ + xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge_hints, &flags); + if (flags & XFS_QMOPT_GQUOTA) xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_PQUOTA) @@ -1193,16 +1222,18 @@ xfs_qm_dqiterate( lblkno = 0; maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); do { + uint lock_mode; + nmaps = XFS_DQITER_MAP_SIZE; /* * We aren't changing the inode itself. Just changing * some of its data. No new blocks are added here, and * the inode is never added to the transaction. */ - xfs_ilock(qip, XFS_ILOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(qip); error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, map, &nmaps, 0); - xfs_iunlock(qip, XFS_ILOCK_SHARED); + xfs_iunlock(qip, lock_mode); if (error) break; @@ -2082,24 +2113,21 @@ xfs_qm_vop_create_dqattach( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - if (udqp) { + if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); - ASSERT(XFS_IS_UQUOTA_ON(mp)); ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } - if (gdqp) { + if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); - ASSERT(XFS_IS_GQUOTA_ON(mp)); ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } - if (pdqp) { + if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); - ASSERT(XFS_IS_PQUOTA_ON(mp)); ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); ip->i_pdquot = xfs_qm_dqhold(pdqp); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index a788b66a5cb..797fd463627 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -20,13 +20,29 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" -#include "xfs_quota_priv.h" struct xfs_inode; extern struct kmem_zone *xfs_qm_dqtrxzone; /* + * Number of bmaps that we ask from bmapi when doing a quotacheck. + * We make this restriction to keep the memory usage to a minimum. + */ +#define XFS_DQITER_MAP_SIZE 10 + +#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ + !dqp->q_core.d_blk_hardlimit && \ + !dqp->q_core.d_blk_softlimit && \ + !dqp->q_core.d_rtb_hardlimit && \ + !dqp->q_core.d_rtb_softlimit && \ + !dqp->q_core.d_ino_hardlimit && \ + !dqp->q_core.d_ino_softlimit && \ + !dqp->q_core.d_bcount && \ + !dqp->q_core.d_rtbcount && \ + !dqp->q_core.d_icount) + +/* * This defines the unit of allocation of dquots. * Currently, it is just one file system block, and a 4K blk contains 30 * (136 * 30 = 4080) dquots. It's probably not worth trying to make diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 437c9198031..3daf5ea1eb8 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -278,7 +278,7 @@ xfs_qm_scall_trunc_qfiles( xfs_mount_t *mp, uint flags) { - int error = 0, error2 = 0; + int error; if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { xfs_debug(mp, "%s: flags=%x m_qflags=%x", @@ -286,14 +286,20 @@ xfs_qm_scall_trunc_qfiles( return XFS_ERROR(EINVAL); } - if (flags & XFS_DQ_USER) + if (flags & XFS_DQ_USER) { error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); - if (flags & XFS_DQ_GROUP) - error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + if (error) + return error; + } + if (flags & XFS_DQ_GROUP) { + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + if (error) + return error; + } if (flags & XFS_DQ_PROJ) - error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); - return error ? error : error2; + return error; } /* diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h deleted file mode 100644 index 6d86219d93d..00000000000 --- a/fs/xfs/xfs_quota_priv.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_QUOTA_PRIV_H__ -#define __XFS_QUOTA_PRIV_H__ - -/* - * Number of bmaps that we ask from bmapi when doing a quotacheck. - * We make this restriction to keep the memory usage to a minimum. - */ -#define XFS_DQITER_MAP_SIZE 10 - -#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ - !dqp->q_core.d_blk_hardlimit && \ - !dqp->q_core.d_blk_softlimit && \ - !dqp->q_core.d_rtb_hardlimit && \ - !dqp->q_core.d_rtb_softlimit && \ - !dqp->q_core.d_ino_hardlimit && \ - !dqp->q_core.d_ino_softlimit && \ - !dqp->q_core.d_bcount && \ - !dqp->q_core.d_rtbcount && \ - !dqp->q_core.d_icount) - -#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ - (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ - (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) - -#endif /* __XFS_QUOTA_PRIV_H__ */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9b96d35e483..b5bc1ab3c4d 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -64,7 +64,7 @@ typedef struct xfs_log_item { struct xfs_item_ops { void (*iop_size)(xfs_log_item_t *, int *, int *); - void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); + void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *); void (*iop_pin)(xfs_log_item_t *); void (*iop_unpin)(xfs_log_item_t *, int remove); uint (*iop_push)(struct xfs_log_item *, struct list_head *); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index c035d11b773..647b6f1d892 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -314,7 +314,18 @@ xfs_trans_read_buf_map( ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); bp->b_ops = ops; - xfsbdstrat(tp->t_mountp, bp); + + /* + * XXX(hch): clean up the error handling here to be less + * of a mess.. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + xfs_bioerror_relse(bp); + } else { + xfs_buf_iorequest(bp); + } + error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index cd2a10e15d3..41172861e85 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -295,8 +295,8 @@ xfs_trans_mod_dquot( /* * Given an array of dqtrx structures, lock all the dquots associated and join * them to the transaction, provided they have been modified. We know that the - * highest number of dquots of one type - usr, grp OR prj - involved in a - * transaction is 2 so we don't need to make this very generic. + * highest number of dquots of one type - usr, grp and prj - involved in a + * transaction is 3 so we don't need to make this very generic. */ STATIC void xfs_trans_dqlockedjoin( diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c index 2fd59c0dae6..2ffd3e331b4 100644 --- a/fs/xfs/xfs_trans_resv.c +++ b/fs/xfs/xfs_trans_resv.c @@ -174,7 +174,7 @@ xfs_calc_itruncate_reservation( xfs_calc_buf_res(5, 0) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + mp->m_in_maxlevels, 0))); } @@ -282,7 +282,7 @@ xfs_calc_create_resv_modify( * For create we can allocate some inodes giving: * the agi and agf of the ag getting the new inodes: 2 * sectorsize * the superblock for the nlink flag: sector size - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize + * the inode blocks allocated: mp->m_ialloc_blks * blocksize * the inode btree: max depth * blocksize * the allocation btrees: 2 trees * (max depth - 1) * block size */ @@ -292,7 +292,7 @@ xfs_calc_create_resv_alloc( { return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + mp->m_sb.sb_sectsize + - xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), XFS_FSB_TO_B(mp, 1)); @@ -385,9 +385,9 @@ xfs_calc_ifree_reservation( xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + - max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) + + max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) + xfs_calc_buf_res(1, 0) + - xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + mp->m_in_maxlevels, 0) + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), XFS_FSB_TO_B(mp, 1)); diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h index 7d2c920dfb9..af5dbe06cb6 100644 --- a/fs/xfs/xfs_trans_space.h +++ b/fs/xfs/xfs_trans_space.h @@ -47,7 +47,7 @@ #define XFS_DIRREMOVE_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ - (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1) + ((mp)->m_ialloc_blks + (mp)->m_in_maxlevels - 1) /* * Space reservation values for various transactions. diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h index 3e8e797c6d1..e8a77383c0d 100644 --- a/fs/xfs/xfs_vnode.h +++ b/fs/xfs/xfs_vnode.h @@ -35,15 +35,6 @@ struct attrlist_cursor_kern; { IO_INVIS, "INVIS"} /* - * Flush/Invalidate options for vop_toss/flush/flushinval_pages. - */ -#define FI_NONE 0 /* none */ -#define FI_REMAPF 1 /* Do a remapf prior to the operation */ -#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation. - Prevent VM access to the pages until - the operation completes. */ - -/* * Some useful predicates. */ #define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 9d479073ba4..78ed92a46fd 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -102,8 +102,8 @@ const struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, #ifdef CONFIG_XFS_POSIX_ACL - &xfs_xattr_acl_access_handler, - &xfs_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif NULL }; |