diff options
Diffstat (limited to 'fs/open.c')
| -rw-r--r-- | fs/open.c | 1107 |
1 files changed, 502 insertions, 605 deletions
diff --git a/fs/open.c b/fs/open.c index 040cef72bc0..d6fd3acde13 100644 --- a/fs/open.c +++ b/fs/open.c @@ -8,10 +8,8 @@ #include <linux/mm.h> #include <linux/file.h> #include <linux/fdtable.h> -#include <linux/quotaops.h> #include <linux/fsnotify.h> #include <linux/module.h> -#include <linux/slab.h> #include <linux/tty.h> #include <linux/namei.h> #include <linux/backing-dev.h> @@ -19,8 +17,8 @@ #include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> -#include <linux/vfs.h> #include <linux/fcntl.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <linux/fs.h> #include <linux/personality.h> @@ -31,174 +29,11 @@ #include <linux/falloc.h> #include <linux/fs_struct.h> #include <linux/ima.h> +#include <linux/dnotify.h> +#include <linux/compat.h> #include "internal.h" -int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - int retval = -ENODEV; - - if (dentry) { - retval = -ENOSYS; - if (dentry->d_sb->s_op->statfs) { - memset(buf, 0, sizeof(*buf)); - retval = security_sb_statfs(dentry); - if (retval) - return retval; - retval = dentry->d_sb->s_op->statfs(dentry, buf); - if (retval == 0 && buf->f_frsize == 0) - buf->f_frsize = buf->f_bsize; - } - } - return retval; -} - -EXPORT_SYMBOL(vfs_statfs); - -static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) -{ - struct kstatfs st; - int retval; - - retval = vfs_statfs(dentry, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); - else { - if (sizeof buf->f_blocks == 4) { - if ((st.f_blocks | st.f_bfree | st.f_bavail | - st.f_bsize | st.f_frsize) & - 0xffffffff00000000ULL) - return -EOVERFLOW; - /* - * f_files and f_ffree may be -1; it's okay to stuff - * that into 32 bits - */ - if (st.f_files != -1 && - (st.f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (st.f_ffree != -1 && - (st.f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } - - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); - } - return 0; -} - -static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) -{ - struct kstatfs st; - int retval; - - retval = vfs_statfs(dentry, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); - else { - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); - } - return 0; -} - -SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) -{ - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct statfs tmp; - error = vfs_statfs_native(path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } - return error; -} - -SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) -{ - struct path path; - long error; - - if (sz != sizeof(*buf)) - return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct statfs64 tmp; - error = vfs_statfs64(path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } - return error; -} - -SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) -{ - struct file * file; - struct statfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs_native(file->f_path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: - return error; -} - -SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) -{ - struct file * file; - struct statfs64 tmp; - int error; - - if (sz != sizeof(*buf)) - return -EINVAL; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs64(file->f_path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: - return error; -} - int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { @@ -222,38 +57,28 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, newattrs.ia_valid |= ret | ATTR_FORCE; mutex_lock(&dentry->d_inode->i_mutex); - ret = notify_change(dentry, &newattrs); + /* Note any delegations or leases have already been broken: */ + ret = notify_change(dentry, &newattrs, NULL); mutex_unlock(&dentry->d_inode->i_mutex); return ret; } -static long do_sys_truncate(const char __user *pathname, loff_t length) +long vfs_truncate(struct path *path, loff_t length) { - struct path path; struct inode *inode; - int error; - - error = -EINVAL; - if (length < 0) /* sorry, but loff_t says... */ - goto out; + long error; - error = user_path(pathname, &path); - if (error) - goto out; - inode = path.dentry->d_inode; + inode = path->dentry->d_inode; /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ - error = -EISDIR; if (S_ISDIR(inode->i_mode)) - goto dput_and_out; - - error = -EINVAL; + return -EISDIR; if (!S_ISREG(inode->i_mode)) - goto dput_and_out; + return -EINVAL; - error = mnt_want_write(path.mnt); + error = mnt_want_write(path->mnt); if (error) - goto dput_and_out; + goto out; error = inode_permission(inode, MAY_WRITE); if (error) @@ -271,56 +96,82 @@ static long do_sys_truncate(const char __user *pathname, loff_t length) * Make sure that there are no leases. get_write_access() protects * against the truncate racing with a lease-granting setlease(). */ - error = break_lease(inode, FMODE_WRITE); + error = break_lease(inode, O_WRONLY); if (error) goto put_write_and_out; error = locks_verify_truncate(inode, NULL, length); if (!error) - error = security_path_truncate(&path, length, 0); - if (!error) { - vfs_dq_init(inode); - error = do_truncate(path.dentry, length, 0, NULL); - } + error = security_path_truncate(path); + if (!error) + error = do_truncate(path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); mnt_drop_write_and_out: - mnt_drop_write(path.mnt); -dput_and_out: - path_put(&path); + mnt_drop_write(path->mnt); out: return error; } +EXPORT_SYMBOL_GPL(vfs_truncate); + +static long do_sys_truncate(const char __user *pathname, loff_t length) +{ + unsigned int lookup_flags = LOOKUP_FOLLOW; + struct path path; + int error; + + if (length < 0) /* sorry, but loff_t says... */ + return -EINVAL; + +retry: + error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + if (!error) { + error = vfs_truncate(&path, length); + path_put(&path); + } + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + return error; +} SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) { return do_sys_truncate(path, length); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length) +{ + return do_sys_truncate(path, length); +} +#endif + static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { - struct inode * inode; + struct inode *inode; struct dentry *dentry; - struct file * file; + struct fd f; int error; error = -EINVAL; if (length < 0) goto out; error = -EBADF; - file = fget(fd); - if (!file) + f = fdget(fd); + if (!f.file) goto out; /* explicitly opened as large or we are on 64-bit box */ - if (file->f_flags & O_LARGEFILE) + if (f.file->f_flags & O_LARGEFILE) small = 0; - dentry = file->f_path.dentry; + dentry = f.file->f_path.dentry; inode = dentry->d_inode; error = -EINVAL; - if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) + if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE)) goto out_putf; error = -EINVAL; @@ -332,71 +183,91 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) if (IS_APPEND(inode)) goto out_putf; - error = locks_verify_truncate(inode, file, length); + sb_start_write(inode->i_sb); + error = locks_verify_truncate(inode, f.file, length); if (!error) - error = security_path_truncate(&file->f_path, length, - ATTR_MTIME|ATTR_CTIME); + error = security_path_truncate(&f.file->f_path); if (!error) - error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); + error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file); + sb_end_write(inode->i_sb); out_putf: - fput(file); + fdput(f); out: return error; } SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) { - long ret = do_sys_ftruncate(fd, length, 1); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, fd, length); - return ret; + return do_sys_ftruncate(fd, length, 1); } -/* LFS versions of truncate are only needed on 32 bit machines */ -#if BITS_PER_LONG == 32 -SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length) +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) { - return do_sys_truncate(path, length); + return do_sys_ftruncate(fd, length, 1); } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_truncate64(long path, loff_t length) -{ - return SYSC_truncate64((const char __user *) path, length); -} -SYSCALL_ALIAS(sys_truncate64, SyS_truncate64); #endif -SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length) +/* LFS versions of truncate are only needed on 32 bit machines */ +#if BITS_PER_LONG == 32 +SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length) { - long ret = do_sys_ftruncate(fd, length, 0); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(2, ret, fd, length); - return ret; + return do_sys_truncate(path, length); } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_ftruncate64(long fd, loff_t length) + +SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) { - return SYSC_ftruncate64((unsigned int) fd, length); + return do_sys_ftruncate(fd, length, 0); } -SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64); -#endif #endif /* BITS_PER_LONG == 32 */ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); long ret; if (offset < 0 || len <= 0) return -EINVAL; /* Return error if mode is not supported */ - if (mode && !(mode & FALLOC_FL_KEEP_SIZE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + return -EOPNOTSUPP; + + /* Punch hole and zero range are mutually exclusive */ + if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) == + (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) + return -EOPNOTSUPP; + + /* Punch hole must have keep size set */ + if ((mode & FALLOC_FL_PUNCH_HOLE) && + !(mode & FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; + /* Collapse range should only be used exclusively. */ + if ((mode & FALLOC_FL_COLLAPSE_RANGE) && + (mode & ~FALLOC_FL_COLLAPSE_RANGE)) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) return -EBADF; + + /* + * We can only allow pure fallocate on append only files + */ + if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) + return -EPERM; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + + /* + * We cannot allow any fallocate operation on an active swapfile + */ + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. @@ -419,34 +290,27 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; - if (!inode->i_op->fallocate) + if (!file->f_op->fallocate) return -EOPNOTSUPP; - return inode->i_op->fallocate(inode, mode, offset, len); + sb_start_write(inode->i_sb); + ret = file->f_op->fallocate(file, mode, offset, len); + sb_end_write(inode->i_sb); + return ret; } -SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) +SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { - struct file *file; + struct fd f = fdget(fd); int error = -EBADF; - file = fget(fd); - if (file) { - error = do_fallocate(file, mode, offset, len); - fput(file); + if (f.file) { + error = do_fallocate(f.file, mode, offset, len); + fdput(f); } - return error; } -#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS -asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len) -{ - return SYSC_fallocate((int)fd, (int)mode, offset, len); -} -SYSCALL_ALIAS(sys_fallocate, SyS_fallocate); -#endif - /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and @@ -459,6 +323,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) struct path path; struct inode *inode; int res; + unsigned int lookup_flags = LOOKUP_FOLLOW; if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; @@ -472,7 +337,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) if (!issecure(SECURE_NO_SETUID_FIXUP)) { /* Clear the capabilities if we switch to a non-root user */ - if (override_cred->uid) + kuid_t root_uid = make_kuid(override_cred->user_ns, 0); + if (!uid_eq(override_cred->uid, root_uid)) cap_clear(override_cred->cap_effective); else override_cred->cap_effective = @@ -480,8 +346,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) } old_cred = override_creds(override_cred); - - res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); +retry: + res = user_path_at(dfd, filename, lookup_flags, &path); if (res) goto out; @@ -516,6 +382,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) out_path_release: path_put(&path); + if (retry_estale(res, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } out: revert_creds(old_cred); put_cred(override_cred); @@ -531,12 +401,13 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename) { struct path path; int error; - - error = user_path_dir(filename, &path); + unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; +retry: + error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); + error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; @@ -544,32 +415,35 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename) dput_and_out: path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } out: return error; } SYSCALL_DEFINE1(fchdir, unsigned int, fd) { - struct file *file; + struct fd f = fdget_raw(fd); struct inode *inode; - int error; + int error = -EBADF; error = -EBADF; - file = fget(fd); - if (!file) + if (!f.file) goto out; - inode = file->f_path.dentry->d_inode; + inode = file_inode(f.file); error = -ENOTDIR; if (!S_ISDIR(inode->i_mode)) goto out_putf; - error = inode_permission(inode, MAY_EXEC | MAY_ACCESS); + error = inode_permission(inode, MAY_EXEC | MAY_CHDIR); if (!error) - set_fs_pwd(current->fs, &file->f_path); + set_fs_pwd(current->fs, &f.file->f_path); out_putf: - fput(file); + fdput(f); out: return error; } @@ -578,17 +452,18 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename) { struct path path; int error; - - error = user_path_dir(filename, &path); + unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; +retry: + error = user_path_at(AT_FDCWD, filename, lookup_flags, &path); if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); + error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; error = -EPERM; - if (!capable(CAP_SYS_CHROOT)) + if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) @@ -598,82 +473,75 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename) error = 0; dput_and_out: path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } out: return error; } -SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) +static int chmod_common(struct path *path, umode_t mode) { - struct inode * inode; - struct dentry * dentry; - struct file * file; - int err = -EBADF; + struct inode *inode = path->dentry->d_inode; + struct inode *delegated_inode = NULL; struct iattr newattrs; + int error; - file = fget(fd); - if (!file) - goto out; - - dentry = file->f_path.dentry; - inode = dentry->d_inode; - - audit_inode(NULL, dentry); - - err = mnt_want_write_file(file); - if (err) - goto out_putf; + error = mnt_want_write(path->mnt); + if (error) + return error; +retry_deleg: mutex_lock(&inode->i_mutex); - err = security_path_chmod(dentry, file->f_vfsmnt, mode); - if (err) + error = security_path_chmod(path, mode); + if (error) goto out_unlock; - if (mode == (mode_t) -1) - mode = inode->i_mode; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - err = notify_change(dentry, &newattrs); + error = notify_change(path->dentry, &newattrs, &delegated_inode); out_unlock: mutex_unlock(&inode->i_mutex); - mnt_drop_write(file->f_path.mnt); -out_putf: - fput(file); -out: + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + mnt_drop_write(path->mnt); + return error; +} + +SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) +{ + struct fd f = fdget(fd); + int err = -EBADF; + + if (f.file) { + audit_inode(NULL, f.file->f_path.dentry, 0); + err = chmod_common(&f.file->f_path, mode); + fdput(f); + } return err; } -SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode) +SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { struct path path; - struct inode *inode; int error; - struct iattr newattrs; - - error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); - if (error) - goto out; - inode = path.dentry->d_inode; - - error = mnt_want_write(path.mnt); - if (error) - goto dput_and_out; - mutex_lock(&inode->i_mutex); - error = security_path_chmod(path.dentry, path.mnt, mode); - if (error) - goto out_unlock; - if (mode == (mode_t) -1) - mode = inode->i_mode; - newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); - newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(path.dentry, &newattrs); -out_unlock: - mutex_unlock(&inode->i_mutex); - mnt_drop_write(path.mnt); -dput_and_out: - path_put(&path); -out: + unsigned int lookup_flags = LOOKUP_FOLLOW; +retry: + error = user_path_at(dfd, filename, lookup_flags, &path); + if (!error) { + error = chmod_common(&path, mode); + path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } return error; } -SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode) +SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { return sys_fchmodat(AT_FDCWD, filename, mode); } @@ -681,46 +549,42 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode) static int chown_common(struct path *path, uid_t user, gid_t group) { struct inode *inode = path->dentry->d_inode; + struct inode *delegated_inode = NULL; int error; struct iattr newattrs; + kuid_t uid; + kgid_t gid; + + uid = make_kuid(current_user_ns(), user); + gid = make_kgid(current_user_ns(), group); newattrs.ia_valid = ATTR_CTIME; if (user != (uid_t) -1) { + if (!uid_valid(uid)) + return -EINVAL; newattrs.ia_valid |= ATTR_UID; - newattrs.ia_uid = user; + newattrs.ia_uid = uid; } if (group != (gid_t) -1) { + if (!gid_valid(gid)) + return -EINVAL; newattrs.ia_valid |= ATTR_GID; - newattrs.ia_gid = group; + newattrs.ia_gid = gid; } if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; +retry_deleg: mutex_lock(&inode->i_mutex); - error = security_path_chown(path, user, group); + error = security_path_chown(path, uid, gid); if (!error) - error = notify_change(path->dentry, &newattrs); + error = notify_change(path->dentry, &newattrs, &delegated_inode); mutex_unlock(&inode->i_mutex); - - return error; -} - -SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) -{ - struct path path; - int error; - - error = user_path(filename, &path); - if (error) - goto out; - error = mnt_want_write(path.mnt); - if (error) - goto out_release; - error = chown_common(&path, user, group); - mnt_drop_write(path.mnt); -out_release: - path_put(&path); -out: + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } return error; } @@ -729,13 +593,16 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, { struct path path; int error = -EINVAL; - int follow; + int lookup_flags; - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; - follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; - error = user_path_at(dfd, filename, follow, &path); + lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; +retry: + error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); @@ -745,229 +612,204 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, mnt_drop_write(path.mnt); out_release: path_put(&path); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } out: return error; } -SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) +SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { - struct path path; - int error; + return sys_fchownat(AT_FDCWD, filename, user, group, 0); +} - error = user_lpath(filename, &path); - if (error) - goto out; - error = mnt_want_write(path.mnt); - if (error) - goto out_release; - error = chown_common(&path, user, group); - mnt_drop_write(path.mnt); -out_release: - path_put(&path); -out: - return error; +SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) +{ + return sys_fchownat(AT_FDCWD, filename, user, group, + AT_SYMLINK_NOFOLLOW); } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) { - struct file * file; + struct fd f = fdget(fd); int error = -EBADF; - struct dentry * dentry; - file = fget(fd); - if (!file) + if (!f.file) goto out; - error = mnt_want_write_file(file); + error = mnt_want_write_file(f.file); if (error) goto out_fput; - dentry = file->f_path.dentry; - audit_inode(NULL, dentry); - error = chown_common(&file->f_path, user, group); - mnt_drop_write(file->f_path.mnt); + audit_inode(NULL, f.file->f_path.dentry, 0); + error = chown_common(&f.file->f_path, user, group); + mnt_drop_write_file(f.file); out_fput: - fput(file); + fdput(f); out: return error; } -/* - * You have to be very careful that these write - * counts get cleaned up in error cases and - * upon __fput(). This should probably never - * be called outside of __dentry_open(). - */ -static inline int __get_file_write_access(struct inode *inode, - struct vfsmount *mnt) +int open_check_o_direct(struct file *f) { - int error; - error = get_write_access(inode); - if (error) - return error; - /* - * Do not take mount writer counts on - * special files since no writes to - * the mount itself will occur. - */ - if (!special_file(inode->i_mode)) { - /* - * Balanced in __fput() - */ - error = mnt_want_write(mnt); - if (error) - put_write_access(inode); + /* NB: we're sure to have correct a_ops only after f_op->open */ + if (f->f_flags & O_DIRECT) { + if (!f->f_mapping->a_ops || + ((!f->f_mapping->a_ops->direct_IO) && + (!f->f_mapping->a_ops->get_xip_mem))) { + return -EINVAL; + } } - return error; + return 0; } -static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, - struct file *f, - int (*open)(struct inode *, struct file *), - const struct cred *cred) +static int do_dentry_open(struct file *f, + int (*open)(struct inode *, struct file *), + const struct cred *cred) { + static const struct file_operations empty_fops = {}; struct inode *inode; int error; f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - inode = dentry->d_inode; - if (f->f_mode & FMODE_WRITE) { - error = __get_file_write_access(inode, mnt); - if (error) + + path_get(&f->f_path); + inode = f->f_inode = f->f_path.dentry->d_inode; + f->f_mapping = inode->i_mapping; + + if (unlikely(f->f_flags & O_PATH)) { + f->f_mode = FMODE_PATH; + f->f_op = &empty_fops; + return 0; + } + + if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { + error = get_write_access(inode); + if (unlikely(error)) goto cleanup_file; - if (!special_file(inode->i_mode)) - file_take_write(f); + error = __mnt_want_write(f->f_path.mnt); + if (unlikely(error)) { + put_write_access(inode); + goto cleanup_file; + } + f->f_mode |= FMODE_WRITER; } - f->f_mapping = inode->i_mapping; - f->f_path.dentry = dentry; - f->f_path.mnt = mnt; - f->f_pos = 0; + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ + if (S_ISREG(inode->i_mode)) + f->f_mode |= FMODE_ATOMIC_POS; + f->f_op = fops_get(inode->i_fop); - file_move(f, &inode->i_sb->s_files); + if (unlikely(WARN_ON(!f->f_op))) { + error = -ENODEV; + goto cleanup_all; + } + + error = security_file_open(f, cred); + if (error) + goto cleanup_all; - error = security_dentry_open(f, cred); + error = break_lease(inode, f->f_flags); if (error) goto cleanup_all; - if (!open && f->f_op) + if (!open) open = f->f_op->open; if (open) { error = open(inode, f); if (error) goto cleanup_all; } - ima_counts_get(f); + if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + i_readcount_inc(inode); + if ((f->f_mode & FMODE_READ) && + likely(f->f_op->read || f->f_op->aio_read || f->f_op->read_iter)) + f->f_mode |= FMODE_CAN_READ; + if ((f->f_mode & FMODE_WRITE) && + likely(f->f_op->write || f->f_op->aio_write || f->f_op->write_iter)) + f->f_mode |= FMODE_CAN_WRITE; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); - /* NB: we're sure to have correct a_ops only after f_op->open */ - if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || - ((!f->f_mapping->a_ops->direct_IO) && - (!f->f_mapping->a_ops->get_xip_mem))) { - fput(f); - f = ERR_PTR(-EINVAL); - } - } - - return f; + return 0; cleanup_all: fops_put(f->f_op); - if (f->f_mode & FMODE_WRITE) { + if (f->f_mode & FMODE_WRITER) { put_write_access(inode); - if (!special_file(inode->i_mode)) { - /* - * We don't consider this a real - * mnt_want/drop_write() pair - * because it all happenend right - * here, so just reset the state. - */ - file_reset_write(f); - mnt_drop_write(mnt); - } + __mnt_drop_write(f->f_path.mnt); } - file_kill(f); - f->f_path.dentry = NULL; - f->f_path.mnt = NULL; cleanup_file: - put_filp(f); - dput(dentry); - mntput(mnt); - return ERR_PTR(error); + path_put(&f->f_path); + f->f_path.mnt = NULL; + f->f_path.dentry = NULL; + f->f_inode = NULL; + return error; } /** - * lookup_instantiate_filp - instantiates the open intent filp - * @nd: pointer to nameidata + * finish_open - finish opening a file + * @file: file pointer * @dentry: pointer to dentry * @open: open callback + * @opened: state of open + * + * This can be used to finish opening a file passed to i_op->atomic_open(). * - * Helper for filesystems that want to use lookup open intents and pass back - * a fully instantiated struct file to the caller. - * This function is meant to be called from within a filesystem's - * lookup method. - * Beware of calling it for non-regular files! Those ->open methods might block - * (e.g. in fifo_open), leaving you with parent locked (and in case of fifo, - * leading to a deadlock, as nobody can open that fifo anymore, because - * another process to open fifo will block on locked parent when doing lookup). - * Note that in case of error, nd->intent.open.file is destroyed, but the - * path information remains valid. * If the open callback is set to NULL, then the standard f_op->open() * filesystem callback is substituted. + * + * NB: the dentry reference is _not_ consumed. If, for example, the dentry is + * the return value of d_splice_alias(), then the caller needs to perform dput() + * on it after finish_open(). + * + * On successful return @file is a fully instantiated open file. After this, if + * an error occurs in ->atomic_open(), it needs to clean up with fput(). + * + * Returns zero on success or -errno if the open failed. */ -struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry, - int (*open)(struct inode *, struct file *)) +int finish_open(struct file *file, struct dentry *dentry, + int (*open)(struct inode *, struct file *), + int *opened) { - const struct cred *cred = current_cred(); + int error; + BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ - if (IS_ERR(nd->intent.open.file)) - goto out; - if (IS_ERR(dentry)) - goto out_err; - nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt), - nd->intent.open.file, - open, cred); -out: - return nd->intent.open.file; -out_err: - release_open_intent(nd); - nd->intent.open.file = (struct file *)dentry; - goto out; + file->f_path.dentry = dentry; + error = do_dentry_open(file, open, current_cred()); + if (!error) + *opened |= FILE_OPENED; + + return error; } -EXPORT_SYMBOL_GPL(lookup_instantiate_filp); +EXPORT_SYMBOL(finish_open); /** - * nameidata_to_filp - convert a nameidata to an open filp. - * @nd: pointer to nameidata - * @flags: open flags + * finish_no_open - finish ->atomic_open() without opening the file + * + * @file: file pointer + * @dentry: dentry or NULL (as returned from ->lookup()) * - * Note that this function destroys the original nameidata + * This can be used to set the result of a successful lookup in ->atomic_open(). + * + * NB: unlike finish_open() this function does consume the dentry reference and + * the caller need not dput() it. + * + * Returns "1" which must be the return value of ->atomic_open() after having + * called this function. */ -struct file *nameidata_to_filp(struct nameidata *nd) +int finish_no_open(struct file *file, struct dentry *dentry) { - const struct cred *cred = current_cred(); - struct file *filp; - - /* Pick up the filp from the open intent */ - filp = nd->intent.open.file; - /* Has the filesystem initialised the file for us? */ - if (filp->f_path.dentry == NULL) - filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, - NULL, cred); - else - path_put(&nd->path); - return filp; + file->f_path.dentry = dentry; + return 1; } +EXPORT_SYMBOL(finish_no_open); -/* - * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an - * error. - */ -struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, +struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { int error; @@ -975,122 +817,193 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, validate_creds(cred); - /* - * We must always pass in a valid mount pointer. Historically - * callers got away with not passing it, but we must enforce this at - * the earliest possible point now to avoid strange problems deep in the - * filesystem stack. - */ - if (!mnt) { - printk(KERN_WARNING "%s called with NULL vfsmount\n", __func__); - dump_stack(); - return ERR_PTR(-EINVAL); - } + /* We must always pass in a valid mount pointer. */ + BUG_ON(!path->mnt); - error = -ENFILE; f = get_empty_filp(); - if (f == NULL) { - dput(dentry); - mntput(mnt); - return ERR_PTR(error); + if (!IS_ERR(f)) { + f->f_flags = flags; + f->f_path = *path; + error = do_dentry_open(f, NULL, cred); + if (!error) { + /* from now on we need fput() to dispose of f */ + error = open_check_o_direct(f); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } else { + put_filp(f); + f = ERR_PTR(error); + } } - - f->f_flags = flags; - return __dentry_open(dentry, mnt, f, NULL, cred); + return f; } EXPORT_SYMBOL(dentry_open); -static void __put_unused_fd(struct files_struct *files, unsigned int fd) +static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { - struct fdtable *fdt = files_fdtable(files); - __FD_CLR(fd, fdt->open_fds); - if (fd < files->next_fd) - files->next_fd = fd; + int lookup_flags = 0; + int acc_mode; + + if (flags & (O_CREAT | __O_TMPFILE)) + op->mode = (mode & S_IALLUGO) | S_IFREG; + else + op->mode = 0; + + /* Must never be set by userspace */ + flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC; + + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (flags & __O_SYNC) + flags |= O_DSYNC; + + if (flags & __O_TMPFILE) { + if ((flags & O_TMPFILE_MASK) != O_TMPFILE) + return -EINVAL; + acc_mode = MAY_OPEN | ACC_MODE(flags); + if (!(acc_mode & MAY_WRITE)) + return -EINVAL; + } else if (flags & O_PATH) { + /* + * If we have O_PATH in the open flag. Then we + * cannot have anything other than the below set of flags + */ + flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; + acc_mode = 0; + } else { + acc_mode = MAY_OPEN | ACC_MODE(flags); + } + + op->open_flag = flags; + + /* O_TRUNC implies we need access checks for write permissions */ + if (flags & O_TRUNC) + acc_mode |= MAY_WRITE; + + /* Allow the LSM permission hook to distinguish append + access from general write access. */ + if (flags & O_APPEND) + acc_mode |= MAY_APPEND; + + op->acc_mode = acc_mode; + + op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; + + if (flags & O_CREAT) { + op->intent |= LOOKUP_CREATE; + if (flags & O_EXCL) + op->intent |= LOOKUP_EXCL; + } + + if (flags & O_DIRECTORY) + lookup_flags |= LOOKUP_DIRECTORY; + if (!(flags & O_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + op->lookup_flags = lookup_flags; + return 0; } -void put_unused_fd(unsigned int fd) +/** + * file_open_name - open file and return file pointer + * + * @name: struct filename containing path to open + * @flags: open flags as per the open(2) second argument + * @mode: mode for the new file if O_CREAT is set, else ignored + * + * This is the helper to open a file from kernelspace if you really + * have to. But in generally you should not do this, so please move + * along, nothing to see here.. + */ +struct file *file_open_name(struct filename *name, int flags, umode_t mode) { - struct files_struct *files = current->files; - spin_lock(&files->file_lock); - __put_unused_fd(files, fd); - spin_unlock(&files->file_lock); + struct open_flags op; + int err = build_open_flags(flags, mode, &op); + return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op); } -EXPORT_SYMBOL(put_unused_fd); - -/* - * Install a file pointer in the fd array. +/** + * filp_open - open file and return file pointer * - * The VFS is full of places where we drop the files lock between - * setting the open_fds bitmap and installing the file in the file - * array. At any such point, we are vulnerable to a dup2() race - * installing a file in the array before us. We need to detect this and - * fput() the struct file we are about to overwrite in this case. + * @filename: path to open + * @flags: open flags as per the open(2) second argument + * @mode: mode for the new file if O_CREAT is set, else ignored * - * It should never happen - if we allow dup2() do it, _really_ bad things - * will follow. + * This is the helper to open a file from kernelspace if you really + * have to. But in generally you should not do this, so please move + * along, nothing to see here.. */ - -void fd_install(unsigned int fd, struct file *file) +struct file *filp_open(const char *filename, int flags, umode_t mode) { - struct files_struct *files = current->files; - struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - BUG_ON(fdt->fd[fd] != NULL); - rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + struct filename name = {.name = filename}; + return file_open_name(&name, flags, mode); } +EXPORT_SYMBOL(filp_open); -EXPORT_SYMBOL(fd_install); +struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *filename, int flags) +{ + struct open_flags op; + int err = build_open_flags(flags, 0, &op); + if (err) + return ERR_PTR(err); + if (flags & O_CREAT) + return ERR_PTR(-EINVAL); + if (!filename && (flags & O_DIRECTORY)) + if (!dentry->d_inode->i_op->lookup) + return ERR_PTR(-ENOTDIR); + return do_file_open_root(dentry, mnt, filename, &op); +} +EXPORT_SYMBOL(file_open_root); -long do_sys_open(int dfd, const char __user *filename, int flags, int mode) +long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { - char *tmp = getname(filename); - int fd = PTR_ERR(tmp); - - if (!IS_ERR(tmp)) { - fd = get_unused_fd_flags(flags); - if (fd >= 0) { - struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); - if (IS_ERR(f)) { - put_unused_fd(fd); - fd = PTR_ERR(f); - } else { - fsnotify_open(f->f_path.dentry); - fd_install(fd, f); - } + struct open_flags op; + int fd = build_open_flags(flags, mode, &op); + struct filename *tmp; + + if (fd) + return fd; + + tmp = getname(filename); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + fd = get_unused_fd_flags(flags); + if (fd >= 0) { + struct file *f = do_filp_open(dfd, tmp, &op); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = PTR_ERR(f); + } else { + fsnotify_open(f); + fd_install(fd, f); } - putname(tmp); } + putname(tmp); return fd; } -SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode) +SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { - long ret; - if (force_o_largefile()) flags |= O_LARGEFILE; - ret = do_sys_open(AT_FDCWD, filename, flags, mode); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(3, ret, filename, flags, mode); - return ret; + return do_sys_open(AT_FDCWD, filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, - int, mode) + umode_t, mode) { - long ret; - if (force_o_largefile()) flags |= O_LARGEFILE; - ret = do_sys_open(dfd, filename, flags, mode); - /* avoid REGPARM breakage on x86: */ - asmlinkage_protect(4, ret, dfd, filename, flags, mode); - return ret; + return do_sys_open(dfd, filename, flags, mode); } #ifndef __alpha__ @@ -1099,7 +1012,7 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, * For backward compatibility? Maybe this should be moved * into arch/i386 instead? */ -SYSCALL_DEFINE2(creat, const char __user *, pathname, int, mode) +SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); } @@ -1119,11 +1032,13 @@ int filp_close(struct file *filp, fl_owner_t id) return 0; } - if (filp->f_op && filp->f_op->flush) + if (filp->f_op->flush) retval = filp->f_op->flush(filp, id); - dnotify_flush(filp, id); - locks_remove_posix(filp, id); + if (likely(!(filp->f_mode & FMODE_PATH))) { + dnotify_flush(filp, id); + locks_remove_posix(filp, id); + } fput(filp); return retval; } @@ -1137,23 +1052,7 @@ EXPORT_SYMBOL(filp_close); */ SYSCALL_DEFINE1(close, unsigned int, fd) { - struct file * filp; - struct files_struct *files = current->files; - struct fdtable *fdt; - int retval; - - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (fd >= fdt->max_fds) - goto out_unlock; - filp = fdt->fd[fd]; - if (!filp) - goto out_unlock; - rcu_assign_pointer(fdt->fd[fd], NULL); - FD_CLR(fd, fdt->close_on_exec); - __put_unused_fd(files, fd); - spin_unlock(&files->file_lock); - retval = filp_close(filp, files); + int retval = __close_fd(current->files, fd); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || @@ -1163,10 +1062,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd) retval = -EINTR; return retval; - -out_unlock: - spin_unlock(&files->file_lock); - return -EBADF; } EXPORT_SYMBOL(sys_close); @@ -1200,7 +1095,9 @@ EXPORT_SYMBOL(generic_file_open); /* * This is used by subsystems that don't want seekable - * file descriptors + * file descriptors. The function is not supposed to ever fail, the only + * reason it returns an 'int' and not 'void' is so that it can be plugged + * directly into file_operations structure. */ int nonseekable_open(struct inode *inode, struct file *filp) { |
