diff options
Diffstat (limited to 'fs/namei.c')
| -rw-r--r-- | fs/namei.c | 1584 |
1 files changed, 1024 insertions, 560 deletions
diff --git a/fs/namei.c b/fs/namei.c index dd1ed1b8e98..9eb787e5c16 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -117,18 +117,70 @@ * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ -static char *getname_flags(const char __user *filename, int flags, int *empty) +void final_putname(struct filename *name) { - char *result = __getname(), *err; + if (name->separate) { + __putname(name->name); + kfree(name); + } else { + __putname(name); + } +} + +#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) + +static struct filename * +getname_flags(const char __user *filename, int flags, int *empty) +{ + struct filename *result, *err; int len; + long max; + char *kname; + result = audit_reusename(filename); + if (result) + return result; + + result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); - len = strncpy_from_user(result, filename, PATH_MAX); - err = ERR_PTR(len); - if (unlikely(len < 0)) + /* + * First, try to embed the struct filename inside the names_cache + * allocation + */ + kname = (char *)result + sizeof(*result); + result->name = kname; + result->separate = false; + max = EMBEDDED_NAME_MAX; + +recopy: + len = strncpy_from_user(kname, filename, max); + if (unlikely(len < 0)) { + err = ERR_PTR(len); goto error; + } + + /* + * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a + * separate struct filename so we can dedicate the entire + * names_cache allocation for the pathname, and re-do the copy from + * userland. + */ + if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) { + kname = (char *)result; + + result = kzalloc(sizeof(*result), GFP_KERNEL); + if (!result) { + err = ERR_PTR(-ENOMEM); + result = (struct filename *)kname; + goto error; + } + result->name = kname; + result->separate = true; + max = PATH_MAX; + goto recopy; + } /* The empty path is special. */ if (unlikely(!len)) { @@ -140,30 +192,61 @@ static char *getname_flags(const char __user *filename, int flags, int *empty) } err = ERR_PTR(-ENAMETOOLONG); - if (likely(len < PATH_MAX)) { - audit_getname(result); - return result; - } + if (unlikely(len >= PATH_MAX)) + goto error; + + result->uptr = filename; + result->aname = NULL; + audit_getname(result); + return result; error: - __putname(result); + final_putname(result); return err; } -char *getname(const char __user * filename) +struct filename * +getname(const char __user * filename) { return getname_flags(filename, 0, NULL); } +/* + * The "getname_kernel()" interface doesn't do pathnames longer + * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user. + */ +struct filename * +getname_kernel(const char * filename) +{ + struct filename *result; + char *kname; + int len; + + len = strlen(filename); + if (len >= EMBEDDED_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + result = __getname(); + if (unlikely(!result)) + return ERR_PTR(-ENOMEM); + + kname = (char *)result + sizeof(*result); + result->name = kname; + result->uptr = NULL; + result->aname = NULL; + result->separate = false; + + strlcpy(kname, filename, EMBEDDED_NAME_MAX); + return result; +} + #ifdef CONFIG_AUDITSYSCALL -void putname(const char *name) +void putname(struct filename *name) { if (unlikely(!audit_dummy_context())) - audit_putname(name); - else - __putname(name); + return audit_putname(name); + final_putname(name); } -EXPORT_SYMBOL(putname); #endif static int check_acl(struct inode *inode, int mask) @@ -181,27 +264,9 @@ static int check_acl(struct inode *inode, int mask) return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); } - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - - /* - * A filesystem can force a ACL callback by just never filling the - * ACL cache. But normally you'd fill the cache either at inode - * instantiation time, or on the first ->get_acl call. - * - * If the filesystem doesn't have a get_acl() function at all, we'll - * just create the negative cache entry. - */ - if (acl == ACL_NOT_CACHED) { - if (inode->i_op->get_acl) { - acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } else { - set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); - return -EAGAIN; - } - } - + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); @@ -267,10 +332,11 @@ int generic_permission(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ - if (inode_capable(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; if (!(mask & MAY_WRITE)) - if (inode_capable(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(inode, + CAP_DAC_READ_SEARCH)) return 0; return -EACCES; } @@ -280,7 +346,7 @@ int generic_permission(struct inode *inode, int mask) * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (inode_capable(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; /* @@ -288,11 +354,12 @@ int generic_permission(struct inode *inode, int mask) */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (inode_capable(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; return -EACCES; } +EXPORT_SYMBOL(generic_permission); /* * We _really_ want to just do "generic_permission()" without @@ -390,6 +457,7 @@ int inode_permission(struct inode *inode, int mask) return retval; return __inode_permission(inode, mask); } +EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path @@ -397,7 +465,7 @@ int inode_permission(struct inode *inode, int mask) * * Given a path increment the reference count to the dentry and the vfsmount. */ -void path_get(struct path *path) +void path_get(const struct path *path) { mntget(path->mnt); dget(path->dentry); @@ -410,7 +478,7 @@ EXPORT_SYMBOL(path_get); * * Given a path decrement the reference count to the dentry and the vfsmount. */ -void path_put(struct path *path) +void path_put(const struct path *path) { dput(path->dentry); mntput(path->mnt); @@ -428,18 +496,6 @@ EXPORT_SYMBOL(path_put); * to restart the path walk from the beginning in ref-walk mode. */ -static inline void lock_rcu_walk(void) -{ - br_read_lock(&vfsmount_lock); - rcu_read_lock(); -} - -static inline void unlock_rcu_walk(void) -{ - rcu_read_unlock(); - br_read_unlock(&vfsmount_lock); -} - /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data @@ -454,56 +510,74 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) { struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; - int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - want_root = 1; - spin_lock(&fs->lock); - if (nd->root.mnt != fs->root.mnt || - nd->root.dentry != fs->root.dentry) - goto err_root; + + /* + * After legitimizing the bastards, terminate_walk() + * will do the right thing for non-RCU mode, and all our + * subsequent exit cases should rcu_read_unlock() + * before returning. Do vfsmount first; if dentry + * can't be legitimized, just set nd->path.dentry to NULL + * and rely on dput(NULL) being a no-op. + */ + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) + return -ECHILD; + nd->flags &= ~LOOKUP_RCU; + + if (!lockref_get_not_dead(&parent->d_lockref)) { + nd->path.dentry = NULL; + goto out; } - spin_lock(&parent->d_lock); + + /* + * For a negative lookup, the lookup sequence point is the parents + * sequence point, and it only needs to revalidate the parent dentry. + * + * For a positive lookup, we need to move both the parent and the + * dentry from the RCU domain to be properly refcounted. And the + * sequence number in the dentry validates *both* dentry counters, + * since we checked the sequence number of the parent after we got + * the child sequence number. So we know the parent must still + * be valid if the child sequence number is still valid. + */ if (!dentry) { - if (!__d_rcu_to_refcount(parent, nd->seq)) - goto err_parent; + if (read_seqcount_retry(&parent->d_seq, nd->seq)) + goto out; BUG_ON(nd->inode != parent->d_inode); } else { - if (dentry->d_parent != parent) - goto err_parent; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!__d_rcu_to_refcount(dentry, nd->seq)) - goto err_child; - /* - * If the sequence check on the child dentry passed, then - * the child has not been removed from its parent. This - * means the parent dentry must be valid and able to take - * a reference at this point. - */ - BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); - BUG_ON(!parent->d_count); - parent->d_count++; - spin_unlock(&dentry->d_lock); + if (!lockref_get_not_dead(&dentry->d_lockref)) + goto out; + if (read_seqcount_retry(&dentry->d_seq, nd->seq)) + goto drop_dentry; } - spin_unlock(&parent->d_lock); - if (want_root) { + + /* + * Sequence counts matched. Now make sure that the root is + * still valid and get it if required. + */ + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + spin_lock(&fs->lock); + if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) + goto unlock_and_drop_dentry; path_get(&nd->root); spin_unlock(&fs->lock); } - mntget(nd->path.mnt); - unlock_rcu_walk(); - nd->flags &= ~LOOKUP_RCU; + rcu_read_unlock(); return 0; -err_child: - spin_unlock(&dentry->d_lock); -err_parent: - spin_unlock(&parent->d_lock); -err_root: - if (want_root) - spin_unlock(&fs->lock); +unlock_and_drop_dentry: + spin_unlock(&fs->lock); +drop_dentry: + rcu_read_unlock(); + dput(dentry); + goto drop_root_mnt; +out: + rcu_read_unlock(); +drop_root_mnt: + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; return -ECHILD; } @@ -531,29 +605,32 @@ static int complete_walk(struct nameidata *nd) nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - spin_lock(&dentry->d_lock); - if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { - spin_unlock(&dentry->d_lock); - unlock_rcu_walk(); + + if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) { + rcu_read_unlock(); + return -ECHILD; + } + if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { + rcu_read_unlock(); + mntput(nd->path.mnt); return -ECHILD; } - BUG_ON(nd->inode != dentry->d_inode); - spin_unlock(&dentry->d_lock); - mntget(nd->path.mnt); - unlock_rcu_walk(); + if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { + rcu_read_unlock(); + dput(dentry); + mntput(nd->path.mnt); + return -ECHILD; + } + rcu_read_unlock(); } if (likely(!(nd->flags & LOOKUP_JUMPED))) return 0; - if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) - return 0; - - if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) + if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) return 0; - /* Note: we do not d_invalidate() */ - status = d_revalidate(dentry, nd->flags); + status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); if (status > 0) return 0; @@ -586,29 +663,6 @@ static __always_inline void set_root_rcu(struct nameidata *nd) } } -static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) -{ - int ret; - - if (IS_ERR(link)) - goto fail; - - if (*link == '/') { - set_root(nd); - path_put(&nd->path); - nd->path = nd->root; - path_get(&nd->root); - nd->flags |= LOOKUP_JUMPED; - } - nd->inode = nd->path.dentry->d_inode; - - ret = link_path_walk(link, nd); - return ret; -fail: - path_put(&nd->path); - return PTR_ERR(link); -} - static void path_put_conditional(struct path *path, struct nameidata *nd) { dput(path->dentry); @@ -639,8 +693,6 @@ void nd_jump_link(struct nameidata *nd, struct path *path) nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->flags |= LOOKUP_JUMPED; - - BUG_ON(nd->inode->i_op->follow_link); } static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) @@ -651,8 +703,8 @@ static inline void put_link(struct nameidata *nd, struct path *link, void *cooki path_put(link); } -int sysctl_protected_symlinks __read_mostly = 1; -int sysctl_protected_hardlinks __read_mostly = 1; +int sysctl_protected_symlinks __read_mostly = 0; +int sysctl_protected_hardlinks __read_mostly = 0; /** * may_follow_link - Check symlink following for unsafe situations @@ -680,7 +732,7 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd) /* Allowed if owner and follower match. */ inode = link->dentry->d_inode; - if (current_cred()->fsuid == inode->i_uid) + if (uid_eq(current_cred()->fsuid, inode->i_uid)) return 0; /* Allowed if parent directory not sticky and world-writable. */ @@ -689,12 +741,12 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd) return 0; /* Allowed if parent directory and link owner match. */ - if (parent->i_uid == inode->i_uid) + if (uid_eq(parent->i_uid, inode->i_uid)) return 0; + audit_log_link_denied("follow_link", link); path_put_conditional(link, nd); path_put(&nd->path); - audit_log_link_denied("follow_link", link); return -EACCES; } @@ -759,7 +811,7 @@ static int may_linkat(struct path *link) /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (cred->fsuid == inode->i_uid || safe_hardlink_source(inode) || + if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) || capable(CAP_FOWNER)) return 0; @@ -802,7 +854,20 @@ follow_link(struct path *link, struct nameidata *nd, void **p) error = 0; s = nd_get_link(nd); if (s) { - error = __vfs_follow_link(nd, s); + if (unlikely(IS_ERR(s))) { + path_put(&nd->path); + put_link(nd, link, *p); + return PTR_ERR(s); + } + if (*s == '/') { + set_root(nd); + path_put(&nd->path); + nd->path = nd->root; + path_get(&nd->root); + nd->flags |= LOOKUP_JUMPED; + } + nd->inode = nd->path.dentry->d_inode; + error = link_path_walk(s, nd); if (unlikely(error)) put_link(nd, link, *p); } @@ -810,6 +875,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p) return error; out_put_nd_path: + *p = NULL; path_put(&nd->path); path_put(link); return error; @@ -846,21 +912,22 @@ int follow_up(struct path *path) struct mount *parent; struct dentry *mountpoint; - br_read_lock(&vfsmount_lock); + read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); path->mnt = &parent->mnt; return 1; } +EXPORT_SYMBOL(follow_up); /* * Perform an automount @@ -985,8 +1052,8 @@ static int follow_managed(struct path *path, unsigned flags) /* Something is mounted on this dentry in another * namespace and/or whatever was mounted there in this - * namespace got unmounted before we managed to get the - * vfsmount_lock */ + * namespace got unmounted before lookup_mnt() could + * get it */ } /* Handle an automount point */ @@ -1022,6 +1089,7 @@ int follow_down_one(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down_one); static inline bool managed_dentry_might_block(struct dentry *dentry) { @@ -1046,9 +1114,9 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, return false; if (!d_mountpoint(path->dentry)) - break; + return true; - mounted = __lookup_mnt(path->mnt, path->dentry, 1); + mounted = __lookup_mnt(path->mnt, path->dentry); if (!mounted) break; path->mnt = &mounted->mnt; @@ -1062,20 +1130,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, */ *inode = path->dentry->d_inode; } - return true; -} - -static void follow_mount_rcu(struct nameidata *nd) -{ - while (d_mountpoint(nd->path.dentry)) { - struct mount *mounted; - mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); - if (!mounted) - break; - nd->path.mnt = &mounted->mnt; - nd->path.dentry = mounted->mnt.mnt_root; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - } + return read_seqretry(&mount_lock, nd->m_seq); } static int follow_dotdot_rcu(struct nameidata *nd) @@ -1103,7 +1158,17 @@ static int follow_dotdot_rcu(struct nameidata *nd) break; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } - follow_mount_rcu(nd); + while (d_mountpoint(nd->path.dentry)) { + struct mount *mounted; + mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); + if (!mounted) + break; + nd->path.mnt = &mounted->mnt; + nd->path.dentry = mounted->mnt.mnt_root; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); + if (!read_seqretry(&mount_lock, nd->m_seq)) + goto failed; + } nd->inode = nd->path.dentry->d_inode; return 0; @@ -1111,7 +1176,7 @@ failed: nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - unlock_rcu_walk(); + rcu_read_unlock(); return -ECHILD; } @@ -1163,6 +1228,7 @@ int follow_down(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down); /* * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() @@ -1220,9 +1286,7 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, *need_lookup = false; dentry = d_lookup(dir, name); if (dentry) { - if (d_need_lookup(dentry)) { - *need_lookup = true; - } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) { + if (dentry->d_flags & DCACHE_OP_REVALIDATE) { error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (error < 0) { @@ -1247,8 +1311,8 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, } /* - * Call i_op->lookup on the dentry. The dentry must be negative but may be - * hashed if it was pouplated with DCACHE_NEED_LOOKUP. + * Call i_op->lookup on the dentry. The dentry must be negative and + * unhashed. * * dir->d_inode->i_mutex must be held */ @@ -1289,7 +1353,7 @@ static struct dentry *__lookup_hash(struct qstr *name, * small and for now I'd prefer to have fast path as straight as possible. * It _is_ time-critical. */ -static int lookup_fast(struct nameidata *nd, struct qstr *name, +static int lookup_fast(struct nameidata *nd, struct path *path, struct inode **inode) { struct vfsmount *mnt = nd->path.mnt; @@ -1305,7 +1369,7 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - dentry = __d_lookup_rcu(parent, name, &seq, nd->inode); + dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (!dentry) goto unlazy; @@ -1328,8 +1392,6 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name, return -ECHILD; nd->seq = seq; - if (unlikely(d_need_lookup(dentry))) - goto unlazy; if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { status = d_revalidate(dentry, nd->flags); if (unlikely(status <= 0)) { @@ -1349,17 +1411,12 @@ unlazy: if (unlazy_walk(nd, dentry)) return -ECHILD; } else { - dentry = __d_lookup(parent, name); + dentry = __d_lookup(parent, &nd->last); } if (unlikely(!dentry)) goto need_lookup; - if (unlikely(d_need_lookup(dentry))) { - dput(dentry); - goto need_lookup; - } - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) status = d_revalidate(dentry, nd->flags); if (unlikely(status <= 0)) { @@ -1390,8 +1447,7 @@ need_lookup: } /* Fast lookup failed, do it the slow way */ -static int lookup_slow(struct nameidata *nd, struct qstr *name, - struct path *path) +static int lookup_slow(struct nameidata *nd, struct path *path) { struct dentry *dentry, *parent; int err; @@ -1400,7 +1456,7 @@ static int lookup_slow(struct nameidata *nd, struct qstr *name, BUG_ON(nd->inode != parent->d_inode); mutex_lock(&parent->d_inode->i_mutex); - dentry = __lookup_hash(name, parent, nd->flags); + dentry = __lookup_hash(&nd->last, parent, nd->flags); mutex_unlock(&parent->d_inode->i_mutex); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -1448,7 +1504,7 @@ static void terminate_walk(struct nameidata *nd) nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - unlock_rcu_walk(); + rcu_read_unlock(); } } @@ -1458,22 +1514,13 @@ static void terminate_walk(struct nameidata *nd) * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ -static inline int should_follow_link(struct inode *inode, int follow) +static inline int should_follow_link(struct dentry *dentry, int follow) { - if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { - if (likely(inode->i_op->follow_link)) - return follow; - - /* This gets set once for the inode lifetime */ - spin_lock(&inode->i_lock); - inode->i_opflags |= IOP_NOFOLLOW; - spin_unlock(&inode->i_lock); - } - return 0; + return unlikely(d_is_symlink(dentry)) ? follow : 0; } static inline int walk_component(struct nameidata *nd, struct path *path, - struct qstr *name, int type, int follow) + int follow) { struct inode *inode; int err; @@ -1482,24 +1529,24 @@ static inline int walk_component(struct nameidata *nd, struct path *path, * to be able to know about the current root directory and * parent relationships. */ - if (unlikely(type != LAST_NORM)) - return handle_dots(nd, type); - err = lookup_fast(nd, name, path, &inode); + if (unlikely(nd->last_type != LAST_NORM)) + return handle_dots(nd, nd->last_type); + err = lookup_fast(nd, path, &inode); if (unlikely(err)) { if (err < 0) goto out_err; - err = lookup_slow(nd, name, path); + err = lookup_slow(nd, path); if (err < 0) goto out_err; inode = path->dentry->d_inode; } err = -ENOENT; - if (!inode) + if (!inode || d_is_negative(path->dentry)) goto out_path_put; - if (should_follow_link(inode, follow)) { + if (should_follow_link(path->dentry, follow)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { err = -ECHILD; @@ -1548,8 +1595,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) res = follow_link(&link, nd, &cookie); if (res) break; - res = walk_component(nd, path, &nd->last, - nd->last_type, LOOKUP_FOLLOW); + res = walk_component(nd, path, LOOKUP_FOLLOW); put_link(nd, &link, cookie); } while (res > 0); @@ -1559,26 +1605,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) } /* - * We really don't want to look at inode->i_op->lookup - * when we don't have to. So we keep a cache bit in - * the inode ->i_opflags field that says "yes, we can - * do lookup on this inode". - */ -static inline int can_lookup(struct inode *inode) -{ - if (likely(inode->i_opflags & IOP_LOOKUP)) - return 1; - if (likely(!inode->i_op->lookup)) - return 0; - - /* We do this once for the lifetime of the inode */ - spin_lock(&inode->i_lock); - inode->i_opflags |= IOP_LOOKUP; - spin_unlock(&inode->i_lock); - return 1; -} - -/* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * @@ -1586,11 +1612,6 @@ static inline int can_lookup(struct inode *inode) * do a "get_unaligned()" if this helps and is sufficiently * fast. * - * - Little-endian machines (so that we can generate the mask - * of low bytes efficiently). Again, we *could* do a byte - * swapping load on big-endian architectures if that is not - * expensive enough to make the optimization worthless. - * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. @@ -1634,7 +1655,7 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len) if (!len) goto done; } - mask = ~(~0ul << len*8); + mask = bytemask_from_count(len); hash += mask & a; done: return fold_hash(hash); @@ -1749,15 +1770,17 @@ static int link_path_walk(const char *name, struct nameidata *nd) struct dentry *parent = nd->path.dentry; nd->flags &= ~LOOKUP_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - err = parent->d_op->d_hash(parent, nd->inode, - &this); + err = parent->d_op->d_hash(parent, &this); if (err < 0) break; } } + nd->last = this; + nd->last_type = type; + if (!name[len]) - goto last_component; + return 0; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. @@ -1766,10 +1789,11 @@ static int link_path_walk(const char *name, struct nameidata *nd) len++; } while (unlikely(name[len] == '/')); if (!name[len]) - goto last_component; + return 0; + name += len; - err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); + err = walk_component(nd, &next, LOOKUP_FOLLOW); if (err < 0) return err; @@ -1778,16 +1802,10 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) return err; } - if (can_lookup(nd->inode)) - continue; - err = -ENOTDIR; - break; - /* here ends the main loop */ - -last_component: - nd->last = this; - nd->last_type = type; - return 0; + if (!d_can_lookup(nd->path.dentry)) { + err = -ENOTDIR; + break; + } } terminate_walk(nd); return err; @@ -1797,16 +1815,15 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd, struct file **fp) { int retval = 0; - int fput_needed; - struct file *file; nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; if (flags & LOOKUP_ROOT) { - struct inode *inode = nd->root.dentry->d_inode; + struct dentry *root = nd->root.dentry; + struct inode *inode = root->d_inode; if (*name) { - if (!inode->i_op->lookup) + if (!d_can_lookup(root)) return -ENOTDIR; retval = inode_permission(inode, MAY_EXEC); if (retval) @@ -1815,8 +1832,9 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { - lock_rcu_walk(); + rcu_read_lock(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + nd->m_seq = read_seqbegin(&mount_lock); } else { path_get(&nd->path); } @@ -1825,9 +1843,10 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->root.mnt = NULL; + nd->m_seq = read_seqbegin(&mount_lock); if (*name=='/') { if (flags & LOOKUP_RCU) { - lock_rcu_walk(); + rcu_read_lock(); set_root_rcu(nd); } else { set_root(nd); @@ -1839,7 +1858,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct fs_struct *fs = current->fs; unsigned seq; - lock_rcu_walk(); + rcu_read_lock(); do { seq = read_seqcount_begin(&fs->seq); @@ -1850,44 +1869,36 @@ static int path_init(int dfd, const char *name, unsigned int flags, get_fs_pwd(current->fs, &nd->path); } } else { + /* Caller must check execute permissions on the starting path component */ + struct fd f = fdget_raw(dfd); struct dentry *dentry; - file = fget_raw_light(dfd, &fput_needed); - retval = -EBADF; - if (!file) - goto out_fail; + if (!f.file) + return -EBADF; - dentry = file->f_path.dentry; + dentry = f.file->f_path.dentry; if (*name) { - retval = -ENOTDIR; - if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_fail; - - retval = inode_permission(dentry->d_inode, MAY_EXEC); - if (retval) - goto fput_fail; + if (!d_can_lookup(dentry)) { + fdput(f); + return -ENOTDIR; + } } - nd->path = file->f_path; + nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { - if (fput_needed) - *fp = file; + if (f.flags & FDPUT_FPUT) + *fp = f.file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - lock_rcu_walk(); + rcu_read_lock(); } else { - path_get(&file->f_path); - fput_light(file, fput_needed); + path_get(&nd->path); + fdput(f); } } nd->inode = nd->path.dentry->d_inode; return 0; - -fput_fail: - fput_light(file, fput_needed); -out_fail: - return retval; } static inline int lookup_last(struct nameidata *nd, struct path *path) @@ -1896,8 +1907,7 @@ static inline int lookup_last(struct nameidata *nd, struct path *path) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; nd->flags &= ~LOOKUP_PARENT; - return walk_component(nd, path, &nd->last, nd->last_type, - nd->flags & LOOKUP_FOLLOW); + return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -1951,7 +1961,7 @@ static int path_lookupat(int dfd, const char *name, err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!nd->inode->i_op->lookup) { + if (!d_can_lookup(nd->path.dentry)) { path_put(&nd->path); err = -ENOTDIR; } @@ -1967,24 +1977,29 @@ static int path_lookupat(int dfd, const char *name, return err; } -static int do_path_lookup(int dfd, const char *name, +static int filename_lookup(int dfd, struct filename *name, unsigned int flags, struct nameidata *nd) { - int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); + int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd); if (unlikely(retval == -ECHILD)) - retval = path_lookupat(dfd, name, flags, nd); + retval = path_lookupat(dfd, name->name, flags, nd); if (unlikely(retval == -ESTALE)) - retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); + retval = path_lookupat(dfd, name->name, + flags | LOOKUP_REVAL, nd); - if (likely(!retval)) { - if (unlikely(!audit_dummy_context())) { - if (nd->path.dentry && nd->inode) - audit_inode(name, nd->path.dentry); - } - } + if (likely(!retval)) + audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT); return retval; } +static int do_path_lookup(int dfd, const char *name, + unsigned int flags, struct nameidata *nd) +{ + struct filename filename = { .name = name }; + + return filename_lookup(dfd, &filename, flags, nd); +} + /* does lookup, returns the object with parent locked */ struct dentry *kern_path_locked(const char *name, struct path *path) { @@ -2016,6 +2031,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path) *path = nd.path; return res; } +EXPORT_SYMBOL(kern_path); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair @@ -2040,6 +2056,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, *path = nd.path; return err; } +EXPORT_SYMBOL(vfs_path_lookup); /* * Restricted form of lookup. Doesn't follow links, single-component only, @@ -2076,6 +2093,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) if (!len) return ERR_PTR(-EACCES); + if (unlikely(name[0] == '.')) { + if (len < 2 || (len == 2 && name[1] == '.')) + return ERR_PTR(-EACCES); + } + while (len--) { c = *(const unsigned char *)name++; if (c == '/' || c == '\0') @@ -2086,7 +2108,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { - int err = base->d_op->d_hash(base, base->d_inode, &this); + int err = base->d_op->d_hash(base, &this); if (err < 0) return ERR_PTR(err); } @@ -2097,18 +2119,19 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return __lookup_hash(&this, base, 0); } +EXPORT_SYMBOL(lookup_one_len); int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { struct nameidata nd; - char *tmp = getname_flags(name, flags, empty); + struct filename *tmp = getname_flags(name, flags, empty); int err = PTR_ERR(tmp); if (!IS_ERR(tmp)) { BUG_ON(flags & LOOKUP_PARENT); - err = do_path_lookup(dfd, tmp, flags, &nd); + err = filename_lookup(dfd, tmp, flags, &nd); putname(tmp); if (!err) *path = nd.path; @@ -2121,25 +2144,229 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, { return user_path_at_empty(dfd, name, flags, path, NULL); } +EXPORT_SYMBOL(user_path_at); -static int user_path_parent(int dfd, const char __user *path, - struct nameidata *nd, char **name) +/* + * NB: most callers don't do anything directly with the reference to the + * to struct filename, but the nd->last pointer points into the name string + * allocated by getname. So we must hold the reference to it until all + * path-walking is complete. + */ +static struct filename * +user_path_parent(int dfd, const char __user *path, struct nameidata *nd, + unsigned int flags) { - char *s = getname(path); + struct filename *s = getname(path); int error; + /* only LOOKUP_REVAL is allowed in extra flags */ + flags &= LOOKUP_REVAL; + if (IS_ERR(s)) - return PTR_ERR(s); + return s; - error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd); - if (error) + error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd); + if (error) { putname(s); - else - *name = s; + return ERR_PTR(error); + } + return s; +} + +/** + * mountpoint_last - look up last component for umount + * @nd: pathwalk nameidata - currently pointing at parent directory of "last" + * @path: pointer to container for result + * + * This is a special lookup_last function just for umount. In this case, we + * need to resolve the path without doing any revalidation. + * + * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since + * mountpoints are always pinned in the dcache, their ancestors are too. Thus, + * in almost all cases, this lookup will be served out of the dcache. The only + * cases where it won't are if nd->last refers to a symlink or the path is + * bogus and it doesn't exist. + * + * Returns: + * -error: if there was an error during lookup. This includes -ENOENT if the + * lookup found a negative dentry. The nd->path reference will also be + * put in this case. + * + * 0: if we successfully resolved nd->path and found it to not to be a + * symlink that needs to be followed. "path" will also be populated. + * The nd->path reference will also be put. + * + * 1: if we successfully resolved nd->last and found it to be a symlink + * that needs to be followed. "path" will be populated with the path + * to the link, and nd->path will *not* be put. + */ +static int +mountpoint_last(struct nameidata *nd, struct path *path) +{ + int error = 0; + struct dentry *dentry; + struct dentry *dir = nd->path.dentry; + + /* If we're in rcuwalk, drop out of it to handle last component */ + if (nd->flags & LOOKUP_RCU) { + if (unlazy_walk(nd, NULL)) { + error = -ECHILD; + goto out; + } + } + + nd->flags &= ~LOOKUP_PARENT; + + if (unlikely(nd->last_type != LAST_NORM)) { + error = handle_dots(nd, nd->last_type); + if (error) + goto out; + dentry = dget(nd->path.dentry); + goto done; + } + + mutex_lock(&dir->d_inode->i_mutex); + dentry = d_lookup(dir, &nd->last); + if (!dentry) { + /* + * No cached dentry. Mounted dentries are pinned in the cache, + * so that means that this dentry is probably a symlink or the + * path doesn't actually point to a mounted dentry. + */ + dentry = d_alloc(dir, &nd->last); + if (!dentry) { + error = -ENOMEM; + mutex_unlock(&dir->d_inode->i_mutex); + goto out; + } + dentry = lookup_real(dir->d_inode, dentry, nd->flags); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->d_inode->i_mutex); + goto out; + } + } + mutex_unlock(&dir->d_inode->i_mutex); + +done: + if (!dentry->d_inode || d_is_negative(dentry)) { + error = -ENOENT; + dput(dentry); + goto out; + } + path->dentry = dentry; + path->mnt = nd->path.mnt; + if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW)) + return 1; + mntget(path->mnt); + follow_mount(path); + error = 0; +out: + terminate_walk(nd); + return error; +} + +/** + * path_mountpoint - look up a path to be umounted + * @dfd: directory file descriptor to start walk from + * @name: full pathname to walk + * @path: pointer to container for result + * @flags: lookup flags + * + * Look up the given name, but don't attempt to revalidate the last component. + * Returns 0 and "path" will be valid on success; Returns error otherwise. + */ +static int +path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags) +{ + struct file *base = NULL; + struct nameidata nd; + int err; + + err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base); + if (unlikely(err)) + return err; + + current->total_link_count = 0; + err = link_path_walk(name, &nd); + if (err) + goto out; + + err = mountpoint_last(&nd, path); + while (err > 0) { + void *cookie; + struct path link = *path; + err = may_follow_link(&link, &nd); + if (unlikely(err)) + break; + nd.flags |= LOOKUP_PARENT; + err = follow_link(&link, &nd, &cookie); + if (err) + break; + err = mountpoint_last(&nd, path); + put_link(&nd, &link, cookie); + } +out: + if (base) + fput(base); + + if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT)) + path_put(&nd.root); + + return err; +} + +static int +filename_mountpoint(int dfd, struct filename *s, struct path *path, + unsigned int flags) +{ + int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU); + if (unlikely(error == -ECHILD)) + error = path_mountpoint(dfd, s->name, path, flags); + if (unlikely(error == -ESTALE)) + error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL); + if (likely(!error)) + audit_inode(s, path->dentry, 0); return error; } +/** + * user_path_mountpoint_at - lookup a path from userland in order to umount it + * @dfd: directory file descriptor + * @name: pathname from userland + * @flags: lookup flags + * @path: pointer to container to hold result + * + * A umount is a special case for path walking. We're not actually interested + * in the inode in this situation, and ESTALE errors can be a problem. We + * simply want track down the dentry and vfsmount attached at the mountpoint + * and avoid revalidating the last component. + * + * Returns 0 and populates "path" on success. + */ +int +user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, + struct path *path) +{ + struct filename *s = getname(name); + int error; + if (IS_ERR(s)) + return PTR_ERR(s); + error = filename_mountpoint(dfd, s, path, flags); + putname(s); + return error; +} + +int +kern_path_mountpoint(int dfd, const char *name, struct path *path, + unsigned int flags) +{ + struct filename s = {.name = name}; + return filename_mountpoint(dfd, &s, path, flags); +} +EXPORT_SYMBOL(kern_path_mountpoint); + /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. @@ -2154,7 +2381,7 @@ static inline int check_sticky(struct inode *dir, struct inode *inode) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; - return !inode_capable(inode, CAP_FOWNER); + return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } /* @@ -2176,30 +2403,33 @@ static inline int check_sticky(struct inode *dir, struct inode *inode) * 10. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct inode *dir,struct dentry *victim,int isdir) +static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) { + struct inode *inode = victim->d_inode; int error; - if (!victim->d_inode) + if (d_is_negative(victim)) return -ENOENT; + BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); - audit_inode_child(victim, dir); + audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| - IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + + if (check_sticky(dir, inode) || IS_APPEND(inode) || + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) return -EPERM; if (isdir) { - if (!S_ISDIR(victim->d_inode->i_mode)) + if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; - } else if (S_ISDIR(victim->d_inode->i_mode)) + } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; @@ -2218,6 +2448,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir) */ static inline int may_create(struct inode *dir, struct dentry *child) { + audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) @@ -2257,6 +2488,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); return NULL; } +EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { @@ -2266,6 +2498,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); } } +EXPORT_SYMBOL(unlock_rename); int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) @@ -2286,6 +2519,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_create); static int may_open(struct path *path, int acc_mode, int flag) { @@ -2349,7 +2583,7 @@ static int handle_truncate(struct file *filp) /* * Refuse to truncate files with mandatory locks held on them. */ - error = locks_verify_locked(inode); + error = locks_verify_locked(filp); if (!error) error = security_path_truncate(path); if (!error) { @@ -2407,6 +2641,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, int acc_mode; int create_error = 0; struct dentry *const DENTRY_NOT_SET = (void *) -1UL; + bool excl; BUG_ON(dentry->d_inode); @@ -2420,10 +2655,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) mode &= ~current_umask(); - if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) { + excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT); + if (excl) open_flag &= ~O_TRUNC; - *opened |= FILE_CREATED; - } /* * Checking write permission is tricky, bacuse we don't know if we are @@ -2476,12 +2710,6 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, goto out; } - acc_mode = op->acc_mode; - if (*opened & FILE_CREATED) { - fsnotify_create(dir, dentry); - acc_mode = MAY_OPEN; - } - if (error) { /* returned 1, that is */ if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; @@ -2491,9 +2719,19 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, dput(dentry); dentry = file->f_path.dentry; } - if (create_error && dentry->d_inode == NULL) { - error = create_error; - goto out; + if (*opened & FILE_CREATED) + fsnotify_create(dir, dentry); + if (!dentry->d_inode) { + WARN_ON(*opened & FILE_CREATED); + if (create_error) { + error = create_error; + goto out; + } + } else { + if (excl && !(*opened & FILE_CREATED)) { + error = -EEXIST; + goto out; + } } goto looked_up; } @@ -2502,6 +2740,12 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, * We didn't have the inode before the open, so check open permission * here. */ + acc_mode = op->acc_mode; + if (*opened & FILE_CREATED) { + WARN_ON(!(open_flag & O_CREAT)); + fsnotify_create(dir, dentry); + acc_mode = MAY_OPEN; + } error = may_open(&file->f_path, acc_mode, open_flag); if (error) fput(file); @@ -2629,7 +2873,7 @@ out_dput: */ static int do_last(struct nameidata *nd, struct path *path, struct file *file, const struct open_flags *op, - int *opened, const char *pathname) + int *opened, struct filename *name) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; @@ -2645,28 +2889,10 @@ static int do_last(struct nameidata *nd, struct path *path, nd->flags &= ~LOOKUP_PARENT; nd->flags |= op->intent; - switch (nd->last_type) { - case LAST_DOTDOT: - case LAST_DOT: + if (nd->last_type != LAST_NORM) { error = handle_dots(nd, nd->last_type); if (error) return error; - /* fallthrough */ - case LAST_ROOT: - error = complete_walk(nd); - if (error) - return error; - audit_inode(pathname, nd->path.dentry); - if (open_flag & O_CREAT) { - error = -EISDIR; - goto out; - } - goto finish_open; - case LAST_BIND: - error = complete_walk(nd); - if (error) - return error; - audit_inode(pathname, dir); goto finish_open; } @@ -2676,7 +2902,7 @@ static int do_last(struct nameidata *nd, struct path *path, if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) symlink_ok = true; /* we _can_ be in RCU mode here */ - error = lookup_fast(nd, &nd->last, path, &inode); + error = lookup_fast(nd, path, &inode); if (likely(!error)) goto finish_lookup; @@ -2695,7 +2921,7 @@ static int do_last(struct nameidata *nd, struct path *path, if (error) return error; - audit_inode(pathname, dir); + audit_inode(name, dir, LOOKUP_PARENT); error = -EISDIR; /* trailing slashes? */ if (nd->last.name[nd->last.len]) @@ -2722,10 +2948,10 @@ retry_lookup: goto out; if ((*opened & FILE_CREATED) || - !S_ISREG(file->f_path.dentry->d_inode->i_mode)) + !S_ISREG(file_inode(file)->i_mode)) will_truncate = false; - audit_inode(pathname, file->f_path.dentry); + audit_inode(name, file->f_path.dentry, 0); goto opened; } @@ -2741,8 +2967,8 @@ retry_lookup: /* * create/update audit record if it already exists. */ - if (path->dentry->d_inode) - audit_inode(pathname, path->dentry); + if (d_is_positive(path->dentry)) + audit_inode(name, path->dentry, 0); /* * If atomic_open() acquired write access it is dropped now due to @@ -2770,12 +2996,12 @@ retry_lookup: finish_lookup: /* we _can_ be in RCU mode here */ error = -ENOENT; - if (!inode) { + if (!inode || d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } - if (should_follow_link(inode, !symlink_ok)) { + if (should_follow_link(path->dentry, !symlink_ok)) { if (nd->flags & LOOKUP_RCU) { if (unlikely(unlazy_walk(nd, path->dentry))) { error = -ECHILD; @@ -2796,19 +3022,19 @@ finish_lookup: } nd->inode = inode; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ +finish_open: error = complete_walk(nd); if (error) { path_put(&save_parent); return error; } + audit_inode(name, nd->path.dentry, 0); error = -EISDIR; - if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) + if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) goto out; error = -ENOTDIR; - if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) + if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) goto out; - audit_inode(pathname, nd->path.dentry); -finish_open: if (!S_ISREG(nd->inode->i_mode)) will_truncate = false; @@ -2875,7 +3101,68 @@ stale_open: goto retry_lookup; } -static struct file *path_openat(int dfd, const char *pathname, +static int do_tmpfile(int dfd, struct filename *pathname, + struct nameidata *nd, int flags, + const struct open_flags *op, + struct file *file, int *opened) +{ + static const struct qstr name = QSTR_INIT("/", 1); + struct dentry *dentry, *child; + struct inode *dir; + int error = path_lookupat(dfd, pathname->name, + flags | LOOKUP_DIRECTORY, nd); + if (unlikely(error)) + return error; + error = mnt_want_write(nd->path.mnt); + if (unlikely(error)) + goto out; + /* we want directory to be writable */ + error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC); + if (error) + goto out2; + dentry = nd->path.dentry; + dir = dentry->d_inode; + if (!dir->i_op->tmpfile) { + error = -EOPNOTSUPP; + goto out2; + } + child = d_alloc(dentry, &name); + if (unlikely(!child)) { + error = -ENOMEM; + goto out2; + } + nd->flags &= ~LOOKUP_DIRECTORY; + nd->flags |= op->intent; + dput(nd->path.dentry); + nd->path.dentry = child; + error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode); + if (error) + goto out2; + audit_inode(pathname, nd->path.dentry, 0); + error = may_open(&nd->path, op->acc_mode, op->open_flag); + if (error) + goto out2; + file->f_path.mnt = nd->path.mnt; + error = finish_open(file, nd->path.dentry, NULL, opened); + if (error) + goto out2; + error = open_check_o_direct(file); + if (error) { + fput(file); + } else if (!(op->open_flag & O_EXCL)) { + struct inode *inode = file_inode(file); + spin_lock(&inode->i_lock); + inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + } +out2: + mnt_drop_write(nd->path.mnt); +out: + path_put(&nd->path); + return error; +} + +static struct file *path_openat(int dfd, struct filename *pathname, struct nameidata *nd, const struct open_flags *op, int flags) { struct file *base = NULL; @@ -2885,17 +3172,22 @@ static struct file *path_openat(int dfd, const char *pathname, int error; file = get_empty_filp(); - if (!file) - return ERR_PTR(-ENFILE); + if (IS_ERR(file)) + return file; file->f_flags = op->open_flag; - error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); + if (unlikely(file->f_flags & __O_TMPFILE)) { + error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened); + goto out; + } + + error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base); if (unlikely(error)) goto out; current->total_link_count = 0; - error = link_path_walk(pathname, nd); + error = link_path_walk(pathname->name, nd); if (unlikely(error)) goto out; @@ -2941,10 +3233,11 @@ out: return file; } -struct file *do_filp_open(int dfd, const char *pathname, - const struct open_flags *op, int flags) +struct file *do_filp_open(int dfd, struct filename *pathname, + const struct open_flags *op) { struct nameidata nd; + int flags = op->lookup_flags; struct file *filp; filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); @@ -2956,33 +3249,43 @@ struct file *do_filp_open(int dfd, const char *pathname, } struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, - const char *name, const struct open_flags *op, int flags) + const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; + struct filename filename = { .name = name }; + int flags = op->lookup_flags | LOOKUP_ROOT; nd.root.mnt = mnt; nd.root.dentry = dentry; - flags |= LOOKUP_ROOT; - - if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) + if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); - file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); + file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) - file = path_openat(-1, name, &nd, op, flags); + file = path_openat(-1, &filename, &nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) - file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL); + file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL); return file; } -struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir) +struct dentry *kern_path_create(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); struct nameidata nd; int err2; - int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); + int error; + bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); + + /* + * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any + * other flags passed in are ignored! + */ + lookup_flags &= LOOKUP_REVAL; + + error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd); if (error) return ERR_PTR(error); @@ -3006,8 +3309,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path goto unlock; error = -EEXIST; - if (dentry->d_inode) + if (d_is_positive(dentry)) goto fail; + /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - @@ -3046,13 +3350,14 @@ void done_path_create(struct path *path, struct dentry *dentry) } EXPORT_SYMBOL(done_path_create); -struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) +struct dentry *user_path_create(int dfd, const char __user *pathname, + struct path *path, unsigned int lookup_flags) { - char *tmp = getname(pathname); + struct filename *tmp = getname(pathname); struct dentry *res; if (IS_ERR(tmp)) return ERR_CAST(tmp); - res = kern_path_create(dfd, tmp, path, is_dir); + res = kern_path_create(dfd, tmp->name, path, lookup_flags); putname(tmp); return res; } @@ -3084,6 +3389,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { @@ -3108,12 +3414,13 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, struct dentry *dentry; struct path path; int error; + unsigned int lookup_flags = 0; error = may_mknod(mode); if (error) return error; - - dentry = user_path_create(dfd, filename, &path, 0); +retry: + dentry = user_path_create(dfd, filename, &path, lookup_flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -3136,6 +3443,10 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, } out: done_path_create(&path, dentry); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } @@ -3168,14 +3479,17 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) fsnotify_mkdir(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mkdir); SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { struct dentry *dentry; struct path path; int error; + unsigned int lookup_flags = LOOKUP_DIRECTORY; - dentry = user_path_create(dfd, pathname, &path, 1); +retry: + dentry = user_path_create(dfd, pathname, &path, lookup_flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -3185,6 +3499,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) if (!error) error = vfs_mkdir(path.dentry->d_inode, dentry, mode); done_path_create(&path, dentry); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } @@ -3212,10 +3530,11 @@ void dentry_unhash(struct dentry *dentry) { shrink_dcache_parent(dentry); spin_lock(&dentry->d_lock); - if (dentry->d_count == 1) + if (dentry->d_lockref.count == 1) __d_drop(dentry); spin_unlock(&dentry->d_lock); } +EXPORT_SYMBOL(dentry_unhash); int vfs_rmdir(struct inode *dir, struct dentry *dentry) { @@ -3253,17 +3572,19 @@ out: d_delete(dentry); return error; } +EXPORT_SYMBOL(vfs_rmdir); static long do_rmdir(int dfd, const char __user *pathname) { int error = 0; - char * name; + struct filename *name; struct dentry *dentry; struct nameidata nd; - - error = user_path_parent(dfd, pathname, &nd, &name); - if (error) - return error; + unsigned int lookup_flags = 0; +retry: + name = user_path_parent(dfd, pathname, &nd, lookup_flags); + if (IS_ERR(name)) + return PTR_ERR(name); switch(nd.last_type) { case LAST_DOTDOT: @@ -3303,6 +3624,10 @@ exit2: exit1: path_put(&nd.path); putname(name); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } return error; } @@ -3311,8 +3636,27 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) return do_rmdir(AT_FDCWD, pathname); } -int vfs_unlink(struct inode *dir, struct dentry *dentry) +/** + * vfs_unlink - unlink a filesystem object + * @dir: parent directory + * @dentry: victim + * @delegated_inode: returns victim inode, if the inode is delegated. + * + * The caller must hold dir->i_mutex. + * + * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and + * return a reference to the inode in delegated_inode. The caller + * should then break the delegation on that inode and retry. Because + * breaking a delegation may take a long time, the caller should drop + * dir->i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ +int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { + struct inode *target = dentry->d_inode; int error = may_delete(dir, dentry, 0); if (error) @@ -3321,27 +3665,32 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (!dir->i_op->unlink) return -EPERM; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&target->i_mutex); if (d_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); if (!error) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; error = dir->i_op->unlink(dir, dentry); if (!error) dont_mount(dentry); } } - mutex_unlock(&dentry->d_inode->i_mutex); +out: + mutex_unlock(&target->i_mutex); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { - fsnotify_link_count(dentry->d_inode); + fsnotify_link_count(target); d_delete(dentry); } return error; } +EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its @@ -3352,14 +3701,16 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) static long do_unlinkat(int dfd, const char __user *pathname) { int error; - char *name; + struct filename *name; struct dentry *dentry; struct nameidata nd; struct inode *inode = NULL; - - error = user_path_parent(dfd, pathname, &nd, &name); - if (error) - return error; + struct inode *delegated_inode = NULL; + unsigned int lookup_flags = 0; +retry: + name = user_path_parent(dfd, pathname, &nd, lookup_flags); + if (IS_ERR(name)) + return PTR_ERR(name); error = -EISDIR; if (nd.last_type != LAST_NORM) @@ -3369,7 +3720,7 @@ static long do_unlinkat(int dfd, const char __user *pathname) error = mnt_want_write(nd.path.mnt); if (error) goto exit1; - +retry_deleg: mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); error = PTR_ERR(dentry); @@ -3378,28 +3729,43 @@ static long do_unlinkat(int dfd, const char __user *pathname) if (nd.last.name[nd.last.len]) goto slashes; inode = dentry->d_inode; - if (!inode) + if (d_is_negative(dentry)) goto slashes; ihold(inode); error = security_path_unlink(&nd.path, dentry); if (error) goto exit2; - error = vfs_unlink(nd.path.dentry->d_inode, dentry); + error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode); exit2: dput(dentry); } mutex_unlock(&nd.path.dentry->d_inode->i_mutex); if (inode) iput(inode); /* truncate the inode here */ + inode = NULL; + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } mnt_drop_write(nd.path.mnt); exit1: path_put(&nd.path); putname(name); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + inode = NULL; + goto retry; + } return error; slashes: - error = !dentry->d_inode ? -ENOENT : - S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; + if (d_is_negative(dentry)) + error = -ENOENT; + else if (d_is_dir(dentry)) + error = -EISDIR; + else + error = -ENOTDIR; goto exit2; } @@ -3438,28 +3804,34 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_symlink); SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) { int error; - char *from; + struct filename *from; struct dentry *dentry; struct path path; + unsigned int lookup_flags = 0; from = getname(oldname); if (IS_ERR(from)) return PTR_ERR(from); - - dentry = user_path_create(newdfd, newname, &path, 0); +retry: + dentry = user_path_create(newdfd, newname, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putname; - error = security_path_symlink(&path, dentry, from); + error = security_path_symlink(&path, dentry, from->name); if (!error) - error = vfs_symlink(path.dentry->d_inode, dentry, from); + error = vfs_symlink(path.dentry->d_inode, dentry, from->name); done_path_create(&path, dentry); + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } out_putname: putname(from); return error; @@ -3470,7 +3842,26 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn return sys_symlinkat(oldname, AT_FDCWD, newname); } -int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) +/** + * vfs_link - create a new link + * @old_dentry: object to be linked + * @dir: new parent + * @new_dentry: where to create the new link + * @delegated_inode: returns inode needing a delegation break + * + * The caller must hold dir->i_mutex + * + * If vfs_link discovers a delegation on the to-be-linked file in need + * of breaking, it will return -EWOULDBLOCK and return a reference to the + * inode in delegated_inode. The caller should then break the delegation + * and retry. Because breaking a delegation may take a long time, the + * caller should drop the i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ +int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -3502,17 +3893,27 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de mutex_lock(&inode->i_mutex); /* Make sure we don't allow creating hardlink to an unlinked file */ - if (inode->i_nlink == 0) + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; - else - error = dir->i_op->link(old_dentry, dir, new_dentry); + else { + error = try_break_deleg(inode, delegated_inode); + if (!error) + error = dir->i_op->link(old_dentry, dir, new_dentry); + } + + if (!error && (inode->i_state & I_LINKABLE)) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_LINKABLE; + spin_unlock(&inode->i_lock); + } mutex_unlock(&inode->i_mutex); if (!error) fsnotify_link(dir, inode, new_dentry); return error; } +EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid @@ -3528,6 +3929,7 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, { struct dentry *new_dentry; struct path old_path, new_path; + struct inode *delegated_inode = NULL; int how = 0; int error; @@ -3546,12 +3948,13 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; - +retry: error = user_path_at(olddfd, oldname, how, &old_path); if (error) return error; - new_dentry = user_path_create(newdfd, newname, &new_path, 0); + new_dentry = user_path_create(newdfd, newname, &new_path, + (how & LOOKUP_REVAL)); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out; @@ -3565,9 +3968,21 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); + error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) { + path_put(&old_path); + goto retry; + } + } + if (retry_estale(error, how)) { + path_put(&old_path); + how |= LOOKUP_REVAL; + goto retry; + } out: path_put(&old_path); @@ -3579,7 +3994,28 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -/* +/** + * vfs_rename - rename a filesystem object + * @old_dir: parent of source + * @old_dentry: source + * @new_dir: parent of destination + * @new_dentry: destination + * @delegated_inode: returns an inode needing a delegation break + * @flags: rename flags + * + * The caller must hold multiple mutexes--see lock_rename()). + * + * If vfs_rename discovers a delegation in need of breaking at either + * the source or destination, it will return -EWOULDBLOCK and return a + * reference to the inode in delegated_inode. The caller should then + * break the delegation and retry. Because breaking a delegation may + * take a long time, the caller should drop all locks before doing + * so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: @@ -3588,7 +4024,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * That's where 4.4 screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. - * c) we have to lock _three_ objects - parents and victim (if it exists). + * c) we have to lock _four_ objects - parents and victim (if it exists), + * and source (if it is not a directory). * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change @@ -3606,149 +4043,170 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ -static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + struct inode **delegated_inode, unsigned int flags) { - int error = 0; + int error; + bool is_dir = d_is_dir(old_dentry); + const unsigned char *old_name; + struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; + bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; - /* - * If we are going to change the parent - check write permissions, - * we'll need to flip '..'. - */ - if (new_dir != old_dir) { - error = inode_permission(old_dentry->d_inode, MAY_WRITE); - if (error) - return error; - } + if (source == target) + return 0; - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + error = may_delete(old_dir, old_dentry, is_dir); if (error) return error; - dget(new_dentry); - if (target) - mutex_lock(&target->i_mutex); + if (!target) { + error = may_create(new_dir, new_dentry); + } else { + new_is_dir = d_is_dir(new_dentry); - error = -EBUSY; - if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) - goto out; + if (!(flags & RENAME_EXCHANGE)) + error = may_delete(new_dir, new_dentry, is_dir); + else + error = may_delete(new_dir, new_dentry, new_is_dir); + } + if (error) + return error; - error = -EMLINK; - if (max_links && !target && new_dir != old_dir && - new_dir->i_nlink >= max_links) - goto out; + if (!old_dir->i_op->rename) + return -EPERM; - if (target) - shrink_dcache_parent(new_dentry); - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - goto out; + if (flags && !old_dir->i_op->rename2) + return -EINVAL; - if (target) { - target->i_flags |= S_DEAD; - dont_mount(new_dentry); + /* + * If we are going to change the parent - check write permissions, + * we'll need to flip '..'. + */ + if (new_dir != old_dir) { + if (is_dir) { + error = inode_permission(source, MAY_WRITE); + if (error) + return error; + } + if ((flags & RENAME_EXCHANGE) && new_is_dir) { + error = inode_permission(target, MAY_WRITE); + if (error) + return error; + } } -out: - if (target) - mutex_unlock(&target->i_mutex); - dput(new_dentry); - if (!error) - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry,new_dentry); - return error; -} -static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - struct inode *target = new_dentry->d_inode; - int error; - - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); if (error) return error; + old_name = fsnotify_oldname_init(old_dentry->d_name.name); dget(new_dentry); - if (target) + if (!is_dir || (flags & RENAME_EXCHANGE)) + lock_two_nondirectories(source, target); + else if (target) mutex_lock(&target->i_mutex); error = -EBUSY; - if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) + if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) goto out; - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (max_links && new_dir != old_dir) { + error = -EMLINK; + if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) + goto out; + if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && + old_dir->i_nlink >= max_links) + goto out; + } + if (is_dir && !(flags & RENAME_EXCHANGE) && target) + shrink_dcache_parent(new_dentry); + if (!is_dir) { + error = try_break_deleg(source, delegated_inode); + if (error) + goto out; + } + if (target && !new_is_dir) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; + } + if (!flags) { + error = old_dir->i_op->rename(old_dir, old_dentry, + new_dir, new_dentry); + } else { + error = old_dir->i_op->rename2(old_dir, old_dentry, + new_dir, new_dentry, flags); + } if (error) goto out; - if (target) + if (!(flags & RENAME_EXCHANGE) && target) { + if (is_dir) + target->i_flags |= S_DEAD; dont_mount(new_dentry); - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry, new_dentry); + } + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { + if (!(flags & RENAME_EXCHANGE)) + d_move(old_dentry, new_dentry); + else + d_exchange(old_dentry, new_dentry); + } out: - if (target) + if (!is_dir || (flags & RENAME_EXCHANGE)) + unlock_two_nondirectories(source, target); + else if (target) mutex_unlock(&target->i_mutex); dput(new_dentry); - return error; -} - -int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - int error; - int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); - const unsigned char *old_name; - - if (old_dentry->d_inode == new_dentry->d_inode) - return 0; - - error = may_delete(old_dir, old_dentry, is_dir); - if (error) - return error; - - if (!new_dentry->d_inode) - error = may_create(new_dir, new_dentry); - else - error = may_delete(new_dir, new_dentry, is_dir); - if (error) - return error; - - if (!old_dir->i_op->rename) - return -EPERM; - - old_name = fsnotify_oldname_init(old_dentry->d_name.name); - - if (is_dir) - error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); - else - error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); - if (!error) + if (!error) { fsnotify_move(old_dir, new_dir, old_name, is_dir, - new_dentry->d_inode, old_dentry); + !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); + if (flags & RENAME_EXCHANGE) { + fsnotify_move(new_dir, old_dir, old_dentry->d_name.name, + new_is_dir, NULL, new_dentry); + } + } fsnotify_oldname_free(old_name); return error; } +EXPORT_SYMBOL(vfs_rename); -SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, - int, newdfd, const char __user *, newname) +SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, unsigned int, flags) { struct dentry *old_dir, *new_dir; struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct nameidata oldnd, newnd; - char *from; - char *to; + struct inode *delegated_inode = NULL; + struct filename *from; + struct filename *to; + unsigned int lookup_flags = 0; + bool should_retry = false; int error; - error = user_path_parent(olddfd, oldname, &oldnd, &from); - if (error) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE)) + return -EINVAL; + +retry: + from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); + if (IS_ERR(from)) { + error = PTR_ERR(from); goto exit; + } - error = user_path_parent(newdfd, newname, &newnd, &to); - if (error) + to = user_path_parent(newdfd, newname, &newnd, lookup_flags); + if (IS_ERR(to)) { + error = PTR_ERR(to); goto exit1; + } error = -EXDEV; if (oldnd.path.mnt != newnd.path.mnt) @@ -3760,6 +4218,8 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, goto exit2; new_dir = newnd.path.dentry; + if (flags & RENAME_NOREPLACE) + error = -EEXIST; if (newnd.last_type != LAST_NORM) goto exit2; @@ -3769,8 +4229,10 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, oldnd.flags &= ~LOOKUP_PARENT; newnd.flags &= ~LOOKUP_PARENT; - newnd.flags |= LOOKUP_RENAME_TARGET; + if (!(flags & RENAME_EXCHANGE)) + newnd.flags |= LOOKUP_RENAME_TARGET; +retry_deleg: trap = lock_rename(new_dir, old_dir); old_dentry = lookup_hash(&oldnd); @@ -3779,62 +4241,94 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, goto exit3; /* source must exist */ error = -ENOENT; - if (!old_dentry->d_inode) + if (d_is_negative(old_dentry)) + goto exit4; + new_dentry = lookup_hash(&newnd); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) goto exit4; + error = -EEXIST; + if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) + goto exit5; + if (flags & RENAME_EXCHANGE) { + error = -ENOENT; + if (d_is_negative(new_dentry)) + goto exit5; + + if (!d_is_dir(new_dentry)) { + error = -ENOTDIR; + if (newnd.last.name[newnd.last.len]) + goto exit5; + } + } /* unless the source is a directory trailing slashes give -ENOTDIR */ - if (!S_ISDIR(old_dentry->d_inode->i_mode)) { + if (!d_is_dir(old_dentry)) { error = -ENOTDIR; if (oldnd.last.name[oldnd.last.len]) - goto exit4; - if (newnd.last.name[newnd.last.len]) - goto exit4; + goto exit5; + if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len]) + goto exit5; } /* source should not be ancestor of target */ error = -EINVAL; if (old_dentry == trap) - goto exit4; - new_dentry = lookup_hash(&newnd); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; + goto exit5; /* target should not be an ancestor of source */ - error = -ENOTEMPTY; + if (!(flags & RENAME_EXCHANGE)) + error = -ENOTEMPTY; if (new_dentry == trap) goto exit5; error = security_path_rename(&oldnd.path, old_dentry, - &newnd.path, new_dentry); + &newnd.path, new_dentry, flags); if (error) goto exit5; error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); + new_dir->d_inode, new_dentry, + &delegated_inode, flags); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_dir, old_dir); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } mnt_drop_write(oldnd.path.mnt); exit2: + if (retry_estale(error, lookup_flags)) + should_retry = true; path_put(&newnd.path); putname(to); exit1: path_put(&oldnd.path); putname(from); + if (should_retry) { + should_retry = false; + lookup_flags |= LOOKUP_REVAL; + goto retry; + } exit: return error; } -SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) +SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname) { - return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); + return sys_renameat2(olddfd, oldname, newdfd, newname, 0); } -int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) +SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { - int len; + return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); +} - len = PTR_ERR(link); +int readlink_copy(char __user *buffer, int buflen, const char *link) +{ + int len = PTR_ERR(link); if (IS_ERR(link)) goto out; @@ -3846,6 +4340,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c out: return len; } +EXPORT_SYMBOL(readlink_copy); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that @@ -3863,16 +4358,12 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) if (IS_ERR(cookie)) return PTR_ERR(cookie); - res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); + res = readlink_copy(buffer, buflen, nd_get_link(&nd)); if (dentry->d_inode->i_op->put_link) dentry->d_inode->i_op->put_link(dentry, &nd, cookie); return res; } - -int vfs_follow_link(struct nameidata *nd, const char *link) -{ - return __vfs_follow_link(nd, link); -} +EXPORT_SYMBOL(generic_readlink); /* get the link contents into pagecache */ static char *page_getlink(struct dentry * dentry, struct page **ppage) @@ -3892,14 +4383,14 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage) int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct page *page = NULL; - char *s = page_getlink(dentry, &page); - int res = vfs_readlink(dentry,buffer,buflen,s); + int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page)); if (page) { kunmap(page); page_cache_release(page); } return res; } +EXPORT_SYMBOL(page_readlink); void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) { @@ -3907,6 +4398,7 @@ void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) nd_set_link(nd, page_getlink(dentry, &page)); return page; } +EXPORT_SYMBOL(page_follow_link_light); void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { @@ -3917,6 +4409,7 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) page_cache_release(page); } } +EXPORT_SYMBOL(page_put_link); /* * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS @@ -3954,47 +4447,18 @@ retry: fail: return err; } +EXPORT_SYMBOL(__page_symlink); int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); } +EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, }; - -EXPORT_SYMBOL(user_path_at); -EXPORT_SYMBOL(follow_down_one); -EXPORT_SYMBOL(follow_down); -EXPORT_SYMBOL(follow_up); -EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ -EXPORT_SYMBOL(getname); -EXPORT_SYMBOL(lock_rename); -EXPORT_SYMBOL(lookup_one_len); -EXPORT_SYMBOL(page_follow_link_light); -EXPORT_SYMBOL(page_put_link); -EXPORT_SYMBOL(page_readlink); -EXPORT_SYMBOL(__page_symlink); -EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(kern_path); -EXPORT_SYMBOL(vfs_path_lookup); -EXPORT_SYMBOL(inode_permission); -EXPORT_SYMBOL(unlock_rename); -EXPORT_SYMBOL(vfs_create); -EXPORT_SYMBOL(vfs_follow_link); -EXPORT_SYMBOL(vfs_link); -EXPORT_SYMBOL(vfs_mkdir); -EXPORT_SYMBOL(vfs_mknod); -EXPORT_SYMBOL(generic_permission); -EXPORT_SYMBOL(vfs_readlink); -EXPORT_SYMBOL(vfs_rename); -EXPORT_SYMBOL(vfs_rmdir); -EXPORT_SYMBOL(vfs_symlink); -EXPORT_SYMBOL(vfs_unlink); -EXPORT_SYMBOL(dentry_unhash); -EXPORT_SYMBOL(generic_readlink); |
