--- zzzz-none-000/linux-3.10.107/fs/namei.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/fs/namei.c 2021-02-04 17:41:59.000000000 +0000 @@ -118,25 +118,15 @@ * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ -void final_putname(struct filename *name) -{ - if (name->separate) { - __putname(name->name); - kfree(name); - } else { - __putname(name); - } -} -#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) +#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) -static struct filename * +struct filename * getname_flags(const char __user *filename, int flags, int *empty) { - struct filename *result, *err; - int len; - long max; + struct filename *result; char *kname; + int len; result = audit_reusename(filename); if (result) @@ -150,16 +140,13 @@ * First, try to embed the struct filename inside the names_cache * allocation */ - kname = (char *)result + sizeof(*result); + kname = (char *)result->iname; result->name = kname; - result->separate = false; - max = EMBEDDED_NAME_MAX; -recopy: - len = strncpy_from_user(kname, filename, max); + len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); if (unlikely(len < 0)) { - err = ERR_PTR(len); - goto error; + __putname(result); + return ERR_PTR(len); } /* @@ -168,41 +155,49 @@ * names_cache allocation for the pathname, and re-do the copy from * userland. */ - if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) { + if (unlikely(len == EMBEDDED_NAME_MAX)) { + const size_t size = offsetof(struct filename, iname[1]); kname = (char *)result; - result = kzalloc(sizeof(*result), GFP_KERNEL); - if (!result) { - err = ERR_PTR(-ENOMEM); - result = (struct filename *)kname; - goto error; + /* + * size is chosen that way we to guarantee that + * result->iname[0] is within the same object and that + * kname can't be equal to result->iname, no matter what. + */ + result = kzalloc(size, GFP_KERNEL); + if (unlikely(!result)) { + __putname(kname); + return ERR_PTR(-ENOMEM); } result->name = kname; - result->separate = true; - max = PATH_MAX; - goto recopy; + len = strncpy_from_user(kname, filename, PATH_MAX); + if (unlikely(len < 0)) { + __putname(kname); + kfree(result); + return ERR_PTR(len); + } + if (unlikely(len == PATH_MAX)) { + __putname(kname); + kfree(result); + return ERR_PTR(-ENAMETOOLONG); + } } + result->refcnt = 1; /* The empty path is special. */ if (unlikely(!len)) { if (empty) *empty = 1; - err = ERR_PTR(-ENOENT); - if (!(flags & LOOKUP_EMPTY)) - goto error; + if (!(flags & LOOKUP_EMPTY)) { + putname(result); + return ERR_PTR(-ENOENT); + } } - err = ERR_PTR(-ENAMETOOLONG); - if (unlikely(len >= PATH_MAX)) - goto error; - result->uptr = filename; + result->aname = NULL; audit_getname(result); return result; - -error: - final_putname(result); - return err; } struct filename * @@ -210,16 +205,55 @@ { return getname_flags(filename, 0, NULL); } -EXPORT_SYMBOL(getname); -#ifdef CONFIG_AUDITSYSCALL +struct filename * +getname_kernel(const char * filename) +{ + struct filename *result; + int len = strlen(filename) + 1; + + result = __getname(); + if (unlikely(!result)) + return ERR_PTR(-ENOMEM); + + if (len <= EMBEDDED_NAME_MAX) { + result->name = (char *)result->iname; + } else if (len <= PATH_MAX) { + struct filename *tmp; + + tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); + if (unlikely(!tmp)) { + __putname(result); + return ERR_PTR(-ENOMEM); + } + tmp->name = (char *)result; + result = tmp; + } else { + __putname(result); + return ERR_PTR(-ENAMETOOLONG); + } + memcpy((char *)result->name, filename, len); + result->uptr = NULL; + result->aname = NULL; + result->refcnt = 1; + audit_getname(result); + + return result; +} + void putname(struct filename *name) { - if (unlikely(!audit_dummy_context())) - return audit_putname(name); - final_putname(name); + BUG_ON(name->refcnt <= 0); + + if (--name->refcnt > 0) + return; + + if (name->name != name->iname) { + __putname(name->name); + kfree(name); + } else + __putname(name); } -#endif static int check_acl(struct inode *inode, int mask) { @@ -236,27 +270,9 @@ return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); } - acl = get_cached_acl(inode, ACL_TYPE_ACCESS); - - /* - * A filesystem can force a ACL callback by just never filling the - * ACL cache. But normally you'd fill the cache either at inode - * instantiation time, or on the first ->get_acl call. - * - * If the filesystem doesn't have a get_acl() function at all, we'll - * just create the negative cache entry. - */ - if (acl == ACL_NOT_CACHED) { - if (inode->i_op->get_acl) { - acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } else { - set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); - return -EAGAIN; - } - } - + acl = get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); @@ -349,6 +365,7 @@ return -EACCES; } +EXPORT_SYMBOL(generic_permission); /* * We _really_ want to just do "generic_permission()" without @@ -404,6 +421,7 @@ return security_inode_permission(inode, mask); } +EXPORT_SYMBOL(__inode_permission); /** * sb_permission - Check superblock-level permissions @@ -446,6 +464,7 @@ return retval; return __inode_permission(inode, mask); } +EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path @@ -473,6 +492,74 @@ } EXPORT_SYMBOL(path_put); +#define EMBEDDED_LEVELS 2 +struct nameidata { + struct path path; + struct qstr last; + struct path root; + struct inode *inode; /* path.dentry.d_inode */ + unsigned int flags; + unsigned seq, m_seq; + int last_type; + unsigned depth; + int total_link_count; + struct saved { + struct path link; + void *cookie; + const char *name; + struct inode *inode; + unsigned seq; + } *stack, internal[EMBEDDED_LEVELS]; + struct filename *name; + struct nameidata *saved; + unsigned root_seq; + int dfd; +}; + +static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) +{ + struct nameidata *old = current->nameidata; + p->stack = p->internal; + p->dfd = dfd; + p->name = name; + p->total_link_count = old ? old->total_link_count : 0; + p->saved = old; + current->nameidata = p; +} + +static void restore_nameidata(void) +{ + struct nameidata *now = current->nameidata, *old = now->saved; + + current->nameidata = old; + if (old) + old->total_link_count = now->total_link_count; + if (now->stack != now->internal) { + kfree(now->stack); + now->stack = now->internal; + } +} + +static int __nd_alloc_stack(struct nameidata *nd) +{ + struct saved *p; + + if (nd->flags & LOOKUP_RCU) { + p= kmalloc(MAXSYMLINKS * sizeof(struct saved), + GFP_ATOMIC); + if (unlikely(!p)) + return -ECHILD; + } else { + p= kmalloc(MAXSYMLINKS * sizeof(struct saved), + GFP_KERNEL); + if (unlikely(!p)) + return -ENOMEM; + } + memcpy(p, nd->internal, sizeof(nd->internal)); + nd->stack = p; + return 0; +} + /** * path_connected - Verify that a path->dentry is below path->mnt.mnt_root * @path: nameidate to verify @@ -491,6 +578,81 @@ return is_subdir(path->dentry, mnt->mnt_root); } +static inline int nd_alloc_stack(struct nameidata *nd) +{ + if (likely(nd->depth != EMBEDDED_LEVELS)) + return 0; + if (likely(nd->stack != nd->internal)) + return 0; + return __nd_alloc_stack(nd); +} + +static void drop_links(struct nameidata *nd) +{ + int i = nd->depth; + while (i--) { + struct saved *last = nd->stack + i; + struct inode *inode = last->inode; + if (last->cookie && inode->i_op->put_link) { + inode->i_op->put_link(inode, last->cookie); + last->cookie = NULL; + } + } +} + +static void terminate_walk(struct nameidata *nd) +{ + drop_links(nd); + if (!(nd->flags & LOOKUP_RCU)) { + int i; + path_put(&nd->path); + for (i = 0; i < nd->depth; i++) + path_put(&nd->stack[i].link); + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + } else { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + } + nd->depth = 0; +} + +/* path_put is needed afterwards regardless of success or failure */ +static bool legitimize_path(struct nameidata *nd, + struct path *path, unsigned seq) +{ + int res = __legitimize_mnt(path->mnt, nd->m_seq); + if (unlikely(res)) { + if (res > 0) + path->mnt = NULL; + path->dentry = NULL; + return false; + } + if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { + path->dentry = NULL; + return false; + } + return !read_seqcount_retry(&path->dentry->d_seq, seq); +} + +static bool legitimize_links(struct nameidata *nd) +{ + int i; + for (i = 0; i < nd->depth; i++) { + struct saved *last = nd->stack + i; + if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { + drop_links(nd); + nd->depth = i + 1; + return false; + } + } + return true; +} + /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't @@ -502,82 +664,101 @@ * to restart the path walk from the beginning in ref-walk mode. */ -static inline void lock_rcu_walk(void) -{ - br_read_lock(&vfsmount_lock); - rcu_read_lock(); -} - -static inline void unlock_rcu_walk(void) -{ - rcu_read_unlock(); - br_read_unlock(&vfsmount_lock); -} - /** * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: child of nd->path.dentry or NULL + * @seq: seq number to check dentry against * Returns: 0 on success, -ECHILD on failure * * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry * for ref-walk mode. @dentry must be a path found by a do_lookup call on * @nd or NULL. Must be called from rcu-walk context. + * Nothing should touch nameidata between unlazy_walk() failure and + * terminate_walk(). */ -static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) +static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq) { - struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; - int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - want_root = 1; - spin_lock(&fs->lock); - if (nd->root.mnt != fs->root.mnt || - nd->root.dentry != fs->root.dentry) - goto err_root; - } - spin_lock(&parent->d_lock); + + nd->flags &= ~LOOKUP_RCU; + if (unlikely(!legitimize_links(nd))) + goto out2; + if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq))) + goto out2; + if (unlikely(!lockref_get_not_dead(&parent->d_lockref))) + goto out1; + + /* + * For a negative lookup, the lookup sequence point is the parents + * sequence point, and it only needs to revalidate the parent dentry. + * + * For a positive lookup, we need to move both the parent and the + * dentry from the RCU domain to be properly refcounted. And the + * sequence number in the dentry validates *both* dentry counters, + * since we checked the sequence number of the parent after we got + * the child sequence number. So we know the parent must still + * be valid if the child sequence number is still valid. + */ if (!dentry) { - if (!__d_rcu_to_refcount(parent, nd->seq)) - goto err_parent; + if (read_seqcount_retry(&parent->d_seq, nd->seq)) + goto out; BUG_ON(nd->inode != parent->d_inode); } else { - if (dentry->d_parent != parent) - goto err_parent; - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!__d_rcu_to_refcount(dentry, nd->seq)) - goto err_child; - /* - * If the sequence check on the child dentry passed, then - * the child has not been removed from its parent. This - * means the parent dentry must be valid and able to take - * a reference at this point. - */ - BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); - BUG_ON(!parent->d_count); - parent->d_count++; - spin_unlock(&dentry->d_lock); - } - spin_unlock(&parent->d_lock); - if (want_root) { - path_get(&nd->root); - spin_unlock(&fs->lock); + if (!lockref_get_not_dead(&dentry->d_lockref)) + goto out; + if (read_seqcount_retry(&dentry->d_seq, seq)) + goto drop_dentry; } - mntget(nd->path.mnt); - unlock_rcu_walk(); - nd->flags &= ~LOOKUP_RCU; + /* + * Sequence counts matched. Now make sure that the root is + * still valid and get it if required. + */ + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) { + rcu_read_unlock(); + dput(dentry); + return -ECHILD; + } + } + + rcu_read_unlock(); return 0; -err_child: - spin_unlock(&dentry->d_lock); -err_parent: - spin_unlock(&parent->d_lock); -err_root: - if (want_root) - spin_unlock(&fs->lock); +drop_dentry: + rcu_read_unlock(); + dput(dentry); + goto drop_root_mnt; +out2: + nd->path.mnt = NULL; +out1: + nd->path.dentry = NULL; +out: + rcu_read_unlock(); +drop_root_mnt: + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + return -ECHILD; +} + +static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq) +{ + if (unlikely(!legitimize_path(nd, link, seq))) { + drop_links(nd); + nd->depth = 0; + nd->flags &= ~LOOKUP_RCU; + nd->path.mnt = NULL; + nd->path.dentry = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + } else if (likely(unlazy_walk(nd, NULL, 0)) == 0) { + return 0; + } + path_put(link); return -ECHILD; } @@ -602,19 +783,10 @@ int status; if (nd->flags & LOOKUP_RCU) { - nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - spin_lock(&dentry->d_lock); - if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) { - spin_unlock(&dentry->d_lock); - unlock_rcu_walk(); + if (unlikely(unlazy_walk(nd, NULL, 0))) return -ECHILD; - } - BUG_ON(nd->inode != dentry->d_inode); - spin_unlock(&dentry->d_lock); - mntget(nd->path.mnt); - unlock_rcu_walk(); } if (likely(!(nd->flags & LOOKUP_JUMPED))) @@ -630,53 +802,24 @@ if (!status) status = -ESTALE; - path_put(&nd->path); return status; } -static __always_inline void set_root(struct nameidata *nd) +static void set_root(struct nameidata *nd) { - if (!nd->root.mnt) - get_fs_root(current->fs, &nd->root); -} - -static int link_path_walk(const char *, struct nameidata *); - -static __always_inline void set_root_rcu(struct nameidata *nd) -{ - if (!nd->root.mnt) { - struct fs_struct *fs = current->fs; - unsigned seq; - - do { - seq = read_seqcount_begin(&fs->seq); - nd->root = fs->root; - nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); - } + get_fs_root(current->fs, &nd->root); } -static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) +static void set_root_rcu(struct nameidata *nd) { - int ret; - - if (IS_ERR(link)) - goto fail; - - if (*link == '/') { - set_root(nd); - path_put(&nd->path); - nd->path = nd->root; - path_get(&nd->root); - nd->flags |= LOOKUP_JUMPED; - } - nd->inode = nd->path.dentry->d_inode; + struct fs_struct *fs = current->fs; + unsigned seq; - ret = link_path_walk(link, nd); - return ret; -fail: - path_put(&nd->path); - return PTR_ERR(link); + do { + seq = read_seqcount_begin(&fs->seq); + nd->root = fs->root; + nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -702,8 +845,9 @@ * Helper to directly jump to a known parsed path from ->follow_link, * caller must have taken a reference to path beforehand. */ -void nd_jump_link(struct nameidata *nd, struct path *path) +void nd_jump_link(struct path *path) { + struct nameidata *nd = current->nameidata; path_put(&nd->path); nd->path = *path; @@ -711,12 +855,14 @@ nd->flags |= LOOKUP_JUMPED; } -static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) +static inline void put_link(struct nameidata *nd) { - struct inode *inode = link->dentry->d_inode; - if (inode->i_op->put_link) - inode->i_op->put_link(link->dentry, nd, cookie); - path_put(link); + struct saved *last = nd->stack + --nd->depth; + struct inode *inode = last->inode; + if (last->cookie && inode->i_op->put_link) + inode->i_op->put_link(inode, last->cookie); + if (!(nd->flags & LOOKUP_RCU)) + path_put(&last->link); } int sysctl_protected_symlinks __read_mostly = 0; @@ -724,7 +870,6 @@ /** * may_follow_link - Check symlink following for unsafe situations - * @link: The path of the symlink * @nd: nameidata pathwalk data * * In the case of the sysctl_protected_symlinks sysctl being enabled, @@ -738,31 +883,34 @@ * * Returns 0 if following the symlink is allowed, -ve on error. */ -static inline int may_follow_link(struct path *link, struct nameidata *nd) +static inline int may_follow_link(struct nameidata *nd) { const struct inode *inode; const struct inode *parent; + kuid_t puid; if (!sysctl_protected_symlinks) return 0; /* Allowed if owner and follower match. */ - inode = link->dentry->d_inode; + inode = nd->stack[0].inode; if (uid_eq(current_cred()->fsuid, inode->i_uid)) return 0; /* Allowed if parent directory not sticky and world-writable. */ - parent = nd->path.dentry->d_inode; + parent = nd->inode; if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ - if (uid_eq(parent->i_uid, inode->i_uid)) + puid = parent->i_uid; + if (uid_valid(puid) && uid_eq(puid, inode->i_uid)) return 0; - audit_log_link_denied("follow_link", link); - path_put_conditional(link, nd); - path_put(&nd->path); + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + audit_log_link_denied("follow_link", &nd->stack[0].link); return -EACCES; } @@ -809,94 +957,91 @@ * - sysctl_protected_hardlinks enabled * - fsuid does not match inode * - hardlink source is unsafe (see safe_hardlink_source() above) - * - not CAP_FOWNER + * - not CAP_FOWNER in a namespace with the inode owner uid mapped * * Returns 0 if successful, -ve on error. */ static int may_linkat(struct path *link) { - const struct cred *cred; struct inode *inode; if (!sysctl_protected_hardlinks) return 0; - cred = current_cred(); inode = link->dentry->d_inode; /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) || - capable(CAP_FOWNER)) + if (inode_owner_or_capable(inode) || safe_hardlink_source(inode)) return 0; audit_log_link_denied("linkat", link); return -EPERM; } -static __always_inline int -follow_link(struct path *link, struct nameidata *nd, void **p) +static __always_inline +const char *get_link(struct nameidata *nd) { - struct dentry *dentry = link->dentry; + struct saved *last = nd->stack + nd->depth - 1; + struct dentry *dentry = last->link.dentry; + struct inode *inode = last->inode; int error; - char *s; - - BUG_ON(nd->flags & LOOKUP_RCU); - - if (link->mnt == nd->path.mnt) - mntget(link->mnt); + const char *res; - error = -ELOOP; - if (unlikely(current->total_link_count >= 40)) - goto out_put_nd_path; - - cond_resched(); - current->total_link_count++; - - touch_atime(link); - nd_set_link(nd, NULL); + if (!(nd->flags & LOOKUP_RCU)) { + touch_atime(&last->link); + cond_resched(); + } else if (atime_needs_update(&last->link, inode)) { + if (unlikely(unlazy_walk(nd, NULL, 0))) + return ERR_PTR(-ECHILD); + touch_atime(&last->link); + } - error = security_inode_follow_link(link->dentry, nd); - if (error) - goto out_put_nd_path; + error = security_inode_follow_link(dentry, inode, + nd->flags & LOOKUP_RCU); + if (unlikely(error)) + return ERR_PTR(error); nd->last_type = LAST_BIND; - *p = dentry->d_inode->i_op->follow_link(dentry, nd); - error = PTR_ERR(*p); - if (IS_ERR(*p)) - goto out_put_nd_path; - - error = 0; - s = nd_get_link(nd); - if (s) { - error = __vfs_follow_link(nd, s); - if (unlikely(error)) - put_link(nd, link, *p); + res = inode->i_link; + if (!res) { + if (nd->flags & LOOKUP_RCU) { + if (unlikely(unlazy_walk(nd, NULL, 0))) + return ERR_PTR(-ECHILD); + } + res = inode->i_op->follow_link(dentry, &last->cookie); + if (IS_ERR_OR_NULL(res)) { + last->cookie = NULL; + return res; + } } - - return error; - -out_put_nd_path: - *p = NULL; - path_put(&nd->path); - path_put(link); - return error; -} - -static int follow_up_rcu(struct path *path) -{ - struct mount *mnt = real_mount(path->mnt); - struct mount *parent; - struct dentry *mountpoint; - - parent = mnt->mnt_parent; - if (&parent->mnt == path->mnt) - return 0; - mountpoint = mnt->mnt_mountpoint; - path->dentry = mountpoint; - path->mnt = &parent->mnt; - return 1; + if (*res == '/') { + if (nd->flags & LOOKUP_RCU) { + struct dentry *d; + if (!nd->root.mnt) + set_root_rcu(nd); + nd->path = nd->root; + d = nd->path.dentry; + nd->inode = d->d_inode; + nd->seq = nd->root_seq; + if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq))) + return ERR_PTR(-ECHILD); + } else { + if (!nd->root.mnt) + set_root(nd); + path_put(&nd->path); + nd->path = nd->root; + path_get(&nd->root); + nd->inode = nd->path.dentry->d_inode; + } + nd->flags |= LOOKUP_JUMPED; + while (unlikely(*++res == '/')) + ; + } + if (!*res) + res = NULL; + return res; } /* @@ -915,28 +1060,29 @@ struct mount *parent; struct dentry *mountpoint; - br_read_lock(&vfsmount_lock); + read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); - br_read_unlock(&vfsmount_lock); + read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); path->mnt = &parent->mnt; return 1; } +EXPORT_SYMBOL(follow_up); /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ -static int follow_automount(struct path *path, unsigned flags, +static int follow_automount(struct path *path, struct nameidata *nd, bool *need_mntput) { struct vfsmount *mnt; @@ -956,13 +1102,13 @@ * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ - if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && path->dentry->d_inode) return -EISDIR; - current->total_link_count++; - if (current->total_link_count >= 40) + nd->total_link_count++; + if (nd->total_link_count >= 40) return -ELOOP; mnt = path->dentry->d_op->d_automount(path); @@ -976,7 +1122,7 @@ * the path being looked up; if it wasn't then the remainder of * the path is inaccessible and we should say so. */ - if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT)) + if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT)) return -EREMOTE; return PTR_ERR(mnt); } @@ -1016,7 +1162,7 @@ * * Serialization is taken care of in namespace.c */ -static int follow_managed(struct path *path, unsigned flags) +static int follow_managed(struct path *path, struct nameidata *nd) { struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ unsigned managed; @@ -1054,13 +1200,13 @@ /* Something is mounted on this dentry in another * namespace and/or whatever was mounted there in this - * namespace got unmounted before we managed to get the - * vfsmount_lock */ + * namespace got unmounted before lookup_mnt() could + * get it */ } /* Handle an automount point */ if (managed & DCACHE_NEED_AUTOMOUNT) { - ret = follow_automount(path, flags, &need_mntput); + ret = follow_automount(path, nd, &need_mntput); if (ret < 0) break; continue; @@ -1074,7 +1220,11 @@ mntput(path->mnt); if (ret == -EISDIR) ret = 0; - return ret < 0 ? ret : need_mntput; + if (need_mntput) + nd->flags |= LOOKUP_JUMPED; + if (unlikely(ret < 0)) + path_put_conditional(path, nd); + return ret; } int follow_down_one(struct path *path) @@ -1091,11 +1241,12 @@ } return 0; } +EXPORT_SYMBOL(follow_down_one); -static inline bool managed_dentry_might_block(struct dentry *dentry) +static inline int managed_dentry_rcu(struct dentry *dentry) { - return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && - dentry->d_op->d_manage(dentry, true) < 0); + return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? + dentry->d_op->d_manage(dentry, true) : 0; } /* @@ -1103,7 +1254,7 @@ * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, - struct inode **inode) + struct inode **inode, unsigned *seqp) { for (;;) { struct mount *mounted; @@ -1111,19 +1262,26 @@ * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - if (unlikely(managed_dentry_might_block(path->dentry))) + switch (managed_dentry_rcu(path->dentry)) { + case -ECHILD: + default: return false; + case -EISDIR: + return true; + case 0: + break; + } if (!d_mountpoint(path->dentry)) - break; + return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); - mounted = __lookup_mnt(path->mnt, path->dentry, 1); + mounted = __lookup_mnt(path->mnt, path->dentry); if (!mounted) break; path->mnt = &mounted->mnt; path->dentry = mounted->mnt.mnt_root; nd->flags |= LOOKUP_JUMPED; - nd->seq = read_seqcount_begin(&path->dentry->d_seq); + *seqp = read_seqcount_begin(&path->dentry->d_seq); /* * Update the inode too. We don't need to re-check the * dentry sequence number here after this d_inode read, @@ -1131,59 +1289,64 @@ */ *inode = path->dentry->d_inode; } - return true; -} - -static void follow_mount_rcu(struct nameidata *nd) -{ - while (d_mountpoint(nd->path.dentry)) { - struct mount *mounted; - mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1); - if (!mounted) - break; - nd->path.mnt = &mounted->mnt; - nd->path.dentry = mounted->mnt.mnt_root; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - } + return !read_seqretry(&mount_lock, nd->m_seq) && + !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); } static int follow_dotdot_rcu(struct nameidata *nd) { - set_root_rcu(nd); + struct inode *inode = nd->inode; + if (!nd->root.mnt) + set_root_rcu(nd); while (1) { - if (nd->path.dentry == nd->root.dentry && - nd->path.mnt == nd->root.mnt) { + if (path_equal(&nd->path, &nd->root)) break; - } if (nd->path.dentry != nd->path.mnt->mnt_root) { struct dentry *old = nd->path.dentry; struct dentry *parent = old->d_parent; unsigned seq; + inode = parent->d_inode; seq = read_seqcount_begin(&parent->d_seq); - if (read_seqcount_retry(&old->d_seq, nd->seq)) - goto failed; + if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) + return -ECHILD; nd->path.dentry = parent; nd->seq = seq; if (unlikely(!path_connected(&nd->path))) - goto failed; + return -ENOENT; break; + } else { + struct mount *mnt = real_mount(nd->path.mnt); + struct mount *mparent = mnt->mnt_parent; + struct dentry *mountpoint = mnt->mnt_mountpoint; + struct inode *inode2 = mountpoint->d_inode; + unsigned seq = read_seqcount_begin(&mountpoint->d_seq); + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return -ECHILD; + if (&mparent->mnt == nd->path.mnt) + break; + /* we know that mountpoint was pinned */ + nd->path.dentry = mountpoint; + nd->path.mnt = &mparent->mnt; + inode = inode2; + nd->seq = seq; } - if (!follow_up_rcu(&nd->path)) + } + while (unlikely(d_mountpoint(nd->path.dentry))) { + struct mount *mounted; + mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return -ECHILD; + if (!mounted) break; + nd->path.mnt = &mounted->mnt; + nd->path.dentry = mounted->mnt.mnt_root; + inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } - follow_mount_rcu(nd); - nd->inode = nd->path.dentry->d_inode; + nd->inode = inode; return 0; - -failed: - nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - unlock_rcu_walk(); - return -ECHILD; } /* @@ -1234,6 +1397,7 @@ } return 0; } +EXPORT_SYMBOL(follow_down); /* * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() @@ -1253,7 +1417,8 @@ static int follow_dotdot(struct nameidata *nd) { - set_root(nd); + if (!nd->root.mnt) + set_root(nd); while(1) { struct dentry *old = nd->path.dentry; @@ -1266,10 +1431,8 @@ /* rare case of legitimate dget_parent()... */ nd->path.dentry = dget_parent(nd->path.dentry); dput(old); - if (unlikely(!path_connected(&nd->path))) { - path_put(&nd->path); + if (unlikely(!path_connected(&nd->path))) return -ENOENT; - } break; } if (!follow_up(&nd->path)) @@ -1302,7 +1465,8 @@ if (error < 0) { dput(dentry); return ERR_PTR(error); - } else if (!d_invalidate(dentry)) { + } else { + d_invalidate(dentry); dput(dentry); dentry = NULL; } @@ -1321,8 +1485,8 @@ } /* - * Call i_op->lookup on the dentry. The dentry must be negative but may be - * hashed if it was pouplated with DCACHE_NEED_LOOKUP. + * Call i_op->lookup on the dentry. The dentry must be negative and + * unhashed. * * dir->d_inode->i_mutex must be held */ @@ -1364,7 +1528,8 @@ * It _is_ time-critical. */ static int lookup_fast(struct nameidata *nd, - struct path *path, struct inode **inode) + struct path *path, struct inode **inode, + unsigned *seqp) { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; @@ -1379,7 +1544,8 @@ */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode); + bool negative; + dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (!dentry) goto unlazy; @@ -1387,7 +1553,8 @@ * This sequence count validates that the inode matches * the dentry name information from lookup. */ - *inode = dentry->d_inode; + *inode = d_backing_inode(dentry); + negative = d_is_negative(dentry); if (read_seqcount_retry(&dentry->d_seq, seq)) return -ECHILD; @@ -1400,8 +1567,8 @@ */ if (__read_seqcount_retry(&parent->d_seq, nd->seq)) return -ECHILD; - nd->seq = seq; + *seqp = seq; if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { status = d_revalidate(dentry, nd->flags); if (unlikely(status <= 0)) { @@ -1410,15 +1577,18 @@ goto unlazy; } } + /* + * Note: do negative dentry check after revalidation in + * case that drops it. + */ + if (negative) + return -ENOENT; path->mnt = mnt; path->dentry = dentry; - if (unlikely(!__follow_mount_rcu(nd, path, inode))) - goto unlazy; - if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) - goto unlazy; - return 0; + if (likely(__follow_mount_rcu(nd, path, inode, seqp))) + return 0; unlazy: - if (unlazy_walk(nd, dentry)) + if (unlazy_walk(nd, dentry, seq)) return -ECHILD; } else { dentry = __d_lookup(parent, &nd->last); @@ -1434,23 +1604,21 @@ dput(dentry); return status; } - if (!d_invalidate(dentry)) { - dput(dentry); - goto need_lookup; - } + d_invalidate(dentry); + dput(dentry); + goto need_lookup; } + if (unlikely(d_is_negative(dentry))) { + dput(dentry); + return -ENOENT; + } path->mnt = mnt; path->dentry = dentry; - err = follow_managed(path, nd->flags); - if (unlikely(err < 0)) { - path_put_conditional(path, nd); - return err; - } - if (err) - nd->flags |= LOOKUP_JUMPED; - *inode = path->dentry->d_inode; - return 0; + err = follow_managed(path, nd); + if (likely(!err)) + *inode = d_backing_inode(path->dentry); + return err; need_lookup: return 1; @@ -1460,7 +1628,6 @@ static int lookup_slow(struct nameidata *nd, struct path *path) { struct dentry *dentry, *parent; - int err; parent = nd->path.dentry; BUG_ON(nd->inode != parent->d_inode); @@ -1472,14 +1639,7 @@ return PTR_ERR(dentry); path->mnt = nd->path.mnt; path->dentry = dentry; - err = follow_managed(path, nd->flags); - if (unlikely(err < 0)) { - path_put_conditional(path, nd); - return err; - } - if (err) - nd->flags |= LOOKUP_JUMPED; - return 0; + return follow_managed(path, nd); } static inline int may_lookup(struct nameidata *nd) @@ -1488,7 +1648,7 @@ int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD) return err; - if (unlazy_walk(nd, NULL)) + if (unlazy_walk(nd, NULL, 0)) return -ECHILD; } return inode_permission(nd->inode, MAY_EXEC); @@ -1498,24 +1658,45 @@ { if (type == LAST_DOTDOT) { if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; + return follow_dotdot_rcu(nd); } else return follow_dotdot(nd); } return 0; } -static void terminate_walk(struct nameidata *nd) +static int pick_link(struct nameidata *nd, struct path *link, + struct inode *inode, unsigned seq) { + int error; + struct saved *last; + if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) { + path_to_nameidata(link, nd); + return -ELOOP; + } if (!(nd->flags & LOOKUP_RCU)) { - path_put(&nd->path); - } else { - nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - unlock_rcu_walk(); + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + } + error = nd_alloc_stack(nd); + if (unlikely(error)) { + if (error == -ECHILD) { + if (unlikely(unlazy_link(nd, link, seq))) + return -ECHILD; + error = nd_alloc_stack(nd); + } + if (error) { + path_put(link); + return error; + } } + + last = nd->stack + nd->depth++; + last->link = *link; + last->cookie = NULL; + last->inode = inode; + last->seq = seq; + return 1; } /* @@ -1524,127 +1705,73 @@ * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ -static inline int should_follow_link(struct inode *inode, int follow) +static inline int should_follow_link(struct nameidata *nd, struct path *link, + int follow, + struct inode *inode, unsigned seq) { - if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { - if (likely(inode->i_op->follow_link)) - return follow; - - /* This gets set once for the inode lifetime */ - spin_lock(&inode->i_lock); - inode->i_opflags |= IOP_NOFOLLOW; - spin_unlock(&inode->i_lock); + if (likely(!d_is_symlink(link->dentry))) + return 0; + if (!follow) + return 0; + /* make sure that d_is_symlink above matches inode */ + if (nd->flags & LOOKUP_RCU) { + if (read_seqcount_retry(&link->dentry->d_seq, seq)) + return -ECHILD; } - return 0; + return pick_link(nd, link, inode, seq); } -static inline int walk_component(struct nameidata *nd, struct path *path, - int follow) +enum {WALK_GET = 1, WALK_PUT = 2}; + +static int walk_component(struct nameidata *nd, int flags) { + struct path path; struct inode *inode; + unsigned seq; int err; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ - if (unlikely(nd->last_type != LAST_NORM)) - return handle_dots(nd, nd->last_type); - err = lookup_fast(nd, path, &inode); + if (unlikely(nd->last_type != LAST_NORM)) { + err = handle_dots(nd, nd->last_type); + if (flags & WALK_PUT) + put_link(nd); + return err; + } + err = lookup_fast(nd, &path, &inode, &seq); if (unlikely(err)) { if (err < 0) - goto out_err; + return err; - err = lookup_slow(nd, path); + err = lookup_slow(nd, &path); if (err < 0) - goto out_err; + return err; - inode = path->dentry->d_inode; + seq = 0; /* we are already out of RCU mode */ + err = -ENOENT; + if (d_is_negative(path.dentry)) + goto out_path_put; + inode = d_backing_inode(path.dentry); } - err = -ENOENT; - if (!inode) - goto out_path_put; - if (should_follow_link(inode, follow)) { - if (nd->flags & LOOKUP_RCU) { - if (unlikely(nd->path.mnt != path->mnt || - unlazy_walk(nd, path->dentry))) { - err = -ECHILD; - goto out_err; - } - } - BUG_ON(inode != path->dentry->d_inode); - return 1; - } - path_to_nameidata(path, nd); + if (flags & WALK_PUT) + put_link(nd); + err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq); + if (unlikely(err)) + return err; + path_to_nameidata(&path, nd); nd->inode = inode; + nd->seq = seq; return 0; out_path_put: - path_to_nameidata(path, nd); -out_err: - terminate_walk(nd); + path_to_nameidata(&path, nd); return err; } /* - * This limits recursive symlink follows to 8, while - * limiting consecutive symlinks to 40. - * - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ -static inline int nested_symlink(struct path *path, struct nameidata *nd) -{ - int res; - - if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { - path_put_conditional(path, nd); - path_put(&nd->path); - return -ELOOP; - } - BUG_ON(nd->depth >= MAX_NESTED_LINKS); - - nd->depth++; - current->link_count++; - - do { - struct path link = *path; - void *cookie; - - res = follow_link(&link, nd, &cookie); - if (res) - break; - res = walk_component(nd, path, LOOKUP_FOLLOW); - put_link(nd, &link, cookie); - } while (res > 0); - - current->link_count--; - nd->depth--; - return res; -} - -/* - * We really don't want to look at inode->i_op->lookup - * when we don't have to. So we keep a cache bit in - * the inode ->i_opflags field that says "yes, we can - * do lookup on this inode". - */ -static inline int can_lookup(struct inode *inode) -{ - if (likely(inode->i_opflags & IOP_LOOKUP)) - return 1; - if (likely(!inode->i_op->lookup)) - return 0; - - /* We do this once for the lifetime of the inode */ - spin_lock(&inode->i_lock); - inode->i_opflags |= IOP_LOOKUP; - spin_unlock(&inode->i_lock); - return 1; -} - -/* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * @@ -1652,11 +1779,6 @@ * do a "get_unaligned()" if this helps and is sufficiently * fast. * - * - Little-endian machines (so that we can generate the mask - * of low bytes efficiently). Again, we *could* do a byte - * swapping load on big-endian architectures if that is not - * expensive enough to make the optimization worthless. - * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. @@ -1699,7 +1821,7 @@ if (!len) goto done; } - mask = ~(~0ul << len*8); + mask = bytemask_from_count(len); hash += mask & a; done: return fold_hash(hash); @@ -1708,9 +1830,9 @@ /* * Calculate the length and hash of the path component, and - * return the length of the component; + * return the "hash_len" as the result. */ -static inline unsigned long hash_name(const char *name, unsigned int *hashp) +static inline u64 hash_name(const char *name) { unsigned long a, b, adata, bdata, mask, hash, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; @@ -1730,9 +1852,8 @@ mask = create_zero_mask(adata | bdata); hash += a & zero_bytemask(mask); - *hashp = fold_hash(hash); - - return len + find_zero(mask); + len += find_zero(mask); + return hashlen_create(fold_hash(hash), len); } #else @@ -1750,7 +1871,7 @@ * We know there's a real path component here of at least * one character. */ -static inline unsigned long hash_name(const char *name, unsigned int *hashp) +static inline u64 hash_name(const char *name) { unsigned long hash = init_name_hash(); unsigned long len = 0, c; @@ -1761,8 +1882,7 @@ hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); - *hashp = end_name_hash(hash); - return len; + return hashlen_create(end_name_hash(hash), len); } #endif @@ -1777,9 +1897,8 @@ */ static int link_path_walk(const char *name, struct nameidata *nd) { - struct path next; int err; - + while (*name=='/') name++; if (!*name) @@ -1787,20 +1906,17 @@ /* At this point we know we have a real path component. */ for(;;) { - struct qstr this; - long len; + u64 hash_len; int type; err = may_lookup(nd); if (err) - break; + return err; - len = hash_name(name, &this.hash); - this.name = name; - this.len = len; + hash_len = hash_name(name); type = LAST_NORM; - if (name[0] == '.') switch (len) { + if (name[0] == '.') switch (hashlen_len(hash_len)) { case 2: if (name[1] == '.') { type = LAST_DOTDOT; @@ -1814,93 +1930,121 @@ struct dentry *parent = nd->path.dentry; nd->flags &= ~LOOKUP_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - err = parent->d_op->d_hash(parent, nd->inode, - &this); + struct qstr this = { { .hash_len = hash_len }, .name = name }; + err = parent->d_op->d_hash(parent, &this); if (err < 0) - break; + return err; + hash_len = this.hash_len; + name = this.name; } } - nd->last = this; + nd->last.hash_len = hash_len; + nd->last.name = name; nd->last_type = type; - if (!name[len]) - return 0; + name += hashlen_len(hash_len); + if (!*name) + goto OK; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { - len++; - } while (unlikely(name[len] == '/')); - if (!name[len]) - return 0; - - name += len; - - err = walk_component(nd, &next, LOOKUP_FOLLOW); + name++; + } while (unlikely(*name == '/')); + if (unlikely(!*name)) { +OK: + /* pathname body, done */ + if (!nd->depth) + return 0; + name = nd->stack[nd->depth - 1].name; + /* trailing symlink, done */ + if (!name) + return 0; + /* last component of nested symlink */ + err = walk_component(nd, WALK_GET | WALK_PUT); + } else { + err = walk_component(nd, WALK_GET); + } if (err < 0) return err; if (err) { - err = nested_symlink(&next, nd); - if (err) - return err; + const char *s = get_link(nd); + + if (IS_ERR(s)) + return PTR_ERR(s); + err = 0; + if (unlikely(!s)) { + /* jumped */ + put_link(nd); + } else { + nd->stack[nd->depth - 1].name = name; + name = s; + continue; + } } - if (!can_lookup(nd->inode)) { - err = -ENOTDIR; - break; + if (unlikely(!d_can_lookup(nd->path.dentry))) { + if (nd->flags & LOOKUP_RCU) { + if (unlazy_walk(nd, NULL, 0)) + return -ECHILD; + } + return -ENOTDIR; } } - terminate_walk(nd); - return err; } -static int path_init(int dfd, const char *name, unsigned int flags, - struct nameidata *nd, struct file **fp) +static const char *path_init(struct nameidata *nd, unsigned flags) { int retval = 0; + const char *s = nd->name->name; nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags | LOOKUP_JUMPED; + nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; nd->depth = 0; if (flags & LOOKUP_ROOT) { - struct inode *inode = nd->root.dentry->d_inode; - if (*name) { - if (!can_lookup(inode)) - return -ENOTDIR; + struct dentry *root = nd->root.dentry; + struct inode *inode = root->d_inode; + if (*s) { + if (!d_can_lookup(root)) + return ERR_PTR(-ENOTDIR); retval = inode_permission(inode, MAY_EXEC); if (retval) - return retval; + return ERR_PTR(retval); } nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { - lock_rcu_walk(); + rcu_read_lock(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + nd->root_seq = nd->seq; + nd->m_seq = read_seqbegin(&mount_lock); } else { path_get(&nd->path); } - return 0; + return s; } nd->root.mnt = NULL; - if (*name=='/') { + nd->m_seq = read_seqbegin(&mount_lock); + if (*s == '/') { if (flags & LOOKUP_RCU) { - lock_rcu_walk(); + rcu_read_lock(); set_root_rcu(nd); + nd->seq = nd->root_seq; } else { set_root(nd); path_get(&nd->root); } nd->path = nd->root; - } else if (dfd == AT_FDCWD) { + } else if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; - lock_rcu_walk(); + rcu_read_lock(); do { seq = read_seqcount_begin(&fs->seq); @@ -1912,167 +2056,207 @@ } } else { /* Caller must check execute permissions on the starting path component */ - struct fd f = fdget_raw(dfd); + struct fd f = fdget_raw(nd->dfd); struct dentry *dentry; if (!f.file) - return -EBADF; + return ERR_PTR(-EBADF); dentry = f.file->f_path.dentry; - if (*name) { - if (!can_lookup(dentry->d_inode)) { + if (*s) { + if (!d_can_lookup(dentry)) { fdput(f); - return -ENOTDIR; + return ERR_PTR(-ENOTDIR); } } nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { - if (f.need_put) - *fp = f.file; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - lock_rcu_walk(); + rcu_read_lock(); + nd->inode = nd->path.dentry->d_inode; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); - fdput(f); + nd->inode = nd->path.dentry->d_inode; } + fdput(f); + return s; } nd->inode = nd->path.dentry->d_inode; - return 0; + if (!(flags & LOOKUP_RCU)) + return s; + if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq))) + return s; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + return ERR_PTR(-ECHILD); } -static inline int lookup_last(struct nameidata *nd, struct path *path) +static const char *trailing_symlink(struct nameidata *nd) +{ + const char *s; + int error = may_follow_link(nd); + if (unlikely(error)) + return ERR_PTR(error); + nd->flags |= LOOKUP_PARENT; + nd->stack[0].name = NULL; + s = get_link(nd); + return s ? s : ""; +} + +static inline int lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; nd->flags &= ~LOOKUP_PARENT; - return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW); + return walk_component(nd, + nd->flags & LOOKUP_FOLLOW + ? nd->depth + ? WALK_PUT | WALK_GET + : WALK_GET + : 0); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ -static int path_lookupat(int dfd, const char *name, - unsigned int flags, struct nameidata *nd) +static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { - struct file *base = NULL; - struct path path; + const char *s = path_init(nd, flags); int err; - /* - * Path walking is largely split up into 2 different synchronisation - * schemes, rcu-walk and ref-walk (explained in - * Documentation/filesystems/path-lookup.txt). These share much of the - * path walk code, but some things particularly setup, cleanup, and - * following mounts are sufficiently divergent that functions are - * duplicated. Typically there is a function foo(), and its RCU - * analogue, foo_rcu(). - * - * -ECHILD is the error number of choice (just to avoid clashes) that - * is returned if some aspect of an rcu-walk fails. Such an error must - * be handled by restarting a traditional ref-walk (which will always - * be able to complete). - */ - err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); - - if (unlikely(err)) - return err; - - current->total_link_count = 0; - err = link_path_walk(name, nd); - - if (!err && !(flags & LOOKUP_PARENT)) { - err = lookup_last(nd, &path); - while (err > 0) { - void *cookie; - struct path link = path; - err = may_follow_link(&link, nd); - if (unlikely(err)) - break; - nd->flags |= LOOKUP_PARENT; - err = follow_link(&link, nd, &cookie); - if (err) - break; - err = lookup_last(nd, &path); - put_link(nd, &link, cookie); + if (IS_ERR(s)) + return PTR_ERR(s); + while (!(err = link_path_walk(s, nd)) + && ((err = lookup_last(nd)) > 0)) { + s = trailing_symlink(nd); + if (IS_ERR(s)) { + err = PTR_ERR(s); + break; } } - if (!err) err = complete_walk(nd); - if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!can_lookup(nd->inode)) { - path_put(&nd->path); + if (!err && nd->flags & LOOKUP_DIRECTORY) + if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; - } - } - - if (base) - fput(base); - - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - path_put(&nd->root); - nd->root.mnt = NULL; + if (!err) { + *path = nd->path; + nd->path.mnt = NULL; + nd->path.dentry = NULL; } + terminate_walk(nd); return err; } -static int filename_lookup(int dfd, struct filename *name, - unsigned int flags, struct nameidata *nd) +static int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, struct path *root) { - int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd); + int retval; + struct nameidata nd; + if (IS_ERR(name)) + return PTR_ERR(name); + if (unlikely(root)) { + nd.root = *root; + flags |= LOOKUP_ROOT; + } + set_nameidata(&nd, dfd, name); + retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) - retval = path_lookupat(dfd, name->name, flags, nd); + retval = path_lookupat(&nd, flags, path); if (unlikely(retval == -ESTALE)) - retval = path_lookupat(dfd, name->name, - flags | LOOKUP_REVAL, nd); + retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) - audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT); + audit_inode(name, path->dentry, flags & LOOKUP_PARENT); + restore_nameidata(); + putname(name); return retval; } -static int do_path_lookup(int dfd, const char *name, - unsigned int flags, struct nameidata *nd) +/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +static int path_parentat(struct nameidata *nd, unsigned flags, + struct path *parent) +{ + const char *s = path_init(nd, flags); + int err; + if (IS_ERR(s)) + return PTR_ERR(s); + err = link_path_walk(s, nd); + if (!err) + err = complete_walk(nd); + if (!err) { + *parent = nd->path; + nd->path.mnt = NULL; + nd->path.dentry = NULL; + } + terminate_walk(nd); + return err; +} + +static struct filename *filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type) { - struct filename filename = { .name = name }; + int retval; + struct nameidata nd; - return filename_lookup(dfd, &filename, flags, nd); + if (IS_ERR(name)) + return name; + set_nameidata(&nd, dfd, name); + retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); + if (unlikely(retval == -ECHILD)) + retval = path_parentat(&nd, flags, parent); + if (unlikely(retval == -ESTALE)) + retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); + if (likely(!retval)) { + *last = nd.last; + *type = nd.last_type; + audit_inode(name, parent->dentry, LOOKUP_PARENT); + } else { + putname(name); + name = ERR_PTR(retval); + } + restore_nameidata(); + return name; } /* does lookup, returns the object with parent locked */ struct dentry *kern_path_locked(const char *name, struct path *path) { - struct nameidata nd; + struct filename *filename; struct dentry *d; - int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd); - if (err) - return ERR_PTR(err); - if (nd.last_type != LAST_NORM) { - path_put(&nd.path); + struct qstr last; + int type; + + filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path, + &last, &type); + if (IS_ERR(filename)) + return ERR_CAST(filename); + if (unlikely(type != LAST_NORM)) { + path_put(path); + putname(filename); return ERR_PTR(-EINVAL); } - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - d = __lookup_hash(&nd.last, nd.path.dentry, 0); + mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + d = __lookup_hash(&last, path->dentry, 0); if (IS_ERR(d)) { - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); - return d; + mutex_unlock(&path->dentry->d_inode->i_mutex); + path_put(path); } - *path = nd.path; + putname(filename); return d; } int kern_path(const char *name, unsigned int flags, struct path *path) { - struct nameidata nd; - int res = do_path_lookup(AT_FDCWD, name, flags, &nd); - if (!res) - *path = nd.path; - return res; + return filename_lookup(AT_FDCWD, getname_kernel(name), + flags, path, NULL); } +EXPORT_SYMBOL(kern_path); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair @@ -2086,27 +2270,12 @@ const char *name, unsigned int flags, struct path *path) { - struct nameidata nd; - int err; - nd.root.dentry = dentry; - nd.root.mnt = mnt; - BUG_ON(flags & LOOKUP_PARENT); - /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ - err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd); - if (!err) - *path = nd.path; - return err; -} - -/* - * Restricted form of lookup. Doesn't follow links, single-component only, - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. - */ -static struct dentry *lookup_hash(struct nameidata *nd) -{ - return __lookup_hash(&nd->last, nd->path.dentry, nd->flags); + struct path root = {.mnt = mnt, .dentry = dentry}; + /* the first argument of filename_lookup() is ignored with root */ + return filename_lookup(AT_FDCWD, getname_kernel(name), + flags , path, &root); } +EXPORT_SYMBOL(vfs_path_lookup); /** * lookup_one_len - filesystem helper to lookup single pathname component @@ -2115,9 +2284,7 @@ * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. Also note that by using this function the - * nameidata argument is passed to the filesystem methods and a filesystem - * using this helper needs to be prepared for that. + * not be called by generic code. */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { @@ -2148,7 +2315,7 @@ * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { - int err = base->d_op->d_hash(base, base->d_inode, &this); + int err = base->d_op->d_hash(base, &this); if (err < 0) return ERR_PTR(err); } @@ -2159,30 +2326,15 @@ return __lookup_hash(&this, base, 0); } +EXPORT_SYMBOL(lookup_one_len); int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { - struct nameidata nd; - struct filename *tmp = getname_flags(name, flags, empty); - int err = PTR_ERR(tmp); - if (!IS_ERR(tmp)) { - - BUG_ON(flags & LOOKUP_PARENT); - - err = filename_lookup(dfd, tmp, flags, &nd); - putname(tmp); - if (!err) - *path = nd.path; - } - return err; -} - -int user_path_at(int dfd, const char __user *name, unsigned flags, - struct path *path) -{ - return user_path_at_empty(dfd, name, flags, path, NULL); + return filename_lookup(dfd, getname_flags(name, flags, empty), + flags, path, NULL); } +EXPORT_SYMBOL(user_path_at_empty); /* * NB: most callers don't do anything directly with the reference to the @@ -2190,44 +2342,196 @@ * allocated by getname. So we must hold the reference to it until all * path-walking is complete. */ -static struct filename * -user_path_parent(int dfd, const char __user *path, struct nameidata *nd, +static inline struct filename * +user_path_parent(int dfd, const char __user *path, + struct path *parent, + struct qstr *last, + int *type, unsigned int flags) { - struct filename *s = getname(path); - int error; - /* only LOOKUP_REVAL is allowed in extra flags */ - flags &= LOOKUP_REVAL; + return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL, + parent, last, type); +} - if (IS_ERR(s)) - return s; +/** + * mountpoint_last - look up last component for umount + * @nd: pathwalk nameidata - currently pointing at parent directory of "last" + * @path: pointer to container for result + * + * This is a special lookup_last function just for umount. In this case, we + * need to resolve the path without doing any revalidation. + * + * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since + * mountpoints are always pinned in the dcache, their ancestors are too. Thus, + * in almost all cases, this lookup will be served out of the dcache. The only + * cases where it won't are if nd->last refers to a symlink or the path is + * bogus and it doesn't exist. + * + * Returns: + * -error: if there was an error during lookup. This includes -ENOENT if the + * lookup found a negative dentry. The nd->path reference will also be + * put in this case. + * + * 0: if we successfully resolved nd->path and found it to not to be a + * symlink that needs to be followed. "path" will also be populated. + * The nd->path reference will also be put. + * + * 1: if we successfully resolved nd->last and found it to be a symlink + * that needs to be followed. "path" will be populated with the path + * to the link, and nd->path will *not* be put. + */ +static int +mountpoint_last(struct nameidata *nd, struct path *path) +{ + int error = 0; + struct dentry *dentry; + struct dentry *dir = nd->path.dentry; - error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd); - if (error) { - putname(s); - return ERR_PTR(error); + /* If we're in rcuwalk, drop out of it to handle last component */ + if (nd->flags & LOOKUP_RCU) { + if (unlazy_walk(nd, NULL, 0)) + return -ECHILD; + } + + nd->flags &= ~LOOKUP_PARENT; + + if (unlikely(nd->last_type != LAST_NORM)) { + error = handle_dots(nd, nd->last_type); + if (error) + return error; + dentry = dget(nd->path.dentry); + goto done; + } + + mutex_lock(&dir->d_inode->i_mutex); + dentry = d_lookup(dir, &nd->last); + if (!dentry) { + /* + * No cached dentry. Mounted dentries are pinned in the cache, + * so that means that this dentry is probably a symlink or the + * path doesn't actually point to a mounted dentry. + */ + dentry = d_alloc(dir, &nd->last); + if (!dentry) { + mutex_unlock(&dir->d_inode->i_mutex); + return -ENOMEM; + } + dentry = lookup_real(dir->d_inode, dentry, nd->flags); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->d_inode->i_mutex); + return PTR_ERR(dentry); + } + } + mutex_unlock(&dir->d_inode->i_mutex); + +done: + if (d_is_negative(dentry)) { + dput(dentry); + return -ENOENT; } + if (nd->depth) + put_link(nd); + path->dentry = dentry; + path->mnt = nd->path.mnt; + error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW, + d_backing_inode(dentry), 0); + if (unlikely(error)) + return error; + mntget(path->mnt); + follow_mount(path); + return 0; +} - return s; +/** + * path_mountpoint - look up a path to be umounted + * @nd: lookup context + * @flags: lookup flags + * @path: pointer to container for result + * + * Look up the given name, but don't attempt to revalidate the last component. + * Returns 0 and "path" will be valid on success; Returns error otherwise. + */ +static int +path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path) +{ + const char *s = path_init(nd, flags); + int err; + if (IS_ERR(s)) + return PTR_ERR(s); + while (!(err = link_path_walk(s, nd)) && + (err = mountpoint_last(nd, path)) > 0) { + s = trailing_symlink(nd); + if (IS_ERR(s)) { + err = PTR_ERR(s); + break; + } + } + terminate_walk(nd); + return err; } -/* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. +static int +filename_mountpoint(int dfd, struct filename *name, struct path *path, + unsigned int flags) +{ + struct nameidata nd; + int error; + if (IS_ERR(name)) + return PTR_ERR(name); + set_nameidata(&nd, dfd, name); + error = path_mountpoint(&nd, flags | LOOKUP_RCU, path); + if (unlikely(error == -ECHILD)) + error = path_mountpoint(&nd, flags, path); + if (unlikely(error == -ESTALE)) + error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); + if (likely(!error)) + audit_inode(name, path->dentry, 0); + restore_nameidata(); + putname(name); + return error; +} + +/** + * user_path_mountpoint_at - lookup a path from userland in order to umount it + * @dfd: directory file descriptor + * @name: pathname from userland + * @flags: lookup flags + * @path: pointer to container to hold result + * + * A umount is a special case for path walking. We're not actually interested + * in the inode in this situation, and ESTALE errors can be a problem. We + * simply want track down the dentry and vfsmount attached at the mountpoint + * and avoid revalidating the last component. + * + * Returns 0 and populates "path" on success. */ -static inline int check_sticky(struct inode *dir, struct inode *inode) +int +user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, + struct path *path) +{ + return filename_mountpoint(dfd, getname(name), path, flags); +} + +int +kern_path_mountpoint(int dfd, const char *name, struct path *path, + unsigned int flags) +{ + return filename_mountpoint(dfd, getname_kernel(name), path, flags); +} +EXPORT_SYMBOL(kern_path_mountpoint); + +int __check_sticky(struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (!(dir->i_mode & S_ISVTX)) - return 0; if (uid_eq(inode->i_uid, fsuid)) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } +EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check @@ -2248,12 +2552,14 @@ * 10. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct inode *dir,struct dentry *victim,int isdir) +static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) { + struct inode *inode = d_backing_inode(victim); int error; - if (!victim->d_inode) + if (d_is_negative(victim)) return -ENOENT; + BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); @@ -2263,15 +2569,16 @@ return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)|| - IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + + if (check_sticky(dir, inode) || IS_APPEND(inode) || + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) return -EPERM; if (isdir) { - if (!S_ISDIR(victim->d_inode->i_mode)) + if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; - } else if (S_ISDIR(victim->d_inode->i_mode)) + } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; @@ -2327,9 +2634,10 @@ } mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2); return NULL; } +EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { @@ -2339,6 +2647,7 @@ mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); } } +EXPORT_SYMBOL(unlock_rename); int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) @@ -2359,6 +2668,7 @@ fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_create); static int may_open(struct path *path, int acc_mode, int flag) { @@ -2422,7 +2732,7 @@ /* * Refuse to truncate files with mandatory locks held on them. */ - error = locks_verify_locked(inode); + error = locks_verify_locked(filp); if (!error) error = security_path_truncate(path); if (!error) { @@ -2480,6 +2790,7 @@ int acc_mode; int create_error = 0; struct dentry *const DENTRY_NOT_SET = (void *) -1UL; + bool excl; BUG_ON(dentry->d_inode); @@ -2493,10 +2804,9 @@ if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) mode &= ~current_umask(); - if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) { + excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT); + if (excl) open_flag &= ~O_TRUNC; - *opened |= FILE_CREATED; - } /* * Checking write permission is tricky, bacuse we don't know if we are @@ -2549,12 +2859,6 @@ goto out; } - acc_mode = op->acc_mode; - if (*opened & FILE_CREATED) { - fsnotify_create(dir, dentry); - acc_mode = MAY_OPEN; - } - if (error) { /* returned 1, that is */ if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; @@ -2564,9 +2868,19 @@ dput(dentry); dentry = file->f_path.dentry; } - if (create_error && dentry->d_inode == NULL) { - error = create_error; - goto out; + if (*opened & FILE_CREATED) + fsnotify_create(dir, dentry); + if (!dentry->d_inode) { + WARN_ON(*opened & FILE_CREATED); + if (create_error) { + error = create_error; + goto out; + } + } else { + if (excl && !(*opened & FILE_CREATED)) { + error = -EEXIST; + goto out; + } } goto looked_up; } @@ -2575,6 +2889,12 @@ * We didn't have the inode before the open, so check open permission * here. */ + acc_mode = op->acc_mode; + if (*opened & FILE_CREATED) { + WARN_ON(!(open_flag & O_CREAT)); + fsnotify_create(dir, dentry); + acc_mode = MAY_OPEN; + } error = may_open(&file->f_path, acc_mode, open_flag); if (error) fput(file); @@ -2588,22 +2908,10 @@ dentry = lookup_real(dir, dentry, nd->flags); if (IS_ERR(dentry)) return PTR_ERR(dentry); - - if (create_error) { - int open_flag = op->open_flag; - - error = create_error; - if ((open_flag & O_EXCL)) { - if (!dentry->d_inode) - goto out; - } else if (!dentry->d_inode) { - goto out; - } else if ((open_flag & O_TRUNC) && - S_ISREG(dentry->d_inode->i_mode)) { - goto out; - } - /* will fail later, go on to get the right error */ - } + } + if (create_error && !dentry->d_inode) { + error = create_error; + goto out; } looked_up: path->dentry = dentry; @@ -2700,61 +3008,42 @@ /* * Handle the last step of open() */ -static int do_last(struct nameidata *nd, struct path *path, +static int do_last(struct nameidata *nd, struct file *file, const struct open_flags *op, - int *opened, struct filename *name) + int *opened) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; int acc_mode = op->acc_mode; + unsigned seq; struct inode *inode; - bool symlink_ok = false; struct path save_parent = { .dentry = NULL, .mnt = NULL }; + struct path path; bool retried = false; int error; nd->flags &= ~LOOKUP_PARENT; nd->flags |= op->intent; - switch (nd->last_type) { - case LAST_DOTDOT: - case LAST_DOT: + if (nd->last_type != LAST_NORM) { error = handle_dots(nd, nd->last_type); - if (error) - return error; - /* fallthrough */ - case LAST_ROOT: - error = complete_walk(nd); - if (error) - return error; - audit_inode(name, nd->path.dentry, 0); - if (open_flag & O_CREAT) { - error = -EISDIR; - goto out; - } - goto finish_open; - case LAST_BIND: - error = complete_walk(nd); - if (error) + if (unlikely(error)) return error; - audit_inode(name, dir, 0); goto finish_open; } if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) - symlink_ok = true; /* we _can_ be in RCU mode here */ - error = lookup_fast(nd, path, &inode); + error = lookup_fast(nd, &path, &inode, &seq); if (likely(!error)) goto finish_lookup; if (error < 0) - goto out; + return error; BUG_ON(nd->inode != dir->d_inode); } else { @@ -2768,11 +3057,10 @@ if (error) return error; - audit_inode(name, dir, LOOKUP_PARENT); - error = -EISDIR; + audit_inode(nd->name, dir, LOOKUP_PARENT); /* trailing slashes? */ - if (nd->last.name[nd->last.len]) - goto out; + if (unlikely(nd->last.name[nd->last.len])) + return -EISDIR; } retry_lookup: @@ -2787,7 +3075,7 @@ */ } mutex_lock(&dir->d_inode->i_mutex); - error = lookup_open(nd, path, file, op, got_write, opened); + error = lookup_open(nd, &path, file, op, got_write, opened); mutex_unlock(&dir->d_inode->i_mutex); if (error <= 0) { @@ -2798,7 +3086,7 @@ !S_ISREG(file_inode(file)->i_mode)) will_truncate = false; - audit_inode(name, file->f_path.dentry, 0); + audit_inode(nd->name, file->f_path.dentry, 0); goto opened; } @@ -2807,15 +3095,15 @@ open_flag &= ~O_TRUNC; will_truncate = false; acc_mode = MAY_OPEN; - path_to_nameidata(path, nd); + path_to_nameidata(&path, nd); goto finish_open_created; } /* * create/update audit record if it already exists. */ - if (path->dentry->d_inode) - audit_inode(name, path->dentry, 0); + if (d_is_positive(path.dentry)) + audit_inode(nd->name, path.dentry, 0); /* * If atomic_open() acquired write access it is dropped now due to @@ -2827,63 +3115,59 @@ got_write = false; } - error = -EEXIST; - if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) - goto exit_dput; - - error = follow_managed(path, nd->flags); - if (error < 0) - goto exit_dput; + if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { + path_to_nameidata(&path, nd); + return -EEXIST; + } - if (error) - nd->flags |= LOOKUP_JUMPED; + error = follow_managed(&path, nd); + if (unlikely(error < 0)) + return error; BUG_ON(nd->flags & LOOKUP_RCU); - inode = path->dentry->d_inode; -finish_lookup: - /* we _can_ be in RCU mode here */ - error = -ENOENT; - if (!inode) { - path_to_nameidata(path, nd); - goto out; - } - - if (should_follow_link(inode, !symlink_ok)) { - if (nd->flags & LOOKUP_RCU) { - if (unlikely(nd->path.mnt != path->mnt || - unlazy_walk(nd, path->dentry))) { - error = -ECHILD; - goto out; - } - } - BUG_ON(inode != path->dentry->d_inode); - return 1; + seq = 0; /* out of RCU mode, so the value doesn't matter */ + if (unlikely(d_is_negative(path.dentry))) { + path_to_nameidata(&path, nd); + return -ENOENT; } + inode = d_backing_inode(path.dentry); +finish_lookup: + if (nd->depth) + put_link(nd); + error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW, + inode, seq); + if (unlikely(error)) + return error; - if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) { - path_to_nameidata(path, nd); + if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) { + path_to_nameidata(&path, nd); } else { save_parent.dentry = nd->path.dentry; - save_parent.mnt = mntget(path->mnt); - nd->path.dentry = path->dentry; + save_parent.mnt = mntget(path.mnt); + nd->path.dentry = path.dentry; } nd->inode = inode; + nd->seq = seq; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ +finish_open: error = complete_walk(nd); if (error) { path_put(&save_parent); return error; } + audit_inode(nd->name, nd->path.dentry, 0); + if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) { + error = -ELOOP; + goto out; + } error = -EISDIR; - if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) + if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) goto out; error = -ENOTDIR; - if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode)) + if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) goto out; - audit_inode(name, nd->path.dentry, 0); -finish_open: - if (!S_ISREG(nd->inode->i_mode)) + if (!d_is_reg(nd->path.dentry)) will_truncate = false; if (will_truncate) { @@ -2896,9 +3180,12 @@ error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; - file->f_path.mnt = nd->path.mnt; - error = finish_open(file, nd->path.dentry, NULL, opened); - if (error) { + + BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ + error = vfs_open(&nd->path, file, current_cred()); + if (!error) { + *opened |= FILE_OPENED; + } else { if (error == -EOPENSTALE) goto stale_open; goto out; @@ -2907,7 +3194,7 @@ error = open_check_o_direct(file); if (error) goto exit_fput; - error = ima_file_check(file, op->acc_mode); + error = ima_file_check(file, op->acc_mode, *opened); if (error) goto exit_fput; @@ -2924,12 +3211,8 @@ if (got_write) mnt_drop_write(nd->path.mnt); path_put(&save_parent); - terminate_walk(nd); return error; -exit_dput: - path_put_conditional(path, nd); - goto out; exit_fput: fput(file); goto out; @@ -2953,12 +3236,69 @@ goto retry_lookup; } -static struct file *path_openat(int dfd, struct filename *pathname, - struct nameidata *nd, const struct open_flags *op, int flags) +static int do_tmpfile(struct nameidata *nd, unsigned flags, + const struct open_flags *op, + struct file *file, int *opened) +{ + static const struct qstr name = QSTR_INIT("/", 1); + struct dentry *child; + struct inode *dir; + struct path path; + int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); + if (unlikely(error)) + return error; + error = mnt_want_write(path.mnt); + if (unlikely(error)) + goto out; + dir = path.dentry->d_inode; + /* we want directory to be writable */ + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (error) + goto out2; + if (!dir->i_op->tmpfile) { + error = -EOPNOTSUPP; + goto out2; + } + child = d_alloc(path.dentry, &name); + if (unlikely(!child)) { + error = -ENOMEM; + goto out2; + } + dput(path.dentry); + path.dentry = child; + error = dir->i_op->tmpfile(dir, child, op->mode); + if (error) + goto out2; + audit_inode(nd->name, child, 0); + /* Don't check for other permissions, the inode was just created */ + error = may_open(&path, MAY_OPEN, op->open_flag); + if (error) + goto out2; + file->f_path.mnt = path.mnt; + error = finish_open(file, child, NULL, opened); + if (error) + goto out2; + error = open_check_o_direct(file); + if (error) { + fput(file); + } else if (!(op->open_flag & O_EXCL)) { + struct inode *inode = file_inode(file); + spin_lock(&inode->i_lock); + inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + } +out2: + mnt_drop_write(path.mnt); +out: + path_put(&path); + return error; +} + +static struct file *path_openat(struct nameidata *nd, + const struct open_flags *op, unsigned flags) { - struct file *base = NULL; + const char *s; struct file *file; - struct path path; int opened = 0; int error; @@ -2968,41 +3308,27 @@ file->f_flags = op->open_flag; - error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base); - if (unlikely(error)) - goto out; - - current->total_link_count = 0; - error = link_path_walk(pathname->name, nd); - if (unlikely(error)) - goto out; + if (unlikely(file->f_flags & __O_TMPFILE)) { + error = do_tmpfile(nd, flags, op, file, &opened); + goto out2; + } - error = do_last(nd, &path, file, op, &opened, pathname); - while (unlikely(error > 0)) { /* trailing symlink */ - struct path link = path; - void *cookie; - if (!(nd->flags & LOOKUP_FOLLOW)) { - path_put_conditional(&path, nd); - path_put(&nd->path); - error = -ELOOP; - break; - } - error = may_follow_link(&link, nd); - if (unlikely(error)) - break; - nd->flags |= LOOKUP_PARENT; + s = path_init(nd, flags); + if (IS_ERR(s)) { + put_filp(file); + return ERR_CAST(s); + } + while (!(error = link_path_walk(s, nd)) && + (error = do_last(nd, file, op, &opened)) > 0) { nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - error = follow_link(&link, nd, &cookie); - if (unlikely(error)) + s = trailing_symlink(nd); + if (IS_ERR(s)) { + error = PTR_ERR(s); break; - error = do_last(nd, &path, file, op, &opened, pathname); - put_link(nd, &link, cookie); + } } -out: - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) - path_put(&nd->root); - if (base) - fput(base); + terminate_walk(nd); +out2: if (!(opened & FILE_OPENED)) { BUG_ON(!error); put_filp(file); @@ -3020,47 +3346,57 @@ } struct file *do_filp_open(int dfd, struct filename *pathname, - const struct open_flags *op, int flags) + const struct open_flags *op) { struct nameidata nd; + int flags = op->lookup_flags; struct file *filp; - filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); + set_nameidata(&nd, dfd, pathname); + filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) - filp = path_openat(dfd, pathname, &nd, op, flags); + filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) - filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); + filp = path_openat(&nd, op, flags | LOOKUP_REVAL); + restore_nameidata(); return filp; } struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, - const char *name, const struct open_flags *op, int flags) + const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; - struct filename filename = { .name = name }; + struct filename *filename; + int flags = op->lookup_flags | LOOKUP_ROOT; nd.root.mnt = mnt; nd.root.dentry = dentry; - flags |= LOOKUP_ROOT; - - if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) + if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); - file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU); + filename = getname_kernel(name); + if (IS_ERR(filename)) + return ERR_CAST(filename); + + set_nameidata(&nd, -1, filename); + file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) - file = path_openat(-1, &filename, &nd, op, flags); + file = path_openat(&nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) - file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL); + file = path_openat(&nd, op, flags | LOOKUP_REVAL); + restore_nameidata(); + putname(filename); return file; } -struct dentry *kern_path_create(int dfd, const char *pathname, +static struct dentry *filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); - struct nameidata nd; + struct qstr last; + int type; int err2; int error; bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); @@ -3071,39 +3407,39 @@ */ lookup_flags &= LOOKUP_REVAL; - error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd); - if (error) - return ERR_PTR(error); + name = filename_parentat(dfd, name, lookup_flags, path, &last, &type); + if (IS_ERR(name)) + return ERR_CAST(name); /* * Yucky last component or no last component at all? * (foo/., foo/.., /////) */ - if (nd.last_type != LAST_NORM) + if (unlikely(type != LAST_NORM)) goto out; - nd.flags &= ~LOOKUP_PARENT; - nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; /* don't fail immediately if it's r/o, at least try to report other errors */ - err2 = mnt_want_write(nd.path.mnt); + err2 = mnt_want_write(path->mnt); /* * Do the final lookup. */ - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_hash(&nd); + lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; + mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = __lookup_hash(&last, path->dentry, lookup_flags); if (IS_ERR(dentry)) goto unlock; error = -EEXIST; - if (dentry->d_inode) + if (d_is_positive(dentry)) goto fail; + /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ - if (unlikely(!is_dir && nd.last.name[nd.last.len])) { + if (unlikely(!is_dir && last.name[last.len])) { error = -ENOENT; goto fail; } @@ -3111,19 +3447,27 @@ error = err2; goto fail; } - *path = nd.path; + putname(name); return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + mutex_unlock(&path->dentry->d_inode->i_mutex); if (!err2) - mnt_drop_write(nd.path.mnt); + mnt_drop_write(path->mnt); out: - path_put(&nd.path); + path_put(path); + putname(name); return dentry; } + +struct dentry *kern_path_create(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) +{ + return filename_create(dfd, getname_kernel(pathname), + path, lookup_flags); +} EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) @@ -3135,16 +3479,10 @@ } EXPORT_SYMBOL(done_path_create); -struct dentry *user_path_create(int dfd, const char __user *pathname, +inline struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, unsigned int lookup_flags) { - struct filename *tmp = getname(pathname); - struct dentry *res; - if (IS_ERR(tmp)) - return ERR_CAST(tmp); - res = kern_path_create(dfd, tmp->name, path, lookup_flags); - putname(tmp); - return res; + return filename_create(dfd, getname(pathname), path, lookup_flags); } EXPORT_SYMBOL(user_path_create); @@ -3174,6 +3512,7 @@ fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { @@ -3263,6 +3602,7 @@ fsnotify_mkdir(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mkdir); SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { @@ -3313,10 +3653,11 @@ { shrink_dcache_parent(dentry); spin_lock(&dentry->d_lock); - if (dentry->d_count == 1) + if (dentry->d_lockref.count == 1) __d_drop(dentry); spin_unlock(&dentry->d_lock); } +EXPORT_SYMBOL(dentry_unhash); int vfs_rmdir(struct inode *dir, struct dentry *dentry) { @@ -3332,7 +3673,7 @@ mutex_lock(&dentry->d_inode->i_mutex); error = -EBUSY; - if (d_mountpoint(dentry)) + if (is_local_mountpoint(dentry)) goto out; error = security_inode_rmdir(dir, dentry); @@ -3346,6 +3687,7 @@ dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); + detach_mounts(dentry); out: mutex_unlock(&dentry->d_inode->i_mutex); @@ -3354,20 +3696,24 @@ d_delete(dentry); return error; } +EXPORT_SYMBOL(vfs_rmdir); static long do_rmdir(int dfd, const char __user *pathname) { int error = 0; struct filename *name; struct dentry *dentry; - struct nameidata nd; + struct path path; + struct qstr last; + int type; unsigned int lookup_flags = 0; retry: - name = user_path_parent(dfd, pathname, &nd, lookup_flags); + name = user_path_parent(dfd, pathname, + &path, &last, &type, lookup_flags); if (IS_ERR(name)) return PTR_ERR(name); - switch(nd.last_type) { + switch (type) { case LAST_DOTDOT: error = -ENOTEMPTY; goto exit1; @@ -3379,13 +3725,12 @@ goto exit1; } - nd.flags &= ~LOOKUP_PARENT; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto exit1; - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_hash(&nd); + mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit2; @@ -3393,17 +3738,17 @@ error = -ENOENT; goto exit3; } - error = security_path_rmdir(&nd.path, dentry); + error = security_path_rmdir(&path, dentry); if (error) goto exit3; - error = vfs_rmdir(nd.path.dentry->d_inode, dentry); + error = vfs_rmdir(path.dentry->d_inode, dentry); exit3: dput(dentry); exit2: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - mnt_drop_write(nd.path.mnt); + mutex_unlock(&path.dentry->d_inode->i_mutex); + mnt_drop_write(path.mnt); exit1: - path_put(&nd.path); + path_put(&path); putname(name); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -3417,8 +3762,27 @@ return do_rmdir(AT_FDCWD, pathname); } -int vfs_unlink(struct inode *dir, struct dentry *dentry) +/** + * vfs_unlink - unlink a filesystem object + * @dir: parent directory + * @dentry: victim + * @delegated_inode: returns victim inode, if the inode is delegated. + * + * The caller must hold dir->i_mutex. + * + * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and + * return a reference to the inode in delegated_inode. The caller + * should then break the delegation on that inode and retry. Because + * breaking a delegation may take a long time, the caller should drop + * dir->i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ +int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { + struct inode *target = dentry->d_inode; int error = may_delete(dir, dentry, 0); if (error) @@ -3427,27 +3791,34 @@ if (!dir->i_op->unlink) return -EPERM; - mutex_lock(&dentry->d_inode->i_mutex); - if (d_mountpoint(dentry)) + mutex_lock(&target->i_mutex); + if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); if (!error) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; error = dir->i_op->unlink(dir, dentry); - if (!error) + if (!error) { dont_mount(dentry); + detach_mounts(dentry); + } } } - mutex_unlock(&dentry->d_inode->i_mutex); +out: + mutex_unlock(&target->i_mutex); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { - fsnotify_link_count(dentry->d_inode); + fsnotify_link_count(target); d_delete(dentry); } return error; } +EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its @@ -3460,47 +3831,56 @@ int error; struct filename *name; struct dentry *dentry; - struct nameidata nd; + struct path path; + struct qstr last; + int type; struct inode *inode = NULL; + struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: - name = user_path_parent(dfd, pathname, &nd, lookup_flags); + name = user_path_parent(dfd, pathname, + &path, &last, &type, lookup_flags); if (IS_ERR(name)) return PTR_ERR(name); error = -EISDIR; - if (nd.last_type != LAST_NORM) + if (type != LAST_NORM) goto exit1; - nd.flags &= ~LOOKUP_PARENT; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto exit1; - - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_hash(&nd); +retry_deleg: + mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ - if (nd.last.name[nd.last.len]) + if (last.name[last.len]) goto slashes; inode = dentry->d_inode; - if (!inode) + if (d_is_negative(dentry)) goto slashes; ihold(inode); - error = security_path_unlink(&nd.path, dentry); + error = security_path_unlink(&path, dentry); if (error) goto exit2; - error = vfs_unlink(nd.path.dentry->d_inode, dentry); + error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode); exit2: dput(dentry); } - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + mutex_unlock(&path.dentry->d_inode->i_mutex); if (inode) iput(inode); /* truncate the inode here */ - mnt_drop_write(nd.path.mnt); + inode = NULL; + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + mnt_drop_write(path.mnt); exit1: - path_put(&nd.path); + path_put(&path); putname(name); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -3510,8 +3890,12 @@ return error; slashes: - error = !dentry->d_inode ? -ENOENT : - S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; + if (d_is_negative(dentry)) + error = -ENOENT; + else if (d_is_dir(dentry)) + error = -EISDIR; + else + error = -ENOTDIR; goto exit2; } @@ -3550,6 +3934,7 @@ fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_symlink); SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) @@ -3587,7 +3972,26 @@ return sys_symlinkat(oldname, AT_FDCWD, newname); } -int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) +/** + * vfs_link - create a new link + * @old_dentry: object to be linked + * @dir: new parent + * @new_dentry: where to create the new link + * @delegated_inode: returns inode needing a delegation break + * + * The caller must hold dir->i_mutex + * + * If vfs_link discovers a delegation on the to-be-linked file in need + * of breaking, it will return -EWOULDBLOCK and return a reference to the + * inode in delegated_inode. The caller should then break the delegation + * and retry. Because breaking a delegation may take a long time, the + * caller should drop the i_mutex before doing so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + */ +int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -3619,17 +4023,27 @@ mutex_lock(&inode->i_mutex); /* Make sure we don't allow creating hardlink to an unlinked file */ - if (inode->i_nlink == 0) + if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; - else - error = dir->i_op->link(old_dentry, dir, new_dentry); + else { + error = try_break_deleg(inode, delegated_inode); + if (!error) + error = dir->i_op->link(old_dentry, dir, new_dentry); + } + + if (!error && (inode->i_state & I_LINKABLE)) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_LINKABLE; + spin_unlock(&inode->i_lock); + } mutex_unlock(&inode->i_mutex); if (!error) fsnotify_link(dir, inode, new_dentry); return error; } +EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid @@ -3645,6 +4059,7 @@ { struct dentry *new_dentry; struct path old_path, new_path; + struct inode *delegated_inode = NULL; int how = 0; int error; @@ -3683,9 +4098,16 @@ error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); + error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) { + path_put(&old_path); + goto retry; + } + } if (retry_estale(error, how)) { path_put(&old_path); how |= LOOKUP_REVAL; @@ -3702,16 +4124,38 @@ return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -/* +/** + * vfs_rename - rename a filesystem object + * @old_dir: parent of source + * @old_dentry: source + * @new_dir: parent of destination + * @new_dentry: destination + * @delegated_inode: returns an inode needing a delegation break + * @flags: rename flags + * + * The caller must hold multiple mutexes--see lock_rename()). + * + * If vfs_rename discovers a delegation in need of breaking at either + * the source or destination, it will return -EWOULDBLOCK and return a + * reference to the inode in delegated_inode. The caller should then + * break the delegation and retry. Because breaking a delegation may + * take a long time, the caller should drop all locks before doing + * so. + * + * Alternatively, a caller may pass NULL for delegated_inode. This may + * be appropriate for callers that expect the underlying filesystem not + * to be NFS exported. + * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: - * a) we can get into loop creation. Check is done in is_subdir(). + * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4 screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. - * c) we have to lock _three_ objects - parents and victim (if it exists). + * c) we have to lock _four_ objects - parents and victim (if it exists), + * and source (if it is not a directory). * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change @@ -3729,228 +4173,277 @@ * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ -static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + struct inode **delegated_inode, unsigned int flags) { - int error = 0; + int error; + bool is_dir = d_is_dir(old_dentry); + struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; + bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; + struct name_snapshot old_name; /* - * If we are going to change the parent - check write permissions, - * we'll need to flip '..'. + * Check source == target. + * On overlayfs need to look at underlying inodes. */ - if (new_dir != old_dir) { - error = inode_permission(old_dentry->d_inode, MAY_WRITE); - if (error) - return error; - } + if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0)) + return 0; - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + error = may_delete(old_dir, old_dentry, is_dir); if (error) return error; - dget(new_dentry); - if (target) - mutex_lock(&target->i_mutex); + if (!target) { + error = may_create(new_dir, new_dentry); + } else { + new_is_dir = d_is_dir(new_dentry); - error = -EBUSY; - if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) - goto out; + if (!(flags & RENAME_EXCHANGE)) + error = may_delete(new_dir, new_dentry, is_dir); + else + error = may_delete(new_dir, new_dentry, new_is_dir); + } + if (error) + return error; - error = -EMLINK; - if (max_links && !target && new_dir != old_dir && - new_dir->i_nlink >= max_links) - goto out; + if (!old_dir->i_op->rename && !old_dir->i_op->rename2) + return -EPERM; - if (target) - shrink_dcache_parent(new_dentry); - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); - if (error) - goto out; + if (flags && !old_dir->i_op->rename2) + return -EINVAL; - if (target) { - target->i_flags |= S_DEAD; - dont_mount(new_dentry); + /* + * If we are going to change the parent - check write permissions, + * we'll need to flip '..'. + */ + if (new_dir != old_dir) { + if (is_dir) { + error = inode_permission(source, MAY_WRITE); + if (error) + return error; + } + if ((flags & RENAME_EXCHANGE) && new_is_dir) { + error = inode_permission(target, MAY_WRITE); + if (error) + return error; + } } -out: - if (target) - mutex_unlock(&target->i_mutex); - dput(new_dentry); - if (!error) - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry,new_dentry); - return error; -} -static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - struct inode *target = new_dentry->d_inode; - int error; - - error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); + error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); if (error) return error; + take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); - if (target) + if (!is_dir || (flags & RENAME_EXCHANGE)) + lock_two_nondirectories(source, target); + else if (target) mutex_lock(&target->i_mutex); error = -EBUSY; - if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) + if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (max_links && new_dir != old_dir) { + error = -EMLINK; + if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) + goto out; + if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && + old_dir->i_nlink >= max_links) + goto out; + } + if (is_dir && !(flags & RENAME_EXCHANGE) && target) + shrink_dcache_parent(new_dentry); + if (!is_dir) { + error = try_break_deleg(source, delegated_inode); + if (error) + goto out; + } + if (target && !new_is_dir) { + error = try_break_deleg(target, delegated_inode); + if (error) + goto out; + } + if (!old_dir->i_op->rename2) { + error = old_dir->i_op->rename(old_dir, old_dentry, + new_dir, new_dentry); + } else { + WARN_ON(old_dir->i_op->rename != NULL); + error = old_dir->i_op->rename2(old_dir, old_dentry, + new_dir, new_dentry, flags); + } if (error) goto out; - if (target) + if (!(flags & RENAME_EXCHANGE) && target) { + if (is_dir) + target->i_flags |= S_DEAD; dont_mount(new_dentry); - if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) - d_move(old_dentry, new_dentry); + detach_mounts(new_dentry); + } + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { + if (!(flags & RENAME_EXCHANGE)) + d_move(old_dentry, new_dentry); + else + d_exchange(old_dentry, new_dentry); + } out: - if (target) + if (!is_dir || (flags & RENAME_EXCHANGE)) + unlock_two_nondirectories(source, target); + else if (target) mutex_unlock(&target->i_mutex); dput(new_dentry); - return error; -} - -int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - int error; - int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); - const unsigned char *old_name; - - if (old_dentry->d_inode == new_dentry->d_inode) - return 0; - - error = may_delete(old_dir, old_dentry, is_dir); - if (error) - return error; - - if (!new_dentry->d_inode) - error = may_create(new_dir, new_dentry); - else - error = may_delete(new_dir, new_dentry, is_dir); - if (error) - return error; - - if (!old_dir->i_op->rename) - return -EPERM; - - old_name = fsnotify_oldname_init(old_dentry->d_name.name); - - if (is_dir) - error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); - else - error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); - if (!error) - fsnotify_move(old_dir, new_dir, old_name, is_dir, - new_dentry->d_inode, old_dentry); - fsnotify_oldname_free(old_name); + if (!error) { + fsnotify_move(old_dir, new_dir, old_name.name, is_dir, + !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); + if (flags & RENAME_EXCHANGE) { + fsnotify_move(new_dir, old_dir, old_dentry->d_name.name, + new_is_dir, NULL, new_dentry); + } + } + release_dentry_name_snapshot(&old_name); return error; } +EXPORT_SYMBOL(vfs_rename); -SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, - int, newdfd, const char __user *, newname) +SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, unsigned int, flags) { - struct dentry *old_dir, *new_dir; struct dentry *old_dentry, *new_dentry; struct dentry *trap; - struct nameidata oldnd, newnd; + struct path old_path, new_path; + struct qstr old_last, new_last; + int old_type, new_type; + struct inode *delegated_inode = NULL; struct filename *from; struct filename *to; - unsigned int lookup_flags = 0; + unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; bool should_retry = false; int error; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && + (flags & RENAME_EXCHANGE)) + return -EINVAL; + + if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) + return -EPERM; + + if (flags & RENAME_EXCHANGE) + target_flags = 0; + retry: - from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); + from = user_path_parent(olddfd, oldname, + &old_path, &old_last, &old_type, lookup_flags); if (IS_ERR(from)) { error = PTR_ERR(from); goto exit; } - to = user_path_parent(newdfd, newname, &newnd, lookup_flags); + to = user_path_parent(newdfd, newname, + &new_path, &new_last, &new_type, lookup_flags); if (IS_ERR(to)) { error = PTR_ERR(to); goto exit1; } error = -EXDEV; - if (oldnd.path.mnt != newnd.path.mnt) + if (old_path.mnt != new_path.mnt) goto exit2; - old_dir = oldnd.path.dentry; error = -EBUSY; - if (oldnd.last_type != LAST_NORM) + if (old_type != LAST_NORM) goto exit2; - new_dir = newnd.path.dentry; - if (newnd.last_type != LAST_NORM) + if (flags & RENAME_NOREPLACE) + error = -EEXIST; + if (new_type != LAST_NORM) goto exit2; - error = mnt_want_write(oldnd.path.mnt); + error = mnt_want_write(old_path.mnt); if (error) goto exit2; - oldnd.flags &= ~LOOKUP_PARENT; - newnd.flags &= ~LOOKUP_PARENT; - newnd.flags |= LOOKUP_RENAME_TARGET; +retry_deleg: + trap = lock_rename(new_path.dentry, old_path.dentry); - trap = lock_rename(new_dir, old_dir); - - old_dentry = lookup_hash(&oldnd); + old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; /* source must exist */ error = -ENOENT; - if (!old_dentry->d_inode) + if (d_is_negative(old_dentry)) + goto exit4; + new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) goto exit4; + error = -EEXIST; + if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) + goto exit5; + if (flags & RENAME_EXCHANGE) { + error = -ENOENT; + if (d_is_negative(new_dentry)) + goto exit5; + + if (!d_is_dir(new_dentry)) { + error = -ENOTDIR; + if (new_last.name[new_last.len]) + goto exit5; + } + } /* unless the source is a directory trailing slashes give -ENOTDIR */ - if (!S_ISDIR(old_dentry->d_inode->i_mode)) { + if (!d_is_dir(old_dentry)) { error = -ENOTDIR; - if (oldnd.last.name[oldnd.last.len]) - goto exit4; - if (newnd.last.name[newnd.last.len]) - goto exit4; + if (old_last.name[old_last.len]) + goto exit5; + if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) + goto exit5; } /* source should not be ancestor of target */ error = -EINVAL; if (old_dentry == trap) - goto exit4; - new_dentry = lookup_hash(&newnd); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; + goto exit5; /* target should not be an ancestor of source */ - error = -ENOTEMPTY; + if (!(flags & RENAME_EXCHANGE)) + error = -ENOTEMPTY; if (new_dentry == trap) goto exit5; - error = security_path_rename(&oldnd.path, old_dentry, - &newnd.path, new_dentry); + error = security_path_rename(&old_path, old_dentry, + &new_path, new_dentry, flags); if (error) goto exit5; - error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); + error = vfs_rename(old_path.dentry->d_inode, old_dentry, + new_path.dentry->d_inode, new_dentry, + &delegated_inode, flags); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: - unlock_rename(new_dir, old_dir); - mnt_drop_write(oldnd.path.mnt); + unlock_rename(new_path.dentry, old_path.dentry); + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + mnt_drop_write(old_path.mnt); exit2: if (retry_estale(error, lookup_flags)) should_retry = true; - path_put(&newnd.path); + path_put(&new_path); putname(to); exit1: - path_put(&oldnd.path); + path_put(&old_path); putname(from); if (should_retry) { should_retry = false; @@ -3961,16 +4454,34 @@ return error; } +SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname) +{ + return sys_renameat2(olddfd, oldname, newdfd, newname, 0); +} + SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { - return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); + return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) +int vfs_whiteout(struct inode *dir, struct dentry *dentry) { - int len; + int error = may_create(dir, dentry); + if (error) + return error; - len = PTR_ERR(link); + if (!dir->i_op->mknod) + return -EPERM; + + return dir->i_op->mknod(dir, dentry, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); +} +EXPORT_SYMBOL(vfs_whiteout); + +int readlink_copy(char __user *buffer, int buflen, const char *link) +{ + int len = PTR_ERR(link); if (IS_ERR(link)) goto out; @@ -3982,6 +4493,7 @@ out: return len; } +EXPORT_SYMBOL(readlink_copy); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that @@ -3990,25 +4502,22 @@ */ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct nameidata nd; void *cookie; + struct inode *inode = d_inode(dentry); + const char *link = inode->i_link; int res; - nd.depth = 0; - cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); - if (IS_ERR(cookie)) - return PTR_ERR(cookie); - - res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); - if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, &nd, cookie); + if (!link) { + link = inode->i_op->follow_link(dentry, &cookie); + if (IS_ERR(link)) + return PTR_ERR(link); + } + res = readlink_copy(buffer, buflen, link); + if (inode->i_op->put_link) + inode->i_op->put_link(inode, cookie); return res; } - -int vfs_follow_link(struct nameidata *nd, const char *link) -{ - return __vfs_follow_link(nd, link); -} +EXPORT_SYMBOL(generic_readlink); /* get the link contents into pagecache */ static char *page_getlink(struct dentry * dentry, struct page **ppage) @@ -4028,31 +4537,32 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct page *page = NULL; - char *s = page_getlink(dentry, &page); - int res = vfs_readlink(dentry,buffer,buflen,s); + int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page)); if (page) { kunmap(page); page_cache_release(page); } return res; } +EXPORT_SYMBOL(page_readlink); -void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) +const char *page_follow_link_light(struct dentry *dentry, void **cookie) { struct page *page = NULL; - nd_set_link(nd, page_getlink(dentry, &page)); - return page; + char *res = page_getlink(dentry, &page); + if (!IS_ERR(res)) + *cookie = page; + return res; } +EXPORT_SYMBOL(page_follow_link_light); -void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +void page_put_link(struct inode *unused, void *cookie) { struct page *page = cookie; - - if (page) { - kunmap(page); - page_cache_release(page); - } + kunmap(page); + page_cache_release(page); } +EXPORT_SYMBOL(page_put_link); /* * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS @@ -4090,46 +4600,18 @@ fail: return err; } +EXPORT_SYMBOL(__page_symlink); int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, - !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); + !mapping_gfp_constraint(inode->i_mapping, __GFP_FS)); } +EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, }; - -EXPORT_SYMBOL(user_path_at); -EXPORT_SYMBOL(follow_down_one); -EXPORT_SYMBOL(follow_down); -EXPORT_SYMBOL(follow_up); -EXPORT_SYMBOL(get_write_access); /* nfsd */ -EXPORT_SYMBOL(lock_rename); -EXPORT_SYMBOL(lookup_one_len); -EXPORT_SYMBOL(page_follow_link_light); -EXPORT_SYMBOL(page_put_link); -EXPORT_SYMBOL(page_readlink); -EXPORT_SYMBOL(__page_symlink); -EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(kern_path); -EXPORT_SYMBOL(vfs_path_lookup); -EXPORT_SYMBOL(inode_permission); -EXPORT_SYMBOL(unlock_rename); -EXPORT_SYMBOL(vfs_create); -EXPORT_SYMBOL(vfs_follow_link); -EXPORT_SYMBOL(vfs_link); -EXPORT_SYMBOL(vfs_mkdir); -EXPORT_SYMBOL(vfs_mknod); -EXPORT_SYMBOL(generic_permission); -EXPORT_SYMBOL(vfs_readlink); -EXPORT_SYMBOL(vfs_rename); -EXPORT_SYMBOL(vfs_rmdir); -EXPORT_SYMBOL(vfs_symlink); -EXPORT_SYMBOL(vfs_unlink); -EXPORT_SYMBOL(dentry_unhash); -EXPORT_SYMBOL(generic_readlink);