diff options
Diffstat (limited to 'fs')
66 files changed, 1616 insertions, 1427 deletions
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 109a6c606d9..e8e5e63ac95 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -177,8 +177,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) } /* Trigger mount for path component or follow link */ } else if (ino->flags & AUTOFS_INF_PENDING || - autofs4_need_mount(flags) || - current->link_count) { + autofs4_need_mount(flags)) { DPRINTK("waiting for mount name=%.*s", dentry->d_name.len, dentry->d_name.name); @@ -262,7 +261,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) spin_unlock(&dcache_lock); spin_unlock(&sbi->fs_lock); - status = try_to_fill_dentry(dentry, 0); + status = try_to_fill_dentry(dentry, nd->flags); if (status) goto out_error; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e84ef60ffe3..97a97839a86 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1481,12 +1481,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, ret = -EBADF; goto out_drop_write; } + src = src_file->f_dentry->d_inode; ret = -EINVAL; if (src == inode) goto out_fput; + /* the src must be open for reading */ + if (!(src_file->f_mode & FMODE_READ)) + goto out_fput; + ret = -EISDIR; if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) goto out_fput; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index f7c255f9c62..a8cd821226d 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -34,6 +34,7 @@ struct cachefiles_object { loff_t i_size; /* object size */ unsigned long flags; #define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ +#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */ atomic_t usage; /* object usage count */ uint8_t type; /* object type */ uint8_t new; /* T if object new */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index d5db84a1ee0..f4a7840bf42 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -93,6 +93,59 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object, } /* + * mark the owner of a dentry, if there is one, to indicate that that dentry + * has been preemptively deleted + * - the caller must hold the i_mutex on the dentry's parent as required to + * call vfs_unlink(), vfs_rmdir() or vfs_rename() + */ +static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, + struct dentry *dentry) +{ + struct cachefiles_object *object; + struct rb_node *p; + + _enter(",'%*.*s'", + dentry->d_name.len, dentry->d_name.len, dentry->d_name.name); + + write_lock(&cache->active_lock); + + p = cache->active_nodes.rb_node; + while (p) { + object = rb_entry(p, struct cachefiles_object, active_node); + if (object->dentry > dentry) + p = p->rb_left; + else if (object->dentry < dentry) + p = p->rb_right; + else + goto found_dentry; + } + + write_unlock(&cache->active_lock); + _leave(" [no owner]"); + return; + + /* found the dentry for */ +found_dentry: + kdebug("preemptive burial: OBJ%x [%s] %p", + object->fscache.debug_id, + fscache_object_states[object->fscache.state], + dentry); + + if (object->fscache.state < FSCACHE_OBJECT_DYING) { + printk(KERN_ERR "\n"); + printk(KERN_ERR "CacheFiles: Error:" + " Can't preemptively bury live object\n"); + cachefiles_printk_object(object, NULL); + } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + printk(KERN_ERR "CacheFiles: Error:" + " Object already preemptively buried\n"); + } + + write_unlock(&cache->active_lock); + _leave(" [owner marked]"); +} + +/* * record the fact that an object is now active */ static int cachefiles_mark_object_active(struct cachefiles_cache *cache, @@ -219,7 +272,8 @@ requeue: */ static int cachefiles_bury_object(struct cachefiles_cache *cache, struct dentry *dir, - struct dentry *rep) + struct dentry *rep, + bool preemptive) { struct dentry *grave, *trap; char nbuffer[8 + 8 + 1]; @@ -229,11 +283,16 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, dir->d_name.len, dir->d_name.len, dir->d_name.name, rep->d_name.len, rep->d_name.len, rep->d_name.name); + _debug("remove %p from %p", rep, dir); + /* non-directories can just be unlinked */ if (!S_ISDIR(rep->d_inode->i_mode)) { _debug("unlink stale object"); ret = vfs_unlink(dir->d_inode, rep); + if (preemptive) + cachefiles_mark_object_buried(cache, rep); + mutex_unlock(&dir->d_inode->i_mutex); if (ret == -EIO) @@ -325,6 +384,9 @@ try_again: if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); + if (preemptive) + cachefiles_mark_object_buried(cache, rep); + unlock_rename(cache->graveyard, dir); dput(grave); _leave(" = 0"); @@ -340,7 +402,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, struct dentry *dir; int ret; - _enter(",{%p}", object->dentry); + _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry); ASSERT(object->dentry); ASSERT(object->dentry->d_inode); @@ -350,15 +412,25 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - /* we need to check that our parent is _still_ our parent - it may have - * been renamed */ - if (dir == object->dentry->d_parent) { - ret = cachefiles_bury_object(cache, dir, object->dentry); - } else { - /* it got moved, presumably by cachefilesd culling it, so it's - * no longer in the key path and we can ignore it */ + if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + /* object allocation for the same key preemptively deleted this + * object's file so that it could create its own file */ + _debug("object preemptively buried"); mutex_unlock(&dir->d_inode->i_mutex); ret = 0; + } else { + /* we need to check that our parent is _still_ our parent - it + * may have been renamed */ + if (dir == object->dentry->d_parent) { + ret = cachefiles_bury_object(cache, dir, + object->dentry, false); + } else { + /* it got moved, presumably by cachefilesd culling it, + * so it's no longer in the key path and we can ignore + * it */ + mutex_unlock(&dir->d_inode->i_mutex); + ret = 0; + } } dput(dir); @@ -381,7 +453,9 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, const char *name; int ret, nlen; - _enter("{%p},,%s,", parent->dentry, key); + _enter("OBJ%x{%p},OBJ%x,%s,", + parent->fscache.debug_id, parent->dentry, + object->fscache.debug_id, key); cache = container_of(parent->fscache.cache, struct cachefiles_cache, cache); @@ -509,7 +583,7 @@ lookup_again: * mutex) */ object->dentry = NULL; - ret = cachefiles_bury_object(cache, dir, next); + ret = cachefiles_bury_object(cache, dir, next, true); dput(next); next = NULL; @@ -828,7 +902,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, /* actually remove the victim (drops the dir mutex) */ _debug("bury"); - ret = cachefiles_bury_object(cache, dir, victim); + ret = cachefiles_bury_object(cache, dir, victim, false); if (ret < 0) goto error; diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index b5808cdb223..039b5011d83 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -77,6 +77,8 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache, /* * check the security details of the on-disk cache * - must be called with security override in force + * - must return with a security override in force - even in the case of an + * error */ int cachefiles_determine_cache_security(struct cachefiles_cache *cache, struct dentry *root, @@ -99,6 +101,8 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache, * which create files */ ret = set_create_files_as(new, root->d_inode); if (ret < 0) { + abort_creds(new); + cachefiles_begin_secure(cache, _saved_cred); _leave(" = %d [cfa]", ret); return ret; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4b42c2bb603..a9005d862ed 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -504,7 +504,6 @@ static void writepages_finish(struct ceph_osd_request *req, int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; - struct writeback_control *wbc = req->r_wbc; __s32 rc = -EIO; u64 bytes = 0; struct ceph_client *client = ceph_inode_to_client(inode); @@ -546,10 +545,6 @@ static void writepages_finish(struct ceph_osd_request *req, clear_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); - if (i >= wrote) { - dout("inode %p skipping page %p\n", inode, page); - wbc->pages_skipped++; - } ceph_put_snap_context((void *)page->private); page->private = 0; ClearPagePrivate(page); @@ -799,7 +794,6 @@ get_more_pages: alloc_page_vec(client, req); req->r_callback = writepages_finish; req->r_inode = inode; - req->r_wbc = wbc; } /* note position of first page in pvec */ diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0c168180686..d9400534b27 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -858,6 +858,8 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci) } /* + * Remove a cap. Take steps to deal with a racing iterate_session_caps. + * * caller should hold i_lock. * caller will not hold session s_mutex if called from destroy_inode. */ @@ -866,15 +868,10 @@ void __ceph_remove_cap(struct ceph_cap *cap) struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + int removed = 0; dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); - /* remove from inode list */ - rb_erase(&cap->ci_node, &ci->i_caps); - cap->ci = NULL; - if (ci->i_auth_cap == cap) - ci->i_auth_cap = NULL; - /* remove from session list */ spin_lock(&session->s_cap_lock); if (session->s_cap_iterator == cap) { @@ -885,10 +882,18 @@ void __ceph_remove_cap(struct ceph_cap *cap) list_del_init(&cap->session_caps); session->s_nr_caps--; cap->session = NULL; + removed = 1; } + /* protect backpointer with s_cap_lock: see iterate_session_caps */ + cap->ci = NULL; spin_unlock(&session->s_cap_lock); - if (cap->session == NULL) + /* remove from inode list */ + rb_erase(&cap->ci_node, &ci->i_caps); + if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; + + if (removed) ceph_put_cap(cap); if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 261f3e6c0bc..85b4d2ffdeb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -733,6 +733,10 @@ no_change: __ceph_get_fmode(ci, cap_fmode); spin_unlock(&inode->i_lock); } + } else if (cap_fmode >= 0) { + pr_warning("mds issued no caps on %llx.%llx\n", + ceph_vinop(inode)); + __ceph_get_fmode(ci, cap_fmode); } /* update delegation info? */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 60a9a4ae47b..24561a557e0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -736,9 +736,10 @@ static void cleanup_cap_releases(struct ceph_mds_session *session) } /* - * Helper to safely iterate over all caps associated with a session. + * Helper to safely iterate over all caps associated with a session, with + * special care taken to handle a racing __ceph_remove_cap(). * - * caller must hold session s_mutex + * Caller must hold session s_mutex. */ static int iterate_session_caps(struct ceph_mds_session *session, int (*cb)(struct inode *, struct ceph_cap *, @@ -2136,7 +2137,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) struct ceph_mds_session *session = NULL; struct ceph_msg *reply; struct rb_node *p; - int err; + int err = -ENOMEM; struct ceph_pagelist *pagelist; pr_info("reconnect to recovering mds%d\n", mds); @@ -2185,7 +2186,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) goto fail; err = iterate_session_caps(session, encode_caps_cb, pagelist); if (err < 0) - goto out; + goto fail; /* * snaprealms. we provide mds with the ino, seq (version), and @@ -2213,28 +2214,31 @@ send: reply->nr_pages = calc_pages_for(0, pagelist->length); ceph_con_send(&session->s_con, reply); - if (session) { - session->s_state = CEPH_MDS_SESSION_OPEN; - __wake_requests(mdsc, &session->s_waiting); - } + session->s_state = CEPH_MDS_SESSION_OPEN; + mutex_unlock(&session->s_mutex); + + mutex_lock(&mdsc->mutex); + __wake_requests(mdsc, &session->s_waiting); + mutex_unlock(&mdsc->mutex); + + ceph_put_mds_session(session); -out: up_read(&mdsc->snap_rwsem); - if (session) { - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - } mutex_lock(&mdsc->mutex); return; fail: ceph_msg_put(reply); + up_read(&mdsc->snap_rwsem); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); fail_nomsg: ceph_pagelist_release(pagelist); kfree(pagelist); fail_nopagelist: - pr_err("ENOMEM preparing reconnect for mds%d\n", mds); - goto out; + pr_err("error %d preparing reconnect for mds%d\n", err, mds); + mutex_lock(&mdsc->mutex); + return; } diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 509f57d9ccb..cd4fadb6491 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -492,7 +492,14 @@ static void prepare_write_message(struct ceph_connection *con) list_move_tail(&m->list_head, &con->out_sent); } - m->hdr.seq = cpu_to_le64(++con->out_seq); + /* + * only assign outgoing seq # if we haven't sent this message + * yet. if it is requeued, resend with it's original seq. + */ + if (m->needs_out_seq) { + m->hdr.seq = cpu_to_le64(++con->out_seq); + m->needs_out_seq = false; + } dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", m, con->out_seq, le16_to_cpu(m->hdr.type), @@ -1986,6 +1993,8 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); + msg->needs_out_seq = true; + /* queue */ mutex_lock(&con->mutex); BUG_ON(!list_empty(&msg->list_head)); @@ -2085,15 +2094,19 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, kref_init(&m->kref); INIT_LIST_HEAD(&m->list_head); + m->hdr.tid = 0; m->hdr.type = cpu_to_le16(type); + m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); + m->hdr.version = 0; m->hdr.front_len = cpu_to_le32(front_len); m->hdr.middle_len = 0; m->hdr.data_len = cpu_to_le32(page_len); m->hdr.data_off = cpu_to_le16(page_off); - m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); + m->hdr.reserved = 0; m->footer.front_crc = 0; m->footer.middle_crc = 0; m->footer.data_crc = 0; + m->footer.flags = 0; m->front_max = front_len; m->front_is_vmalloc = false; m->more_to_follow = false; diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index a343dae73cd..a5caf91cc97 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -86,6 +86,7 @@ struct ceph_msg { struct kref kref; bool front_is_vmalloc; bool more_to_follow; + bool needs_out_seq; int front_max; struct ceph_msgpool *pool; diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index c7b4dedaace..3514f71ff85 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -565,7 +565,8 @@ static int __map_osds(struct ceph_osd_client *osdc, { struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; struct ceph_pg pgid; - int o = -1; + int acting[CEPH_PG_MAX_SIZE]; + int o = -1, num = 0; int err; dout("map_osds %p tid %lld\n", req, req->r_tid); @@ -576,10 +577,16 @@ static int __map_osds(struct ceph_osd_client *osdc, pgid = reqhead->layout.ol_pgid; req->r_pgid = pgid; - o = ceph_calc_pg_primary(osdc->osdmap, pgid); + err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); + if (err > 0) { + o = acting[0]; + num = err; + } if ((req->r_osd && req->r_osd->o_osd == o && - req->r_sent >= req->r_osd->o_incarnation) || + req->r_sent >= req->r_osd->o_incarnation && + req->r_num_pg_osds == num && + memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || (req->r_os |