diff options
Diffstat (limited to 'fs/ceph/super.c')
| -rw-r--r-- | fs/ceph/super.c | 274 | 
1 files changed, 213 insertions, 61 deletions
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 08b460ae053..06150fd745a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -17,7 +17,9 @@  #include "super.h"  #include "mds_client.h" +#include "cache.h" +#include <linux/ceph/ceph_features.h>  #include <linux/ceph/decode.h>  #include <linux/ceph/mon_client.h>  #include <linux/ceph/auth.h> @@ -70,17 +72,21 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)  	/*  	 * express utilization in terms of large blocks to avoid  	 * overflow on 32-bit machines. +	 * +	 * NOTE: for the time being, we make bsize == frsize to humor +	 * not-yet-ancient versions of glibc that are broken. +	 * Someday, we will probably want to report a real block +	 * size...  whatever that may mean for a network file system!  	 */  	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; +	buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;  	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); -	buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >> -		(CEPH_BLOCK_SHIFT-10); +	buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);  	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);  	buf->f_files = le64_to_cpu(st.num_objects);  	buf->f_ffree = -1;  	buf->f_namelen = NAME_MAX; -	buf->f_frsize = PAGE_CACHE_SIZE;  	/* leave fsid little-endian, regardless of host endianness */  	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); @@ -115,6 +121,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)  enum {  	Opt_wsize,  	Opt_rsize, +	Opt_rasize,  	Opt_caps_wanted_delay_min,  	Opt_caps_wanted_delay_max,  	Opt_cap_release_safety, @@ -130,12 +137,24 @@ enum {  	Opt_nodirstat,  	Opt_rbytes,  	Opt_norbytes, +	Opt_asyncreaddir,  	Opt_noasyncreaddir, +	Opt_dcache, +	Opt_nodcache, +	Opt_ino32, +	Opt_noino32, +	Opt_fscache, +	Opt_nofscache, +#ifdef CONFIG_CEPH_FS_POSIX_ACL +	Opt_acl, +#endif +	Opt_noacl  };  static match_table_t fsopt_tokens = {  	{Opt_wsize, "wsize=%d"},  	{Opt_rsize, "rsize=%d"}, +	{Opt_rasize, "rasize=%d"},  	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},  	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},  	{Opt_cap_release_safety, "cap_release_safety=%d"}, @@ -149,7 +168,18 @@ static match_table_t fsopt_tokens = {  	{Opt_nodirstat, "nodirstat"},  	{Opt_rbytes, "rbytes"},  	{Opt_norbytes, "norbytes"}, +	{Opt_asyncreaddir, "asyncreaddir"},  	{Opt_noasyncreaddir, "noasyncreaddir"}, +	{Opt_dcache, "dcache"}, +	{Opt_nodcache, "nodcache"}, +	{Opt_ino32, "ino32"}, +	{Opt_noino32, "noino32"}, +	{Opt_fscache, "fsc"}, +	{Opt_nofscache, "nofsc"}, +#ifdef CONFIG_CEPH_FS_POSIX_ACL +	{Opt_acl, "acl"}, +#endif +	{Opt_noacl, "noacl"},  	{-1, NULL}  }; @@ -195,6 +225,9 @@ static int parse_fsopt_token(char *c, void *private)  	case Opt_rsize:  		fsopt->rsize = intval;  		break; +	case Opt_rasize: +		fsopt->rasize = intval; +		break;  	case Opt_caps_wanted_delay_min:  		fsopt->caps_wanted_delay_min = intval;  		break; @@ -222,9 +255,38 @@ static int parse_fsopt_token(char *c, void *private)  	case Opt_norbytes:  		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;  		break; +	case Opt_asyncreaddir: +		fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; +		break;  	case Opt_noasyncreaddir:  		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;  		break; +	case Opt_dcache: +		fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; +		break; +	case Opt_nodcache: +		fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; +		break; +	case Opt_ino32: +		fsopt->flags |= CEPH_MOUNT_OPT_INO32; +		break; +	case Opt_noino32: +		fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; +		break; +	case Opt_fscache: +		fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; +		break; +	case Opt_nofscache: +		fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; +		break; +#ifdef CONFIG_CEPH_FS_POSIX_ACL +	case Opt_acl: +		fsopt->sb_flags |= MS_POSIXACL; +		break; +#endif +	case Opt_noacl: +		fsopt->sb_flags &= ~MS_POSIXACL; +		break;  	default:  		BUG_ON(token);  	} @@ -277,7 +339,10 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,  {  	struct ceph_mount_options *fsopt;  	const char *dev_name_end; -	int err = -ENOMEM; +	int err; + +	if (!dev_name || !*dev_name) +		return -EINVAL;  	fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);  	if (!fsopt) @@ -285,37 +350,54 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,  	dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); -        fsopt->sb_flags = flags; -        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - -        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; -        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); -        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; -        fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; -        fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; -        fsopt->congestion_kb = default_congestion_kb(); -	 -        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ -        err = -EINVAL; -        if (!dev_name) -                goto out; -        *path = strstr(dev_name, ":/"); -        if (*path == NULL) { -                pr_err("device name is missing path (no :/ in %s)\n", -                       dev_name); -                goto out; -        } -	dev_name_end = *path; -	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); +	fsopt->sb_flags = flags; +	fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + +	fsopt->rsize = CEPH_RSIZE_DEFAULT; +	fsopt->rasize = CEPH_RASIZE_DEFAULT; +	fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); +	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; +	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; +	fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; +	fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; +	fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; +	fsopt->congestion_kb = default_congestion_kb(); -	/* path on server */ -	*path += 2; +	/* +	 * Distinguish the server list from the path in "dev_name". +	 * Internally we do not include the leading '/' in the path. +	 * +	 * "dev_name" will look like: +	 *     <server_spec>[,<server_spec>...]:[<path>] +	 * where +	 *     <server_spec> is <ip>[:<port>] +	 *     <path> is optional, but if present must begin with '/' +	 */ +	dev_name_end = strchr(dev_name, '/'); +	if (dev_name_end) { +		/* skip over leading '/' for path */ +		*path = dev_name_end + 1; +	} else { +		/* path is empty */ +		dev_name_end = dev_name + strlen(dev_name); +		*path = dev_name_end; +	} +	err = -EINVAL; +	dev_name_end--;		/* back up to ':' separator */ +	if (dev_name_end < dev_name || *dev_name_end != ':') { +		pr_err("device name is missing path (no : separator in %s)\n", +				dev_name); +		goto out; +	} +	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);  	dout("server path '%s'\n", *path); -	err = ceph_parse_options(popt, options, dev_name, dev_name_end, +	*popt = ceph_parse_options(options, dev_name, dev_name_end,  				 parse_fsopt_token, (void *)fsopt); -	if (err) +	if (IS_ERR(*popt)) { +		err = PTR_ERR(*popt);  		goto out; +	}  	/* success */  	*pfsopt = fsopt; @@ -329,11 +411,11 @@ out:  /**   * ceph_show_options - Show mount options in /proc/mounts   * @m: seq_file to write to - * @mnt: mount descriptor + * @root: root of that (sub)tree   */ -static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) +static int ceph_show_options(struct seq_file *m, struct dentry *root)  { -	struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb); +	struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);  	struct ceph_mount_options *fsopt = fsc->mount_options;  	struct ceph_options *opt = fsc->client->options; @@ -346,15 +428,13 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)  	if (opt->name)  		seq_printf(m, ",name=%s", opt->name); -	if (opt->secret) +	if (opt->key)  		seq_puts(m, ",secret=<hidden>");  	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)  		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);  	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)  		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); -	if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) -		seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);  	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)  		seq_printf(m, ",osdkeepalivetimeout=%d",  			   opt->osd_keepalive_timeout); @@ -365,11 +445,28 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)  		seq_puts(m, ",norbytes");  	if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)  		seq_puts(m, ",noasyncreaddir"); +	if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE) +		seq_puts(m, ",dcache"); +	else +		seq_puts(m, ",nodcache"); +	if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) +		seq_puts(m, ",fsc"); +	else +		seq_puts(m, ",nofsc"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL +	if (fsopt->sb_flags & MS_POSIXACL) +		seq_puts(m, ",acl"); +	else +		seq_puts(m, ",noacl"); +#endif  	if (fsopt->wsize)  		seq_printf(m, ",wsize=%d", fsopt->wsize); -	if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT) +	if (fsopt->rsize != CEPH_RSIZE_DEFAULT)  		seq_printf(m, ",rsize=%d", fsopt->rsize); +	if (fsopt->rasize != CEPH_RASIZE_DEFAULT) +		seq_printf(m, ",rasize=%d", fsopt->rasize);  	if (fsopt->congestion_kb != default_congestion_kb())  		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);  	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) @@ -412,23 +509,29 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)  /*   * create a new fs client   */ -struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, +static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,  					struct ceph_options *opt)  {  	struct ceph_fs_client *fsc; +	const u64 supported_features = +		CEPH_FEATURE_FLOCK | +		CEPH_FEATURE_DIRLAYOUTHASH; +	const u64 required_features = 0; +	int page_count; +	size_t size;  	int err = -ENOMEM;  	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);  	if (!fsc)  		return ERR_PTR(-ENOMEM); -	fsc->client = ceph_create_client(opt, fsc); +	fsc->client = ceph_create_client(opt, fsc, supported_features, +					 required_features);  	if (IS_ERR(fsc->client)) {  		err = PTR_ERR(fsc->client);  		goto fail;  	}  	fsc->client->extra_mon_dispatch = extra_mon_dispatch; -	fsc->client->supported_features |= CEPH_FEATURE_FLOCK;  	fsc->client->monc.want_mdsmap = 1;  	fsc->mount_options = fsopt; @@ -443,28 +546,40 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,  		goto fail_client;  	err = -ENOMEM; -	fsc->wb_wq = create_workqueue("ceph-writeback"); +	/* +	 * The number of concurrent works can be high but they don't need +	 * to be processed in parallel, limit concurrency. +	 */ +	fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);  	if (fsc->wb_wq == NULL)  		goto fail_bdi; -	fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); +	fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);  	if (fsc->pg_inv_wq == NULL)  		goto fail_wb_wq; -	fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc"); +	fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);  	if (fsc->trunc_wq == NULL)  		goto fail_pg_inv_wq;  	/* set up mempools */  	err = -ENOMEM; -	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, -			      fsc->mount_options->wsize >> PAGE_CACHE_SHIFT); +	page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT; +	size = sizeof (struct page *) * (page_count ? page_count : 1); +	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);  	if (!fsc->wb_pagevec_pool)  		goto fail_trunc_wq; +	/* setup fscache */ +	if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) && +	    (ceph_fscache_register_fs(fsc) != 0)) +		goto fail_fscache; +  	/* caps */  	fsc->min_caps = fsopt->max_readdir;  	return fsc; +fail_fscache: +	ceph_fscache_unregister_fs(fsc);  fail_trunc_wq:  	destroy_workqueue(fsc->trunc_wq);  fail_pg_inv_wq: @@ -480,10 +595,12 @@ fail:  	return ERR_PTR(err);  } -void destroy_fs_client(struct ceph_fs_client *fsc) +static void destroy_fs_client(struct ceph_fs_client *fsc)  {  	dout("destroy_fs_client %p\n", fsc); +	ceph_fscache_unregister_fs(fsc); +  	destroy_workqueue(fsc->wb_wq);  	destroy_workqueue(fsc->pg_inv_wq);  	destroy_workqueue(fsc->trunc_wq); @@ -518,6 +635,8 @@ static void ceph_inode_init_once(void *foo)  static int __init init_caches(void)  { +	int error = -ENOMEM; +  	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",  				      sizeof(struct ceph_inode_info),  				      __alignof__(struct ceph_inode_info), @@ -541,23 +660,33 @@ static int __init init_caches(void)  	if (ceph_file_cachep == NULL)  		goto bad_file; -	return 0; +	if ((error = ceph_fscache_register())) +		goto bad_file; +	return 0;  bad_file:  	kmem_cache_destroy(ceph_dentry_cachep);  bad_dentry:  	kmem_cache_destroy(ceph_cap_cachep);  bad_cap:  	kmem_cache_destroy(ceph_inode_cachep); -	return -ENOMEM; +	return error;  }  static void destroy_caches(void)  { +	/* +	 * Make sure all delayed rcu free inodes are flushed before we +	 * destroy cache. +	 */ +	rcu_barrier(); +  	kmem_cache_destroy(ceph_inode_cachep);  	kmem_cache_destroy(ceph_cap_cachep);  	kmem_cache_destroy(ceph_dentry_cachep);  	kmem_cache_destroy(ceph_file_cachep); + +	ceph_fscache_unregister();  } @@ -580,6 +709,7 @@ static const struct super_operations ceph_super_ops = {  	.alloc_inode	= ceph_alloc_inode,  	.destroy_inode	= ceph_destroy_inode,  	.write_inode    = ceph_write_inode, +	.drop_inode	= ceph_drop_inode,  	.sync_fs        = ceph_sync_fs,  	.put_super	= ceph_put_super,  	.show_options   = ceph_show_options, @@ -614,17 +744,25 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,  	req->r_num_caps = 2;  	err = ceph_mdsc_do_request(mdsc, NULL, req);  	if (err == 0) { -		dout("open_root_inode success\n"); -		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && -		    fsc->sb->s_root == NULL) -			root = d_alloc_root(req->r_target_inode); -		else -			root = d_obtain_alias(req->r_target_inode); +		struct inode *inode = req->r_target_inode;  		req->r_target_inode = NULL; +		dout("open_root_inode success\n"); +		if (ceph_ino(inode) == CEPH_INO_ROOT && +		    fsc->sb->s_root == NULL) { +			root = d_make_root(inode); +			if (!root) { +				root = ERR_PTR(-ENOMEM); +				goto out; +			} +		} else { +			root = d_obtain_alias(inode); +		} +		ceph_init_dentry(root);  		dout("open_root_inode success, root dentry is %p\n", root);  	} else {  		root = ERR_PTR(err);  	} +out:  	ceph_mdsc_put_request(req);  	return root;  } @@ -705,6 +843,7 @@ static int ceph_set_super(struct super_block *s, void *data)  	s->s_flags = fsc->mount_options->sb_flags;  	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */ +	s->s_xattr = ceph_xattr_handlers;  	s->s_fs_info = fsc;  	fsc->sb = s; @@ -763,12 +902,16 @@ static int ceph_register_bdi(struct super_block *sb,  {  	int err; -	/* set ra_pages based on rsize mount option? */ -	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) +	/* set ra_pages based on rasize mount option? */ +	if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)  		fsc->backing_dev_info.ra_pages = -			(fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) +			(fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)  			>> PAGE_SHIFT; -	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", +	else +		fsc->backing_dev_info.ra_pages = +			default_backing_dev_info.ra_pages; + +	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",  			   atomic_long_inc_return(&bdi_seq));  	if (!err)  		sb->s_bdi = &fsc->backing_dev_info; @@ -788,6 +931,10 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,  	struct ceph_options *opt = NULL;  	dout("ceph_mount\n"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL +	flags |= MS_POSIXACL; +#endif  	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);  	if (err < 0) {  		res = ERR_PTR(err); @@ -798,8 +945,8 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,  	fsc = create_fs_client(fsopt, opt);  	if (IS_ERR(fsc)) {  		res = ERR_CAST(fsc); -		kfree(fsopt); -		kfree(opt); +		destroy_mount_options(fsopt); +		ceph_destroy_options(opt);  		goto out_final;  	} @@ -811,7 +958,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,  	if (ceph_test_opt(fsc->client, NOSHARE))  		compare_super = NULL; -	sb = sget(fs_type, compare_super, ceph_set_super, fsc); +	sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc);  	if (IS_ERR(sb)) {  		res = ERR_CAST(sb);  		goto out; @@ -868,6 +1015,7 @@ static struct file_system_type ceph_fs_type = {  	.kill_sb	= ceph_kill_sb,  	.fs_flags	= FS_RENAME_DOES_D_MOVE,  }; +MODULE_ALIAS_FS("ceph");  #define _STRINGIFY(x) #x  #define STRINGIFY(x) _STRINGIFY(x) @@ -878,6 +1026,8 @@ static int __init init_ceph(void)  	if (ret)  		goto out; +	ceph_flock_init(); +	ceph_xattr_init();  	ret = register_filesystem(&ceph_fs_type);  	if (ret)  		goto out_icache; @@ -887,6 +1037,7 @@ static int __init init_ceph(void)  	return 0;  out_icache: +	ceph_xattr_exit();  	destroy_caches();  out:  	return ret; @@ -896,6 +1047,7 @@ static void __exit exit_ceph(void)  {  	dout("exit_ceph\n");  	unregister_filesystem(&ceph_fs_type); +	ceph_xattr_exit();  	destroy_caches();  }  | 
