diff options
Diffstat (limited to 'drivers/md/dm.c')
| -rw-r--r-- | drivers/md/dm.c | 1309 | 
1 files changed, 774 insertions, 535 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7cb1352f7e7..32b958dbc49 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -14,7 +14,6 @@  #include <linux/moduleparam.h>  #include <linux/blkpg.h>  #include <linux/bio.h> -#include <linux/buffer_head.h>  #include <linux/mempool.h>  #include <linux/slab.h>  #include <linux/idr.h> @@ -25,6 +24,16 @@  #define DM_MSG_PREFIX "core" +#ifdef CONFIG_PRINTK +/* + * ratelimit state to be used in DMXXX_LIMIT(). + */ +DEFINE_RATELIMIT_STATE(dm_ratelimit_state, +		       DEFAULT_RATELIMIT_INTERVAL, +		       DEFAULT_RATELIMIT_BURST); +EXPORT_SYMBOL(dm_ratelimit_state); +#endif +  /*   * Cookies are numeric values sent with CHANGE and REMOVE   * uevents while resuming, removing or renaming the device. @@ -32,13 +41,21 @@  #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"  #define DM_COOKIE_LENGTH 24 -static DEFINE_MUTEX(dm_mutex);  static const char *_name = DM_NAME;  static unsigned int major = 0;  static unsigned int _major = 0; +static DEFINE_IDR(_minor_idr); +  static DEFINE_SPINLOCK(_minor_lock); + +static void do_deferred_remove(struct work_struct *w); + +static DECLARE_WORK(deferred_remove_work, do_deferred_remove); + +static struct workqueue_struct *deferred_remove_workqueue; +  /*   * For bio-based dm.   * One of these is allocated per bio. @@ -50,17 +67,7 @@ struct dm_io {  	struct bio *bio;  	unsigned long start_time;  	spinlock_t endio_lock; -}; - -/* - * For bio-based dm. - * One of these is allocated per target within a bio.  Hopefully - * this will be simplified out one day. - */ -struct dm_target_io { -	struct dm_io *io; -	struct dm_target *ti; -	union map_info info; +	struct dm_stats_aux stats_aux;  };  /* @@ -76,21 +83,19 @@ struct dm_rq_target_io {  };  /* - * For request-based dm. - * One of these is allocated per bio. + * For request-based dm - the bio clones we allocate are embedded in these + * structs. + * + * We allocate these with bio_alloc_bioset, using the front_pad parameter when + * the bioset is created - this means the bio has to come at the end of the + * struct.   */  struct dm_rq_clone_bio_info {  	struct bio *orig;  	struct dm_rq_target_io *tio; +	struct bio clone;  }; -union map_info *dm_get_mapinfo(struct bio *bio) -{ -	if (bio && bio->bi_private) -		return &((struct dm_target_io *)bio->bi_private)->info; -	return NULL; -} -  union map_info *dm_get_rq_mapinfo(struct request *rq)  {  	if (rq && rq->end_io_data) @@ -110,17 +115,33 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);  #define DMF_FREEING 3  #define DMF_DELETING 4  #define DMF_NOFLUSH_SUSPENDING 5 +#define DMF_MERGE_IS_OPTIONAL 6 +#define DMF_DEFERRED_REMOVE 7 + +/* + * A dummy definition to make RCU happy. + * struct dm_table should never be dereferenced in this file. + */ +struct dm_table { +	int undefined__; +};  /*   * Work processed by per-device workqueue.   */  struct mapped_device { -	struct rw_semaphore io_lock; +	struct srcu_struct io_barrier;  	struct mutex suspend_lock; -	rwlock_t map_lock;  	atomic_t holders;  	atomic_t open_count; +	/* +	 * The current mapping. +	 * Use dm_get_live_table{_fast} or take suspend_lock for +	 * dereference. +	 */ +	struct dm_table *map; +  	unsigned long flags;  	struct request_queue *queue; @@ -128,6 +149,8 @@ struct mapped_device {  	/* Protect queue and type against concurrent access. */  	struct mutex type_lock; +	struct target_type *immutable_target_type; +  	struct gendisk *disk;  	char name[16]; @@ -148,15 +171,9 @@ struct mapped_device {  	struct workqueue_struct *wq;  	/* -	 * The current mapping. -	 */ -	struct dm_table *map; - -	/*  	 * io objects are allocated from here.  	 */  	mempool_t *io_pool; -	mempool_t *tio_pool;  	struct bio_set *bs; @@ -178,14 +195,13 @@ struct mapped_device {  	/* forced geometry settings */  	struct hd_geometry geometry; -	/* For saving the address of __make_request for request based dm */ -	make_request_fn *saved_make_request_fn; - -	/* sysfs handle */ -	struct kobject kobj; +	/* kobject and completion */ +	struct dm_kobject_holder kobj_holder;  	/* zero-length flush that will be cloned and submitted to targets */  	struct bio flush_bio; + +	struct dm_stats stats;  };  /* @@ -193,15 +209,57 @@ struct mapped_device {   */  struct dm_md_mempools {  	mempool_t *io_pool; -	mempool_t *tio_pool;  	struct bio_set *bs;  }; -#define MIN_IOS 256 +#define RESERVED_BIO_BASED_IOS		16 +#define RESERVED_REQUEST_BASED_IOS	256 +#define RESERVED_MAX_IOS		1024  static struct kmem_cache *_io_cache; -static struct kmem_cache *_tio_cache;  static struct kmem_cache *_rq_tio_cache; -static struct kmem_cache *_rq_bio_info_cache; + +/* + * Bio-based DM's mempools' reserved IOs set by the user. + */ +static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; + +/* + * Request-based DM's mempools' reserved IOs set by the user. + */ +static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; + +static unsigned __dm_get_reserved_ios(unsigned *reserved_ios, +				      unsigned def, unsigned max) +{ +	unsigned ios = ACCESS_ONCE(*reserved_ios); +	unsigned modified_ios = 0; + +	if (!ios) +		modified_ios = def; +	else if (ios > max) +		modified_ios = max; + +	if (modified_ios) { +		(void)cmpxchg(reserved_ios, ios, modified_ios); +		ios = modified_ios; +	} + +	return ios; +} + +unsigned dm_get_reserved_bio_based_ios(void) +{ +	return __dm_get_reserved_ios(&reserved_bio_based_ios, +				     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); +} +EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); + +unsigned dm_get_reserved_rq_based_ios(void) +{ +	return __dm_get_reserved_ios(&reserved_rq_based_ios, +				     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); +} +EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);  static int __init local_init(void)  { @@ -212,41 +270,36 @@ static int __init local_init(void)  	if (!_io_cache)  		return r; -	/* allocate a slab for the target ios */ -	_tio_cache = KMEM_CACHE(dm_target_io, 0); -	if (!_tio_cache) -		goto out_free_io_cache; -  	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);  	if (!_rq_tio_cache) -		goto out_free_tio_cache; - -	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); -	if (!_rq_bio_info_cache) -		goto out_free_rq_tio_cache; +		goto out_free_io_cache;  	r = dm_uevent_init();  	if (r) -		goto out_free_rq_bio_info_cache; +		goto out_free_rq_tio_cache; + +	deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); +	if (!deferred_remove_workqueue) { +		r = -ENOMEM; +		goto out_uevent_exit; +	}  	_major = major;  	r = register_blkdev(_major, _name);  	if (r < 0) -		goto out_uevent_exit; +		goto out_free_workqueue;  	if (!_major)  		_major = r;  	return 0; +out_free_workqueue: +	destroy_workqueue(deferred_remove_workqueue);  out_uevent_exit:  	dm_uevent_exit(); -out_free_rq_bio_info_cache: -	kmem_cache_destroy(_rq_bio_info_cache);  out_free_rq_tio_cache:  	kmem_cache_destroy(_rq_tio_cache); -out_free_tio_cache: -	kmem_cache_destroy(_tio_cache);  out_free_io_cache:  	kmem_cache_destroy(_io_cache); @@ -255,9 +308,10 @@ out_free_io_cache:  static void local_exit(void)  { -	kmem_cache_destroy(_rq_bio_info_cache); +	flush_scheduled_work(); +	destroy_workqueue(deferred_remove_workqueue); +  	kmem_cache_destroy(_rq_tio_cache); -	kmem_cache_destroy(_tio_cache);  	kmem_cache_destroy(_io_cache);  	unregister_blkdev(_major, _name);  	dm_uevent_exit(); @@ -275,6 +329,7 @@ static int (*_inits[])(void) __initdata = {  	dm_io_init,  	dm_kcopyd_init,  	dm_interface_init, +	dm_statistics_init,  };  static void (*_exits[])(void) = { @@ -285,6 +340,7 @@ static void (*_exits[])(void) = {  	dm_io_exit,  	dm_kcopyd_exit,  	dm_interface_exit, +	dm_statistics_exit,  };  static int __init dm_init(void) @@ -314,6 +370,11 @@ static void __exit dm_exit(void)  	while (i--)  		_exits[i](); + +	/* +	 * Should be empty by this point. +	 */ +	idr_destroy(&_minor_idr);  }  /* @@ -328,7 +389,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)  {  	struct mapped_device *md; -	mutex_lock(&dm_mutex);  	spin_lock(&_minor_lock);  	md = bdev->bd_disk->private_data; @@ -346,21 +406,23 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)  out:  	spin_unlock(&_minor_lock); -	mutex_unlock(&dm_mutex);  	return md ? 0 : -ENXIO;  } -static int dm_blk_close(struct gendisk *disk, fmode_t mode) +static void dm_blk_close(struct gendisk *disk, fmode_t mode)  {  	struct mapped_device *md = disk->private_data; -	mutex_lock(&dm_mutex); -	atomic_dec(&md->open_count); +	spin_lock(&_minor_lock); + +	if (atomic_dec_and_test(&md->open_count) && +	    (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) +		queue_work(deferred_remove_workqueue, &deferred_remove_work); +  	dm_put(md); -	mutex_unlock(&dm_mutex); -	return 0; +	spin_unlock(&_minor_lock);  }  int dm_open_count(struct mapped_device *md) @@ -371,14 +433,18 @@ int dm_open_count(struct mapped_device *md)  /*   * Guarantees nothing is using the device before it's deleted.   */ -int dm_lock_for_deletion(struct mapped_device *md) +int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)  {  	int r = 0;  	spin_lock(&_minor_lock); -	if (dm_open_count(md)) +	if (dm_open_count(md)) {  		r = -EBUSY; +		if (mark_deferred) +			set_bit(DMF_DEFERRED_REMOVE, &md->flags); +	} else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) +		r = -EEXIST;  	else  		set_bit(DMF_DELETING, &md->flags); @@ -387,6 +453,42 @@ int dm_lock_for_deletion(struct mapped_device *md)  	return r;  } +int dm_cancel_deferred_remove(struct mapped_device *md) +{ +	int r = 0; + +	spin_lock(&_minor_lock); + +	if (test_bit(DMF_DELETING, &md->flags)) +		r = -EBUSY; +	else +		clear_bit(DMF_DEFERRED_REMOVE, &md->flags); + +	spin_unlock(&_minor_lock); + +	return r; +} + +static void do_deferred_remove(struct work_struct *w) +{ +	dm_deferred_remove(); +} + +sector_t dm_get_size(struct mapped_device *md) +{ +	return get_capacity(md->disk); +} + +struct request_queue *dm_get_md_queue(struct mapped_device *md) +{ +	return md->queue; +} + +struct dm_stats *dm_get_stats(struct mapped_device *md) +{ +	return &md->stats; +} +  static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)  {  	struct mapped_device *md = bdev->bd_disk->private_data; @@ -398,10 +500,14 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,  			unsigned int cmd, unsigned long arg)  {  	struct mapped_device *md = bdev->bd_disk->private_data; -	struct dm_table *map = dm_get_live_table(md); +	int srcu_idx; +	struct dm_table *map;  	struct dm_target *tgt;  	int r = -ENOTTY; +retry: +	map = dm_get_live_table(md, &srcu_idx); +  	if (!map || !dm_table_get_size(map))  		goto out; @@ -420,7 +526,12 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,  		r = tgt->type->ioctl(tgt, cmd, arg);  out: -	dm_table_put(map); +	dm_put_live_table(md, srcu_idx); + +	if (r == -ENOTCONN) { +		msleep(10); +		goto retry; +	}  	return r;  } @@ -437,28 +548,18 @@ static void free_io(struct mapped_device *md, struct dm_io *io)  static void free_tio(struct mapped_device *md, struct dm_target_io *tio)  { -	mempool_free(tio, md->tio_pool); +	bio_put(&tio->clone);  }  static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,  					    gfp_t gfp_mask)  { -	return mempool_alloc(md->tio_pool, gfp_mask); +	return mempool_alloc(md->io_pool, gfp_mask);  }  static void free_rq_tio(struct dm_rq_target_io *tio)  { -	mempool_free(tio, tio->md->tio_pool); -} - -static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) -{ -	return mempool_alloc(md->io_pool, GFP_ATOMIC); -} - -static void free_bio_info(struct dm_rq_clone_bio_info *info) -{ -	mempool_free(info, info->tio->md->io_pool); +	mempool_free(tio, tio->md->io_pool);  }  static int md_in_flight(struct mapped_device *md) @@ -470,15 +571,21 @@ static int md_in_flight(struct mapped_device *md)  static void start_io_acct(struct dm_io *io)  {  	struct mapped_device *md = io->md; +	struct bio *bio = io->bio;  	int cpu; -	int rw = bio_data_dir(io->bio); +	int rw = bio_data_dir(bio);  	io->start_time = jiffies;  	cpu = part_stat_lock();  	part_round_stats(cpu, &dm_disk(md)->part0);  	part_stat_unlock(); -	dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); +	atomic_set(&dm_disk(md)->part0.in_flight[rw], +		atomic_inc_return(&md->pending[rw])); + +	if (unlikely(dm_stats_used(&md->stats))) +		dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, +				    bio_sectors(bio), false, 0, &io->stats_aux);  }  static void end_io_acct(struct dm_io *io) @@ -494,12 +601,16 @@ static void end_io_acct(struct dm_io *io)  	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);  	part_stat_unlock(); +	if (unlikely(dm_stats_used(&md->stats))) +		dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, +				    bio_sectors(bio), true, duration, &io->stats_aux); +  	/*  	 * After this is decremented the bio must not be touched if it is  	 * a flush.  	 */ -	dm_disk(md)->part0.in_flight[rw] = pending = -		atomic_dec_return(&md->pending[rw]); +	pending = atomic_dec_return(&md->pending[rw]); +	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);  	pending += atomic_read(&md->pending[rw^0x1]);  	/* nudge anyone waiting on suspend queue */ @@ -523,20 +634,39 @@ static void queue_io(struct mapped_device *md, struct bio *bio)  /*   * Everyone (including functions in this file), should use this   * function to access the md->map field, and make sure they call - * dm_table_put() when finished. + * dm_put_live_table() when finished.   */ -struct dm_table *dm_get_live_table(struct mapped_device *md) +struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)  { -	struct dm_table *t; -	unsigned long flags; +	*srcu_idx = srcu_read_lock(&md->io_barrier); -	read_lock_irqsave(&md->map_lock, flags); -	t = md->map; -	if (t) -		dm_table_get(t); -	read_unlock_irqrestore(&md->map_lock, flags); +	return srcu_dereference(md->map, &md->io_barrier); +} -	return t; +void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) +{ +	srcu_read_unlock(&md->io_barrier, srcu_idx); +} + +void dm_sync_table(struct mapped_device *md) +{ +	synchronize_srcu(&md->io_barrier); +	synchronize_rcu_expedited(); +} + +/* + * A fast alternative to dm_get_live_table/dm_put_live_table. + * The caller must not block between these two functions. + */ +static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) +{ +	rcu_read_lock(); +	return rcu_dereference(md->map); +} + +static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) +{ +	rcu_read_unlock();  }  /* @@ -621,7 +751,7 @@ static void dec_pending(struct dm_io *io, int error)  		if (io_error == DM_ENDIO_REQUEUE)  			return; -		if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { +		if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {  			/*  			 * Preflush done for flush with data, reissue  			 * without REQ_FLUSH. @@ -630,16 +760,24 @@ static void dec_pending(struct dm_io *io, int error)  			queue_io(md, bio);  		} else {  			/* done with normal IO or empty flush */ -			trace_block_bio_complete(md->queue, bio); +			trace_block_bio_complete(md->queue, bio, io_error);  			bio_endio(bio, io_error);  		}  	}  } +static void disable_write_same(struct mapped_device *md) +{ +	struct queue_limits *limits = dm_get_queue_limits(md); + +	/* device doesn't really support WRITE SAME, disable it */ +	limits->max_write_same_sectors = 0; +} +  static void clone_endio(struct bio *bio, int error)  {  	int r = 0; -	struct dm_target_io *tio = bio->bi_private; +	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);  	struct dm_io *io = tio->io;  	struct mapped_device *md = tio->io->md;  	dm_endio_fn endio = tio->ti->type->end_io; @@ -648,7 +786,7 @@ static void clone_endio(struct bio *bio, int error)  		error = -EIO;  	if (endio) { -		r = endio(tio->ti, bio, error, &tio->info); +		r = endio(tio->ti, bio, error);  		if (r < 0 || r == DM_ENDIO_REQUEUE)  			/*  			 * error and requeue request are handled @@ -664,13 +802,11 @@ static void clone_endio(struct bio *bio, int error)  		}  	} -	/* -	 * Store md for cleanup instead of tio which is about to get freed. -	 */ -	bio->bi_private = md->bs; +	if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) && +		     !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) +		disable_write_same(md);  	free_tio(md, tio); -	bio_put(bio);  	dec_pending(io, error);  } @@ -679,10 +815,11 @@ static void clone_endio(struct bio *bio, int error)   */  static void end_clone_bio(struct bio *clone, int error)  { -	struct dm_rq_clone_bio_info *info = clone->bi_private; +	struct dm_rq_clone_bio_info *info = +		container_of(clone, struct dm_rq_clone_bio_info, clone);  	struct dm_rq_target_io *tio = info->tio;  	struct bio *bio = info->orig; -	unsigned int nr_bytes = info->orig->bi_size; +	unsigned int nr_bytes = info->orig->bi_iter.bi_size;  	bio_put(clone); @@ -737,8 +874,14 @@ static void rq_completed(struct mapped_device *md, int rw, int run_queue)  	if (!md_in_flight(md))  		wake_up(&md->wait); +	/* +	 * Run this off this callpath, as drivers could invoke end_io while +	 * inside their request_fn (and holding the queue lock). Calling +	 * back into ->request_fn() could deadlock attempting to grab the +	 * queue lock again. +	 */  	if (run_queue) -		blk_run_queue(md->queue); +		blk_run_queue_async(md->queue);  	/*  	 * dm_put() must be at the end of this function. See the comment above @@ -808,8 +951,6 @@ void dm_requeue_unmapped_request(struct request *clone)  	dm_unprep_request(rq);  	spin_lock_irqsave(q->queue_lock, flags); -	if (elv_queue_empty(q)) -		blk_plug_device(q);  	blk_requeue_request(q, rq);  	spin_unlock_irqrestore(q->queue_lock, flags); @@ -850,10 +991,18 @@ static void dm_done(struct request *clone, int error, bool mapped)  {  	int r = error;  	struct dm_rq_target_io *tio = clone->end_io_data; -	dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; +	dm_request_endio_fn rq_end_io = NULL; + +	if (tio->ti) { +		rq_end_io = tio->ti->type->rq_end_io; + +		if (mapped && rq_end_io) +			r = rq_end_io(tio->ti, clone, error, &tio->info); +	} -	if (mapped && rq_end_io) -		r = rq_end_io(tio->ti, clone, error, &tio->info); +	if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) && +		     !clone->q->limits.max_write_same_sectors)) +		disable_write_same(tio->md);  	if (r <= 0)  		/* The target wants to complete the I/O */ @@ -953,31 +1102,90 @@ static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti  static sector_t max_io_len(sector_t sector, struct dm_target *ti)  {  	sector_t len = max_io_len_target_boundary(sector, ti); +	sector_t offset, max_len;  	/* -	 * Does the target need to split even further ? +	 * Does the target need to split even further?  	 */ -	if (ti->split_io) { -		sector_t boundary; -		sector_t offset = dm_target_offset(ti, sector); -		boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) -			   - offset; -		if (len > boundary) -			len = boundary; +	if (ti->max_io_len) { +		offset = dm_target_offset(ti, sector); +		if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) +			max_len = sector_div(offset, ti->max_io_len); +		else +			max_len = offset & (ti->max_io_len - 1); +		max_len = ti->max_io_len - max_len; + +		if (len > max_len) +			len = max_len;  	}  	return len;  } -static void __map_bio(struct dm_target *ti, struct bio *clone, -		      struct dm_target_io *tio) +int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) +{ +	if (len > UINT_MAX) { +		DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", +		      (unsigned long long)len, UINT_MAX); +		ti->error = "Maximum size of target IO is too large"; +		return -EINVAL; +	} + +	ti->max_io_len = (uint32_t) len; + +	return 0; +} +EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); + +/* + * A target may call dm_accept_partial_bio only from the map routine.  It is + * allowed for all bio types except REQ_FLUSH. + * + * dm_accept_partial_bio informs the dm that the target only wants to process + * additional n_sectors sectors of the bio and the rest of the data should be + * sent in a next bio. + * + * A diagram that explains the arithmetics: + * +--------------------+---------------+-------+ + * |         1          |       2       |   3   | + * +--------------------+---------------+-------+ + * + * <-------------- *tio->len_ptr ---------------> + *                      <------- bi_size -------> + *                      <-- n_sectors --> + * + * Region 1 was already iterated over with bio_advance or similar function. + *	(it may be empty if the target doesn't use bio_advance) + * Region 2 is the remaining bio size that the target wants to process. + *	(it may be empty if region 1 is non-empty, although there is no reason + *	 to make it empty) + * The target requires that region 3 is to be sent in the next bio. + * + * If the target wants to receive multiple copies of the bio (via num_*bios, etc), + * the partially processed part (the sum of regions 1+2) must be the same for all + * copies of the bio. + */ +void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) +{ +	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); +	unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; +	BUG_ON(bio->bi_rw & REQ_FLUSH); +	BUG_ON(bi_size > *tio->len_ptr); +	BUG_ON(n_sectors > bi_size); +	*tio->len_ptr -= bi_size - n_sectors; +	bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; +} +EXPORT_SYMBOL_GPL(dm_accept_partial_bio); + +static void __map_bio(struct dm_target_io *tio)  {  	int r;  	sector_t sector;  	struct mapped_device *md; +	struct bio *clone = &tio->clone; +	struct dm_target *ti = tio->ti;  	clone->bi_end_io = clone_endio; -	clone->bi_private = tio;  	/*  	 * Map the clone.  If r == 0 we don't need to do @@ -985,24 +1193,19 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,  	 * this io.  	 */  	atomic_inc(&tio->io->io_count); -	sector = clone->bi_sector; -	r = ti->type->map(ti, clone, &tio->info); +	sector = clone->bi_iter.bi_sector; +	r = ti->type->map(ti, clone);  	if (r == DM_MAPIO_REMAPPED) {  		/* the bio has been remapped so dispatch it */ -		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, -				    tio->io->bio->bi_bdev->bd_dev, sector); +		trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, +				      tio->io->bio->bi_bdev->bd_dev, sector);  		generic_make_request(clone);  	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {  		/* error the io and bail out, or requeue it if needed */  		md = tio->io->md;  		dec_pending(tio->io, r); -		/* -		 * Store bio_set for cleanup. -		 */ -		clone->bi_private = md->bs; -		bio_put(clone);  		free_tio(md, tio);  	} else if (r) {  		DMWARN("unimplemented target map return value: %d", r); @@ -1016,155 +1219,142 @@ struct clone_info {  	struct bio *bio;  	struct dm_io *io;  	sector_t sector; -	sector_t sector_count; -	unsigned short idx; +	unsigned sector_count;  }; -static void dm_bio_destructor(struct bio *bio) +static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)  { -	struct bio_set *bs = bio->bi_private; - -	bio_free(bio, bs); +	bio->bi_iter.bi_sector = sector; +	bio->bi_iter.bi_size = to_bytes(len);  }  /* - * Creates a little bio that just does part of a bvec. + * Creates a bio that consists of range of complete bvecs.   */ -static struct bio *split_bvec(struct bio *bio, sector_t sector, -			      unsigned short idx, unsigned int offset, -			      unsigned int len, struct bio_set *bs) +static void clone_bio(struct dm_target_io *tio, struct bio *bio, +		      sector_t sector, unsigned len)  { -	struct bio *clone; -	struct bio_vec *bv = bio->bi_io_vec + idx; - -	clone = bio_alloc_bioset(GFP_NOIO, 1, bs); -	clone->bi_destructor = dm_bio_destructor; -	*clone->bi_io_vec = *bv; - -	clone->bi_sector = sector; -	clone->bi_bdev = bio->bi_bdev; -	clone->bi_rw = bio->bi_rw; -	clone->bi_vcnt = 1; -	clone->bi_size = to_bytes(len); -	clone->bi_io_vec->bv_offset = offset; -	clone->bi_io_vec->bv_len = clone->bi_size; -	clone->bi_flags |= 1 << BIO_CLONED; - -	if (bio_integrity(bio)) { -		bio_integrity_clone(clone, bio, GFP_NOIO, bs); -		bio_integrity_trim(clone, -				   bio_sector_offset(bio, idx, offset), len); -	} +	struct bio *clone = &tio->clone; -	return clone; -} +	__bio_clone_fast(clone, bio); -/* - * Creates a bio that consists of range of complete bvecs. - */ -static struct bio *clone_bio(struct bio *bio, sector_t sector, -			     unsigned short idx, unsigned short bv_count, -			     unsigned int len, struct bio_set *bs) -{ -	struct bio *clone; +	if (bio_integrity(bio)) +		bio_integrity_clone(clone, bio, GFP_NOIO); -	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); -	__bio_clone(clone, bio); -	clone->bi_destructor = dm_bio_destructor; -	clone->bi_sector = sector; -	clone->bi_idx = idx; -	clone->bi_vcnt = idx + bv_count; -	clone->bi_size = to_bytes(len); -	clone->bi_flags &= ~(1 << BIO_SEG_VALID); - -	if (bio_integrity(bio)) { -		bio_integrity_clone(clone, bio, GFP_NOIO, bs); - -		if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) -			bio_integrity_trim(clone, -					   bio_sector_offset(bio, idx, 0), len); -	} +	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); +	clone->bi_iter.bi_size = to_bytes(len); -	return clone; +	if (bio_integrity(bio)) +		bio_integrity_trim(clone, 0, len);  }  static struct dm_target_io *alloc_tio(struct clone_info *ci, -				      struct dm_target *ti) +				      struct dm_target *ti, int nr_iovecs, +				      unsigned target_bio_nr)  { -	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); +	struct dm_target_io *tio; +	struct bio *clone; + +	clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs); +	tio = container_of(clone, struct dm_target_io, clone);  	tio->io = ci->io;  	tio->ti = ti; -	memset(&tio->info, 0, sizeof(tio->info)); +	tio->target_bio_nr = target_bio_nr;  	return tio;  } -static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, -				   unsigned request_nr, sector_t len) +static void __clone_and_map_simple_bio(struct clone_info *ci, +				       struct dm_target *ti, +				       unsigned target_bio_nr, unsigned *len)  { -	struct dm_target_io *tio = alloc_tio(ci, ti); -	struct bio *clone; +	struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); +	struct bio *clone = &tio->clone; -	tio->info.target_request_nr = request_nr; +	tio->len_ptr = len;  	/*  	 * Discard requests require the bio's inline iovecs be initialized.  	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush  	 * and discard, so no need for concern about wasted bvec allocations.  	 */ -	clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); -	__bio_clone(clone, ci->bio); -	clone->bi_destructor = dm_bio_destructor; -	if (len) { -		clone->bi_sector = ci->sector; -		clone->bi_size = to_bytes(len); -	} +	 __bio_clone_fast(clone, ci->bio); +	if (len) +		bio_setup_sector(clone, ci->sector, *len); -	__map_bio(ti, clone, tio); +	__map_bio(tio);  } -static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, -				    unsigned num_requests, sector_t len) +static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, +				  unsigned num_bios, unsigned *len)  { -	unsigned request_nr; +	unsigned target_bio_nr; -	for (request_nr = 0; request_nr < num_requests; request_nr++) -		__issue_target_request(ci, ti, request_nr, len); +	for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) +		__clone_and_map_simple_bio(ci, ti, target_bio_nr, len);  } -static int __clone_and_map_empty_flush(struct clone_info *ci) +static int __send_empty_flush(struct clone_info *ci)  {  	unsigned target_nr = 0;  	struct dm_target *ti;  	BUG_ON(bio_has_data(ci->bio));  	while ((ti = dm_table_get_target(ci->map, target_nr++))) -		__issue_target_requests(ci, ti, ti->num_flush_requests, 0); +		__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);  	return 0;  } -/* - * Perform all io with a single clone. - */ -static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) +static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, +				     sector_t sector, unsigned *len)  { -	struct bio *clone, *bio = ci->bio; +	struct bio *bio = ci->bio;  	struct dm_target_io *tio; +	unsigned target_bio_nr; +	unsigned num_target_bios = 1; + +	/* +	 * Does the target want to receive duplicate copies of the bio? +	 */ +	if (bio_data_dir(bio) == WRITE && ti->num_write_bios) +		num_target_bios = ti->num_write_bios(ti, bio); + +	for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { +		tio = alloc_tio(ci, ti, 0, target_bio_nr); +		tio->len_ptr = len; +		clone_bio(tio, bio, sector, *len); +		__map_bio(tio); +	} +} + +typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); + +static unsigned get_num_discard_bios(struct dm_target *ti) +{ +	return ti->num_discard_bios; +} + +static unsigned get_num_write_same_bios(struct dm_target *ti) +{ +	return ti->num_write_same_bios; +} + +typedef bool (*is_split_required_fn)(struct dm_target *ti); -	tio = alloc_tio(ci, ti); -	clone = clone_bio(bio, ci->sector, ci->idx, -			  bio->bi_vcnt - ci->idx, ci->sector_count, -			  ci->md->bs); -	__map_bio(ti, clone, tio); -	ci->sector_count = 0; +static bool is_split_required_for_discard(struct dm_target *ti) +{ +	return ti->split_discard_bios;  } -static int __clone_and_map_discard(struct clone_info *ci) +static int __send_changing_extent_only(struct clone_info *ci, +				       get_num_bios_fn get_num_bios, +				       is_split_required_fn is_split_required)  {  	struct dm_target *ti; -	sector_t len; +	unsigned len; +	unsigned num_bios;  	do {  		ti = dm_table_find_target(ci->map, ci->sector); @@ -1172,16 +1362,21 @@ static int __clone_and_map_discard(struct clone_info *ci)  			return -EIO;  		/* -		 * Even though the device advertised discard support, -		 * reconfiguration might have changed that since the +		 * Even though the device advertised support for this type of +		 * request, that does not mean every target supports it, and +		 * reconfiguration might also have changed that since the  		 * check was performed.  		 */ -		if (!ti->num_discard_requests) +		num_bios = get_num_bios ? get_num_bios(ti) : 0; +		if (!num_bios)  			return -EOPNOTSUPP; -		len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); +		if (is_split_required && !is_split_required(ti)) +			len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); +		else +			len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); -		__issue_target_requests(ci, ti, ti->num_discard_requests, len); +		__send_duplicate_bios(ci, ti, num_bios, &len);  		ci->sector += len;  	} while (ci->sector_count -= len); @@ -1189,108 +1384,60 @@ static int __clone_and_map_discard(struct clone_info *ci)  	return 0;  } -static int __clone_and_map(struct clone_info *ci) +static int __send_discard(struct clone_info *ci) +{ +	return __send_changing_extent_only(ci, get_num_discard_bios, +					   is_split_required_for_discard); +} + +static int __send_write_same(struct clone_info *ci)  { -	struct bio *clone, *bio = ci->bio; +	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); +} + +/* + * Select the correct strategy for processing a non-flush bio. + */ +static int __split_and_process_non_flush(struct clone_info *ci) +{ +	struct bio *bio = ci->bio;  	struct dm_target *ti; -	sector_t len = 0, max; -	struct dm_target_io *tio; +	unsigned len;  	if (unlikely(bio->bi_rw & REQ_DISCARD)) -		return __clone_and_map_discard(ci); +		return __send_discard(ci); +	else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) +		return __send_write_same(ci);  	ti = dm_table_find_target(ci->map, ci->sector);  	if (!dm_target_is_valid(ti))  		return -EIO; -	max = max_io_len(ci->sector, ti); - -	if (ci->sector_count <= max) { -		/* -		 * Optimise for the simple case where we can do all of -		 * the remaining io with a single clone. -		 */ -		__clone_and_map_simple(ci, ti); - -	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { -		/* -		 * There are some bvecs that don't span targets. -		 * Do as many of these as possible. -		 */ -		int i; -		sector_t remaining = max; -		sector_t bv_len; - -		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { -			bv_len = to_sector(bio->bi_io_vec[i].bv_len); +	len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); -			if (bv_len > remaining) -				break; +	__clone_and_map_data_bio(ci, ti, ci->sector, &len); -			remaining -= bv_len; -			len += bv_len; -		} - -		tio = alloc_tio(ci, ti); -		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, -				  ci->md->bs); -		__map_bio(ti, clone, tio); - -		ci->sector += len; -		ci->sector_count -= len; -		ci->idx = i; - -	} else { -		/* -		 * Handle a bvec that must be split between two or more targets. -		 */ -		struct bio_vec *bv = bio->bi_io_vec + ci->idx; -		sector_t remaining = to_sector(bv->bv_len); -		unsigned int offset = 0; - -		do { -			if (offset) { -				ti = dm_table_find_target(ci->map, ci->sector); -				if (!dm_target_is_valid(ti)) -					return -EIO; - -				max = max_io_len(ci->sector, ti); -			} - -			len = min(remaining, max); - -			tio = alloc_tio(ci, ti); -			clone = split_bvec(bio, ci->sector, ci->idx, -					   bv->bv_offset + offset, len, -					   ci->md->bs); - -			__map_bio(ti, clone, tio); - -			ci->sector += len; -			ci->sector_count -= len; -			offset += to_bytes(len); -		} while (remaining -= len); - -		ci->idx++; -	} +	ci->sector += len; +	ci->sector_count -= len;  	return 0;  }  /* - * Split the bio into several clones and submit it to targets. + * Entry point to split a bio into clones and submit them to the targets.   */ -static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) +static void __split_and_process_bio(struct mapped_device *md, +				    struct dm_table *map, struct bio *bio)  {  	struct clone_info ci;  	int error = 0; -	ci.map = dm_get_live_table(md); -	if (unlikely(!ci.map)) { +	if (unlikely(!map)) {  		bio_io_error(bio);  		return;  	} +	ci.map = map;  	ci.md = md;  	ci.io = alloc_io(md);  	ci.io->error = 0; @@ -1298,25 +1445,24 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)  	ci.io->bio = bio;  	ci.io->md = md;  	spin_lock_init(&ci.io->endio_lock); -	ci.sector = bio->bi_sector; -	ci.idx = bio->bi_idx; +	ci.sector = bio->bi_iter.bi_sector;  	start_io_acct(ci.io); +  	if (bio->bi_rw & REQ_FLUSH) {  		ci.bio = &ci.md->flush_bio;  		ci.sector_count = 0; -		error = __clone_and_map_empty_flush(&ci); +		error = __send_empty_flush(&ci);  		/* dec_pending submits any data associated with flush */  	} else {  		ci.bio = bio;  		ci.sector_count = bio_sectors(bio);  		while (ci.sector_count && !error) -			error = __clone_and_map(&ci); +			error = __split_and_process_non_flush(&ci);  	}  	/* drop the extra reference count */  	dec_pending(ci.io, error); -	dm_table_put(ci.map);  }  /*-----------------------------------------------------------------   * CRUD END @@ -1327,7 +1473,7 @@ static int dm_merge_bvec(struct request_queue *q,  			 struct bio_vec *biovec)  {  	struct mapped_device *md = q->queuedata; -	struct dm_table *map = dm_get_live_table(md); +	struct dm_table *map = dm_get_live_table_fast(md);  	struct dm_target *ti;  	sector_t max_sectors;  	int max_size = 0; @@ -1337,7 +1483,7 @@ static int dm_merge_bvec(struct request_queue *q,  	ti = dm_table_find_target(map, bvm->bi_sector);  	if (!dm_target_is_valid(ti)) -		goto out_table; +		goto out;  	/*  	 * Find maximum amount of I/O that won't need splitting @@ -1363,13 +1509,10 @@ static int dm_merge_bvec(struct request_queue *q,  	 * just one page.  	 */  	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) -  		max_size = 0; -out_table: -	dm_table_put(map); -  out: +	dm_put_live_table_fast(md);  	/*  	 * Always allow an entire first page  	 */ @@ -1383,13 +1526,15 @@ out:   * The request function that just remaps the bio built up by   * dm_merge_bvec.   */ -static int _dm_request(struct request_queue *q, struct bio *bio) +static void _dm_request(struct request_queue *q, struct bio *bio)  {  	int rw = bio_data_dir(bio);  	struct mapped_device *md = q->queuedata;  	int cpu; +	int srcu_idx; +	struct dm_table *map; -	down_read(&md->io_lock); +	map = dm_get_live_table(md, &srcu_idx);  	cpu = part_stat_lock();  	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); @@ -1398,40 +1543,33 @@ static int _dm_request(struct request_queue *q, struct bio *bio)  	/* if we're suspended, we have to queue this io for later */  	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { -		up_read(&md->io_lock); +		dm_put_live_table(md, srcu_idx);  		if (bio_rw(bio) != READA)  			queue_io(md, bio);  		else  			bio_io_error(bio); -		return 0; +		return;  	} -	__split_and_process_bio(md, bio); -	up_read(&md->io_lock); -	return 0; -} - -static int dm_make_request(struct request_queue *q, struct bio *bio) -{ -	struct mapped_device *md = q->queuedata; - -	return md->saved_make_request_fn(q, bio); /* call __make_request() */ +	__split_and_process_bio(md, map, bio); +	dm_put_live_table(md, srcu_idx); +	return;  } -static int dm_request_based(struct mapped_device *md) +int dm_request_based(struct mapped_device *md)  {  	return blk_queue_stackable(md->queue);  } -static int dm_request(struct request_queue *q, struct bio *bio) +static void dm_request(struct request_queue *q, struct bio *bio)  {  	struct mapped_device *md = q->queuedata;  	if (dm_request_based(md)) -		return dm_make_request(q, bio); - -	return _dm_request(q, bio); +		blk_queue_bio(q, bio); +	else +		_dm_request(q, bio);  }  void dm_dispatch_request(struct request *rq) @@ -1448,30 +1586,16 @@ void dm_dispatch_request(struct request *rq)  }  EXPORT_SYMBOL_GPL(dm_dispatch_request); -static void dm_rq_bio_destructor(struct bio *bio) -{ -	struct dm_rq_clone_bio_info *info = bio->bi_private; -	struct mapped_device *md = info->tio->md; - -	free_bio_info(info); -	bio_free(bio, md->bs); -} -  static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,  				 void *data)  {  	struct dm_rq_target_io *tio = data; -	struct mapped_device *md = tio->md; -	struct dm_rq_clone_bio_info *info = alloc_bio_info(md); - -	if (!info) -		return -ENOMEM; +	struct dm_rq_clone_bio_info *info = +		container_of(bio, struct dm_rq_clone_bio_info, clone);  	info->orig = bio_orig;  	info->tio = tio;  	bio->bi_end_io = end_clone_bio; -	bio->bi_private = info; -	bio->bi_destructor = dm_rq_bio_destructor;  	return 0;  } @@ -1489,7 +1613,6 @@ static int setup_clone(struct request *clone, struct request *rq,  	clone->cmd = rq->cmd;  	clone->cmd_len = rq->cmd_len;  	clone->sense = rq->sense; -	clone->buffer = rq->buffer;  	clone->end_io = end_clone_request;  	clone->end_io_data = tio; @@ -1556,15 +1679,6 @@ static int map_request(struct dm_target *ti, struct request *clone,  	int r, requeued = 0;  	struct dm_rq_target_io *tio = clone->end_io_data; -	/* -	 * Hold the md reference here for the in-flight I/O. -	 * We can't rely on the reference count by device opener, -	 * because the device may be closed during the request completion -	 * when all bios are completed. -	 * See the comment in rq_completed() too. -	 */ -	dm_get(md); -  	tio->ti = ti;  	r = ti->type->map_rq(ti, clone, &tio->info);  	switch (r) { @@ -1596,6 +1710,26 @@ static int map_request(struct dm_target *ti, struct request *clone,  	return requeued;  } +static struct request *dm_start_request(struct mapped_device *md, struct request *orig) +{ +	struct request *clone; + +	blk_start_request(orig); +	clone = orig->special; +	atomic_inc(&md->pending[rq_data_dir(clone)]); + +	/* +	 * Hold the md reference here for the in-flight I/O. +	 * We can't rely on the reference count by device opener, +	 * because the device may be closed during the request completion +	 * when all bios are completed. +	 * See the comment in rq_completed() too. +	 */ +	dm_get(md); + +	return clone; +} +  /*   * q->request_fn for request-based dm.   * Called with the queue lock held. @@ -1603,7 +1737,8 @@ static int map_request(struct dm_target *ti, struct request *clone,  static void dm_request_fn(struct request_queue *q)  {  	struct mapped_device *md = q->queuedata; -	struct dm_table *map = dm_get_live_table(md); +	int srcu_idx; +	struct dm_table *map = dm_get_live_table(md, &srcu_idx);  	struct dm_target *ti;  	struct request *rq, *clone;  	sector_t pos; @@ -1614,10 +1749,10 @@ static void dm_request_fn(struct request_queue *q)  	 * number of in-flight I/Os after the queue is stopped in  	 * dm_suspend().  	 */ -	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { +	while (!blk_queue_stopped(q)) {  		rq = blk_peek_request(q);  		if (!rq) -			goto plug_and_out; +			goto delay_and_out;  		/* always use block 0 to find the target for flushes for now */  		pos = 0; @@ -1625,36 +1760,40 @@ static void dm_request_fn(struct request_queue *q)  			pos = blk_rq_pos(rq);  		ti = dm_table_find_target(map, pos); -		BUG_ON(!dm_target_is_valid(ti)); +		if (!dm_target_is_valid(ti)) { +			/* +			 * Must perform setup, that dm_done() requires, +			 * before calling dm_kill_unmapped_request +			 */ +			DMERR_LIMIT("request attempted access beyond the end of device"); +			clone = dm_start_request(md, rq); +			dm_kill_unmapped_request(clone, -EIO); +			continue; +		}  		if (ti->type->busy && ti->type->busy(ti)) -			goto plug_and_out; +			goto delay_and_out; -		blk_start_request(rq); -		clone = rq->special; -		atomic_inc(&md->pending[rq_data_dir(clone)]); +		clone = dm_start_request(md, rq);  		spin_unlock(q->queue_lock);  		if (map_request(ti, clone, md))  			goto requeued; -		spin_lock_irq(q->queue_lock); +		BUG_ON(!irqs_disabled()); +		spin_lock(q->queue_lock);  	}  	goto out;  requeued: -	spin_lock_irq(q->queue_lock); - -plug_and_out: -	if (!elv_queue_empty(q)) -		/* Some requests still remain, retry later */ -		blk_plug_device(q); +	BUG_ON(!irqs_disabled()); +	spin_lock(q->queue_lock); +delay_and_out: +	blk_delay_queue(q, HZ / 10);  out: -	dm_table_put(map); - -	return; +	dm_put_live_table(md, srcu_idx);  }  int dm_underlying_device_busy(struct request_queue *q) @@ -1667,32 +1806,18 @@ static int dm_lld_busy(struct request_queue *q)  {  	int r;  	struct mapped_device *md = q->queuedata; -	struct dm_table *map = dm_get_live_table(md); +	struct dm_table *map = dm_get_live_table_fast(md);  	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))  		r = 1;  	else  		r = dm_table_any_busy_target(map); -	dm_table_put(map); +	dm_put_live_table_fast(md);  	return r;  } -static void dm_unplug_all(struct request_queue *q) -{ -	struct mapped_device *md = q->queuedata; -	struct dm_table *map = dm_get_live_table(md); - -	if (map) { -		if (dm_request_based(md)) -			generic_unplug_device(q); - -		dm_table_unplug_all(map); -		dm_table_put(map); -	} -} -  static int dm_any_congested(void *congested_data, int bdi_bits)  {  	int r = bdi_bits; @@ -1700,7 +1825,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)  	struct dm_table *map;  	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { -		map = dm_get_live_table(md); +		map = dm_get_live_table_fast(md);  		if (map) {  			/*  			 * Request-based dm cares about only own queue for @@ -1711,9 +1836,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits)  				    bdi_bits;  			else  				r = dm_table_any_congested(map, bdi_bits); - -			dm_table_put(map);  		} +		dm_put_live_table_fast(md);  	}  	return r; @@ -1722,8 +1846,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)  /*-----------------------------------------------------------------   * An IDR is used to keep track of allocated minor numbers.   *---------------------------------------------------------------*/ -static DEFINE_IDR(_minor_idr); -  static void free_minor(int minor)  {  	spin_lock(&_minor_lock); @@ -1736,62 +1858,38 @@ static void free_minor(int minor)   */  static int specific_minor(int minor)  { -	int r, m; +	int r;  	if (minor >= (1 << MINORBITS))  		return -EINVAL; -	r = idr_pre_get(&_minor_idr, GFP_KERNEL); -	if (!r) -		return -ENOMEM; - +	idr_preload(GFP_KERNEL);  	spin_lock(&_minor_lock); -	if (idr_find(&_minor_idr, minor)) { -		r = -EBUSY; -		goto out; -	} - -	r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); -	if (r) -		goto out; - -	if (m != minor) { -		idr_remove(&_minor_idr, m); -		r = -EBUSY; -		goto out; -	} +	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); -out:  	spin_unlock(&_minor_lock); -	return r; +	idr_preload_end(); +	if (r < 0) +		return r == -ENOSPC ? -EBUSY : r; +	return 0;  }  static int next_free_minor(int *minor)  { -	int r, m; - -	r = idr_pre_get(&_minor_idr, GFP_KERNEL); -	if (!r) -		return -ENOMEM; +	int r; +	idr_preload(GFP_KERNEL);  	spin_lock(&_minor_lock); -	r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); -	if (r) -		goto out; - -	if (m >= (1 << MINORBITS)) { -		idr_remove(&_minor_idr, m); -		r = -ENOSPC; -		goto out; -	} +	r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); -	*minor = m; - -out:  	spin_unlock(&_minor_lock); -	return r; +	idr_preload_end(); +	if (r < 0) +		return r; +	*minor = r; +	return 0;  }  static const struct block_device_operations dm_blk_dops; @@ -1816,9 +1914,7 @@ static void dm_init_md_queue(struct mapped_device *md)  	md->queue->backing_dev_info.congested_data = md;  	blk_queue_make_request(md->queue, dm_request);  	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); -	md->queue->unplug_fn = dm_unplug_all;  	blk_queue_merge_bvec(md->queue, dm_merge_bvec); -	blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);  }  /* @@ -1846,12 +1942,14 @@ static struct mapped_device *alloc_dev(int minor)  	if (r < 0)  		goto bad_minor; +	r = init_srcu_struct(&md->io_barrier); +	if (r < 0) +		goto bad_io_barrier; +  	md->type = DM_TYPE_NONE; -	init_rwsem(&md->io_lock);  	mutex_init(&md->suspend_lock);  	mutex_init(&md->type_lock);  	spin_lock_init(&md->deferred_lock); -	rwlock_init(&md->map_lock);  	atomic_set(&md->holders, 1);  	atomic_set(&md->open_count, 0);  	atomic_set(&md->event_nr, 0); @@ -1874,6 +1972,7 @@ static struct mapped_device *alloc_dev(int minor)  	init_waitqueue_head(&md->wait);  	INIT_WORK(&md->work, dm_wq_work);  	init_waitqueue_head(&md->eventq); +	init_completion(&md->kobj_holder.completion);  	md->disk->major = _major;  	md->disk->first_minor = minor; @@ -1884,7 +1983,7 @@ static struct mapped_device *alloc_dev(int minor)  	add_disk(md->disk);  	format_dev_t(md->name, MKDEV(_major, minor)); -	md->wq = create_singlethread_workqueue("kdmflush"); +	md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);  	if (!md->wq)  		goto bad_thread; @@ -1896,6 +1995,8 @@ static struct mapped_device *alloc_dev(int minor)  	md->flush_bio.bi_bdev = md->bdev;  	md->flush_bio.bi_rw = WRITE_FLUSH; +	dm_stats_init(&md->stats); +  	/* Populate the mapping, nobody knows we exist yet */  	spin_lock(&_minor_lock);  	old_md = idr_replace(&_minor_idr, md, minor); @@ -1913,6 +2014,8 @@ bad_thread:  bad_disk:  	blk_cleanup_queue(md->queue);  bad_queue: +	cleanup_srcu_struct(&md->io_barrier); +bad_io_barrier:  	free_minor(minor);  bad_minor:  	module_put(THIS_MODULE); @@ -1930,14 +2033,13 @@ static void free_dev(struct mapped_device *md)  	unlock_fs(md);  	bdput(md->bdev);  	destroy_workqueue(md->wq); -	if (md->tio_pool) -		mempool_destroy(md->tio_pool);  	if (md->io_pool)  		mempool_destroy(md->io_pool);  	if (md->bs)  		bioset_free(md->bs);  	blk_integrity_unregister(md->disk);  	del_gendisk(md->disk); +	cleanup_srcu_struct(&md->io_barrier);  	free_minor(minor);  	spin_lock(&_minor_lock); @@ -1946,25 +2048,42 @@ static void free_dev(struct mapped_device *md)  	put_disk(md->disk);  	blk_cleanup_queue(md->queue); +	dm_stats_cleanup(&md->stats);  	module_put(THIS_MODULE);  	kfree(md);  }  static void __bind_mempools(struct mapped_device *md, struct dm_table *t)  { -	struct dm_md_mempools *p; +	struct dm_md_mempools *p = dm_table_get_md_mempools(t); -	if (md->io_pool && md->tio_pool && md->bs) -		/* the md already has necessary mempools */ +	if (md->io_pool && md->bs) { +		/* The md already has necessary mempools. */ +		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { +			/* +			 * Reload bioset because front_pad may have changed +			 * because a different table was loaded. +			 */ +			bioset_free(md->bs); +			md->bs = p->bs; +			p->bs = NULL; +		} else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) { +			/* +			 * There's no need to reload with request-based dm +			 * because the size of front_pad doesn't change. +			 * Note for future: If you are to reload bioset, +			 * prep-ed requests in the queue may refer +			 * to bio from the old bioset, so you must walk +			 * through the queue to unprep. +			 */ +		}  		goto out; +	} -	p = dm_table_get_md_mempools(t); -	BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); +	BUG_ON(!p || md->io_pool || md->bs);  	md->io_pool = p->io_pool;  	p->io_pool = NULL; -	md->tio_pool = p->tio_pool; -	p->tio_pool = NULL;  	md->bs = p->bs;  	p->bs = NULL; @@ -1992,13 +2111,67 @@ static void event_callback(void *context)  	wake_up(&md->eventq);  } +/* + * Protected by md->suspend_lock obtained by dm_swap_table(). + */  static void __set_size(struct mapped_device *md, sector_t size)  {  	set_capacity(md->disk, size); -	mutex_lock(&md->bdev->bd_inode->i_mutex);  	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); -	mutex_unlock(&md->bdev->bd_inode->i_mutex); +} + +/* + * Return 1 if the queue has a compulsory merge_bvec_fn function. + * + * If this function returns 0, then the device is either a non-dm + * device without a merge_bvec_fn, or it is a dm device that is + * able to split any bios it receives that are too big. + */ +int dm_queue_merge_is_compulsory(struct request_queue *q) +{ +	struct mapped_device *dev_md; + +	if (!q->merge_bvec_fn) +		return 0; + +	if (q->make_request_fn == dm_request) { +		dev_md = q->queuedata; +		if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) +			return 0; +	} + +	return 1; +} + +static int dm_device_merge_is_compulsory(struct dm_target *ti, +					 struct dm_dev *dev, sector_t start, +					 sector_t len, void *data) +{ +	struct block_device *bdev = dev->bdev; +	struct request_queue *q = bdev_get_queue(bdev); + +	return dm_queue_merge_is_compulsory(q); +} + +/* + * Return 1 if it is acceptable to ignore merge_bvec_fn based + * on the properties of the underlying devices. + */ +static int dm_table_merge_is_optional(struct dm_table *table) +{ +	unsigned i = 0; +	struct dm_target *ti; + +	while (i < dm_table_get_num_targets(table)) { +		ti = dm_table_get_target(table, i++); + +		if (ti->type->iterate_devices && +		    ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) +			return 0; +	} + +	return 1;  }  /* @@ -2010,14 +2183,14 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,  	struct dm_table *old_map;  	struct request_queue *q = md->queue;  	sector_t size; -	unsigned long flags; +	int merge_is_optional;  	size = dm_table_get_size(t);  	/*  	 * Wipe any geometry if the size of the table changed.  	 */ -	if (size != get_capacity(md->disk)) +	if (size != dm_get_size(md))  		memset(&md->geometry, 0, sizeof(md->geometry));  	__set_size(md, size); @@ -2036,11 +2209,18 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,  	__bind_mempools(md, t); -	write_lock_irqsave(&md->map_lock, flags); +	merge_is_optional = dm_table_merge_is_optional(t); +  	old_map = md->map; -	md->map = t; +	rcu_assign_pointer(md->map, t); +	md->immutable_target_type = dm_table_get_immutable_target_type(t); +  	dm_table_set_restrictions(t, q, limits); -	write_unlock_irqrestore(&md->map_lock, flags); +	if (merge_is_optional) +		set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); +	else +		clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); +	dm_sync_table(md);  	return old_map;  } @@ -2051,15 +2231,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,  static struct dm_table *__unbind(struct mapped_device *md)  {  	struct dm_table *map = md->map; -	unsigned long flags;  	if (!map)  		return NULL;  	dm_table_event_callback(map, NULL, NULL); -	write_lock_irqsave(&md->map_lock, flags); -	md->map = NULL; -	write_unlock_irqrestore(&md->map_lock, flags); +	RCU_INIT_POINTER(md->map, NULL); +	dm_sync_table(md);  	return map;  } @@ -2097,14 +2275,32 @@ void dm_unlock_md_type(struct mapped_device *md)  void dm_set_md_type(struct mapped_device *md, unsigned type)  { +	BUG_ON(!mutex_is_locked(&md->type_lock));  	md->type = type;  }  unsigned dm_get_md_type(struct mapped_device *md)  { +	BUG_ON(!mutex_is_locked(&md->type_lock));  	return md->type;  } +struct target_type *dm_get_immutable_target_type(struct mapped_device *md) +{ +	return md->immutable_target_type; +} + +/* + * The queue_limits are only valid as long as you have a reference + * count on 'md'. + */ +struct queue_limits *dm_get_queue_limits(struct mapped_device *md) +{ +	BUG_ON(!atomic_read(&md->holders)); +	return &md->queue->limits; +} +EXPORT_SYMBOL_GPL(dm_get_queue_limits); +  /*   * Fully initialize a request-based queue (->elevator, ->request_fn, etc).   */ @@ -2121,7 +2317,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)  		return 0;  	md->queue = q; -	md->saved_make_request_fn = md->queue->make_request_fn;  	dm_init_md_queue(md);  	blk_queue_softirq_done(md->queue, dm_softirq_done);  	blk_queue_prep_rq(md->queue, dm_prep_fn); @@ -2180,6 +2375,7 @@ struct mapped_device *dm_get_md(dev_t dev)  	return md;  } +EXPORT_SYMBOL_GPL(dm_get_md);  void *dm_get_mdptr(struct mapped_device *md)  { @@ -2206,11 +2402,12 @@ EXPORT_SYMBOL_GPL(dm_device_name);  static void __dm_destroy(struct mapped_device *md, bool wait)  {  	struct dm_table *map; +	int srcu_idx;  	might_sleep();  	spin_lock(&_minor_lock); -	map = dm_get_live_table(md); +	map = dm_get_live_table(md, &srcu_idx);  	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));  	set_bit(DMF_FREEING, &md->flags);  	spin_unlock(&_minor_lock); @@ -2220,6 +2417,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)  		dm_table_postsuspend_targets(map);  	} +	/* dm_put_live_table must be before msleep, otherwise deadlock is possible */ +	dm_put_live_table(md, srcu_idx); +  	/*  	 * Rare, but there may be I/O requests still going to complete,  	 * for example.  Wait for all references to disappear. @@ -2234,7 +2434,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait)  		       dm_device_name(md), atomic_read(&md->holders));  	dm_sysfs_exit(md); -	dm_table_put(map);  	dm_table_destroy(__unbind(md));  	free_dev(md);  } @@ -2260,14 +2459,11 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)  	int r = 0;  	DECLARE_WAITQUEUE(wait, current); -	dm_unplug_all(md->queue); -  	add_wait_queue(&md->wait, &wait);  	while (1) {  		set_current_state(interruptible); -		smp_mb();  		if (!md_in_flight(md))  			break; @@ -2294,8 +2490,10 @@ static void dm_wq_work(struct work_struct *work)  	struct mapped_device *md = container_of(work, struct mapped_device,  						work);  	struct bio *c; +	int srcu_idx; +	struct dm_table *map; -	down_read(&md->io_lock); +	map = dm_get_live_table(md, &srcu_idx);  	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {  		spin_lock_irq(&md->deferred_lock); @@ -2305,23 +2503,19 @@ static void dm_wq_work(struct work_struct *work)  		if (!c)  			break; -		up_read(&md->io_lock); -  		if (dm_request_based(md))  			generic_make_request(c);  		else -			__split_and_process_bio(md, c); - -		down_read(&md->io_lock); +			__split_and_process_bio(md, map, c);  	} -	up_read(&md->io_lock); +	dm_put_live_table(md, srcu_idx);  }  static void dm_queue_flush(struct mapped_device *md)  {  	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic();  	queue_work(md->wq, &md->work);  } @@ -2330,7 +2524,7 @@ static void dm_queue_flush(struct mapped_device *md)   */  struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)  { -	struct dm_table *map = ERR_PTR(-EINVAL); +	struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);  	struct queue_limits limits;  	int r; @@ -2340,10 +2534,25 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)  	if (!dm_suspended_md(md))  		goto out; -	r = dm_calculate_queue_limits(table, &limits); -	if (r) { -		map = ERR_PTR(r); -		goto out; +	/* +	 * If the new table has no data devices, retain the existing limits. +	 * This helps multipath with queue_if_no_path if all paths disappear, +	 * then new I/O is queued based on these limits, and then some paths +	 * reappear. +	 */ +	if (dm_table_has_no_data_devices(table)) { +		live_map = dm_get_live_table_fast(md); +		if (live_map) +			limits = md->queue->limits; +		dm_put_live_table_fast(md); +	} + +	if (!live_map) { +		r = dm_calculate_queue_limits(table, &limits); +		if (r) { +			map = ERR_PTR(r); +			goto out; +		}  	}  	map = __bind(md, table, &limits); @@ -2415,7 +2624,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)  		goto out_unlock;  	} -	map = dm_get_live_table(md); +	map = md->map;  	/*  	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. @@ -2436,7 +2645,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)  	if (!noflush && do_lockfs) {  		r = lock_fs(md);  		if (r) -			goto out; +			goto out_unlock;  	}  	/* @@ -2451,9 +2660,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)  	 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call  	 * flush_workqueue(md->wq).  	 */ -	down_write(&md->io_lock);  	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); -	up_write(&md->io_lock); +	synchronize_srcu(&md->io_barrier);  	/*  	 * Stop md->queue before flushing md->wq in case request-based @@ -2471,10 +2679,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)  	 */  	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); -	down_write(&md->io_lock);  	if (noflush)  		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); -	up_write(&md->io_lock); +	synchronize_srcu(&md->io_barrier);  	/* were we interrupted ? */  	if (r < 0) { @@ -2484,7 +2691,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)  			start_queue(md->queue);  		unlock_fs(md); -		goto out; /* pushback list is already flushed, so skip flush */ +		goto out_unlock; /* pushback list is already flushed, so skip flush */  	}  	/* @@ -2497,9 +2704,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)  	dm_table_postsuspend_targets(map); -out: -	dm_table_put(map); -  out_unlock:  	mutex_unlock(&md->suspend_lock);  	return r; @@ -2514,7 +2718,7 @@ int dm_resume(struct mapped_device *md)  	if (!dm_suspended_md(md))  		goto out; -	map = dm_get_live_table(md); +	map = md->map;  	if (!map || !dm_table_get_size(map))  		goto out; @@ -2536,15 +2740,45 @@ int dm_resume(struct mapped_device *md)  	clear_bit(DMF_SUSPENDED, &md->flags); -	dm_table_unplug_all(map);  	r = 0;  out: -	dm_table_put(map);  	mutex_unlock(&md->suspend_lock);  	return r;  } +/* + * Internal suspend/resume works like userspace-driven suspend. It waits + * until all bios finish and prevents issuing new bios to the target drivers. + * It may be used only from the kernel. + * + * Internal suspend holds md->suspend_lock, which prevents interaction with + * userspace-driven suspend. + */ + +void dm_internal_suspend(struct mapped_device *md) +{ +	mutex_lock(&md->suspend_lock); +	if (dm_suspended_md(md)) +		return; + +	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); +	synchronize_srcu(&md->io_barrier); +	flush_workqueue(md->wq); +	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); +} + +void dm_internal_resume(struct mapped_device *md) +{ +	if (dm_suspended_md(md)) +		goto done; + +	dm_queue_flush(md); + +done: +	mutex_unlock(&md->suspend_lock); +} +  /*-----------------------------------------------------------------   * Event notification.   *---------------------------------------------------------------*/ @@ -2600,20 +2834,14 @@ struct gendisk *dm_disk(struct mapped_device *md)  struct kobject *dm_kobject(struct mapped_device *md)  { -	return &md->kobj; +	return &md->kobj_holder.kobj;  } -/* - * struct mapped_device should not be exported outside of dm.c - * so use this check to verify that kobj is part of md structure - */  struct mapped_device *dm_get_from_kobject(struct kobject *kobj)  {  	struct mapped_device *md; -	md = container_of(kobj, struct mapped_device, kobj); -	if (&md->kobj != kobj) -		return NULL; +	md = container_of(kobj, struct mapped_device, kobj_holder.kobj);  	if (test_bit(DMF_FREEING, &md->flags) ||  	    dm_deleting_md(md)) @@ -2628,6 +2856,11 @@ int dm_suspended_md(struct mapped_device *md)  	return test_bit(DMF_SUSPENDED, &md->flags);  } +int dm_test_deferred_remove_flag(struct mapped_device *md) +{ +	return test_bit(DMF_DEFERRED_REMOVE, &md->flags); +} +  int dm_suspended(struct dm_target *ti)  {  	return dm_suspended_md(dm_table_get_md(ti->table)); @@ -2640,40 +2873,44 @@ int dm_noflush_suspending(struct dm_target *ti)  }  EXPORT_SYMBOL_GPL(dm_noflush_suspending); -struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) +struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)  { -	struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); +	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); +	struct kmem_cache *cachep; +	unsigned int pool_size; +	unsigned int front_pad;  	if (!pools)  		return NULL; -	pools->io_pool = (type == DM_TYPE_BIO_BASED) ? -			 mempool_create_slab_pool(MIN_IOS, _io_cache) : -			 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); -	if (!pools->io_pool) -		goto free_pools_and_out; +	if (type == DM_TYPE_BIO_BASED) { +		cachep = _io_cache; +		pool_size = dm_get_reserved_bio_based_ios(); +		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); +	} else if (type == DM_TYPE_REQUEST_BASED) { +		cachep = _rq_tio_cache; +		pool_size = dm_get_reserved_rq_based_ios(); +		front_pad = offsetof(struct dm_rq_clone_bio_info, clone); +		/* per_bio_data_size is not used. See __bind_mempools(). */ +		WARN_ON(per_bio_data_size != 0); +	} else +		goto out; -	pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? -			  mempool_create_slab_pool(MIN_IOS, _tio_cache) : -			  mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); -	if (!pools->tio_pool) -		goto free_io_pool_and_out; +	pools->io_pool = mempool_create_slab_pool(pool_size, cachep); +	if (!pools->io_pool) +		goto out; -	pools->bs = (type == DM_TYPE_BIO_BASED) ? -		    bioset_create(16, 0) : bioset_create(MIN_IOS, 0); +	pools->bs = bioset_create(pool_size, front_pad);  	if (!pools->bs) -		goto free_tio_pool_and_out; - -	return pools; +		goto out; -free_tio_pool_and_out: -	mempool_destroy(pools->tio_pool); +	if (integrity && bioset_integrity_create(pools->bs, pool_size)) +		goto out; -free_io_pool_and_out: -	mempool_destroy(pools->io_pool); +	return pools; -free_pools_and_out: -	kfree(pools); +out: +	dm_free_md_mempools(pools);  	return NULL;  } @@ -2686,9 +2923,6 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)  	if (pools->io_pool)  		mempool_destroy(pools->io_pool); -	if (pools->tio_pool) -		mempool_destroy(pools->tio_pool); -  	if (pools->bs)  		bioset_free(pools->bs); @@ -2703,8 +2937,6 @@ static const struct block_device_operations dm_blk_dops = {  	.owner = THIS_MODULE  }; -EXPORT_SYMBOL(dm_get_mapinfo); -  /*   * module hooks   */ @@ -2713,6 +2945,13 @@ module_exit(dm_exit);  module_param(major, uint, 0);  MODULE_PARM_DESC(major, "The major number of the device mapper"); + +module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); + +module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); +  MODULE_DESCRIPTION(DM_NAME " driver");  MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");  MODULE_LICENSE("GPL");  | 
