diff options
Diffstat (limited to 'drivers/md/dm-cache-target.c')
-rw-r--r-- | drivers/md/dm-cache-target.c | 2584 |
1 files changed, 2584 insertions, 0 deletions
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c new file mode 100644 index 00000000000..0f4e84b15c3 --- /dev/null +++ b/drivers/md/dm-cache-target.c @@ -0,0 +1,2584 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-bio-prison.h" +#include "dm-cache-metadata.h" + +#include <linux/dm-io.h> +#include <linux/dm-kcopyd.h> +#include <linux/init.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#define DM_MSG_PREFIX "cache" + +DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, + "A percentage of time allocated for copying to and/or from cache"); + +/*----------------------------------------------------------------*/ + +/* + * Glossary: + * + * oblock: index of an origin block + * cblock: index of a cache block + * promotion: movement of a block from origin to cache + * demotion: movement of a block from cache to origin + * migration: movement of a block between the origin and cache device, + * either direction + */ + +/*----------------------------------------------------------------*/ + +static size_t bitset_size_in_bytes(unsigned nr_entries) +{ + return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); +} + +static unsigned long *alloc_bitset(unsigned nr_entries) +{ + size_t s = bitset_size_in_bytes(nr_entries); + return vzalloc(s); +} + +static void clear_bitset(void *bitset, unsigned nr_entries) +{ + size_t s = bitset_size_in_bytes(nr_entries); + memset(bitset, 0, s); +} + +static void free_bitset(unsigned long *bits) +{ + vfree(bits); +} + +/*----------------------------------------------------------------*/ + +#define PRISON_CELLS 1024 +#define MIGRATION_POOL_SIZE 128 +#define COMMIT_PERIOD HZ +#define MIGRATION_COUNT_WINDOW 10 + +/* + * The block size of the device holding cache data must be >= 32KB + */ +#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) + +/* + * FIXME: the cache is read/write for the time being. + */ +enum cache_mode { + CM_WRITE, /* metadata may be changed */ + CM_READ_ONLY, /* metadata may not be changed */ +}; + +struct cache_features { + enum cache_mode mode; + bool write_through:1; +}; + +struct cache_stats { + atomic_t read_hit; + atomic_t read_miss; + atomic_t write_hit; + atomic_t write_miss; + atomic_t demotion; + atomic_t promotion; + atomic_t copies_avoided; + atomic_t cache_cell_clash; + atomic_t commit_count; + atomic_t discard_count; +}; + +struct cache { + struct dm_target *ti; + struct dm_target_callbacks callbacks; + + /* + * Metadata is written to this device. + */ + struct dm_dev *metadata_dev; + + /* + * The slower of the two data devices. Typically a spindle. + */ + struct dm_dev *origin_dev; + + /* + * The faster of the two data devices. Typically an SSD. + */ + struct dm_dev *cache_dev; + + /* + * Cache features such as write-through. + */ + struct cache_features features; + + /* + * Size of the origin device in _complete_ blocks and native sectors. + */ + dm_oblock_t origin_blocks; + sector_t origin_sectors; + + /* + * Size of the cache device in blocks. + */ + dm_cblock_t cache_size; + + /* + * Fields for converting from sectors to blocks. + */ + uint32_t sectors_per_block; + int sectors_per_block_shift; + + struct dm_cache_metadata *cmd; + + spinlock_t lock; + struct bio_list deferred_bios; + struct bio_list deferred_flush_bios; + struct list_head quiesced_migrations; + struct list_head completed_migrations; + struct list_head need_commit_migrations; + sector_t migration_threshold; + atomic_t nr_migrations; + wait_queue_head_t migration_wait; + + /* + * cache_size entries, dirty if set + */ + dm_cblock_t nr_dirty; + unsigned long *dirty_bitset; + + /* + * origin_blocks entries, discarded if set. + */ + sector_t discard_block_size; /* a power of 2 times sectors per block */ + dm_dblock_t discard_nr_blocks; + unsigned long *discard_bitset; + + struct dm_kcopyd_client *copier; + struct workqueue_struct *wq; + struct work_struct worker; + + struct delayed_work waker; + unsigned long last_commit_jiffies; + + struct dm_bio_prison *prison; + struct dm_deferred_set *all_io_ds; + + mempool_t *migration_pool; + struct dm_cache_migration *next_migration; + + struct dm_cache_policy *policy; + unsigned policy_nr_args; + + bool need_tick_bio:1; + bool sized:1; + bool quiescing:1; + bool commit_requested:1; + bool loaded_mappings:1; + bool loaded_discards:1; + + struct cache_stats stats; + + /* + * Rather than reconstructing the table line for the status we just + * save it and regurgitate. + */ + unsigned nr_ctr_args; + const char **ctr_args; +}; + +struct per_bio_data { + bool tick:1; + unsigned req_nr:2; + struct dm_deferred_entry *all_io_entry; +}; + +struct dm_cache_migration { + struct list_head list; + struct cache *cache; + + unsigned long start_jiffies; + dm_oblock_t old_oblock; + dm_oblock_t new_oblock; + dm_cblock_t cblock; + + bool err:1; + bool writeback:1; + bool demote:1; + bool promote:1; + + struct dm_bio_prison_cell *old_ocell; + struct dm_bio_prison_cell *new_ocell; +}; + +/* + * Processing a bio in the worker thread may require these memory + * allocations. We prealloc to avoid deadlocks (the same worker thread + * frees them back to the mempool). + */ +struct prealloc { + struct dm_cache_migration *mg; + struct dm_bio_prison_cell *cell1; + struct dm_bio_prison_cell *cell2; +}; + +static void wake_worker(struct cache *cache) +{ + queue_work(cache->wq, &cache->worker); +} + +/*----------------------------------------------------------------*/ + +static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) +{ + /* FIXME: change to use a local slab. */ + return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); +} + +static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) +{ + dm_bio_prison_free_cell(cache->prison, cell); +} + +static int prealloc_data_structs(struct cache *cache, struct prealloc *p) +{ + if (!p->mg) { + p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); + if (!p->mg) + return -ENOMEM; + } + + if (!p->cell1) { + p->cell1 = alloc_prison_cell(cache); + if (!p->cell1) + return -ENOMEM; + } + + if (!p->cell2) { + p->cell2 = alloc_prison_cell(cache); + if (!p->cell2) + return -ENOMEM; + } + + return 0; +} + +static void prealloc_free_structs(struct cache *cache, struct prealloc *p) +{ + if (p->cell2) + free_prison_cell(cache, p->cell2); + + if (p->cell1) + free_prison_cell(cache, p->cell1); + + if (p->mg) + mempool_free(p->mg, cache->migration_pool); +} + +static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) +{ + struct dm_cache_migration *mg = p->mg; + + BUG_ON(!mg); + p->mg = NULL; + + return mg; +} + +/* + * You must have a cell within the prealloc struct to return. If not this + * function will BUG() rather than returning NULL. + */ +static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) +{ + struct dm_bio_prison_cell *r = NULL; + + if (p->cell1) { + r = p->cell1; + p->cell1 = NULL; + + } else if (p->cell2) { + r = p->cell2; + p->cell2 = NULL; + } else + BUG(); + + return r; +} + +/* + * You can't have more than two cells in a prealloc struct. BUG() will be + * called if you try and overfill. + */ +static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) +{ + if (!p->cell2) + p->cell2 = cell; + + else if (!p->cell1) + p->cell1 = cell; + + else + BUG(); +} + +/*----------------------------------------------------------------*/ + +static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) +{ + key->virtual = 0; + key->dev = 0; + key->block = from_oblock(oblock); +} + +/* + * The caller hands in a preallocated cell, and a free function for it. + * The cell will be freed if there's an error, or if it wasn't used because + * a cell with that key already exists. + */ +typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); + +static int bio_detain(struct cache *cache, dm_oblock_t oblock, + struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, + cell_free_fn free_fn, void *free_context, + struct dm_bio_prison_cell **cell_result) +{ + int r; + struct dm_cell_key key; + + build_key(oblock, &key); + r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); + if (r) + free_fn(free_context, cell_prealloc); + + return r; +} + +static int get_cell(struct cache *cache, + dm_oblock_t oblock, + struct prealloc *structs, + struct dm_bio_prison_cell **cell_result) +{ + int r; + struct dm_cell_key key; + struct dm_bio_prison_cell *cell_prealloc; + + cell_prealloc = prealloc_get_cell(structs); + + build_key(oblock, &key); + r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); + if (r) + prealloc_put_cell(structs, cell_prealloc); + + return r; +} + + /*----------------------------------------------------------------*/ + +static bool is_dirty(struct cache *cache, dm_cblock_t b) +{ + return test_bit(from_cblock(b), cache->dirty_bitset); +} + +static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) +{ + if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { + cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1); + policy_set_dirty(cache->policy, oblock); + } +} + +static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) +{ + if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { + policy_clear_dirty(cache->policy, oblock); + cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1); + if (!from_cblock(cache->nr_dirty)) + dm_table_event(cache->ti->table); + } +} + +/*----------------------------------------------------------------*/ +static bool block_size_is_power_of_two(struct cache *cache) +{ + return cache->sectors_per_block_shift >= 0; +} + +static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) +{ + sector_t discard_blocks = cache->discard_block_size; + dm_block_t b = from_oblock(oblock); + + if (!block_size_is_power_of_two(cache)) + (void) sector_div(discard_blocks, cache->sectors_per_block); + else + discard_blocks >>= cache->sectors_per_block_shift; + + (void) sector_div(b, discard_blocks); + + return to_dblock(b); +} + +static void set_discard(struct cache *cache, dm_dblock_t b) +{ + unsigned long flags; + + atomic_inc(&cache->stats.discard_count); + + spin_lock_irqsave(&cache->lock, flags); + set_bit(from_dblock(b), cache->discard_bitset); + spin_unlock_irqrestore(&cache->lock, flags); +} + +static void clear_discard(struct cache *cache, dm_dblock_t b) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + clear_bit(from_dblock(b), cache->discard_bitset); + spin_unlock_irqrestore(&cache->lock, flags); +} + +static bool is_discarded(struct cache *cache, dm_dblock_t b) +{ + int r; + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + r = test_bit(from_dblock(b), cache->discard_bitset); + spin_unlock_irqrestore(&cache->lock, flags); + + return r; +} + +static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) +{ + int r; + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + r = test_bit(from_dblock(oblock_to_dblock(cache, b)), + cache->discard_bitset); + spin_unlock_irqrestore(&cache->lock, flags); + + return r; +} + +/*----------------------------------------------------------------*/ + +static void load_stats(struct cache *cache) +{ + struct dm_cache_statistics stats; + + dm_cache_metadata_get_stats(cache->cmd, &stats); + atomic_set(&cache->stats.read_hit, stats.read_hits); + atomic_set(&cache->stats.read_miss, stats.read_misses); + atomic_set(&cache->stats.write_hit, stats.write_hits); + atomic_set(&cache->stats.write_miss, stats.write_misses); +} + +static void save_stats(struct cache *cache) +{ + struct dm_cache_statistics stats; + + stats.read_hits = atomic_read(&cache->stats.read_hit); + stats.read_misses = atomic_read(&cache->stats.read_miss); + stats.write_hits = atomic_read(&cache->stats.write_hit); + stats.write_misses = atomic_read(&cache->stats.write_miss); + + dm_cache_metadata_set_stats(cache->cmd, &stats); +} + +/*---------------------------------------------------------------- + * Per bio data + *--------------------------------------------------------------*/ +static struct per_bio_data *get_per_bio_data(struct bio *bio) +{ + struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); + BUG_ON(!pb); + return pb; +} + +static struct per_bio_data *init_per_bio_data(struct bio *bio) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + + pb->tick = false; + pb->req_nr = dm_bio_get_target_bio_nr(bio); + pb->all_io_entry = NULL; + + return pb; +} + +/*---------------------------------------------------------------- + * Remapping + *--------------------------------------------------------------*/ +static void remap_to_origin(struct cache *cache, struct bio *bio) +{ + bio->bi_bdev = cache->origin_dev->bdev; +} + +static void remap_to_cache(struct cache *cache, struct bio *bio, + dm_cblock_t cblock) +{ + sector_t bi_sector = bio->bi_sector; + + bio->bi_bdev = cache->cache_dev->bdev; + if (!block_size_is_power_of_two(cache)) + bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) + + sector_div(bi_sector, cache->sectors_per_block); + else + bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) | + (bi_sector & (cache->sectors_per_block - 1)); +} + +static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + struct per_bio_data *pb = get_per_bio_data(bio); + + spin_lock_irqsave(&cache->lock, flags); + if (cache->need_tick_bio && + !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { + pb->tick = true; + cache->need_tick_bio = false; + } + spin_unlock_irqrestore(&cache->lock, flags); +} + +static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, + dm_oblock_t oblock) +{ + check_if_tick_bio_needed(cache, bio); + remap_to_origin(cache, bio); + if (bio_data_dir(bio) == WRITE) + clear_discard(cache, oblock_to_dblock(cache, oblock)); +} + +static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + remap_to_cache(cache, bio, cblock); + if (bio_data_dir(bio) == WRITE) { + set_dirty(cache, oblock, cblock); + clear_discard(cache, oblock_to_dblock(cache, oblock)); + } +} + +static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) +{ + sector_t block_nr = bio->bi_sector; + + if (!block_size_is_power_of_two(cache)) + (void) sector_div(block_nr, cache->sectors_per_block); + else + block_nr >>= cache->sectors_per_block_shift; + + return to_oblock(block_nr); +} + +static int bio_triggers_commit(struct cache *cache, struct bio *bio) +{ + return bio->bi_rw & (REQ_FLUSH | REQ_FUA); +} + +static void issue(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + + if (!bio_triggers_commit(cache, bio)) { + generic_make_request(bio); + return; + } + + /* + * Batch together any bios that trigger commits and then issue a + * single commit for them in do_worker(). + */ + spin_lock_irqsave(&cache->lock, flags); + cache->commit_requested = true; + bio_list_add(&cache->deferred_flush_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); +} + +/*---------------------------------------------------------------- + * Migration processing + * + * Migration covers moving data from the origin device to the cache, or + * vice versa. + *--------------------------------------------------------------*/ +static void free_migration(struct dm_cache_migration *mg) +{ + mempool_free(mg, mg->cache->migration_pool); +} + +static void inc_nr_migrations(struct cache *cache) +{ + atomic_inc(&cache->nr_migrations); +} + +static void dec_nr_migrations(struct cache *cache) +{ + atomic_dec(&cache->nr_migrations); + + /* + * Wake the worker in case we're suspending the target. + */ + wake_up(&cache->migration_wait); +} + +static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, + bool holder) +{ + (holder ? dm_cell_release : dm_cell_release_no_holder) + (cache->prison, cell, &cache->deferred_bios); + free_prison_cell(cache, cell); +} + +static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, + bool holder) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + __cell_defer(cache, cell, holder); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void cleanup_migration(struct dm_cache_migration *mg) +{ + dec_nr_migrations(mg->cache); + free_migration(mg); +} + +static void migration_failure(struct dm_cache_migration *mg) +{ + struct cache *cache = mg->cache; + + if (mg->writeback) { + DMWARN_LIMIT("writeback failed; couldn't copy block"); + set_dirty(cache, mg->old_oblock, mg->cblock); + cell_defer(cache, mg->old_ocell, false); + + } else if (mg->demote) { + DMWARN_LIMIT("demotion failed; couldn't copy block"); + policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); + + cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); + if (mg->promote) + cell_defer(cache, mg->new_ocell, 1); + } else { + DMWARN_LIMIT("promotion failed; couldn't copy block"); + policy_remove_mapping(cache->policy, mg->new_oblock); + cell_defer(cache, mg->new_ocell, 1); + } + + cleanup_migration(mg); +} + +static void migration_success_pre_commit(struct dm_cache_migration *mg) +{ + unsigned long flags; + struct cache *cache = mg->cache; + + if (mg->writeback) { + cell_defer(cache, mg->old_ocell, false); + clear_dirty(cache, mg->old_oblock, mg->cblock); + cleanup_migration(mg); + return; + + } else if (mg->demote) { + if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) { + DMWARN_LIMIT("demotion failed; couldn't update on disk metadata"); + policy_force_mapping(cache->policy, mg->new_oblock, + mg->old_oblock); + if (mg->promote) + cell_defer(cache, mg->new_ocell, true); + cleanup_migration(mg); + return; + } + } else { + if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { + DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); + policy_remove_mapping(cache->policy, mg->new_oblock); + cleanup_migration(mg); + return; + } + } + + spin_lock_irqsave(&cache->lock, flags); + list_add_tail(&mg->list, &cache->need_commit_migrations); + cache->commit_requested = true; + spin_unlock_irqrestore(&cache->lock, flags); +} + +static void migration_success_post_commit(struct dm_cache_migration *mg) +{ + unsigned long flags; + struct cache *cache = mg->cache; + + if (mg->writeback) { + DMWARN("writeback unexpectedly triggered commit"); + return; + + } else if (mg->demote) { + cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); + + if (mg->promote) { + mg->demote = false; + + spin_lock_irqsave(&cache->lock, flags); + list_add_tail(&mg->list, &cache->quiesced_migrations); + spin_unlock_irqrestore(&cache->lock, flags); + + } else + cleanup_migration(mg); + + } else { + cell_defer(cache, mg->new_ocell, true); + clear_dirty(cache, mg->new_oblock, mg->cblock); + cleanup_migration(mg); + } +} + +static void copy_complete(int read_err, unsigned long write_err, void *context) +{ + unsigned long flags; + struct dm_cache_migration *mg = (struct dm_cache_migration *) context; + struct cache *cache = mg->cache; + + if (read_err || write_err) + mg->err = true; + + spin_lock_irqsave(&cache->lock, flags); + list_add_tail(&mg->list, &cache->completed_migrations); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void issue_copy_real(struct dm_cache_migration *mg) +{ + int r; + struct dm_io_region o_region, c_region; + struct cache *cache = mg->cache; + + o_region.bdev = cache->origin_dev->bdev; + o_region.count = cache->sectors_per_block; + + c_region.bdev = cache->cache_dev->bdev; + c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block; + c_region.count = cache->sectors_per_block; + + if (mg->writeback || mg->demote) { + /* demote */ + o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; + r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); + } else { + /* promote */ + o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; + r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); + } + + if (r < 0) + migration_failure(mg); +} + +static void avoid_copy(struct dm_cache_migration *mg) +{ + atomic_inc(&mg->cache->stats.copies_avoided); + migration_success_pre_commit(mg); +} + +static void issue_copy(struct dm_cache_migration *mg) +{ + bool avoid; + struct cache *cache = mg->cache; + + if (mg->writeback || mg->demote) + avoid = !is_dirty(cache, mg->cblock) || + is_discarded_oblock(cache, mg->old_oblock); + else + avoid = is_discarded_oblock(cache, mg->new_oblock); + + avoid ? avoid_copy(mg) : issue_copy_real(mg); +} + +static void complete_migration(struct dm_cache_migration *mg) +{ + if (mg->err) + migration_failure(mg); + else + migration_success_pre_commit(mg); +} + +static void process_migrations(struct cache *cache, struct list_head *head, + void (*fn)(struct dm_cache_migration *)) +{ + unsigned long flags; + struct list_head list; + struct dm_cache_migration *mg, *tmp; + + INIT_LIST_HEAD(&list); + spin_lock_irqsave(&cache->lock, flags); + list_splice_init(head, &list); + spin_unlock_irqrestore(&cache->lock, flags); + + list_for_each_entry_safe(mg, tmp, &list, list) + fn(mg); +} + +static void __queue_quiesced_migration(struct dm_cache_migration *mg) +{ + list_add_tail(&mg->list, &mg->cache->quiesced_migrations); +} + +static void queue_quiesced_migration(struct dm_cache_migration *mg) +{ + unsigned long flags; + struct cache *cache = mg->cache; + + spin_lock_irqsave(&cache->lock, flags); + __queue_quiesced_migration(mg); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) +{ + unsigned long flags; + struct dm_cache_migration *mg, *tmp; + + spin_lock_irqsave(&cache->lock, flags); + list_for_each_entry_safe(mg, tmp, work, list) + __queue_quiesced_migration(mg); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void check_for_quiesced_migrations(struct cache *cache, + struct per_bio_data *pb) +{ + struct list_head work; + + if (!pb->all_io_entry) + return; + + INIT_LIST_HEAD(&work); + if (pb->all_io_entry) + dm_deferred_entry_dec(pb->all_io_entry, &work); + + if (!list_empty(&work)) + queue_quiesced_migrations(cache, &work); +} + +static void quiesce_migration(struct dm_cache_migration *mg) +{ + if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) + queue_quiesced_migration(mg); +} + +static void promote(struct cache *cache, struct prealloc *structs, + dm_oblock_t oblock, dm_cblock_t cblock, + struct dm_bio_prison_cell *cell) +{ + struct dm_cache_migration *mg = prealloc_get_migration(structs); + + mg->err = false; + mg->writeback = false; + mg->demote = false; + mg->promote = true; + mg->cache = cache; + mg->new_oblock = oblock; + mg->cblock = cblock; + mg->old_ocell = NULL; + mg->new_ocell = cell; + mg->start_jiffies = jiffies; + + inc_nr_migrations(cache); + quiesce_migration(mg); +} + +static void writeback(struct cache *cache, struct prealloc *structs, + dm_oblock_t oblock, dm_cblock_t cblock, + struct dm_bio_prison_cell *cell) +{ + struct dm_cache_migration *mg = prealloc_get_migration(structs); + + mg->err = false; + mg->writeback = true; + mg->demote = false; + mg->promote = false; + mg->cache = cache; + mg->old_oblock = oblock; + mg->cblock = cblock; + mg->old_ocell = cell; + mg->new_ocell = NULL; + mg->start_jiffies = jiffies; + + inc_nr_migrations(cache); + quiesce_migration(mg); +} + +static void demote_then_promote(struct cache *cache, struct prealloc *structs, + dm_oblock_t old_oblock, dm_oblock_t new_oblock, + dm_cblock_t cblock, + struct dm_bio_prison_cell *old_ocell, + struct dm_bio_prison_cell *new_ocell) +{ + struct dm_cache_migration *mg = prealloc_get_migration(structs); + + mg->err = false; + mg->writeback = false; + mg->demote = true; + mg->promote = true; + mg->cache = cache; + mg->old_oblock = old_oblock; + mg->new_oblock = new_oblock; + mg->cblock = cblock; + mg->old_ocell = old_ocell; + mg->new_ocell = new_ocell; + mg->start_jiffies = jiffies; + + inc_nr_migrations(cache); + quiesce_migration(mg); +} + +/*---------------------------------------------------------------- + * bio processing + *--------------------------------------------------------------*/ +static void defer_bio(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + bio_list_add(&cache->deferred_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void process_flush_bio(struct cache *cache, struct bio *bio) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + + BUG_ON(bio->bi_size); + if (!pb->req_nr) + remap_to_origin(cache, bio); + else + remap_to_cache(cache, bio, 0); + + issue(cache, bio); +} + +/* + * People generally discard large parts of a device, eg, the whole device + * when formatting. Splitting these large discards up into cache block + * sized ios and then quiescing (always neccessary for discard) takes too + * long. + * + * We keep it simple, and allow any size of discard to come in, and just + * mark off blocks on the discard bitset. No passdown occurs! + * + * To implement passdown we need to change the bio_prison such that a cell + * can have a key that spans many blocks. + */ +static void process_discard_bio(struct cache *cache, struct bio *bio) +{ + dm_block_t start_block = dm_sector_div_up(bio->bi_sector, + cache->discard_block_size); + dm_block_t end_block = bio->bi_sector + bio_sectors(bio); + dm_block_t b; + + (void) sector_div(end_block, cache->discard_block_size); + + for (b = start_block; b < end_block; b++) + set_discard(cache, to_dblock(b)); + + bio_endio(bio, 0); +} + +static bool spare_migration_bandwidth(struct cache *cache) +{ + sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) * + cache->sectors_per_block; + return current_volume < cache->migration_threshold; +} + +static bool is_writethrough_io(struct cache *cache, struct bio *bio, + dm_cblock_t cblock) +{ + return bio_data_dir(bio) == WRITE && + cache->features.write_through && !is_dirty(cache, cblock); +} + +static void inc_hit_counter(struct cache *cache, struct bio *bio) +{ + atomic_inc(bio_data_dir(bio) == READ ? + &cache->stats.read_hit : &cache->stats.write_hit); +} + +static void inc_miss_counter(struct cache *cache, struct bio *bio) +{ + atomic_inc(bio_data_dir(bio) == READ ? + &cache->stats.read_miss : &cache->stats.write_miss); +} + +static void process_bio(struct cache *cache, struct prealloc *structs, + struct bio *bio) +{ + int r; + bool release_cell = true; + dm_oblock_t block = get_bio_block(cache, bio); + struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; + struct policy_result lookup_result; + struct per_bio_data *pb = get_per_bio_data(bio); + bool discarded_block = is_discarded_oblock(cache, block); + bool can_migrate = discarded_block || spare_migration_bandwidth(cache); + + /* + * Check to see if that block is currently migrating. + */ + cell_prealloc = prealloc_get_cell(structs); + r = bio_detain(cache, block, bio, cell_prealloc, + (cell_free_fn) prealloc_put_cell, + structs, &new_ocell); + if (r > 0) + return; + + r = policy_map(cache->policy, block, true, can_migrate, discarded_block, + bio, &lookup_result); + + if (r == -EWOULDBLOCK) + /* migration has been denied */ + lookup_result.op = POLICY_MISS; + + switch (lookup_result.op) { + case POLICY_HIT: + inc_hit_counter(cache, bio); + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + + if (is_writethrough_io(cache, bio, lookup_result.cblock)) { + /* + * No need to mark anything dirty in write through mode. + */ + pb->req_nr == 0 ? + remap_to_cache(cache, bio, lookup_result.cblock) : + remap_to_origin_clear_discard(cache, bio, block); + } else + remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); + + issue(cache, bio); + break; + + case POLICY_MISS: + inc_miss_counter(cache, bio); + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + + if (pb->req_nr != 0) { + /* + * This is a duplicate writethrough io that is no + * longer needed because the block has been demoted. + */ + bio_endio(bio, 0); + } else { + remap_to_origin_clear_discard(cache, bio, block); + issue(cache, bio); + } + break; + + case POLICY_NEW: + atomic_inc(&cache->stats.promotion); + promote(cache, structs, block, lookup_result.cblock, new_ocell); + release_cell = false; + break; + + case POLICY_REPLACE: + cell_prealloc = prealloc_get_cell(structs); + r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc, + (cell_free_fn) prealloc_put_cell, + structs, &old_ocell); + if (r > 0) { + /* + * We have to be careful to avoid lock inversion of + * the cells. So we back off, and wait for the + * old_ocell to become free. + */ + policy_force_mapping(cache->policy, block, + lookup_result.old_oblock); + atomic_inc(&cache->stats.cache_cell_clash); + break; + } + atomic_inc(&cache->stats.demotion); + atomic_inc(&cache->stats.promotion); + + demote_then_promote(cache, structs, lookup_result.old_oblock, + block, lookup_result.cblock, + old_ocell, new_ocell); + release_cell = false; + break; + + default: + DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__, + (unsigned) lookup_result.op); + bio_io_error(bio); + } + + if (release_cell) + cell_defer(cache, new_ocell, false); +} + +static int need_commit_due_to_time(struct cache *cache) +{ + return jiffies < cache->last_commit_jiffies || + jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; +} + +static int commit_if_needed(struct cache *cache) +{ + if (dm_cache_changed_this_transaction(cache->cmd) && + (cache->commit_requested || need_commit_due_to_time(cache))) { + atomic_inc(&cache->stats.commit_count); + cache->last_commit_jiffies = jiffies; + cache->commit_requested = false; + return dm_cache_commit(cache->cmd, false); + } + + return 0; +} + +static void process_deferred_bios(struct cache *cache) +{ + unsigned long flags; + struct bio_list bios; + struct bio *bio; + struct prealloc structs; + + memset(&structs, 0, sizeof(structs)); + bio_list_init(&bios); + + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&bios, &cache->deferred_bios); + bio_list_init(&cache->deferred_bios); + spin_unlock_irqrestore(&cache->lock, flags); + + while (!bio_list_empty(&bios)) { + /* + * If we've got no free migration structs, and processing + * this bio might require one, we pause until there are some + * prepared mappings to process. + */ + if (prealloc_data_structs(cache, &structs)) { + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&cache->deferred_bios, &bios); + spin_unlock_irqrestore(&cache->lock, flags); + break; + } + + bio = bio_list_pop(&bios); + + if (bio->bi_rw & REQ_FLUSH) + process_flush_bio(cache, bio); + else if (bio->bi_rw & REQ_DISCARD) + process_discard_bio(cache, bio); + else + process_bio(cache, &structs, bio); + } + + prealloc_free_structs(cache, &structs); +} + +static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) +{ + unsigned long flags; + struct bio_list bios; + struct bio *bio; + + bio_list_init(&bios); + + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&bios, &cache->deferred_flush_bios); + bio_list_init(&cache->deferred_flush_bios); + spin_unlock_irqrestore(&cache->lock, flags); + + |