diff options
-rw-r--r-- | Documentation/device-mapper/cache.txt | 243 | ||||
-rw-r--r-- | drivers/md/Kconfig | 13 | ||||
-rw-r--r-- | drivers/md/Makefile | 2 | ||||
-rw-r--r-- | drivers/md/dm-bio-prison.c | 9 | ||||
-rw-r--r-- | drivers/md/dm-bio-prison.h | 11 | ||||
-rw-r--r-- | drivers/md/dm-cache-block-types.h | 54 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 1146 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.h | 142 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-internal.h | 124 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy.c | 161 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy.h | 228 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 2584 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-block-manager.c | 1 |
13 files changed, 4718 insertions, 0 deletions
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt new file mode 100644 index 00000000000..f50470abe24 --- /dev/null +++ b/Documentation/device-mapper/cache.txt @@ -0,0 +1,243 @@ +Introduction +============ + +dm-cache is a device mapper target written by Joe Thornber, Heinz +Mauelshagen, and Mike Snitzer. + +It aims to improve performance of a block device (eg, a spindle) by +dynamically migrating some of its data to a faster, smaller device +(eg, an SSD). + +This device-mapper solution allows us to insert this caching at +different levels of the dm stack, for instance above the data device for +a thin-provisioning pool. Caching solutions that are integrated more +closely with the virtual memory system should give better performance. + +The target reuses the metadata library used in the thin-provisioning +library. + +The decision as to what data to migrate and when is left to a plug-in +policy module. Several of these have been written as we experiment, +and we hope other people will contribute others for specific io +scenarios (eg. a vm image server). + +Glossary +======== + + Migration - Movement of the primary copy of a logical block from one + device to the other. + Promotion - Migration from slow device to fast device. + Demotion - Migration from fast device to slow device. + +The origin device always contains a copy of the logical block, which +may be out of date or kept in sync with the copy on the cache device +(depending on policy). + +Design +====== + +Sub-devices +----------- + +The target is constructed by passing three devices to it (along with +other parameters detailed later): + +1. An origin device - the big, slow one. + +2. A cache device - the small, fast one. + +3. A small metadata device - records which blocks are in the cache, + which are dirty, and extra hints for use by the policy object. + This information could be put on the cache device, but having it + separate allows the volume manager to configure it differently, + e.g. as a mirror for extra robustness. + +Fixed block size +---------------- + +The origin is divided up into blocks of a fixed size. This block size +is configurable when you first create the cache. Typically we've been +using block sizes of 256k - 1024k. + +Having a fixed block size simplifies the target a lot. But it is +something of a compromise. For instance, a small part of a block may be +getting hit a lot, yet the whole block will be promoted to the cache. +So large block sizes are bad because they waste cache space. And small +block sizes are bad because they increase the amount of metadata (both +in core and on disk). + +Writeback/writethrough +---------------------- + +The cache has two modes, writeback and writethrough. + +If writeback, the default, is selected then a write to a block that is +cached will go only to the cache and the block will be marked dirty in +the metadata. + +If writethrough is selected then a write to a cached block will not +complete until it has hit both the origin and cache devices. Clean +blocks should remain clean. + +A simple cleaner policy is provided, which will clean (write back) all +dirty blocks in a cache. Useful for decommissioning a cache. + +Migration throttling +-------------------- + +Migrating data between the origin and cache device uses bandwidth. +The user can set a throttle to prevent more than a certain amount of +migration occuring at any one time. Currently we're not taking any +account of normal io traffic going to the devices. More work needs +doing here to avoid migrating during those peak io moments. + +For the time being, a message "migration_threshold <#sectors>" +can be used to set the maximum number of sectors being migrated, +the default being 204800 sectors (or 100MB). + +Updating on-disk metadata +------------------------- + +On-disk metadata is committed every time a REQ_SYNC or REQ_FUA bio is +written. If no such requests are made then commits will occur every +second. This means the cache behaves like a physical disk that has a +write cache (the same is true of the thin-provisioning target). If +power is lost you may lose some recent writes. The metadata should +always be consistent in spite of any crash. + +The 'dirty' state for a cache block changes far too frequently for us +to keep updating it on the fly. So we treat it as a hint. In normal +operation it will be written when the dm device is suspended. If the +system crashes all cache blocks will be assumed dirty when restarted. + +Per-block policy hints +---------------------- + +Policy plug-ins can store a chunk of data per cache block. It's up to +the policy how big this chunk is, but it should be kept small. Like the +dirty flags this data is lost if there's a crash so a safe fallback +value should always be possible. + +For instance, the 'mq' policy, which is currently the default policy, +uses this facility to store the hit count of the cache blocks. If +there's a crash this information will be lost, which means the cache +may be less efficient until those hit counts are regenerated. + +Policy hints affect performance, not correctness. + +Policy messaging +---------------- + +Policies will have different tunables, specific to each one, so we +need a generic way of getting and setting these. Device-mapper +messages are used. Refer to cache-policies.txt. + +Discard bitset resolution +------------------------- + +We can avoid copying data during migration if we know the block has +been discarded. A prime example of this is when mkfs discards the +whole block device. We store a bitset tracking the discard state of +blocks. However, we allow this bitset to have a different block size +from the cache blocks. This is because we need to track the discard +state for all of the origin device (compare with the dirty bitset +which is just for the smaller cache device). + +Target interface +================ + +Constructor +----------- + + cache <metadata dev> <cache dev> <origin dev> <block size> + <#feature args> [<feature arg>]* + <policy> <#policy args> [policy args]* + + metadata dev : fast device holding the persistent metadata + cache dev : fast device holding cached data blocks + origin dev : slow device holding original data blocks + block size : cache unit size in sectors + + #feature args : number of feature arguments passed + feature args : writethrough. (The default is writeback.) + + policy : the replacement policy to use + #policy args : an even number of arguments corresponding to + key/value pairs passed to the policy + policy args : key/value pairs passed to the policy + E.g. 'sequential_threshold 1024' + See cache-policies.txt for details. + +Optional feature arguments are: + writethrough : write through caching that prohibits cache block + content from being different from origin block content. + Without this argument, the default behaviour is to write + back cache block contents later for performance reasons, + so they may differ from the corresponding origin blocks. + +A policy called 'default' is always registered. This is an alias for +the policy we currently think is giving best all round performance. + +As the default policy could vary between kernels, if you are relying on +the characteristics of a specific policy, always request it by name. + +Status +------ + +<#used metadata blocks>/<#total metadata blocks> <#read hits> <#read misses> +<#write hits> <#write misses> <#demotions> <#promotions> <#blocks in cache> +<#dirty> <#features> <features>* <#core args> <core args>* <#policy args> +<policy args>* + +#used metadata blocks : Number of metadata blocks used +#total metadata blocks : Total number of metadata blocks +#read hits : Number of times a READ bio has been mapped + to the cache +#read misses : Number of times a READ bio has been mapped + to the origin +#write hits : Number of times a WRITE bio has been mapped + to the cache +#write misses : Number of times a WRITE bio has been + mapped to the origin +#demotions : Number of times a block has been removed + from the cache +#promotions : Number of times a block has been moved to + the cache +#blocks in cache : Number of blocks resident in the cache +#dirty : Number of blocks in the cache that differ + from the origin +#feature args : Number of feature args to follow +feature args : 'writethrough' (optional) +#core args : Number of core arguments (must be even) +core args : Key/value pairs for tuning the core + e.g. migration_threshold +#policy args : Number of policy arguments to follow (must be even) +policy args : Key/value pairs + e.g. 'sequential_threshold 1024 + +Messages +-------- + +Policies will have different tunables, specific to each one, so we +need a generic way of getting and setting these. Device-mapper +messages are used. (A sysfs interface would also be possible.) + +The message format is: + + <key> <value> + +E.g. + dmsetup message my_cache 0 sequential_threshold 1024 + +Examples +======== + +The test suite can be found here: + +https://github.com/jthornber/thinp-test-suite + +dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ + /dev/mapper/ssd /dev/mapper/origin 512 1 writeback default 0' +dmsetup create my_cache --table '0 41943040 cache /dev/mapper/metadata \ + /dev/mapper/ssd /dev/mapper/origin 1024 1 writeback \ + mq 4 sequential_threshold 1024 random_threshold 8' diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 7cdf359d6b2..1a4fbcdb5ca 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -268,6 +268,19 @@ config DM_DEBUG_BLOCK_STACK_TRACING If unsure, say N. +config DM_CACHE + tristate "Cache target (EXPERIMENTAL)" + depends on BLK_DEV_DM + default n + select DM_PERSISTENT_DATA + select DM_BIO_PRISON + ---help--- + dm-cache attempts to improve performance of a block device by + moving frequently used data to a smaller, higher performance + device. Different 'policy' plugins can be used to change the + algorithms used to select which blocks are promoted, demoted, + cleaned etc. It supports writeback and writethrough modes. + config DM_MIRROR tristate "Mirror target" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 94dce8b4932..24b52560f4d 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -11,6 +11,7 @@ dm-mirror-y += dm-raid1.o dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o dm-thin-pool-y += dm-thin.o dm-thin-metadata.o +dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o md-mod-y += md.o bitmap.o raid456-y += raid5.o @@ -44,6 +45,7 @@ obj-$(CONFIG_DM_ZERO) += dm-zero.o obj-$(CONFIG_DM_RAID) += dm-raid.o obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o +obj-$(CONFIG_DM_CACHE) += dm-cache.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c index 144067c95ab..85f0b707425 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison.c @@ -179,6 +179,15 @@ int dm_bio_detain(struct dm_bio_prison *prison, } EXPORT_SYMBOL_GPL(dm_bio_detain); +int dm_get_cell(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result) +{ + return bio_detain(prison, key, NULL, cell_prealloc, cell_result); +} +EXPORT_SYMBOL_GPL(dm_get_cell); + /* * @inmates must have been initialised prior to this call */ diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h index 981a02d3a05..3f833190ead 100644 --- a/drivers/md/dm-bio-prison.h +++ b/drivers/md/dm-bio-prison.h @@ -57,6 +57,17 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell); /* + * Creates, or retrieves a cell for the given key. + * + * Returns 1 if pre-existing cell returned, zero if new cell created using + * @cell_prealloc. + */ +int dm_get_cell(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result); + +/* * An atomic op that combines retrieving a cell, and adding a bio to it. * * Returns 1 if the cell was already held, 0 if @inmate is the new holder. diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h new file mode 100644 index 00000000000..bed4ad4e1b7 --- /dev/null +++ b/drivers/md/dm-cache-block-types.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_BLOCK_TYPES_H +#define DM_CACHE_BLOCK_TYPES_H + +#include "persistent-data/dm-block-manager.h" + +/*----------------------------------------------------------------*/ + +/* + * It's helpful to get sparse to differentiate between indexes into the + * origin device, indexes into the cache device, and indexes into the + * discard bitset. + */ + +typedef dm_block_t __bitwise__ dm_oblock_t; +typedef uint32_t __bitwise__ dm_cblock_t; +typedef dm_block_t __bitwise__ dm_dblock_t; + +static inline dm_oblock_t to_oblock(dm_block_t b) +{ + return (__force dm_oblock_t) b; +} + +static inline dm_block_t from_oblock(dm_oblock_t b) +{ + return (__force dm_block_t) b; +} + +static inline dm_cblock_t to_cblock(uint32_t b) +{ + return (__force dm_cblock_t) b; +} + +static inline uint32_t from_cblock(dm_cblock_t b) +{ + return (__force uint32_t) b; +} + +static inline dm_dblock_t to_dblock(dm_block_t b) +{ + return (__force dm_dblock_t) b; +} + +static inline dm_block_t from_dblock(dm_dblock_t b) +{ + return (__force dm_block_t) b; +} + +#endif /* DM_CACHE_BLOCK_TYPES_H */ diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c new file mode 100644 index 00000000000..fbd3625f274 --- /dev/null +++ b/drivers/md/dm-cache-metadata.c @@ -0,0 +1,1146 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-cache-metadata.h" + +#include "persistent-data/dm-array.h" +#include "persistent-data/dm-bitset.h" +#include "persistent-data/dm-space-map.h" +#include "persistent-data/dm-space-map-disk.h" +#include "persistent-data/dm-transaction-manager.h" + +#include <linux/device-mapper.h> + +/*----------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "cache metadata" + +#define CACHE_SUPERBLOCK_MAGIC 06142003 +#define CACHE_SUPERBLOCK_LOCATION 0 +#define CACHE_VERSION 1 +#define CACHE_METADATA_CACHE_SIZE 64 + +/* + * 3 for btree insert + + * 2 for btree lookup used within space map + */ +#define CACHE_MAX_CONCURRENT_LOCKS 5 +#define SPACE_MAP_ROOT_SIZE 128 + +enum superblock_flag_bits { + /* for spotting crashes that would invalidate the dirty bitset */ + CLEAN_SHUTDOWN, +}; + +/* + * Each mapping from cache block -> origin block carries a set of flags. + */ +enum mapping_bits { + /* + * A valid mapping. Because we're using an array we clear this + * flag for an non existant mapping. + */ + M_VALID = 1, + + /* + * The data on the cache is different from that on the origin. + */ + M_DIRTY = 2 +}; + +struct cache_disk_superblock { + __le32 csum; + __le32 flags; + __le64 blocknr; + + __u8 uuid[16]; + __le64 magic; + __le32 version; + + __u8 policy_name[CACHE_POLICY_NAME_SIZE]; + __le32 policy_hint_size; + + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + __le64 mapping_root; + __le64 hint_root; + + __le64 discard_root; + __le64 discard_block_size; + __le64 discard_nr_blocks; + + __le32 data_block_size; + __le32 metadata_block_size; + __le32 cache_blocks; + + __le32 compat_flags; + __le32 compat_ro_flags; + __le32 incompat_flags; + + __le32 read_hits; + __le32 read_misses; + __le32 write_hits; + __le32 write_misses; +} __packed; + +struct dm_cache_metadata { + struct block_device *bdev; + struct dm_block_manager *bm; + struct dm_space_map *metadata_sm; + struct dm_transaction_manager *tm; + + struct dm_array_info info; + struct dm_array_info hint_info; + struct dm_disk_bitset discard_info; + + struct rw_semaphore root_lock; + dm_block_t root; + dm_block_t hint_root; + dm_block_t discard_root; + + sector_t discard_block_size; + dm_dblock_t discard_nr_blocks; + + sector_t data_block_size; + dm_cblock_t cache_blocks; + bool changed:1; + bool clean_when_opened:1; + + char policy_name[CACHE_POLICY_NAME_SIZE]; + size_t policy_hint_size; + struct dm_cache_statistics stats; +}; + +/*------------------------------------------------------------------- + * superblock validator + *-----------------------------------------------------------------*/ + +#define SUPERBLOCK_CSUM_XOR 9031977 + +static void sb_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t sb_block_size) +{ + struct cache_disk_superblock *disk_super = dm_block_data(b); + + disk_super->blocknr = cpu_to_le64(dm_block_location(b)); + disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); +} + +static int sb_check(struct dm_block_validator *v, + struct dm_block *b, + size_t sb_block_size) +{ + struct cache_disk_superblock *disk_super = dm_block_data(b); + __le32 csum_le; + + if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { + DMERR("sb_check failed: blocknr %llu: wanted %llu", + le64_to_cpu(disk_super->blocknr), + (unsigned long long)dm_block_location(b)); + return -ENOTBLK; + } + + if (le64_to_cpu(disk_super->magic) != CACHE_SUPERBLOCK_MAGIC) { + DMERR("sb_check failed: magic %llu: wanted %llu", + le64_to_cpu(disk_super->magic), + (unsigned long long)CACHE_SUPERBLOCK_MAGIC); + return -EILSEQ; + } + + csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); + if (csum_le != disk_super->csum) { + DMERR("sb_check failed: csum %u: wanted %u", + le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator sb_validator = { + .name = "superblock", + .prepare_for_write = sb_prepare_for_write, + .check = sb_check +}; + +/*----------------------------------------------------------------*/ + +static int superblock_read_lock(struct dm_cache_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_read_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int superblock_lock_zero(struct dm_cache_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock_zero(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int superblock_lock(struct dm_cache_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +/*----------------------------------------------------------------*/ + +static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) +{ + int r; + unsigned i; + struct dm_block *b; + __le64 *data_le, zero = cpu_to_le64(0); + unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64); + + /* + * We can't use a validator here - it may be all zeroes. + */ + r = dm_bm_read_lock(bm, CACHE_SUPERBLOCK_LOCATION, NULL, &b); + if (r) + return r; + + data_le = dm_block_data(b); + *result = 1; + for (i = 0; i < sb_block_size; i++) { + if (data_le[i] != zero) { + *result = 0; + break; + } + } + + return dm_bm_unlock(b); +} + +static void __setup_mapping_info(struct dm_cache_metadata *cmd) +{ + struct dm_btree_value_type vt; + + vt.context = NULL; + vt.size = sizeof(__le64); + vt.inc = NULL; + vt.dec = NULL; + vt.equal = NULL; + dm_array_info_init(&cmd->info, cmd->tm, &vt); + + if (cmd->policy_hint_size) { + vt.size = sizeof(__le32); + dm_array_info_init(&cmd->hint_info, cmd->tm, &vt); + } +} + +static int __write_initial_superblock(struct dm_cache_metadata *cmd) +{ + int r; + struct dm_block *sblock; + size_t metadata_len; + struct cache_disk_superblock *disk_super; + sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT; + + /* FIXME: see if we can lose the max sectors limit */ + if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS) + bdev_size = DM_CACHE_METADATA_MAX_SECTORS; + + r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); + if (r < 0) + return r; + + r = dm_tm_pre_commit(cmd->tm); + if (r < 0) + return r; + + r = superblock_lock_zero(cmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + disk_super->flags = 0; + memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); + disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); + disk_super->version = cpu_to_le32(CACHE_VERSION); + memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE); + disk_super->policy_hint_size = 0; + + r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, + metadata_len); + if (r < 0) + goto bad_locked; + + disk_super->mapping_root = cpu_to_le64(cmd->root); + disk_super->hint_root = cpu_to_le64(cmd->hint_root); + disk_super->discard_root = cpu_to_le64(cmd->discard_root); + disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); + disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); + disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); + disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); + disk_super->cache_blocks = cpu_to_le32(0); + memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); + + disk_super->read_hits = cpu_to_le32(0); + disk_super->read_misses = cpu_to_le32(0); + disk_super->write_hits = cpu_to_le32(0); + disk_super->write_misses = cpu_to_le32(0); + + return dm_tm_commit(cmd->tm, sblock); + +bad_locked: + dm_bm_unlock(sblock); + return r; +} + +static int __format_metadata(struct dm_cache_metadata *cmd) +{ + int r; + + r = dm_tm_create_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + &cmd->tm, &cmd->metadata_sm); + if (r < 0) { + DMERR("tm_create_with_sm failed"); + return r; + } + + __setup_mapping_info(cmd); + + r = dm_array_empty(&cmd->info, &cmd->root); + if (r < 0) + goto bad; + + dm_disk_bitset_init(cmd->tm, &cmd->discard_info); + + r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root); + if (r < 0) + goto bad; + + cmd->discard_block_size = 0; + cmd->discard_nr_blocks = 0; + + r = __write_initial_superblock(cmd); + if (r) + goto bad; + + cmd->clean_when_opened = true; + return 0; + +bad: + dm_tm_destroy(cmd->tm); + dm_sm_destroy(cmd->metadata_sm); + + return r; +} + +static int __check_incompat_features(struct cache_disk_superblock *disk_super, + struct dm_cache_metadata *cmd) +{ + uint32_t features; + + features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP; + if (features) { + DMERR("could not access metadata due to unsupported optional features (%lx).", + (unsigned long)features); + return -EINVAL; + } + + /* + * Check for read-only metadata to skip the following RDWR checks. + */ + if (get_disk_ro(cmd->bdev->bd_disk)) + return 0; + + features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP; + if (features) { + DMERR("could not access metadata RDWR due to unsupported optional features (%lx).", + (unsigned long)features); + return -EINVAL; + } + + return 0; +} + +static int __open_metadata(struct dm_cache_metadata *cmd) +{ + int r; + struct dm_block *sblock; + struct cache_disk_superblock *disk_super; + unsigned long sb_flags; + + r = superblock_read_lock(cmd, &sblock); + if (r < 0) { + DMERR("couldn't read lock superblock"); + return r; + } + + disk_super = dm_block_data(sblock); + + r = __check_incompat_features(disk_super, cmd); + if (r < 0) + goto bad; + + r = dm_tm_open_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + disk_super->metadata_space_map_root, + sizeof(disk_super->metadata_space_map_root), + &cmd->tm, &cmd->metadata_sm); + if (r < 0) { + DMERR("tm_open_with_sm failed"); + goto bad; + } + + __setup_mapping_info(cmd); + dm_disk_bitset_init(cmd->tm, &cmd->discard_info); + sb_flags = le32_to_cpu(disk_super->flags); + cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags); + return dm_bm_unlock(sblock); + +bad: + dm_bm_unlock(sblock); + return r; +} + +static int __open_or_format_metadata(struct dm_cache_metadata *cmd, + bool format_device) +{ + int r, unformatted; + + r = __superblock_all_zeroes(cmd->bm, &unformatted); + if (r) + return r; + + if (unformatted) + return format_device ? __format_metadata(cmd) : -EPERM; + + return __open_metadata(cmd); +} + +static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, + bool may_format_device) +{ + int r; + cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE, + CACHE_METADATA_CACHE_SIZE, + CACHE_MAX_CONCURRENT_LOCKS); + if (IS_ERR(cmd->bm)) { + DMERR("could not create block manager"); + return PTR_ERR(cmd->bm); + } + + r = __open_or_format_metadata(cmd, may_format_device); + if (r) + dm_block_manager_destroy(cmd->bm); + + return r; +} + +static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd) +{ + dm_sm_destroy(cmd->metadata_sm); + dm_tm_destroy(cmd->tm); + dm_block_manager_destroy(cmd->bm); +} + +typedef unsigned long (*flags_mutator)(unsigned long); + +static void update_flags(struct cache_disk_superblock *disk_super, + flags_mutator mutator) +{ + uint32_t sb_flags = mutator(le32_to_cpu(disk_super->flags)); + disk_super->flags = cpu_to_le32(sb_flags); +} + +static unsigned long set_clean_shutdown(unsigned long flags) +{ + set_bit(CLEAN_SHUTDOWN, &flags); + return flags; +} + +static unsigned long clear_clean_shutdown(unsigned long flags) +{ + clear_bit(CLEAN_SHUTDOWN, &flags); + return flags; +} + +static void read_superblock_fields(struct dm_cache_metadata *cmd, + struct cache_disk_superblock *disk_super) +{ + cmd->root = le64_to_cpu(disk_super->mapping_root); + cmd->hint_root = le64_to_cpu(disk_super->hint_root); + cmd->discard_root = le64_to_cpu(disk_super->discard_root); + cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size); + cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks)); + cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); + cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); + strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); + cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size); + + cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits); + cmd->stats.read_misses = le32_to_cpu(disk_super->read_misses); + cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits); + cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses); + + cmd->changed = false; +} + +/* + * The mutator updates the superblock flags. + */ +static int __begin_transaction_flags(struct dm_cache_metadata *cmd, + flags_mutator mutator) +{ + int r; + struct cache_disk_superblock *disk_super; + struct dm_block *sblock; + + r = superblock_lock(cmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + update_flags(disk_super, mutator); + read_superblock_fields(cmd, disk_super); + + return dm_bm_flush_and_unlock(cmd->bm, sblock); +} + +static int __begin_transaction(struct dm_cache_metadata *cmd) +{ + int r; + struct cache_disk_superblock *disk_super; + struct dm_block *sblock; + + /* + * We re-read the superblock every time. Shouldn't need to do this + * really. + */ + r = superblock_read_lock(cmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + read_superblock_fields(cmd, disk_super); + dm_bm_unlock(sblock); + + return 0; +} + +static int __commit_transaction(struct dm_cache_metadata *cmd, + flags_mutator mutator) +{ + int r; + size_t metadata_len; + struct cache_disk_superblock *disk_super; + struct dm_block *sblock; + + /* + * We need to know if the cache_disk_superblock exceeds a 512-byte sector. + */ + BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512); + + r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, + &cmd->discard_root); + if (r) + return r; + + r = dm_tm_pre_commit(cmd->tm); + if (r < 0) + return r; + + r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); + if (r < 0) + return r; + + r = superblock_lock(cmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + + if (mutator) + update_flags(disk_super, mutator); + + disk_super->mapping_root = cpu_to_le64(cmd->root); + disk_super->hint_root = cpu_to_le64(cmd->hint_root); + disk_super->discard_root = cpu_to_le64(cmd->discard_root); + disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); + disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); + disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); + strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); + + disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits); + disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); + disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits); + disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses); + + r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root, + metadata_len); + if (r < 0) { + dm_bm_unlock(sblock); + return r; + } + + return dm_tm_commit(cmd->tm, sblock); +} + +/*----------------------------------------------------------------*/ + +/* + * The mappings are held in a dm-array that has 64-bit values stored in + * little-endian format. The index is the cblock, the high 48bits of the + * value are the oblock and the low 16 bit the flags. + */ +#define FLAGS_MASK ((1 << 16) - 1) + +static __le64 pack_value(dm_oblock_t block, unsigned flags) +{ + uint64_t value = from_oblock(block); + value <<= 16; + value = value | (flags & FLAGS_MASK); + return cpu_to_le64(value); +} + +static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags) +{ + uint64_t value = le64_to_cpu(value_le); + uint64_t b = value >> 16; + *block = to_oblock(b); + *flags = value & FLAGS_MASK; +} + +/*----------------------------------------------------------------*/ + +struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size) +{ + int r; + struct dm_cache_metadata *cmd; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) { + DMERR("could not allocate metadata struct"); + return NULL; + } + + init_rwsem(&cmd->root_lock); + cmd->bdev = bdev; + cmd->data_block_size = data_block_size; + cmd->cache_blocks = 0; + cmd->policy_hint_size = policy |