diff options
Diffstat (limited to 'drivers/md')
145 files changed, 74587 insertions, 13791 deletions
diff --git a/drivers/md/.gitignore b/drivers/md/.gitignore deleted file mode 100644 index a7afec6b19c..00000000000 --- a/drivers/md/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -mktables -raid6altivec*.c -raid6int*.c -raid6tables.c diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 07d92c11b5d..5bdedf6df15 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -30,6 +30,20 @@ config BLK_DEV_MD If unsure, say N. +config MD_AUTODETECT + bool "Autodetect RAID arrays during kernel boot" + depends on BLK_DEV_MD=y + default y + ---help--- + If you say Y here, then the kernel will try to autodetect raid + arrays as part of its boot process. + + If you don't use raid and say Y, this autodetection can cause + a several-second delay in the boot time due to various + synchronisation steps that are part of this step. + + If unsure, say Y. + config MD_LINEAR tristate "Linear (append) mode" depends on BLK_DEV_MD @@ -86,8 +100,8 @@ config MD_RAID1 If unsure, say Y. config MD_RAID10 - tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" - depends on BLK_DEV_MD && EXPERIMENTAL + tristate "RAID-10 (mirrored striping) mode" + depends on BLK_DEV_MD ---help--- RAID-10 provides a combination of striping (RAID-0) and mirroring (RAID-1) with easier configuration and more flexible @@ -107,8 +121,11 @@ config MD_RAID10 config MD_RAID456 tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD + select RAID6_PQ select ASYNC_MEMCPY select ASYNC_XOR + select ASYNC_PQ + select ASYNC_RAID6_RECOV ---help--- A RAID-5 set of N drives with a capacity of C MB per drive provides the capacity of C * (N - 1) MB, and protects against a failure @@ -137,44 +154,14 @@ config MD_RAID456 If unsure, say Y. -config MD_RAID5_RESHAPE - bool "Support adding drives to a raid-5 array" - depends on MD_RAID456 - default y - ---help--- - A RAID-5 set can be expanded by adding extra drives. This - requires "restriping" the array which means (almost) every - block must be written to a different place. - - This option allows such restriping to be done while the array - is online. - - You will need mdadm version 2.4.1 or later to use this - feature safely. During the early stage of reshape there is - a critical section where live data is being over-written. A - crash during this time needs extra care for recovery. The - newer mdadm takes a copy of the data in the critical section - and will restore it, if necessary, after a crash. - - The mdadm usage is e.g. - mdadm --grow /dev/md1 --raid-disks=6 - to grow '/dev/md1' to having 6 disks. - - Note: The array can only be expanded, not contracted. - There should be enough spares already present to make the new - array workable. - - If unsure, say Y. - config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD help - Multipath-IO is the ability of certain devices to address the same - physical disk over multiple 'IO paths'. The code ensures that such - paths can be defined and handled at runtime, and ensures that a - transparent failover to the backup path(s) happens if a IO errors - arrives on the primary path. + MD_MULTIPATH provides a simple multi-path personality for use + the MD framework. It is not under active development. New + projects should consider using DM_MULTIPATH which has more + features and more testing. If unsure, say N. @@ -187,8 +174,14 @@ config MD_FAULTY In unsure, say N. +source "drivers/md/bcache/Kconfig" + +config BLK_DEV_DM_BUILTIN + boolean + config BLK_DEV_DM tristate "Device mapper support" + select BLK_DEV_DM_BUILTIN ---help--- Device-mapper is a low level volume manager. It works by allowing people to specify mappings for ranges of logical sectors. Various @@ -210,6 +203,23 @@ config DM_DEBUG If unsure, say N. +config DM_BUFIO + tristate + depends on BLK_DEV_DM + ---help--- + This interface allows you to do buffered I/O on a device and acts + as a cache, holding recently-read blocks in memory and performing + delayed writes. + +config DM_BIO_PRISON + tristate + depends on BLK_DEV_DM + ---help--- + Some bio locking schemes used by other device-mapper targets + including thin provisioning. + +source "drivers/md/persistent-data/Kconfig" + config DM_CRYPT tristate "Crypt target support" depends on BLK_DEV_DM @@ -232,9 +242,60 @@ config DM_CRYPT config DM_SNAPSHOT tristate "Snapshot target" depends on BLK_DEV_DM + select DM_BUFIO ---help--- Allow volume managers to take writable snapshots of a device. +config DM_THIN_PROVISIONING + tristate "Thin provisioning target" + depends on BLK_DEV_DM + select DM_PERSISTENT_DATA + select DM_BIO_PRISON + ---help--- + Provides thin provisioning and snapshots that share a data store. + +config DM_CACHE + tristate "Cache target (EXPERIMENTAL)" + depends on BLK_DEV_DM + default n + select DM_PERSISTENT_DATA + select DM_BIO_PRISON + ---help--- + dm-cache attempts to improve performance of a block device by + moving frequently used data to a smaller, higher performance + device. Different 'policy' plugins can be used to change the + algorithms used to select which blocks are promoted, demoted, + cleaned etc. It supports writeback and writethrough modes. + +config DM_CACHE_MQ + tristate "MQ Cache Policy (EXPERIMENTAL)" + depends on DM_CACHE + default y + ---help--- + A cache policy that uses a multiqueue ordered by recent hit + count to select which blocks should be promoted and demoted. + This is meant to be a general purpose policy. It prioritises + reads over writes. + +config DM_CACHE_CLEANER + tristate "Cleaner Cache Policy (EXPERIMENTAL)" + depends on DM_CACHE + default y + ---help--- + A simple cache policy that writes back all data to the + origin. Used when decommissioning a dm-cache. + +config DM_ERA + tristate "Era target (EXPERIMENTAL)" + depends on BLK_DEV_DM + default n + select DM_PERSISTENT_DATA + select DM_BIO_PRISON + ---help--- + dm-era tracks which parts of a block device are written to + over time. Useful for maintaining cache coherency when using + vendor snapshots. + config DM_MIRROR tristate "Mirror target" depends on BLK_DEV_DM @@ -242,6 +303,43 @@ config DM_MIRROR Allow volume managers to mirror logical volumes, also needed for live data migration tools such as 'pvmove'. +config DM_LOG_USERSPACE + tristate "Mirror userspace logging" + depends on DM_MIRROR && NET + select CONNECTOR + ---help--- + The userspace logging module provides a mechanism for + relaying the dm-dirty-log API to userspace. Log designs + which are more suited to userspace implementation (e.g. + shared storage logs) or experimental logs can be implemented + by leveraging this framework. + +config DM_RAID + tristate "RAID 1/4/5/6/10 target" + depends on BLK_DEV_DM + select MD_RAID1 + select MD_RAID10 + select MD_RAID456 + select BLK_DEV_MD + ---help--- + A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings + + A RAID-5 set of N drives with a capacity of C MB per drive provides + the capacity of C * (N - 1) MB, and protects against a failure + of a single drive. For a given sector (row) number, (N - 1) drives + contain data sectors, and one drive contains the parity protection. + For a RAID-4 set, the parity blocks are present on a single drive, + while a RAID-5 set distributes the parity across the drives in one + of the available parity distribution methods. + + A RAID-6 set of N drives with a capacity of C MB per drive + provides the capacity of C * (N - 2) MB, and protects + against a failure of any two drives. For a given sector + (row) number, (N - 2) drives contain data sectors, and two + drives contains two independent redundancy syndromes. Like + RAID-5, RAID-6 distributes the syndromes across the drives + in one of the available parity distribution methods. + config DM_ZERO tristate "Zero target" depends on BLK_DEV_DM @@ -260,9 +358,28 @@ config DM_MULTIPATH ---help--- Allow volume managers to support multipath hardware. +config DM_MULTIPATH_QL + tristate "I/O Path Selector based on the number of in-flight I/Os" + depends on DM_MULTIPATH + ---help--- + This path selector is a dynamic load balancer which selects + the path with the least number of in-flight I/Os. + + If unsure, say N. + +config DM_MULTIPATH_ST + tristate "I/O Path Selector based on the service time" + depends on DM_MULTIPATH + ---help--- + This path selector is a dynamic load balancer which selects + the path expected to complete the incoming I/O in the shortest + time. + + If unsure, say N. + config DM_DELAY - tristate "I/O delaying target (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + tristate "I/O delaying target" + depends on BLK_DEV_DM ---help--- A target that delays reads and/or writes and can send them to different devices. Useful for testing. @@ -270,9 +387,49 @@ config DM_DELAY If unsure, say N. config DM_UEVENT - bool "DM uevents (EXPERIMENTAL)" - depends on BLK_DEV_DM && EXPERIMENTAL + bool "DM uevents" + depends on BLK_DEV_DM ---help--- Generate udev events for DM events. +config DM_FLAKEY + tristate "Flakey target" + depends on BLK_DEV_DM + ---help--- + A target that intermittently fails I/O for debugging purposes. + +config DM_VERITY + tristate "Verity target support" + depends on BLK_DEV_DM + select CRYPTO + select CRYPTO_HASH + select DM_BUFIO + ---help--- + This device-mapper target creates a read-only device that + transparently validates the data on one underlying device against + a pre-generated tree of cryptographic checksums stored on a second + device. + + You'll need to activate the digests you're going to use in the + cryptoapi configuration. + + To compile this code as a module, choose M here: the module will + be called dm-verity. + + If unsure, say N. + +config DM_SWITCH + tristate "Switch target support (EXPERIMENTAL)" + depends on BLK_DEV_DM + ---help--- + This device-mapper target creates a device that supports an arbitrary + mapping of fixed-size regions of I/O across a fixed set of paths. + The path used for any specific region can be switched dynamically + by sending the target a message. + + To compile this code as a module, choose M here: the module will + be called dm-switch. + + If unsure, say N. + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index f1ef33dfd8c..a2da532b1c2 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -2,19 +2,21 @@ # Makefile for the kernel software RAID and LVM drivers. # -dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ - dm-ioctl.o dm-io.o dm-kcopyd.o -dm-multipath-objs := dm-path-selector.o dm-mpath.o -dm-snapshot-objs := dm-snap.o dm-exception-store.o -dm-mirror-objs := dm-raid1.o -md-mod-objs := md.o bitmap.o -raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ - raid6int1.o raid6int2.o raid6int4.o \ - raid6int8.o raid6int16.o raid6int32.o \ - raid6altivec1.o raid6altivec2.o raid6altivec4.o \ - raid6altivec8.o \ - raid6mmx.o raid6sse1.o raid6sse2.o -hostprogs-y := mktables +dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ + dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o +dm-multipath-y += dm-path-selector.o dm-mpath.o +dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ + dm-snap-persistent.o +dm-mirror-y += dm-raid1.o +dm-log-userspace-y \ + += dm-log-userspace-base.o dm-log-userspace-transfer.o +dm-thin-pool-y += dm-thin.o dm-thin-metadata.o +dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o +dm-cache-mq-y += dm-cache-policy-mq.o +dm-cache-cleaner-y += dm-cache-policy-cleaner.o +dm-era-y += dm-era-target.o +md-mod-y += md.o bitmap.o +raid456-y += raid5.o # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise @@ -28,84 +30,32 @@ obj-$(CONFIG_MD_RAID10) += raid10.o obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o +obj-$(CONFIG_BCACHE) += bcache/ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o +obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o +obj-$(CONFIG_DM_BUFIO) += dm-bufio.o +obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o +obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o +obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o +obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o +obj-$(CONFIG_DM_SWITCH) += dm-switch.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o -obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o +obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ +obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o +obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o - -quiet_cmd_unroll = UNROLL $@ - cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \ - < $< > $@ || ( rm -f $@ && exit 1 ) - -ifeq ($(CONFIG_ALTIVEC),y) -altivec_flags := -maltivec -mabi=altivec -endif +obj-$(CONFIG_DM_RAID) += dm-raid.o +obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o +obj-$(CONFIG_DM_VERITY) += dm-verity.o +obj-$(CONFIG_DM_CACHE) += dm-cache.o +obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o +obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o +obj-$(CONFIG_DM_ERA) += dm-era.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o endif - -targets += raid6int1.c -$(obj)/raid6int1.c: UNROLL := 1 -$(obj)/raid6int1.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -targets += raid6int2.c -$(obj)/raid6int2.c: UNROLL := 2 -$(obj)/raid6int2.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -targets += raid6int4.c -$(obj)/raid6int4.c: UNROLL := 4 -$(obj)/raid6int4.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -targets += raid6int8.c -$(obj)/raid6int8.c: UNROLL := 8 -$(obj)/raid6int8.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -targets += raid6int16.c -$(obj)/raid6int16.c: UNROLL := 16 -$(obj)/raid6int16.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -targets += raid6int32.c -$(obj)/raid6int32.c: UNROLL := 32 -$(obj)/raid6int32.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -CFLAGS_raid6altivec1.o += $(altivec_flags) -targets += raid6altivec1.c -$(obj)/raid6altivec1.c: UNROLL := 1 -$(obj)/raid6altivec1.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -CFLAGS_raid6altivec2.o += $(altivec_flags) -targets += raid6altivec2.c -$(obj)/raid6altivec2.c: UNROLL := 2 -$(obj)/raid6altivec2.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -CFLAGS_raid6altivec4.o += $(altivec_flags) -targets += raid6altivec4.c -$(obj)/raid6altivec4.c: UNROLL := 4 -$(obj)/raid6altivec4.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -CFLAGS_raid6altivec8.o += $(altivec_flags) -targets += raid6altivec8.c -$(obj)/raid6altivec8.c: UNROLL := 8 -$(obj)/raid6altivec8.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE - $(call if_changed,unroll) - -quiet_cmd_mktable = TABLE $@ - cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) - -targets += raid6tables.c -$(obj)/raid6tables.c: $(obj)/mktables FORCE - $(call if_changed,mktable) diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig new file mode 100644 index 00000000000..4d200883c50 --- /dev/null +++ b/drivers/md/bcache/Kconfig @@ -0,0 +1,26 @@ + +config BCACHE + tristate "Block device as cache" + ---help--- + Allows a block device to be used as cache for other devices; uses + a btree for indexing and the layout is optimized for SSDs. + + See Documentation/bcache.txt for details. + +config BCACHE_DEBUG + bool "Bcache debugging" + depends on BCACHE + ---help--- + Don't select this option unless you're a developer + + Enables extra debugging tools, allows expensive runtime checks to be + turned on. + +config BCACHE_CLOSURES_DEBUG + bool "Debug closures" + depends on BCACHE + select DEBUG_FS + ---help--- + Keeps all active closures in a linked list and provides a debugfs + interface to list them, which makes it possible to see asynchronous + operations that get stuck. diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile new file mode 100644 index 00000000000..c488b846f83 --- /dev/null +++ b/drivers/md/bcache/Makefile @@ -0,0 +1,8 @@ + +obj-$(CONFIG_BCACHE) += bcache.o + +bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ + io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o + +CFLAGS_request.o += -Iblock diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c new file mode 100644 index 00000000000..443d03fbac4 --- /dev/null +++ b/drivers/md/bcache/alloc.c @@ -0,0 +1,696 @@ +/* + * Primary bucket allocation code + * + * Copyright 2012 Google, Inc. + * + * Allocation in bcache is done in terms of buckets: + * + * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in + * btree pointers - they must match for the pointer to be considered valid. + * + * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a + * bucket simply by incrementing its gen. + * + * The gens (along with the priorities; it's really the gens are important but + * the code is named as if it's the priorities) are written in an arbitrary list + * of buckets on disk, with a pointer to them in the journal header. + * + * When we invalidate a bucket, we have to write its new gen to disk and wait + * for that write to complete before we use it - otherwise after a crash we + * could have pointers that appeared to be good but pointed to data that had + * been overwritten. + * + * Since the gens and priorities are all stored contiguously on disk, we can + * batch this up: We fill up the free_inc list with freshly invalidated buckets, + * call prio_write(), and when prio_write() finishes we pull buckets off the + * free_inc list and optionally discard them. + * + * free_inc isn't the only freelist - if it was, we'd often to sleep while + * priorities and gens were being written before we could allocate. c->free is a + * smaller freelist, and buckets on that list are always ready to be used. + * + * If we've got discards enabled, that happens when a bucket moves from the + * free_inc list to the free list. + * + * There is another freelist, because sometimes we have buckets that we know + * have nothing pointing into them - these we can reuse without waiting for + * priorities to be rewritten. These come from freed btree nodes and buckets + * that garbage collection discovered no longer had valid keys pointing into + * them (because they were overwritten). That's the unused list - buckets on the + * unused list move to the free list, optionally being discarded in the process. + * + * It's also important to ensure that gens don't wrap around - with respect to + * either the oldest gen in the btree or the gen on disk. This is quite + * difficult to do in practice, but we explicitly guard against it anyways - if + * a bucket is in danger of wrapping around we simply skip invalidating it that + * time around, and we garbage collect or rewrite the priorities sooner than we + * would have otherwise. + * + * bch_bucket_alloc() allocates a single bucket from a specific cache. + * + * bch_bucket_alloc_set() allocates one or more buckets from different caches + * out of a cache set. + * + * free_some_buckets() drives all the processes described above. It's called + * from bch_bucket_alloc() and a few other places that need to make sure free + * buckets are ready. + * + * invalidate_buckets_(lru|fifo)() find buckets that are available to be + * invalidated, and then invalidate them and stick them on the free_inc list - + * in either lru or fifo order. + */ + +#include "bcache.h" +#include "btree.h" + +#include <linux/blkdev.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/random.h> +#include <trace/events/bcache.h> + +/* Bucket heap / gen */ + +uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) +{ + uint8_t ret = ++b->gen; + + ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); + WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); + + return ret; +} + +void bch_rescale_priorities(struct cache_set *c, int sectors) +{ + struct cache *ca; + struct bucket *b; + unsigned next = c->nbuckets * c->sb.bucket_size / 1024; + unsigned i; + int r; + + atomic_sub(sectors, &c->rescale); + + do { + r = atomic_read(&c->rescale); + + if (r >= 0) + return; + } while (atomic_cmpxchg(&c->rescale, r, r + next) != r); + + mutex_lock(&c->bucket_lock); + + c->min_prio = USHRT_MAX; + + for_each_cache(ca, c, i) + for_each_bucket(b, ca) + if (b->prio && + b->prio != BTREE_PRIO && + !atomic_read(&b->pin)) { + b->prio--; + c->min_prio = min(c->min_prio, b->prio); + } + + mutex_unlock(&c->bucket_lock); +} + +/* + * Background allocation thread: scans for buckets to be invalidated, + * invalidates them, rewrites prios/gens (marking them as invalidated on disk), + * then optionally issues discard commands to the newly free buckets, then puts + * them on the various freelists. + */ + +static inline bool can_inc_bucket_gen(struct bucket *b) +{ + return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX; +} + +bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) +{ + BUG_ON(!ca->set->gc_mark_valid); + + return (!GC_MARK(b) || + GC_MARK(b) == GC_MARK_RECLAIMABLE) && + !atomic_read(&b->pin) && + can_inc_bucket_gen(b); +} + +void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) +{ + lockdep_assert_held(&ca->set->bucket_lock); + BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE); + + if (GC_SECTORS_USED(b)) + trace_bcache_invalidate(ca, b - ca->buckets); + + bch_inc_gen(ca, b); + b->prio = INITIAL_PRIO; + atomic_inc(&b->pin); +} + +static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) +{ + __bch_invalidate_one_bucket(ca, b); + + fifo_push(&ca->free_inc, b - ca->buckets); +} + +/* + * Determines what order we're going to reuse buckets, smallest bucket_prio() + * first: we also take into account the number of sectors of live data in that + * bucket, and in order for that multiply to make sense we have to scale bucket + * + * Thus, we scale the bucket priorities so that the bucket with the smallest + * prio is worth 1/8th of what INITIAL_PRIO is worth. + */ + +#define bucket_prio(b) \ +({ \ + unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ + \ + (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ +}) + +#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) +#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) + +static void invalidate_buckets_lru(struct cache *ca) +{ + struct bucket *b; + ssize_t i; + + ca->heap.used = 0; + + for_each_bucket(b, ca) { + if (!bch_can_invalidate_bucket(ca, b)) + continue; + + if (!heap_full(&ca->heap)) + heap_add(&ca->heap, b, bucket_max_cmp); + else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { + ca->heap.data[0] = b; + heap_sift(&ca->heap, 0, bucket_max_cmp); + } + } + + for (i = ca->heap.used / 2 - 1; i >= 0; --i) + heap_sift(&ca->heap, i, bucket_min_cmp); + + while (!fifo_full(&ca->free_inc)) { + if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { + /* + * We don't want to be calling invalidate_buckets() + * multiple times when it can't do anything + */ + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + return; + } + + bch_invalidate_one_bucket(ca, b); + } +} + +static void invalidate_buckets_fifo(struct cache *ca) +{ + struct bucket *b; + size_t checked = 0; + + while (!fifo_full(&ca->free_inc)) { + if (ca->fifo_last_bucket < ca->sb.first_bucket || + ca->fifo_last_bucket >= ca->sb.nbuckets) + ca->fifo_last_bucket = ca->sb.first_bucket; + + b = ca->buckets + ca->fifo_last_bucket++; + + if (bch_can_invalidate_bucket(ca, b)) + bch_invalidate_one_bucket(ca, b); + + if (++checked >= ca->sb.nbuckets) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + return; + } + } +} + +static void invalidate_buckets_random(struct cache *ca) +{ + struct bucket *b; + size_t checked = 0; + + while (!fifo_full(&ca->free_inc)) { + size_t n; + get_random_bytes(&n, sizeof(n)); + + n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); + n += ca->sb.first_bucket; + + b = ca->buckets + n; + + if (bch_can_invalidate_bucket(ca, b)) + bch_invalidate_one_bucket(ca, b); + + if (++checked >= ca->sb.nbuckets / 2) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + return; + } + } +} + +static void invalidate_buckets(struct cache *ca) +{ + BUG_ON(ca->invalidate_needs_gc); + + switch (CACHE_REPLACEMENT(&ca->sb)) { + case CACHE_REPLACEMENT_LRU: + invalidate_buckets_lru(ca); + break; + case CACHE_REPLACEMENT_FIFO: + invalidate_buckets_fifo(ca); + break; + case CACHE_REPLACEMENT_RANDOM: + invalidate_buckets_random(ca); + break; + } +} + +#define allocator_wait(ca, cond) \ +do { \ + while (1) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (cond) \ + break; \ + \ + mutex_unlock(&(ca)->set->bucket_lock); \ + if (kthread_should_stop()) \ + return 0; \ + \ + try_to_freeze(); \ + schedule(); \ + mutex_lock(&(ca)->set->bucket_lock); \ + } \ + __set_current_state(TASK_RUNNING); \ +} while (0) + +static int bch_allocator_push(struct cache *ca, long bucket) +{ + unsigned i; + + /* Prios/gens are actually the most important reserve */ + if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) + return true; + + for (i = 0; i < RESERVE_NR; i++) + if (fifo_push(&ca->free[i], bucket)) + return true; + + return false; +} + +static int bch_allocator_thread(void *arg) +{ + struct cache *ca = arg; + + mutex_lock(&ca->set->bucket_lock); + + while (1) { + /* + * First, we pull buckets off of the unused and free_inc lists, + * possibly issue discards to them, then we add the bucket to + * the free list: + */ + while (!fifo_empty(&ca->free_inc)) { + long bucket; + + fifo_pop(&ca->free_inc, bucket); + + if (ca->discard) { + mutex_unlock(&ca->set->bucket_lock); + blkdev_issue_discard(ca->bdev, + bucket_to_sector(ca->set, bucket), + ca->sb.block_size, GFP_KERNEL, 0); + mutex_lock(&ca->set->bucket_lock); + } + + allocator_wait(ca, bch_allocator_push(ca, bucket)); + wake_up(&ca->set->btree_cache_wait); + wake_up(&ca->set->bucket_wait); + } + + /* + * We've run out of free buckets, we need to find some buckets + * we can invalidate. First, invalidate them in memory and add + * them to the free_inc list: + */ + +retry_invalidate: + allocator_wait(ca, ca->set->gc_mark_valid && + !ca->invalidate_needs_gc); + invalidate_buckets(ca); + + /* + * Now, we write their new gens to disk so we can start writing + * new stuff to them: + */ + allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); + if (CACHE_SYNC(&ca->set->sb)) { + /* + * This could deadlock if an allocation with a btree + * node locked ever blocked - having the btree node + * locked would block garbage collection, but here we're + * waiting on garbage collection before we invalidate + * and free anything. + * + * But this should be safe since the btree code always + * uses btree_check_reserve() before allocating now, and + * if it fails it blocks without btree nodes locked. + */ + if (!fifo_full(&ca->free_inc)) + goto retry_invalidate; + + bch_prio_write(ca); + } + } +} + +/* Allocation */ + +long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) +{ + DEFINE_WAIT(w); + struct bucket *b; + long r; + + /* fastpath */ + if (fifo_pop(&ca->free[RESERVE_NONE], r) || + fifo_pop(&ca->free[reserve], r)) + goto out; + + if (!wait) { + trace_bcache_alloc_fail(ca, reserve); + return -1; + } + + do { + prepare_to_wait(&ca->set->bucket_wait, &w, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&ca->set->bucket_lock); + schedule(); + mutex_lock(&ca->set->bucket_lock); + } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && + !fifo_pop(&ca->free[reserve], r)); + + finish_wait(&ca->set->bucket_wait, &w); +out: + wake_up_process(ca->alloc_thread); + + trace_bcache_alloc(ca, reserve); + + if (expensive_debug_checks(ca->set)) { + size_t iter; + long i; + unsigned j; + + for (iter = 0; iter < prio_buckets(ca) * 2; iter++) + BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); + + for (j = 0; j < RESERVE_NR; j++) + fifo_for_each(i, &ca->free[j], iter) + BUG_ON(i == r); + fifo_for_each(i, &ca->free_inc, iter) + BUG_ON(i == r); + } + + b = ca->buckets + r; + + BUG_ON(atomic_read(&b->pin) != 1); + + SET_GC_SECTORS_USED(b, ca->sb.bucket_size); + + if (reserve <= RESERVE_PRIO) { + SET_GC_MARK(b, GC_MARK_METADATA); + SET_GC_MOVE(b, 0); + b->prio = BTREE_PRIO; + } else { + SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_MOVE(b, 0); + b->prio = INITIAL_PRIO; + } + + return r; +} + +void __bch_bucket_free(struct cache *ca, struct bucket *b) +{ + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); +} + +void bch_bucket_free(struct cache_set *c, struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + __bch_bucket_free(PTR_CACHE(c, k, i), + PTR_BUCKET(c, k, i)); +} + +int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, + struct bkey *k, int n, bool wait) +{ + int i; + + lockdep_assert_held(&c->bucket_lock); + BUG_ON(!n || n > c->caches_loaded || n > 8); + + bkey_init(k); + + /* sort by free space/prio of oldest data in caches */ + + for (i = 0; i < n; i++) { + struct cache *ca = c->cache_by_alloc[i]; + long b = bch_bucket_alloc(ca, reserve, wait); + + if (b == -1) + goto err; + + k->ptr[i] = PTR(ca->buckets[b].gen, + bucket_to_sector(c, b), + ca->sb.nr_this_dev); + + SET_KEY_PTRS(k, i + 1); + } + + return 0; +err: + bch_bucket_free(c, k); + bkey_put(c, k); + return -1; +} + +int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, + struct bkey *k, int n, bool wait) +{ + int ret; + mutex_lock(&c->bucket_lock); + ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); + mutex_unlock(&c->bucket_lock); + return ret; +} + +/* Sector allocator */ + +struct open_bucket { + struct list_head list; + unsigned last_write_point; + unsigned sectors_free; + BKEY_PADDED(key); +}; + +/* + * We keep multiple buckets open for writes, and try to segregate different + * write streams for better cache utilization: first we look for a bucket where + * the last write to it was sequential with the current write, and failing that + * we look for a bucket that was last used by the same task. + * + * The ideas is if you've got multiple tasks pulling data into the cache at the + * same time, you'll get better cache utilization if you try to segregate their + * data and preserve locality. + * + * For example, say you've starting Firefox at the same time you're copying a + * bunch of files. Firefox will likely end up being fairly hot and stay in the + * cache awhile, but the data you copied might not be; if you wrote all that + * data to the same buckets it'd get invalidated at the same time. + * + * Both of those tasks will be doing fairly random IO so we can't rely on + * detecting sequential IO to segregate their data, but going off of the task + * should be a sane heuristic. + */ +static struct open_bucket *pick_data_bucket(struct cache_set *c, + const struct bkey *search, + unsigned write_point, + struct bkey *alloc) +{ + struct open_bucket *ret, *ret_task = NULL; + + list_for_each_entry_reverse(ret, &c->data_buckets, list) + if (!bkey_cmp(&ret->key, search)) + goto found; + else if (ret->last_write_point == write_point) + ret_task = ret; + + ret = ret_task ?: list_first_entry(&c->data_buckets, + struct open_bucket, list); +found: + if (!ret->sectors_free && KEY_PTRS(alloc)) { + ret->sectors_free = c->sb.bucket_size; + bkey_copy(&ret->key, alloc); + bkey_init(alloc); + } + + if (!ret->sectors_free) + ret = NULL; + + return ret; +} + +/* + * Allocates some space in the cache to write to, and k to point to the newly + * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the + * end of the newly allocated space). + * + * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many + * sectors were actually allocated. + * + * If s->writeback is true, will not fail. + */ +bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, + unsigned write_point, unsigned write_prio, bool wait) +{ + struct open_bucket *b; + BKEY_PADDED(key) alloc; + unsigned i; + + /* + * We might have to allocate a new bucket, which we can't do with a + * spinlock held. So if we have to allocate, we drop the lock, allocate + * and then retry. KEY_PTRS() indicates whether alloc points to + * allocated bucket(s). + */ + + bkey_init(&alloc.key); + spin_lock(&c->data_bucket_lock); + + while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { + unsigned watermark = write_prio + ? RESERVE_MOVINGGC + : RESERVE_NONE; + + spin_unlock(&c->data_bucket_lock); + + if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) + return false; + + spin_lock(&c->data_bucket_lock); + } + + /* + * If we had to allocate, we might race and not need to allocate the + * second time we call find_data_bucket(). If we allocated a bucket but + * didn't use it, drop the refcount bch_bucket_alloc_set() took: + */ + if (KEY_PTRS(&alloc.key)) + bkey_put(c, &alloc.key); + + for (i = 0; i < KEY_PTRS(&b->key); i++) + EBUG_ON(ptr_stale(c, &b->key, i)); + + /* Set up the pointer to the space we're allocating: */ + + for (i = 0; i < KEY_PTRS(&b->key); i++) + k->ptr[i] = b->key.ptr[i]; + + sectors = min(sectors, b->sectors_free); + + SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); + SET_KEY_SIZE(k, sectors); + SET_KEY_PTRS(k, KEY_PTRS(&b->key)); + + /* + * Move b to the end of the lru, and keep track of what this bucket was + * last used for: + */ + list_move_tail(&b->list, &c->data_buckets); + bkey_copy_key(&b->key, k); + b->last_write_point = write_point; + + b->sectors_free -= sectors; + + for (i = 0; i < KEY_PTRS(&b->key); i++) { + SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); + + atomic_long_add(sectors, + &PTR_CACHE(c, &b->key, i)->sectors_written); + } + + if (b->sectors_free < c->sb.block_size) + b->sectors_free = 0; + + /* + * k takes refcounts on the buckets it points to until it's inserted + * into the btree, but if we're done with this bucket we just transfer + * get_data_bucket()'s refcount. + */ + if (b->sectors_free) + for (i = 0; i < KEY_PTRS(&b->key); i++) + atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); + + spin_unlock(&c->data_bucket_lock); + return true; +} + +/* Init */ + +void bch_open_buckets_free(struct cache_set *c) +{ + struct open_bucket *b; + + while (!list_empty(&c->data_buckets)) { + b = list_first_entry(&c->data_buckets, + struct open_bucket, list); + list_del(&b->list); + kfree(b); + } +} + +int bch_open_buckets_alloc(struct cache_set *c) +{ + int i; + + spin_lock_init(&c->data_bucket_lock); + + for (i = 0; i < 6; i++) { + struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) + return -ENOMEM; + + list_add(&b->list, &c->data_buckets); + } + + return 0; +} + +int bch_cache_allocator_start(struct cache *ca) +{ + struct task_struct *k = kthread_run(bch_allocator_thread, + ca, "bcache_allocator"); + if (IS_ERR(k)) + return PTR_ERR(k); + + ca->alloc_thread = k; + return 0; +} diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h new file mode 100644 index 00000000000..d2ebcf32309 --- /dev/null +++ b/drivers/md/bcache/bcache.h @@ -0,0 +1,942 @@ +#ifndef _BCACHE_H +#define _BCACHE_H + +/* + * SOME HIGH LEVEL CODE DOCUMENTATION: + * + * Bcache mostly works with cache sets, cache devices, and backing devices. + * + * Support for multiple cache devices hasn't quite been finished off yet, but + * it's about 95% plumbed through. A cache set and its cache devices is sort of + * like a md raid array and its component devices. Most of the code doesn't care + * about individual cache devices, the main abstraction is the cache set. + * + * Multiple cache devices is intended to give us the ability to mirror dirty + * cached data and metadata, without mirroring clean cached data. + * + * Backing devices are different, in that they have a lifetime independent of a + * cache set. When you register a newly formatted backing device it'll come up + * in passthrough mode, and then you can attach and detach a backing device from + * a cache set at runtime - while it's mounted and in use. Detaching implicitly + * invalidates any cached data for that backing device. + * + * A cache set can have multiple (many) backing devices attached to it. + * + * There's also flash only volumes - this is the reason for the distinction + * between struct cached_dev and struct bcache_device. A flash only volume + * works much like a bcache device that has a backing device, except the + * "cached" data is always dirty. The end result is that we get thin + * provisioning with very little additional code. + * + * Flash only volumes work but they're not production ready because the moving + * garbage collector needs more work. More on that later. + * + * BUCKETS/ALLOCATION: + * + * Bcache is primarily designed for caching, which means that in normal + * operation all of our available space will be allocated. Thus, we need an + * efficient way of deleting things from the cache so we can write new things to + * it. + * + * To do this, we first divide the cache device up into buckets. A bucket is the + * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ + * works efficiently. + * + * Each bucket has a 16 bit priority, and an 8 bit generation associated with + * it. The gens and priorities for all the buckets are stored contiguously and + * packed on disk (in a linked list of buckets - aside from the superblock, all + * of bcache's metadata is stored in buckets). + * + * The priority is used to implement an LRU. We reset a bucket's priority when + * we allocate it or on cache it, and every so often we decrement the priority + * of each bucket. It could be used to implement something more sophisticated, + * if anyone ever gets around to it. + * + * The generation is used for invalidating buckets. Each pointer also has an 8 + * bit generation embedded in it; for a pointer to be considered valid, its gen + * must match the gen of the bucket it points into. Thus, to reuse a bucket all + * we have to do is increment its gen (and write its new gen to disk; we batch + * this up). + * + * Bcache is entirely COW - we never write twice to a bucket, even buckets that + * contain metadata (including btree nodes). + * + * THE BTREE: + * + * Bcache is in large part design around the btree. + * + * At a high level, the btree is just an index of key -> ptr tuples. + * + * Keys represent extents, and thus have a size field. Keys also have a variable + * number of pointers attached to them (potentially zero, which is handy for + * invalidating the cache). + * + * The key itself is an inode:offset pair. The inode number corresponds to a + * backing device or a flash only volume. The offset is the ending offset of the + * extent within the inode - not the starting offset; this makes lookups + * slightly more convenient. + * + * Pointers contain the cache device id, the offset on that device, and an 8 bit + * generation number. More on the gen later. + * + * Index lookups are not fully abstracted - cache lookups in particular are + * still somewhat mixed in with the btree code, but things are headed in that + * direction. + * + * Updates are fairly well abstracted, though. There are two different ways of + * updating the btree; insert and replace. + * + * BTREE_INSERT will just take a list of keys and insert them into the btree - + * overwriting (possibly only partially) any extents they overlap with. This is + * used to update the index after a write. + * + * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is + * overwriting a key that matches another given key. This is used for inserting + * data into the cache after a cache miss, and for background writeback, and for + * the moving garbage collector. + * + * There is no "delete" operation; deleting things from the index is + * accomplished by either by invalidating pointers (by incrementing a bucket's + * gen) or by inserting a key with 0 pointers - which will overwrite anything + * previously present at that location in the index. + * + * This means that there are always stale/invalid keys in the btree. They're + * filtered out by the code that iterates through a btree node, and removed when + * a btree node is rewritten. + * + * BTREE NODES: + * + * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and + * free smaller than a bucket - so, that's how big our btree nodes are. + * + * (If buckets are really big we'll only use part of the bucket for a btree node + * - no less than 1/4th - but a bucket still contains no more than a single + * btree node. I'd actually like to change this, but for now we rely on the + * bucket's gen for deleting btree nodes when we rewrite/split a node.) + * + * Anyways, btree nodes are big - big enough to be inefficient with a textbook + * btree implementation. + * + * The way this is solved is that btree nodes are internally log structured; we + * can append new keys to an existing btree node without rewriting it. This + * means each set of keys we write is sorted, but the node is not. + * + * We maintain this log structure in memory - keeping 1Mb of keys sorted would + * be expensive, and we have to distinguish between the keys we have written and + * the keys we haven't. So to do a lookup in a btree node, we have to search + * each sorted set. But we do merge written sets together lazily, so the cost of + * these extra searches is quite low (normally most of the keys in a btree node + * will be in one big set, and then there'll be one or two sets that are much + * smaller). + * + * This log structure makes bcache's btree more of a hybrid between a + * conventional btree and a compacting data structure, with some of the + * advantages of both. + * + * GARBAGE COLLECTION: + * + * We can't just invalidate any bucket - it might contain dirty data or + * metadata. If it once contained dirty data, other writes might overwrite it + * later, leaving no valid pointers into that bucket in the index. + * + * Thus, the primary purpose of garbage collection is to find buckets to reuse. + * It also counts how much valid data it each bucket currently contains, so that + * allocation can reuse buckets sooner when they've been mostly overwritten. + * + * It also does some things that are really internal to the btree + * implementation. If a btree node contains pointers that are stale by more than + * some threshold, it rewrites the btree node to avoid the bucket's generation + * wrapping around. It also merges adjacent btree nodes if they're empty enough. + * + * THE JOURNAL: + * + * Bcache's journal is not necessary for consistency; we always strictly + * order metadata writes so that the btree and everything else is consistent on + * disk in the event of an unclean shutdown, and in fact bcache had writeback + * caching (with recovery from unclean shutdown) before journalling was + * implemented. + * + * Rather, the journal is purely a performance optimization; we can't complete a + * write until we've updated the index on disk, otherwise the cache would be + * inconsistent in the event of an unclean shutdown. This means that without the + * journal, on random write workloads we constantly have to update all the leaf + * nodes in the btree, and those writes will be mostly empty (appending at most + * a few keys each) - highly inefficient in terms of amount of metadata writes, + * and it puts more strain on the various btree resorting/compacting code. + * + * The journal is just a log of keys we've inserted; on startup we just reinsert + * all the keys in the open journal entries. That means that when we're updating + * a node in the btree, we can wait until a 4k block of keys fills up before + * writing them out. + * + * For simplicity, we only journal updates to leaf nodes; updates to parent + * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth + * the complexity to deal with journalling them (in particular, journal replay) + * - updates to non leaf nodes just happen synchronously (see btree_split()). + */ + +#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ + +#include <linux/bcache.h> +#include <linux/bio.h> +#include <linux/kobject.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/rbtree.h> +#include <linux/rwsem.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +#include "bset.h" +#include "util.h" +#include "closure.h" + +struct bucket { + atomic_t pin; + uint16_t prio; + uint8_t gen; + uint8_t last_gc; /* Most out of date gen in the btree */ + uint16_t gc_mark; /* Bitfield used by GC. See below for field */ +}; + +/* + * I'd use bitfields for these, but I don't trust the compiler not to screw me + * as multiple threads touch struct bucket without locking + */ + +BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); +#define GC_MARK_RECLAIMABLE 1 +#define GC_MARK_DIRTY 2 +#define GC_MARK_METADATA 3 +#define GC_SECTORS_USED_SIZE 13 +#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) +BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); +BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); + +#include "journal.h" +#include "stats.h" +struct search; +struct btree; +struct keybuf; + +struct keybuf_key { + struct rb_node node; + BKEY_PADDED(key); + void *private; +}; + +struct keybuf { + struct bkey last_scanned; + spinlock_t lock; + + /* + * Beginning and end of range in rb tree - so that we can skip taking + * lock and checking the rb tree when we need to check for overlapping + * keys. + */ + struct bkey start; + struct bkey end; + + struct rb_root keys; + +#define KEYBUF_NR 500 + DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); +}; + +struct bio_split_pool { + struct bio_set *bio_split; + mempool_t *bio_split_hook; +}; + +struct bio_split_hook { + struct closure cl; + struct bio_split_pool *p; + struct bio *bio; + bio_end_io_t *bi_end_io; + void *bi_private; +}; + +struct bcache_device { + struct closure cl; + + struct kobject kobj; + + struct cache_set *c; + unsigned id; +#define BCACHEDEVNAME_SIZE 12 + char name[BCACHEDEVNAME_SIZE]; + + struct gendisk *disk; + + unsigned long flags; +#define BCACHE_DEV_CLOSING 0 +#define BCACHE_DEV_DETACHING 1 +#define BCACHE_DEV_UNLINK_DONE 2 + + unsigned nr_stripes; + unsigned stripe_size; + atomic_t *stripe_sectors_dirty; + unsigned long *full_dirty_stripes; + + unsigned long sectors_dirty_last; + long sectors_dirty_derivative; + + struct bio_set *bio_split; + + unsigned data_csum:1; + + int (*cache_miss)(struct btree *, struct search *, + struct bio *, unsigned); + int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long); + + struct bio_split_pool bio_split_hook; +}; + +struct io { + /* Used to track sequential IO so it can be skipped */ + struct hlist_node hash; + struct list_head lru; + + unsigned long jiffies; + unsigned sequential; + sector_t last; +}; + +struct cached_dev { + struct list_head list; + struct bcache_device disk; + struct block_device *bdev; + + struct cache_sb sb; + struct bio sb_bio; + struct bio_vec sb_bv[1]; + struct closure sb_write; + struct semaphore sb_write_mutex; + + /* Refcount on the cache set. Always nonzero when we're caching. */ + atomic_t count; + struct work_struct detach; + + /* + * Device might not be running if it's dirty and the cache set hasn't + * showed up yet. + */ + atomic_t running; + + /* + * Writes take a shared lock from start to finish; scanning for dirty + * data to refill the rb tree requires an exclusive lock. + */ + struct rw_semaphore writeback_lock; + + /* + * Nonzero, and writeback has a refcount (d->count), iff there is dirty + * data in the cache. Protected by writeback_lock; must have an + * shared lock to set and exclusive lock to clear. + */ + atomic_t has_dirty; + + struct bch_ratelimit writeback_rate; + struct delayed_work writeback_rate_update; + + /* + * Internal to the writeback code, so read_dirty() can keep track of + * where it's at. + */ + sector_t last_read; + + /* Limit number of writeback bios in flight */ + struct semaphore in_flight; + struct task_struct *writeback_thread; + + struct keybuf writeback_keys; + + /* For tracking sequential IO */ +#define RECENT_IO_BITS 7 +#define RECENT_IO (1 << RECENT_IO_BITS) + struct io io[RECENT_IO]; + struct hlist_head io_hash[RECENT_IO + 1]; + struct list_head io_lru; + spinlock_t io_lock; + + struct cache_accounting accounting; + + /* The rest of this all shows up in sysfs */ + unsigned sequential_cutoff; + unsigned readahead; + + unsigned verify:1; + unsigned bypass_torture_test:1; + + unsigned partial_stripes_expensive:1; + unsigned writeback_metadata:1; + unsigned writeback_running:1; + unsigned char writeback_percent; + unsigned writeback_delay; + + uint64_t writeback_rate_target; + int64_t writeback_rate_proportional; + int64_t writeback_rate_derivative; + int64_t writeback_rate_change; + + unsigned writeback_rate_update_seconds; + unsigned writeback_rate_d_term; + unsigned writeback_rate_p_term_inverse; +}; + +enum alloc_reserve { + RESERVE_BTREE, + RESERVE_PRIO, + RESERVE_MOVINGGC, + RESERVE_NONE, + RESERVE_NR, +}; + +struct cache { + struct cache_set *set; + struct cache_sb sb; + struct bio sb_bio; + struct bio_vec sb_bv[1]; + + struct kobject kobj; + struct block_device *bdev; + + struct task_struct *alloc_thread; + + struct closure prio; + struct prio_set *disk_buckets; + + /* + * When allocating new buckets, prio_write() gets first dibs - since we + * may not be allocate at all without writing priorities and gens. + * prio_buckets[] contains the last buckets we wrote priorities to (so + * gc can mark them as metadata), prio_next[] contains the buckets + * allocated for the next prio write. + */ + uint64_t *prio_buckets; + uint64_t *prio_last_buckets; + + /* + * free: Buckets that are ready to be used + * + * free_inc: Incoming buckets - these are buckets that currently have + * cached data in them, and we can't reuse them until after we write + * their new gen to disk. After prio_write() finishes writing the new + * gens/prios, they'll be moved to the free list (and possibly discarded + * in the process) + */ + DECLARE_FIFO(long, free)[RESERVE_NR]; + DECLARE_FIFO(long, free_inc); + + size_t fifo_last_bucket; + + /* Allocation stuff: */ + struct bucket *buckets; + + DECLARE_HEAP(struct bucket *, heap); + + /* + * If nonzero, we know we aren't going to find any buckets to invalidate + * until a gc finishes - otherwise we could pointlessly burn a ton of + * cpu + */ + unsigned invalidate_needs_gc:1; + + bool discard; /* Get rid of? */ + + struct journal_device journal; + + /* The rest of this all shows up in sysfs */ +#define IO_ERROR_SHIFT 20 + atomic_t io_errors; + atomic_t io_count; + + atomic_long_t meta_sectors_written; + atomic_long_t btree_sectors_written; + atomic_long_t sectors_written; + + struct bio_split_pool bio_split_hook; +}; + +struct gc_stat { + size_t nodes; + size_t key_bytes; + + size_t nkeys; + uint64_t data; /* sectors */ + unsigned in_use; /* percent */ +}; + +/* + * Flag bits, for how the cache set is shutting down, and what phase it's at: + * + * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching + * all the backing devices first (their cached data gets invalidated, and they + * won't automatically reattach). + * + * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; + * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. + * flushing dirty data). + */ +#define CACHE_SET_UNREGISTERING 0 +#define CACHE_SET_STOPPING 1 + +struct cache_set { + struct closure cl; + + struct list_head list; + struct kobject kobj; + struct kobject internal; + struct dentry *debug; + struct cache_accounting accounting; + + unsigned long flags; + + struct cache_sb sb; + + struct cache *cache[MAX_CACHES_PER_SET]; + struct cache *cache_by_alloc[MAX_CACHES_PER_SET]; + int caches_loaded; + + struct bcache_device **devices; + struct list_head cached_devs; + uint64_t cached_dev_sectors; + struct closure caching; + + struct closure sb_write; + struct semaphore sb_write_mutex; + + mempool_t *search; + mempool_t *bio_meta; + struct bio_set *bio_split; + + /* For the btree cache */ + struct shrinker shrink; + + /* For the btree cache and anything allocation related */ + struct mutex bucket_lock; + + /* log2(bucket_size), in sectors */ + unsigned short bucket_bits; + + /* log2(block_size), in sectors */ + unsigned short block_bits; + + /* + * Default number of pages for a new btree node - may be less than a + * full bucket + */ + unsigned btree_pages; + + /* + * Lists of struct btrees; lru is the list for structs that have memory + * allocated for actual btree node, freed is for structs that do not. + * + * We never free a struct btree, except on shutdown - we just put it on + * the btree_cache_freed list and reuse it later. This simplifies the + * code, and it doesn't cost us much memory as the memory usage is + * dominated by buffers that hold the actual btree node data and those + * can be freed - and the number of struct btrees allocated is + * effectively bounded. + * + * btree_cache_freeable effectively is a small cache - we use it because + * high order page allocations can be rather expensive, and it's quite + * common to delete and allocate btree nodes in quick succession. It + * should never grow past ~2-3 nodes in practice. + */ + struct list_head btree_cache; + struct list_head btree_cache_freeable; + struct list_head btree_cache_freed; + + /* Number of elements in btree_cache + btree_cache_freeable lists */ + unsigned btree_cache_used; + + /* + * If we need to allocate memory for a new btree node and that + * allocation fails, we can cannibalize another node in the btree cache + * to satisfy the allocation - lock to guarantee only one thread does + * this at a time: + */ + wait_queue_head_t btree_cache_wait; + struct task_struct *btree_cache_alloc_lock; + + /* + * When we free a btree node, we increment the gen of the bucket the + * node is in - but we can't rewrite the prios and gens until we + * finished whatever it is we were doing, otherwise after a crash the + * btree node would be freed but for say a split, we might not have the + * pointers to the new nodes inserted into the btree yet. + * + * This is a refcount that blocks prio_write() until the new keys are + * written. + */ + atomic_t prio_blocked; + wait_queue_head_t bucket_wait; + + /* + * For any bio we don't skip we subtract the number of sectors from + * rescale; when it hits 0 we rescale all the bucket priorities. + */ + atomic_t rescale; + /* + * When we invalidate buckets, we use both the priority and the amount + * of good data to determine which buckets to reuse first - to weight + * those together consistently we keep track of the smallest nonzero + * priority of any bucket. + */ + uint16_t min_prio; + + /* + * max(gen - last_gc) for all buckets. When it gets too big we have to gc + * to keep gens from wrapping around. + */ + uint8_t need_gc; + struct gc_stat gc_stats; + size_t nbuckets; + + struct task_struct *gc_thread; + /* Where in the btree gc currently is */ + struct bkey gc_done; + + /* + * The allocation code needs gc_mark in struct bucket to be correct, but + * it's not while a gc is in progress. Protected by bucket_lock. + */ + int gc_mark_valid; + + /* Counts how many sectors bio_insert has added to the cache */ + atomic_t sectors_to_gc; + + wait_queue_head_t moving_gc_wait; + struct keybuf moving_gc_keys; + /* Number of moving GC bios in flight */ + struct semaphore moving_in_flight; + + struct workqueue_struct *moving_gc_wq; + + struct btree *root; + +#ifdef CONFIG_BCACHE_DEBUG + struct btree *verify_data; + struct bset *verify_ondisk; + struct mutex verify_lock; +#endif + + unsigned nr_uuids; + struct uuid_entry *uuids; + BKEY_PADDED(uuid_bucket); + struct closure uuid_write; + struct semaphore uuid_write_mutex; + + /* + * A btree node on disk could have too many bsets for an iterator to fit + * on the stack - have to dynamically allocate them + */ + mempool_t *fill_iter; + + struct bset_sort_state sort; + + /* List of buckets we're currently writing data to */ + struct list_head data_buckets; + spinlock_t data_bucket_lock; + + struct journal journal; + +#define CONGESTED_MAX 1024 + unsigned congested_last_us; + atomic_t congested; + + /* The rest of this all shows up in sysfs */ + unsigned congested_read_threshold_us; + unsigned congested_write_threshold_us; + + struct time_stats btree_gc_time; + struct time_stats btree_split_time; + struct time_stats btree_read_time; + + atomic_long_t cache_read_races; + atomic_long_t writeback_keys_done; + atomic_long_t writeback_keys_failed; + + enum { + ON_ERROR_UNREGISTER, + ON_ERROR_PANIC, + } on_error; + unsigned error_limit; + unsigned error_decay; + + unsigned short journal_delay_ms; + bool expensive_debug_checks; + unsigned verify:1; + unsigned key_merging_disabled:1; + unsigned gc_always_rewrite:1; + unsigned shrinker_disabled:1; + unsigned copy_gc_enabled:1; + +#define BUCKET_HASH_BITS 12 + struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; +}; + +struct bbio { + unsigned submit_time_us; + union { + struct bkey key; + uint64_t _pad[3]; + /* + * We only need pad = 3 here because we only ever carry around a + * single pointer - i.e. the pointer we're doing io to/from. + */ + }; + struct bio bio; +}; + +#define BTREE_PRIO USHRT_MAX +#define INITIAL_PRIO 32768U + +#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) +#define btree_blocks(b) \ + ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) + +#define btree_default_blocks(c) \ + ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) + +#define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS) +#define bucket_bytes(c) ((c)->sb.bucket_size << 9) +#define block_bytes(c) ((c)->sb.block_size << 9) + +#define prios_per_bucket(c) \ + ((bucket_bytes(c) - sizeof(struct prio_set)) / \ + sizeof(struct bucket_disk)) +#define prio_buckets(c) \ + DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) + +static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) +{ + return s >> c->bucket_bits; +} + +static inline sector_t bucket_to_sector(struct cache_set *c, size_t b) +{ + return ((sector_t) b) << c->bucket_bits; +} + +static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) +{ + return s & (c->sb.bucket_size - 1); +} + +static inline struct cache *PTR_CACHE(struct cache_set *c, + const struct bkey *k, + unsigned ptr) +{ + return c->cache[PTR_DEV(k, ptr)]; +} + +static inline size_t PTR_BUCKET_NR(struct cache_set *c, + const struct bkey *k, + unsigned ptr) +{ + return sector_to_bucket(c, PTR_OFFSET(k, ptr)); +} + +static inline struct bucket *PTR_BUCKET(struct cache_set *c, + const struct bkey *k, + unsigned ptr) +{ + return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); +} + +static inline uint8_t gen_after(uint8_t a, uint8_t b) +{ + uint8_t r = a - b; + return r > 128U ? 0 : r; +} + +static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, + unsigned i) +{ + return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); +} + +static inline bool ptr_available(struct cache_set *c, const struct bkey *k, + unsigned i) +{ + return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); +} + +/* Btree key macros */ + +/* + * This is used for various on disk data structures - cache_sb, prio_set, bset, + * jset: The checksum is _always_ the first 8 bytes of these structs + */ +#define csum_set(i) \ + bch_crc64(((void *) (i)) + sizeof(uint64_t), \ + ((void *) bset_bkey_last(i)) - \ + (((void *) (i)) + sizeof(uint64_t))) + +/* Error handling macros */ + +#define btree_bug(b, ...) \ +do { \ + if (bch_cache_set_error((b)->c, __VA_ARGS__)) \ + dump_stack(); \ +} while (0) + +#define cache_bug(c, ...) \ +do { \ + if (bch_cache_set_error(c, __VA_ARGS__)) \ + dump_stack(); \ +} while (0) + +#define btree_bug_on(cond, b, ...) \ +do { \ + if (cond) \ + btree_bug(b, __VA_ARGS__); \ +} while (0) + +#define cache_bug_on(cond, c, ...) \ +do { \ + if (cond) \ + cache_bug(c, __VA_ARGS__); \ +} while (0) + +#define cache_set_err_on(cond, c, ...) \ +do { \ + if (cond) \ + bch_cache_set_error(c, __VA_ARGS__); \ +} while (0) + +/* Looping macros */ + +#define for_each_cache(ca, cs, iter) \ + for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++) + +#define for_each_bucket(b, ca) \ + for (b = (ca)->buckets + (ca)->sb.first_bucket; \ + b < (ca)->buckets + (ca)->sb.nbuckets; b++) + +static inline void cached_dev_put(struct cached_dev *dc) +{ + if (atomic_dec_and_test(&dc->count)) + schedule_work(&dc->detach); +} + +static inline bool cached_dev_get(struct cached_dev *dc) +{ + if (!atomic_inc_not_zero(&dc->count)) + return false; + + /* Paired with the mb in cached_dev_attach */ + smp_mb__after_atomic(); + return true; +} + +/* + * bucket_gc_gen() returns the difference between the bucket's current gen and + * the oldest gen of any pointer into that bucket in the btree (last_gc). + */ + +static inline uint8_t bucket_gc_gen(struct bucket *b) +{ + return b->gen - b->last_gc; +} + +#define BUCKET_GC_GEN_MAX 96U + +#define kobj_attribute_write(n, fn) \ + static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) + +#define kobj_attribute_rw(n, show, store) \ + static struct kobj_attribute ksysfs_##n = \ + __ATTR(n, S_IWUSR|S_IRUSR, show, store) + +static inline void wake_up_allocators(struct cache_set *c) +{ + struct cache *ca; + unsigned i; + + for_each_cache(ca, c, i) + wake_up_process(ca->alloc_thread); +} + +/* Forward declarations */ + +void bch_count_io_errors(struct cache *, int, const char *); +void bch_bbio_count_io_errors(struct cache_set *, struct bio *, + int, const char *); +void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); +void bch_bbio_free(struct bio *, struct cache_set *); +struct bio *bch_bbio_alloc(struct cache_set *); + +void bch_generic_make_request(struct bio *, struct bio_split_pool *); +void __bch_submit_bbio(struct bio *, struct cache_set *); +void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); + +uint8_t bch_inc_gen(struct cache *, struct bucket *); +void bch_rescale_priorities(struct cache_set *, int); + +bool bch_can_invalidate_bucket(struct cache *, struct bucket *); +void __bch_invalidate_one_bucket(struct cache *, struct bucket *); + +void __bch_bucket_free(struct cache *, struct bucket *); +void bch_bucket_free(struct cache_set *, struct bkey *); + +long bch_bucket_alloc(struct cache *, unsigned, bool); +int __bch_bucket_alloc_set(struct cache_set *, unsigned, + struct bkey *, int, bool); +int bch_bucket_alloc_set(struct cache_set *, unsigned, + struct bkey *, int, bool); +bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, + unsigned, unsigned, bool); + +__printf(2, 3) +bool bch_cache_set_error(struct cache_set *, const char *, ...); + +void bch_prio_write(struct cache *); +void bch_write_bdev_super(struct cached_dev *, struct closure *); + +extern struct workqueue_struct *bcache_wq; +extern const char * const bch_cache_modes[]; +extern struct mutex bch_register_lock; +extern struct list_head bch_cache_sets; + +extern struct kobj_type bch_cached_dev_ktype; +extern struct kobj_type bch_flash_dev_ktype; +extern struct kobj_type bch_cache_set_ktype; +extern struct kobj_type bch_cache_set_internal_ktype; +extern struct kobj_type bch_cache_ktype; + +void bch_cached_dev_release(struct kobject *); +void bch_flash_dev_release(struct kobject *); +void bch_cache_set_release(struct kobject *); +void bch_cache_release(struct kobject *); + +int bch_uuid_write(struct cache_set *); +void bcache_write_super(struct cache_set *); + +int bch_flash_dev_create(struct cache_set *c, uint64_t size); + +int bch_cached_dev_attach(struct cached_dev *, struct cache_set *); +void bch_cached_dev_detach(struct cached_dev *); +void bch_cached_dev_run(struct cached_dev *); +void bcache_device_stop(struct bcache_device *); + +void bch_cache_set_unregister(struct cache_set *); +void bch_cache_set_stop(struct cache_set *); + +struct cache_set *bch_cache_set_alloc(struct cache_sb *); +void bch_btree_cache_free(struct cache_set *); +int bch_btree_cache_alloc(struct cache_set *); +void bch_moving_init_cache_set(struct cache_set *); +int bch_open_buckets_alloc(struct cache_set *); +void bch_open_buckets_free(struct cache_set *); + +int bch_cache_allocator_start(struct cache *ca); + +void bch_debug_exit(void); +int bch_debug_init(struct kobject *); +void bch_request_exit(void); +int bch_request_init(void); + +#endif /* _BCACHE_H */ diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c new file mode 100644 index 00000000000..54541641530 --- /dev/null +++ b/drivers/md/bcache/bset.c @@ -0,0 +1,1331 @@ +/* + * Code for working with individual keys, and sorted sets of keys with in a + * btree node + * + * Copyright 2012 Google, Inc. + */ + +#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ + +#include "util.h" +#include "bset.h" + +#include <linux/console.h> +#include <linux/random.h> +#include <linux/prefetch.h> + +#ifdef CONFIG_BCACHE_DEBUG + +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) +{ + struct bkey *k, *next; + + for (k = i->start; k < bset_bkey_last(i); k = next) { + next = bkey_next(k); + + printk(KERN_ERR "block %u key %u/%u: ", set, + (unsigned) ((u64 *) k - i->d), i->keys); + + if (b->ops->key_dump) + b->ops->key_dump(b, k); + else + printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); + + if (next < bset_bkey_last(i) && + bkey_cmp(k, b->ops->is_extents ? + &START_KEY(next) : next) > 0) + printk(KERN_ERR "Key skipped backwards\n"); + } +} + +void bch_dump_bucket(struct btree_keys *b) +{ + unsigned i; + + console_lock(); + for (i = 0; i <= b->nsets; i++) + bch_dump_bset(b, b->set[i].data, + bset_sector_offset(b, b->set[i].data)); + console_unlock(); +} + +int __bch_count_data(struct btree_keys *b) +{ + unsigned ret = 0; + struct btree_iter iter; + struct bkey *k; + + if (b->ops->is_extents) + for_each_key(b, k, &iter) + ret += KEY_SIZE(k); + return ret; +} + +void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) +{ + va_list args; + struct bkey *k, *p = NULL; + struct btree_iter iter; + const char *err; + + for_each_key(b, k, &iter) { + if (b->ops->is_extents) { + err = "Keys out of order"; + if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) + goto bug; + + if (bch_ptr_invalid(b, k)) + continue; + + err = "Overlapping keys"; + if (p && bkey_cmp(p, &START_KEY(k)) > 0) + goto bug; + } else { + if (bch_ptr_bad(b, k)) + continue; + + err = "Duplicate keys"; + if (p && !bkey_cmp(p, k)) + goto bug; + } + p = k; + } +#if 0 + err = "Key larger than btree node key"; + if (p && bkey_cmp(p, &b->key) > 0) + goto bug; +#endif + return; +bug: + bch_dump_bucket(b); + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + panic("bch_check_keys error: %s:\n", err); +} + +static void bch_btree_iter_next_check(struct btree_iter *iter) +{ + struct bkey *k = iter->data->k, *next = bkey_next(k); + + if (next < iter->data->end && + bkey_cmp(k, iter->b->ops->is_extents ? + &START_KEY(next) : next) > 0) { + bch_dump_bucket(iter->b); + panic("Key skipped backwards\n"); + } +} + +#else + +static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} + +#endif + +/* Keylists */ + +int __bch_keylist_realloc(struct keylist *l, unsigned u64s) +{ + size_t oldsize = bch_keylist_nkeys(l); + size_t newsize = oldsize + u64s; + uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p; + uint64_t *new_keys; + + newsize = roundup_pow_of_two(newsize); + + if (newsize <= KEYLIST_INLINE || + roundup_pow_of_two(oldsize) == newsize) + return 0; + + new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO); + + if (!new_keys) + return -ENOMEM; + + if (!old_keys) + memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize); + + l->keys_p = new_keys; + l->top_p = new_keys + oldsize; + + return 0; +} + +struct bkey *bch_keylist_pop(struct keylist *l) +{ + struct bkey *k = l->keys; + + if (k == l->top) + return NULL; + + while (bkey_next(k) != l->top) + k = bkey_next(k); + + return l->top = k; +} + +void bch_keylist_pop_front(struct keylist *l) +{ + l->top_p -= bkey_u64s(l->keys); + + memmove(l->keys, + bkey_next(l->keys), + bch_keylist_bytes(l)); +} + +/* Key/pointer manipulation */ + +void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, + unsigned i) +{ + BUG_ON(i > KEY_PTRS(src)); + + /* Only copy the header, key, and one pointer. */ + memcpy(dest, src, 2 * sizeof(uint64_t)); + dest->ptr[0] = src->ptr[i]; + SET_KEY_PTRS(dest, 1); + /* We didn't copy the checksum so clear that bit. */ + SET_KEY_CSUM(dest, 0); +} + +bool __bch_cut_front(const struct bkey *where, struct bkey *k) +{ + unsigned i, len = 0; + + if (bkey_cmp(where, &START_KEY(k)) <= 0) + return false; + + if (bkey_cmp(where, k) < 0) + len = KEY_OFFSET(k) - KEY_OFFSET(where); + else + bkey_copy_key(k, where); + + for (i = 0; i < KEY_PTRS(k); i++) + SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len); + + BUG_ON(len > KEY_SIZE(k)); + SET_KEY_SIZE(k, len); + return true; +} + +bool __bch_cut_back(const struct bkey *where, struct bkey *k) +{ + unsigned len = 0; + + if (bkey_cmp(where, k) >= 0) + return false; + + BUG_ON(KEY_INODE(where) != KEY_INODE(k)); + + if (bkey_cmp(where, &START_KEY(k)) > 0) + len = KEY_OFFSET(where) - KEY_START(k); + + bkey_copy_key(k, where); + + BUG_ON(len > KEY_SIZE(k)); + SET_KEY_SIZE(k, len); + return true; +} + +/* Auxiliary search trees */ + +/* 32 bits total: */ +#define BKEY_MID_BITS 3 +#define BKEY_EXPONENT_BITS 7 +#define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS) +#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) + +struct bkey_float { + unsigned exponent:BKEY_EXPONENT_BITS; + unsigned m:BKEY_MID_BITS; + unsigned mantissa:BKEY_MANTISSA_BITS; +} __packed; + +/* + * BSET_CACHELINE was originally intended to match the hardware cacheline size - + * it used to be 64, but I realized the lookup code would touch slightly less + * memory if it was 128. + * + * It definites the number of bytes (in struct bset) per struct bkey_float in + * the auxiliar search tree - when we're done searching the bset_float tree we + * have this many bytes left that we do a linear search over. + * + * Since (after level 5) every level of the bset_tree is on a new cacheline, + * we're touching one fewer cacheline in the bset tree in exchange for one more + * cacheline in the linear search - but the linear search might stop before it + * gets to the second cacheline. + */ + +#define BSET_CACHELINE 128 + +/* Space required for the btree node keys */ +static inline size_t btree_keys_bytes(struct btree_keys *b) +{ + return PAGE_SIZE << b->page_order; +} + +static inline size_t btree_keys_cachelines(struct btree_keys *b) +{ + return btree_keys_bytes(b) / BSET_CACHELINE; +} + +/* Space required for the auxiliary search trees */ +static inline size_t bset_tree_bytes(struct btree_keys *b) +{ + return btree_keys_cachelines(b) * sizeof(struct bkey_float); +} + +/* Space required for the prev pointers */ +static inline size_t bset_prev_bytes(struct btree_keys *b) +{ + return btree_keys_cachelines(b) * sizeof(uint8_t); +} + +/* Memory allocation */ + +void bch_btree_keys_free(struct btree_keys *b) +{ + struct bset_tree *t = b->set; + + if (bset_prev_bytes(b) < PAGE_SIZE) + kfree(t->prev); + else + free_pages((unsigned long) t->prev, + get_order(bset_prev_bytes(b))); + + if (bset_tree_bytes(b) < PAGE_SIZE) + kfree(t->tree); + else + free_pages((unsigned long) t->tree, + get_order(bset_tree_bytes(b))); + + free_pages((unsigned long) t->data, b->page_order); + + t->prev = NULL; + t->tree = NULL; + t->data = NULL; +} +EXPORT_SYMBOL(bch_btree_keys_free); + +int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp) +{ + struct bset_tree *t = b->set; + + BUG_ON(t->data); + + b->page_order = page_order; + + t->data = (void *) __get_free_pages(gfp, b->page_order); + if (!t->data) + goto err; + + t->tree = bset_tree_bytes(b) < PAGE_SIZE + ? kmalloc(bset_tree_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); + if (!t->tree) + goto err; + + t->prev = bset_prev_bytes(b) < PAGE_SIZE + ? kmalloc(bset_prev_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); + if (!t->prev) + goto err; + + return 0; +err: + bch_btree_keys_free(b); + return -ENOMEM; +} +EXPORT_SYMBOL(bch_btree_keys_alloc); + +void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, + bool *expensive_debug_checks) +{ + unsigned i; + + b->ops = ops; + b->expensive_debug_checks = expensive_debug_checks; + b->nsets = 0; + b->last_set_unwritten = 0; + + /* XXX: shouldn't be needed */ + for (i = 0; i < MAX_BSETS; i++) + b->set[i].size = 0; + /* + * Second loop starts at 1 because b->keys[0]->data is the memory we + * allocated + */ + for (i = 1; i < MAX_BSETS; i++) + b->set[i].data = NULL; +} +EXPORT_SYMBOL(bch_btree_keys_init); + +/* Binary tree stuff for auxiliary search trees */ + +static unsigned inorder_next(unsigned j, unsigned size) +{ + if (j * 2 + 1 < size) { + j = j * 2 + 1; + + while (j * 2 < size) + j *= 2; + } else + j >>= ffz(j) + 1; + + return j; +} + +static unsigned inorder_prev(unsigned j, unsigned size) +{ + if (j * 2 < size) { + j = j * 2; + + while (j * 2 + 1 < size) + j = j * 2 + 1; + } else + j >>= ffs(j); + + return j; +} + +/* I have no idea why this code works... and I'm the one who wrote it + * + * However, I do know what it does: + * Given a binary tree constructed in an array (i.e. how you normally implement + * a heap), it converts a node in the tree - referenced by array index - to the + * index it would have if you did an inorder traversal. + * + * Also tested for every j, size up to size somewhere around 6 million. + * + * The binary tree starts at array index 1, not 0 + * extra is a function of size: + * extra = (size - rounddown_pow_of_two(size - 1)) << 1; + */ +static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra) +{ + unsigned b = fls(j); + unsigned shift = fls(size - 1) - b; + + j ^= 1U << (b - 1); + j <<= 1; + j |= 1; + j <<= shift; + + if (j > extra) + j -= (j - extra) >> 1; + + return j; +} + +static unsigned to_inorder(unsigned j, struct bset_tree *t) +{ + return __to_inorder(j, t->size, t->extra); +} + +static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra) +{ + unsigned shift; + + if (j > extra) + j += j - extra; + + shift = ffs(j); + + j >>= shift; + j |= roundup_pow_of_two(size) >> shift; + + return j; +} + +static unsigned inorder_to_tree(unsigned j, struct bset_tree *t) +{ + return __inorder_to_tree(j, t->size, t->extra); +} + +#if 0 +void inorder_test(void) +{ + unsigned long done = 0; + ktime_t start = ktime_get(); + + for (unsigned size = 2; + size < 65536000; + size++) { + unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1; + unsigned i = 1, j = rounddown_pow_of_two(size - 1); + + if (!(size % 4096)) + printk(KERN_NOTICE "loop %u, %llu per us\n", size, + done / ktime_us_delta(ktime_get(), start)); + + while (1) { + if (__inorder_to_tree(i, size, extra) != j) + panic("size %10u j %10u i %10u", size, j, i); + + if (__to_inorder(j, size, extra) != i) + panic("size %10u j %10u i %10u", size, j, i); + + if (j == rounddown_pow_of_two(size) - 1) + break; + + BUG_ON(inorder_prev(inorder_next(j, size), size) != j); + + j = inorder_next(j, size); + i++; + } + + done += size - 1; + } +} +#endif + +/* + * Cacheline/offset <-> bkey pointer arithmetic: + * + * t->tree is a binary search tree in an array; each node corresponds to a key + * in one cacheline in t->set (BSET_CACHELINE bytes). + * + * This means we don't have to store the full index of the key that a node in + * the binary tree points to; to_inorder() gives us the cacheline, and then + * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes. + * + * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to + * make this work. + * + * To construct the bfloat for an arbitrary key we need to know what the key + * immediately preceding it is: we have to check if the two keys differ in the + * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size + * of the previous key so we can walk backwards to it from t->tree[j]'s key. + */ + +static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline, + unsigned offset) +{ + return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8; +} + +static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k) +{ + return ((void *) k - (void *) t->data) / BSET_CACHELINE; +} + +static unsigned bkey_to_cacheline_offset(struct bset_tree *t, + unsigned cacheline, + struct bkey *k) +{ + return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0); +} + +static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) +{ + return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m); +} + +static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j) +{ + return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]); +} + +/* + * For the write set - the one we're currently inserting keys into - we don't + * maintain a full search tree, we just keep a simple lookup table in t->prev. + */ +static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline) +{ + return cacheline_to_bkey(t, cacheline, t->prev[cacheline]); +} + +static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) +{ + low >>= shift; + low |= (high << 1) << (63U - shift); + return low; +} + +static inline unsigned bfloat_mantissa(const struct bkey *k, + struct bkey_float *f) +{ + const uint64_t *p = &k->low - (f->exponent >> 6); + return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK; +} + +static void make_bfloat(struct bset_tree *t, unsigned j) +{ + struct bkey_float *f = &t->tree[j]; + struct bkey *m = tree_to_bkey(t, j); + struct bkey *p = tree_to_prev_bkey(t, j); + + struct bkey *l = is_power_of_2(j) + ? t->data->start + : tree_to_prev_bkey(t, j >> ffs(j)); + + struct bkey *r = is_power_of_2(j + 1) + ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end)) + : tree_to_bkey(t, j >> (ffz(j) + 1)); + + BUG_ON(m < l || m > r); + BUG_ON(bkey_next(p) != m); + + if (KEY_INODE(l) != KEY_INODE(r)) + f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64; + else + f->exponent = fls64(r->low ^ l->low); + + f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0); + + /* + * Setting f->exponent = 127 flags this node as failed, and causes the + * lookup code to fall back to comparing against the original key. + */ + + if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f)) + f->mantissa = bfloat_mantissa(m, f) - 1; + else + f->exponent = 127; +} + +static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t) +{ + if (t != b->set) { + unsigned j = roundup(t[-1].size, + 64 / sizeof(struct bkey_float)); + + t->tree = t[-1].tree + j; + t->prev = t[-1].prev + j; + } + + while (t < b->set + MAX_BSETS) + t++->size = 0; +} + +static void bch_bset_build_unwritten_tree(struct btree_keys *b) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON(b->last_set_unwritten); + b->last_set_unwritten = 1; + + bset_alloc_tree(b, t); + + if (t->tree != b->set->tree + btree_keys_cachelines(b)) { + t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start); + t->size = 1; + } +} + +void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) +{ + if (i != b->set->data) { + b->set[++b->nsets].data = i; + i->seq = b->set->data->seq; + } else + get_random_bytes(&i->seq, sizeof(uint64_t)); + + i->magic = magic; + i->version = 0; + i->keys = 0; + + bch_bset_build_unwritten_tree(b); +} +EXPORT_SYMBOL(bch_bset_init_next); + +void bch_bset_build_written_tree(struct btree_keys *b) +{ + struct bset_tree *t = bset_tree_last(b); + struct bkey *prev = NULL, *k = t->data->start; + unsigned j, cacheline = 1; + + b->last_set_unwritten = 0; + + bset_alloc_tree(b, t); + + t->size = min_t(unsigned, + bkey_to_cacheline(t, bset_bkey_last(t->data)), + b->set->tree + btree_keys_cachelines(b) - t->tree); + + if (t->size < 2) { + t->size = 0; + return; + } + + t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; + + /* First we figure out where the first key in each cacheline is */ + for (j = inorder_next(0, t->size); + j; + j = inorder_next(j, t->size)) { + while (bkey_to_cacheline(t, k) < cacheline) + prev = k, k = bkey_next(k); + + t->prev[j] = bkey_u64s(prev); + t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k); + } + + while (bkey_next(k) != bset_bkey_last(t->data)) + k = bkey_next(k); + + t->end = *k; + + /* Then we build the tree */ + for (j = inorder_next(0, t->size); + j; + j = inorder_next(j, t->size)) + make_bfloat(t, j); +} +EXPORT_SYMBOL(bch_bset_build_written_tree); + +/* Insert */ + +void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k) +{ + struct bset_tree *t; + unsigned inorder, j = 1; + + for (t = b->set; t <= bset_tree_last(b); t++) + if (k < bset_bkey_last(t->data)) + goto found_set; + + BUG(); +found_set: + if (!t->size || !bset_written(b, t)) + return; + + inorder = bkey_to_cacheline(t, k); + + if (k == t->data->start) + goto fix_left; + + if (bkey_next(k) == bset_bkey_last(t->data)) { + t->end = *k; + goto fix_right; + } + + j = inorder_to_tree(inorder, t); + + if (j && + j < t->size && + k == tree_to_bkey(t, j)) +fix_left: do { + make_bfloat(t, j); + j = j * 2; + } while (j < t->size); + + j = inorder_to_tree(inorder + 1, t); + + if (j && + j < t->size && + k == tree_to_prev_bkey(t, j)) +fix_right: do { + make_bfloat(t, j); + j = j * 2 + 1; + } while (j < t->size); +} +EXPORT_SYMBOL(bch_bset_fix_invalidated_key); + +static void bch_bset_fix_lookup_table(struct btree_keys *b, + struct bset_tree *t, + struct bkey *k) +{ + unsigned shift = bkey_u64s(k); + unsigned j = bkey_to_cacheline(t, k); + + /* We're getting called from btree_split() or btree_gc, just bail out */ + if (!t->size) + return; + + /* k is the key we just inserted; we need to find the entry in the + * lookup table for the first key that is strictly greater than k: + * it's either k's cacheline or the next one + */ + while (j < t->size && + table_to_bkey(t, j) <= k) + j++; + + /* Adjust all the lookup table entries, and find a new key for any that + * have gotten too big + */ + for (; j < t->size; j++) { + t->prev[j] += shift; + + if (t->prev[j] > 7) { + k = table_to_bkey(t, j - 1); + + while (k < cacheline_to_bkey(t, j, 0)) + k = bkey_next(k); + + t->prev[j] = bkey_to_cacheline_offset(t, j, k); + } + } + + if (t->size == b->set->tree + btree_keys_cachelines(b) - t->tree) + return; + + /* Possibly add a new entry to the end of the lookup table */ + + for (k = table_to_bkey(t, t->size - 1); + k != bset_bkey_last(t->data); + k = bkey_next(k)) + if (t->size == bkey_to_cacheline(t, k)) { + t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k); + t->size++; + } +} + +/* + * Tries to merge l and r: l should be lower than r + * Returns true if we were able to merge. If we did merge, l will be the merged + * key, r will be untouched. + */ +bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) +{ + if (!b->ops->key_merge) + return false; + + /* + * Generic header checks + * Assumes left and right are in order + * Left and right must be exactly aligned + */ + if (!bch_bkey_equal_header(l, r) || + bkey_cmp(l, &START_KEY(r))) + return false; + + return b->ops->key_merge(b, l, r); +} +EXPORT_SYMBOL(bch_bkey_try_merge); + +void bch_bset_insert(struct btree_keys *b, struct bkey *where, + struct bkey *insert) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON(!b->last_set_unwritten); + BUG_ON(bset_byte_offset(b, t->data) + + __set_bytes(t->data, t->data->keys + bkey_u64s(insert)) > + PAGE_SIZE << b->page_order); + + memmove((uint64_t *) where + bkey_u64s(insert), + where, + (void *) bset_bkey_last(t->data) - (void *) where); + + t->data->keys += bkey_u64s(insert); + bkey_copy(where, insert); + bch_bset_fix_lookup_table(b, t, where); +} +EXPORT_SYMBOL(bch_bset_insert); + +unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k, + struct bkey *replace_key) +{ + unsigned status = BTREE_INSERT_STATUS_NO_INSERT; + struct bset *i = bset_tree_last(b)->data; + struct bkey *m, *prev = NULL; + struct btree_iter iter; + + BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); + + m = bch_btree_iter_init(b, &iter, b->ops->is_extents + ? PRECEDING_KEY(&START_KEY(k)) + : PRECEDING_KEY(k)); + + if (b->ops->insert_fixup(b, k, &iter, replace_key)) + return status; + + status = BTREE_INSERT_STATUS_INSERT; + + while (m != bset_bkey_last(i) && + bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) + prev = m, m = bkey_next(m); + + /* prev is in the tree, if we merge we're done */ + status = BTREE_INSERT_STATUS_BACK_MERGE; + if (prev && + bch_bkey_try_merge(b, prev, k)) + goto merged; +#if 0 + status = BTREE_INSERT_STATUS_OVERWROTE; + if (m != bset_bkey_last(i) && + KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) + goto copy; +#endif + status = BTREE_INSERT_STATUS_FRONT_MERGE; + if (m != bset_bkey_last(i) && + bch_bkey_try_merge(b, k, m)) + goto copy; + + bch_bset_insert(b, m, k); +copy: bkey_copy(m, k); +merged: + return status; +} +EXPORT_SYMBOL(bch_btree_insert_key); + +/* Lookup */ + +struct bset_search_iter { + struct bkey *l, *r; +}; + +static struct bset_search_iter bset_search_write_set(struct bset_tree *t, + const struct bkey *search) +{ + unsigned li = 0, ri = t->size; + + while (li + 1 != ri) { + unsigned m = (li + ri) >> 1; + + if (bkey_cmp(table_to_bkey(t, m), search) > 0) + ri = m; + else + li = m; + } + + return (struct bset_search_iter) { + table_to_bkey(t, li), + ri < t->size ? table_to_bkey(t, ri) : bset_bkey_last(t->data) + }; +} + +static struct bset_search_iter bset_search_tree(struct bset_tree *t, + const struct bkey *search) +{ + struct bkey *l, *r; + struct bkey_float *f; + unsigned inorder, j, n = 1; + + do { + unsigned p = n << 4; + p &= ((int) (p - t->size)) >> 31; + + prefetch(&t->tree[p]); + + j = n; + f = &t->tree[j]; + + /* + * n = (f->mantissa > bfloat_mantissa()) + * ? j * 2 + * : j * 2 + 1; + * + * We need to subtract 1 from f->mantissa for the sign bit trick + * to work - that's done in make_bfloat() + */ + if (likely(f->exponent != 127)) + n = j * 2 + (((unsigned) + (f->mantissa - + bfloat_mantissa(search, f))) >> 31); + else + n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) + ? j * 2 + : j * 2 + 1; + } while (n < t->size); + + inorder = to_inorder(j, t); + + /* + * n would have been the node we recursed to - the low bit tells us if + * we recursed left or recursed right. + */ + if (n & 1) { + l = cacheline_to_bkey(t, inorder, f->m); + + if (++inorder != t->size) { + f = &t->tree[inorder_next(j, t->size)]; + r = cacheline_to_bkey(t, inorder, f->m); + } else + r = bset_bkey_last(t->data); + } else { + r = cacheline_to_bkey(t, inorder, f->m); + + if (--inorder) { + f = &t->tree[inorder_prev(j, t->size)]; + l = cacheline_to_bkey(t, inorder, f->m); + } else + l = t->data->start; + } + + return (struct bset_search_iter) {l, r}; +} + +struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, + const struct bkey *search) +{ + struct bset_search_iter i; + + /* + * First, we search for a cacheline, then lastly we do a linear search + * within that cacheline. + * + * To search for the cacheline, there's three different possibilities: + * * The set is too small to have a search tree, so we just do a linear + * search over the whole set. + * * The set is the one we're currently inserting into; keeping a full + * auxiliary search tree up to date would be too expensive, so we + * use a much simpler lookup table to do a binary search - + * bset_search_write_set(). + * * Or we use the auxiliary search tree we constructed earlier - + * bset_search_tree() + */ + + if (unlikely(!t->size)) { + i.l = t->data->start; + i.r = bset_bkey_last(t->data); + } else if (bset_written(b, t)) { + /* + * Each node in the auxiliary search tree covers a certain range + * of bits, and keys above and below the set it covers might + * differ outside those bits - so we have to special case the + * start and end - handle that here: + */ + + if (unlikely(bkey_cmp(search, &t->end) >= 0)) + return bset_bkey_last(t->data); + + if (unlikely(bkey_cmp(search, t->data->start) < 0)) + return t->data->start; + + i = bset_search_tree(t, search); + } else { + BUG_ON(!b->nsets && + t->size < bkey_to_cacheline(t, bset_bkey_last(t->data))); + + i = bset_search_write_set(t, search); + } + + if (btree_keys_expensive_checks(b)) { + BUG_ON(bset_written(b, t) && + i.l != t->data->start && + bkey_cmp(tree_to_prev_bkey(t, + inorder_to_tree(bkey_to_cacheline(t, i.l), t)), + search) > 0); + + BUG_ON(i.r != bset_bkey_last(t->data) && + bkey_cmp(i.r, search) <= 0); + } + + while (likely(i.l != i.r) && + bkey_cmp(i.l, search) <= 0) + i.l = bkey_next(i.l); + + return i.l; +} +EXPORT_SYMBOL(__bch_bset_search); + +/* Btree iterator */ + +typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, + struct btree_iter_set); + +static inline bool btree_iter_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + return bkey_cmp(l.k, r.k) > 0; +} + +static inline bool btree_iter_end(struct btree_iter *iter) +{ + return !iter->used; +} + +void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, + struct bkey *end) +{ + if (k != end) + BUG_ON(!heap_add(iter, + ((struct btree_iter_set) { k, end }), + btree_iter_cmp)); +} + +static struct bkey *__bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search, + struct bset_tree *start) +{ + struct bkey *ret = NULL; + iter->size = ARRAY_SIZE(iter->data); + iter->used = 0; + +#ifdef CONFIG_BCACHE_DEBUG + iter->b = b; +#endif + + for (; start <= bset_tree_last(b); start++) { + ret = bch_bset_search(b, start, search); + bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); + } + + return ret; +} + +struct bkey *bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search) +{ + return __bch_btree_iter_init(b, iter, search, b->set); +} +EXPORT_SYMBOL(bch_btree_iter_init); + +static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, + btree_iter_cmp_fn *cmp) +{ + struct btree_iter_set unused; + struct bkey *ret = NULL; + + if (!btree_iter_end(iter)) { + bch_btree_iter_next_check(iter); + + ret = iter->data->k; + iter->data->k = bkey_next(iter->data->k); + + if (iter->data->k > iter->data->end) { + WARN_ONCE(1, "bset was corrupt!\n"); + iter->data->k = iter->data->end; + } + + if (iter->data->k == iter->data->end) + heap_pop(iter, unused, cmp); + else + heap_sift(iter, 0, cmp); + } + + return ret; +} + +struct bkey *bch_btree_iter_next(struct btree_iter *iter) +{ + return __bch_btree_iter_next(iter, btree_iter_cmp); + +} +EXPORT_SYMBOL(bch_btree_iter_next); + +struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, + struct btree_keys *b, ptr_filter_fn fn) +{ + struct bkey *ret; + + do { + ret = bch_btree_iter_next(iter); + } while (ret && fn(b, ret)); + + return ret; +} + +/* Mergesort */ + +void bch_bset_sort_state_free(struct bset_sort_state *state) +{ + if (state->pool) + mempool_destroy(state->pool); +} + +int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) +{ + spin_lock_init(&state->time.lock); + + state->page_order = page_order; + state->crit_factor = int_sqrt(1 << page_order); + + state->pool = mempool_create_page_pool(1, page_order); + if (!state->pool) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(bch_bset_sort_state_init); + +static void btree_mergesort(struct btree_keys *b, struct bset *out, + struct btree_iter *iter, + bool fixup, bool remove_stale) +{ + int i; + struct bkey *k, *last = NULL; + BKEY_PADDED(k) tmp; + bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale + ? bch_ptr_bad + : bch_ptr_invalid; + + /* Heapify the iterator, using our comparison function */ + for (i = iter->used / 2 - 1; i >= 0; --i) + heap_sift(iter, i, b->ops->sort_cmp); + + while (!btree_iter_end(iter)) { + if (b->ops->sort_fixup && fixup) + k = b->ops->sort_fixup(iter, &tmp.k); + else + k = NULL; + + if (!k) + k = __bch_btree_iter_next(iter, b->ops->sort_cmp); + + if (bad(b, k)) + continue; + + if (!last) { + last = out->start; + bkey_copy(last, k); + } else if (!bch_bkey_try_merge(b, last, k)) { + last = bkey_next(last); + bkey_copy(last, k); + } + } + + out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; + + pr_debug("sorted %i keys", out->keys); +} + +static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, + unsigned start, unsigned order, bool fixup, + struct bset_sort_state *state) +{ + uint64_t start_time; + bool used_mempool = false; + struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, + order); + if (!out) { + struct page *outp; + + BUG_ON(order > state->page_order); + + outp = mempool_alloc(state->pool, GFP_NOIO); + out = page_address(outp); + used_mempool = true; + order = state->page_order; + } + + start_time = local_clock(); + + btree_mergesort(b, out, iter, fixup, false); + b->nsets = start; + + if (!start && order == b->page_order) { + /* + * Our temporary buffer is the same size as the btree node's + * buffer, we can just swap buffers instead of doing a big + * memcpy() + */ + + out->magic = b->set->data->magic; + out->seq = b->set->data->seq; + out->version = b->set->data->version; + swap(out, b->set->data); + } else { + b->set[start].data->keys = out->keys; + memcpy(b->set[start].data->start, out->start, + (void *) bset_bkey_last(out) - (void *) out->start); + } + + if (used_mempool) + mempool_free(virt_to_page(out), state->pool); + else + free_pages((unsigned long) out, order); + + bch_bset_build_written_tree(b); + + if (!start) + bch_time_stats_update(&state->time, start_time); +} + +void bch_btree_sort_partial(struct btree_keys *b, unsigned start, + struct bset_sort_state *state) +{ + size_t order = b->page_order, keys = 0; + struct btree_iter iter; + int oldsize = bch_count_data(b); + + __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); + + if (start) { + unsigned i; + + for (i = start; i <= b->nsets; i++) + keys += b->set[i].data->keys; + + order = get_order(__set_bytes(b->set->data, keys)); + } + + __btree_sort(b, &iter, start, order, false, state); + + EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); +} +EXPORT_SYMBOL(bch_btree_sort_partial); + +void bch_btree_sort_and_fix_extents(struct btree_keys *b, + struct btree_iter *iter, + struct bset_sort_state *state) +{ + __btree_sort(b, iter, 0, b->page_order, true, state); +} + +void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, + struct bset_sort_state *state) +{ + uint64_t start_time = local_clock(); + + struct btree_iter iter; + bch_btree_iter_init(b, &iter, NULL); + + btree_mergesort(b, new->set->data, &iter, false, true); + + bch_time_stats_update(&state->time, start_time); + + new->set->size = 0; // XXX: why? +} + +#define SORT_CRIT (4096 / sizeof(uint64_t)) + +void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) +{ + unsigned crit = SORT_CRIT; + int i; + + /* Don't sort if nothing to do */ + if (!b->nsets) + goto out; + + for (i = b->nsets - 1; i >= 0; --i) { + crit *= state->crit_factor; + + if (b->set[i].data->keys < crit) { + bch_btree_sort_partial(b, i, state); + return; + } + } + + /* Sort if we'd overflow */ + if (b->nsets + 1 == MAX_BSETS) { + bch_btree_sort(b, state); + return; + } + +out: + bch_bset_build_written_tree(b); +} +EXPORT_SYMBOL(bch_btree_sort_lazy); + +void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) +{ + unsigned i; + + for (i = 0; i <= b->nsets; i++) { + struct bset_tree *t = &b->set[i]; + size_t bytes = t->data->keys * sizeof(uint64_t); + size_t j; + + if (bset_written(b, t)) { + stats->sets_written++; + stats->bytes_written += bytes; + + stats->floats += t->size - 1; + + for (j = 1; j < t->size; j++) + if (t->tree[j].exponent == 127) + stats->failed++; + } else { + stats->sets_unwritten++; + stats->bytes_unwritten += bytes; + } + } +} diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h new file mode 100644 index 00000000000..5f6728d5d4d --- /dev/null +++ b/drivers/md/bcache/bset.h @@ -0,0 +1,566 @@ +#ifndef _BCACHE_BSET_H +#define _BCACHE_BSET_H + +#include <linux/bcache.h> +#include <linux/kernel.h> +#include <linux/types.h> + +#include "util.h" /* for time_stats */ + +/* + * BKEYS: + * + * A bkey contains a key, a size field, a variable number of pointers, and some + * ancillary flag bits. + * + * We use two different functions for validating bkeys, bch_ptr_invalid and + * bch_ptr_bad(). + * + * bch_ptr_invalid() primarily filters out keys and pointers that would be + * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and + * pointer that occur in normal practice but don't point to real data. + * + * The one exception to the rule that ptr_invalid() filters out invalid keys is + * that it also filters out keys of size 0 - these are keys that have been + * completely overwritten. It'd be safe to delete these in memory while leaving + * them on disk, just unnecessary work - so we filter them out when resorting + * instead. + * + * We can't filter out stale keys when we're resorting, because garbage + * collection needs to find them to ensure bucket gens don't wrap around - + * unless we're rewriting the btree node those stale keys still exist on disk. + * + * We also implement functions here for removing some number of sectors from the + * front or the back of a bkey - this is mainly used for fixing overlapping + * extents, by removing the overlapping sectors from the older key. + * + * BSETS: + * + * A bset is an array of bkeys laid out contiguously in memory in sorted order, + * along with a header. A btree node is made up of a number of these, written at + * different times. + * + * There could be many of them on disk, but we never allow there to be more than + * 4 in memory - we lazily resort as needed. + * + * We implement code here for creating and maintaining auxiliary search trees + * (described below) for searching an individial bset, and on top of that we + * implement a btree iterator. + * + * BTREE ITERATOR: + * + * Most of the code in bcache doesn't care about an individual bset - it needs + * to search entire btree nodes and iterate over them in sorted order. + * + * The btree iterator code serves both functions; it iterates through the keys + * in a btree node in sorted order, starting from either keys after a specific + * point (if you pass it a search key) or the start of the btree node. + * + * AUXILIARY SEARCH TREES: + * + * Since keys are variable length, we can't use a binary search on a bset - we + * wouldn't be able to find the start of the next key. But binary searches are + * slow anyways, due to terrible cache behaviour; bcache originally used binary + * searches and that code topped out at under 50k lookups/second. + * + * So we need to construct some sort of lookup table. Since we only insert keys + * into the last (unwritten) set, most of the keys within a given btree node are + * usually in sets that are mostly constant. We use two different types of + * lookup tables to take advantage of this. + * + * Both lookup tables share in common that they don't index every key in the + * set; they index one key every BSET_CACHELINE bytes, and then a linear search + * is used for the rest. + * + * For sets that have been written to disk and are no longer being inserted + * into, we construct a binary search tree in an array - traversing a binary + * search tree in an array gives excellent locality of reference and is very + * fast, since both children of any node are adjacent to each other in memory + * (and their grandchildren, and great grandchildren...) - this means + * prefetching can be used to great effect. + * + * It's quite useful performance wise to keep these nodes small - not just + * because they're more likely to be in L2, but also because we can prefetch + * more nodes on a single cacheline and thus prefetch more iterations in advance + * when traversing this tree. + * + * Nodes in the auxiliary search tree must contain both a key to compare against + * (we don't want to fetch the key from the set, that would defeat the purpose), + * and a pointer to the key. We use a few tricks to compress both of these. + * + * To compress the pointer, we take advantage of the fact that one node in the + * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have + * a function (to_inorder()) that takes the index of a node in a binary tree and + * returns what its index would be in an inorder traversal, so we only have to + * store the low bits of the offset. + * + * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To + * compress that, we take advantage of the fact that when we're traversing the + * search tree at every iteration we know that both our search key and the key + * we're looking for lie within some range - bounded by our previous + * comparisons. (We special case the start of a search so that this is true even + * at the root of the tree). + * + * So we know the key we're looking for is between a and b, and a and b don't + * differ higher than bit 50, we don't need to check anything higher than bit + * 50. + * + * We don't usually need the rest of the bits, either; we only need enough bits + * to partition the key range we're currently checking. Consider key n - the + * key our auxiliary search tree node corresponds to, and key p, the key + * immediately preceding n. The lowest bit we need to store in the auxiliary + * search tree is the highest bit that differs between n and p. + * + * Note that this could be bit 0 - we might sometimes need all 80 bits to do the + * comparison. But we'd really like our nodes in the auxiliary search tree to be + * of fixed size. + * + * The solution is to make them fixed size, and when we're constructing a node + * check if p and n differed in the bits we needed them to. If they don't we + * flag that node, and when doing lookups we fallback to comparing against the + * real key. As long as this doesn't happen to often (and it seems to reliably + * happen a bit less than 1% of the time), we win - even on failures, that key + * is then more likely to be in cache than if we were doing binary searches all + * the way, since we're touching so much less memory. + * + * The keys in the auxiliary search tree are stored in (software) floating + * point, with an exponent and a mantissa. The exponent needs to be big enough + * to address all the bits in the original key, but the number of bits in the + * mantissa is somewhat arbitrary; more bits just gets us fewer failures. + * + * We need 7 bits for the exponent and 3 bits for the key's offset (since keys + * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. + * We need one node per 128 bytes in the btree node, which means the auxiliary + * search trees take up 3% as much memory as the btree itself. + * + * Constructing these auxiliary search trees is moderately expensive, and we + * don't want to be constantly rebuilding the search tree for the last set + * whenever we insert another key into it. For the unwritten set, we use a much + * simpler lookup table - it's just a flat array, so index i in the lookup table + * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing + * within each byte range works the same as with the auxiliary search trees. + * + * These are much easier to keep up to date when we insert a key - we do it + * somewhat lazily; when we shift a key up we usually just increment the pointer + * to it, only when it would overflow do we go to the trouble of finding the + * first key in that range of bytes again. + */ + +struct btree_keys; +struct btree_iter; +struct btree_iter_set; +struct bkey_float; + +#define MAX_BSETS 4U + +struct bset_tree { + /* + * We construct a binary tree in an array as if the array + * started at 1, so that things line up on the same cachelines + * better: see comments in bset.c at cacheline_to_bkey() for + * details + */ + + /* size of the binary tree and prev array */ + unsigned size; + + /* function of size - precalculated for to_inorder() */ + unsigned extra; + + /* copy of the last key in the set */ + struct bkey end; + struct bkey_float *tree; + + /* + * The nodes in the bset tree point to specific keys - this + * array holds the sizes of the previous key. + * + * Conceptually it's a member of struct bkey_float, but we want + * to keep bkey_float to 4 bytes and prev isn't used in the fast + * path. + */ + uint8_t *prev; + + /* The actual btree node, with pointers to each sorted set */ + struct bset *data; +}; + +struct btree_keys_ops { + bool (*sort_cmp)(struct btree_iter_set, + struct btree_iter_set); + struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *); + bool (*insert_fixup)(struct btree_keys *, struct bkey *, + struct btree_iter *, struct bkey *); + bool (*key_invalid)(struct btree_keys *, + const struct bkey *); + bool (*key_bad)(struct btree_keys *, const struct bkey *); + bool (*key_merge)(struct btree_keys *, + struct bkey *, struct bkey *); + void (*key_to_text)(char *, size_t, const struct bkey *); + void (*key_dump)(struct btree_keys *, const struct bkey *); + + /* + * Only used for deciding whether to use START_KEY(k) or just the key + * itself in a couple places + */ + bool is_extents; +}; + +struct btree_keys { + const struct btree_keys_ops *ops; + uint8_t page_order; + uint8_t nsets; + unsigned last_set_unwritten:1; + bool *expensive_debug_checks; + + /* + * Sets of sorted keys - the real btree node - plus a binary search tree + * + * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point + * to the memory we have allocated for this btree node. Additionally, + * set[0]->data points to the entire btree node as it exists on disk. + */ + struct bset_tree set[MAX_BSETS]; +}; + +static inline struct bset_tree *bset_tree_last(struct btree_keys *b) +{ + return b->set + b->nsets; +} + +static inline bool bset_written(struct btree_keys *b, struct bset_tree *t) +{ + return t <= b->set + b->nsets - b->last_set_unwritten; +} + +static inline bool bkey_written(struct btree_keys *b, struct bkey *k) +{ + return !b->last_set_unwritten || k < b->set[b->nsets].data->start; +} + +static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i) +{ + return ((size_t) i) - ((size_t) b->set->data); +} + +static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i) +{ + return bset_byte_offset(b, i) >> 9; +} + +#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t)) +#define set_bytes(i) __set_bytes(i, i->keys) + +#define __set_blocks(i, k, block_bytes) \ + DIV_ROUND_UP(__set_bytes(i, k), block_bytes) +#define set_blocks(i, block_bytes) \ + __set_blocks(i, (i)->keys, block_bytes) + +static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) +{ + struct bset_tree *t = bset_tree_last(b); + + BUG_ON((PAGE_SIZE << b->page_order) < + (bset_byte_offset(b, t->data) + set_bytes(t->data))); + + if (!b->last_set_unwritten) + return 0; + + return ((PAGE_SIZE << b->page_order) - + (bset_byte_offset(b, t->data) + set_bytes(t->data))) / + sizeof(u64); +} + +static inline struct bset *bset_next_set(struct btree_keys *b, + unsigned block_bytes) +{ + struct bset *i = bset_tree_last(b)->data; + + return ((void *) i) + roundup(set_bytes(i), block_bytes); +} + +void bch_btree_keys_free(struct btree_keys *); +int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t); +void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *, + bool *); + +void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); +void bch_bset_build_written_tree(struct btree_keys *); +void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); +bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *); +void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); +unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *, + struct bkey *); + +enum { + BTREE_INSERT_STATUS_NO_INSERT = 0, + BTREE_INSERT_STATUS_INSERT, + BTREE_INSERT_STATUS_BACK_MERGE, + BTREE_INSERT_STATUS_OVERWROTE, + BTREE_INSERT_STATUS_FRONT_MERGE, +}; + +/* Btree key iteration */ + +struct btree_iter { + size_t size, used; +#ifdef CONFIG_BCACHE_DEBUG + struct btree_keys *b; +#endif + struct btree_iter_set { + struct bkey *k, *end; + } data[MAX_BSETS]; +}; + +typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *); + +struct bkey *bch_btree_iter_next(struct btree_iter *); +struct bkey *bch_btree_iter_next_filter(struct btree_iter *, + struct btree_keys *, ptr_filter_fn); + +void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); +struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *, + struct bkey *); + +struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *, + const struct bkey *); + +/* + * Returns the first key that is strictly greater than search + */ +static inline struct bkey *bch_bset_search(struct btree_keys *b, + struct bset_tree *t, + const struct bkey *search) +{ + return search ? __bch_bset_search(b, t, search) : t->data->start; +} + +#define for_each_key_filter(b, k, iter, filter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next_filter((iter), (b), filter));) + +#define for_each_key(b, k, iter) \ + for (bch_btree_iter_init((b), (iter), NULL); \ + ((k) = bch_btree_iter_next(iter));) + +/* Sorting */ + +struct bset_sort_state { + mempool_t *pool; + + unsigned page_order; + unsigned crit_factor; + + struct time_stats time; +}; + +void bch_bset_sort_state_free(struct bset_sort_state *); +int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); +void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *); +void bch_btree_sort_into(struct btree_keys *, struct btree_keys *, + struct bset_sort_state *); +void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *, + struct bset_sort_state *); +void bch_btree_sort_partial(struct btree_keys *, unsigned, + struct bset_sort_state *); + +static inline void bch_btree_sort(struct btree_keys *b, + struct bset_sort_state *state) +{ + bch_btree_sort_partial(b, 0, state); +} + +struct bset_stats { + size_t sets_written, sets_unwritten; + size_t bytes_written, bytes_unwritten; + size_t floats, failed; +}; + +void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *); + +/* Bkey utility code */ + +#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) + +static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) +{ + return bkey_idx(i->start, idx); +} + +static inline void bkey_init(struct bkey *k) +{ + *k = ZERO_KEY; +} + +static __always_inline int64_t bkey_cmp(const struct bkey *l, + const struct bkey *r) +{ + return unlikely(KEY_INODE(l) != KEY_INODE(r)) + ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r) + : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); +} + +void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, + unsigned); +bool __bch_cut_front(const struct bkey *, struct bkey *); +bool __bch_cut_back(const struct bkey *, struct bkey *); + +static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) +{ + BUG_ON(bkey_cmp(where, k) > 0); + return __bch_cut_front(where, k); +} + +static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) +{ + BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0); + return __bch_cut_back(where, k); +} + +#define PRECEDING_KEY(_k) \ +({ \ + struct bkey *_ret = NULL; \ + \ + if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ + _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ + \ + if (!_ret->low) \ + _ret->high--; \ + _ret->low--; \ + } \ + \ + _ret; \ +}) + +static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) +{ + return b->ops->key_invalid(b, k); +} + +static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k) +{ + return b->ops->key_bad(b, k); +} + +static inline void bch_bkey_to_text(struct btree_keys *b, char *buf, + size_t size, const struct bkey *k) +{ + return b->ops->key_to_text(buf, size, k); +} + +static inline bool bch_bkey_equal_header(const struct bkey *l, + const struct bkey *r) +{ + return (KEY_DIRTY(l) == KEY_DIRTY(r) && + KEY_PTRS(l) == KEY_PTRS(r) && + KEY_CSUM(l) == KEY_CSUM(l)); +} + +/* Keylists */ + +struct keylist { + union { + struct bkey *keys; + uint64_t *keys_p; + }; + union { + struct bkey *top; + uint64_t *top_p; + }; + + /* Enough room for btree_split's keys without realloc */ +#define KEYLIST_INLINE 16 + uint64_t inline_keys[KEYLIST_INLINE]; +}; + +static inline void bch_keylist_init(struct keylist *l) +{ + l->top_p = l->keys_p = l->inline_keys; +} + +static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k) +{ + l->keys = k; + l->top = bkey_next(k); +} + +static inline void bch_keylist_push(struct keylist *l) +{ + l->top = bkey_next(l->top); +} + +static inline void bch_keylist_add(struct keylist *l, struct bkey *k) +{ + bkey_copy(l->top, k); + bch_keylist_push(l); +} + +static inline bool bch_keylist_empty(struct keylist *l) +{ + return l->top == l->keys; +} + +static inline void bch_keylist_reset(struct keylist *l) +{ + l->top = l->keys; +} + +static inline void bch_keylist_free(struct keylist *l) +{ + if (l->keys_p != l->inline_keys) + kfree(l->keys_p); +} + +static inline size_t bch_keylist_nkeys(struct keylist *l) +{ + return l->top_p - l->keys_p; +} + +static inline size_t bch_keylist_bytes(struct keylist *l) +{ + return bch_keylist_nkeys(l) * sizeof(uint64_t); +} + +struct bkey *bch_keylist_pop(struct keylist *); +void bch_keylist_pop_front(struct keylist *); +int __bch_keylist_realloc(struct keylist *, unsigned); + +/* Debug stuff */ + +#ifdef CONFIG_BCACHE_DEBUG + +int __bch_count_data(struct btree_keys *); +void __bch_check_keys(struct btree_keys *, const char *, ...); +void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); +void bch_dump_bucket(struct btree_keys *); + +#else + +static inline int __bch_count_data(struct btree_keys *b) { return -1; } +static inline void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} +static inline void bch_dump_bucket(struct btree_keys *b) {} +void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); + +#endif + +static inline bool btree_keys_expensive_checks(struct btree_keys *b) +{ +#ifdef CONFIG_BCACHE_DEBUG + return *b->expensive_debug_checks; +#else + return false; +#endif +} + +static inline int bch_count_data(struct btree_keys *b) +{ + return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1; +} + +#define bch_check_keys(b, ...) \ +do { \ + if (btree_keys_expensive_checks(b)) \ + __bch_check_keys(b, __VA_ARGS__); \ +} while (0) + +#endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c new file mode 100644 index 00000000000..7347b610096 --- /dev/null +++ b/drivers/md/bcache/btree.c @@ -0,0 +1,2518 @@ +/* + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> + * + * Uses a block device as cache for other block devices; optimized for SSDs. + * All allocation is done in buckets, which should match the erase block size + * of the device. + * + * Buckets containing cached data are kept on a heap sorted by priority; + * bucket priority is increased on cache hit, and periodically all the buckets + * on the heap have their priority scaled down. This currently is just used as + * an LRU but in the future should allow for more intelligent heuristics. + * + * Buckets have an 8 bit counter; freeing is accomplished by incrementing the + * counter. Garbage collection is used to remove stale pointers. + * + * Indexing is done via a btree; nodes are not necessarily fully sorted, rather + * as keys are inserted we only sort the pages that have not yet been written. + * When garbage collection is run, we resort the entire node. + * + * All configuration is done via sysfs; see Documentation/bcache.txt. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" + +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/freezer.h> +#include <linux/hash.h> +#include <linux/kthread.h> +#include <linux/prefetch.h> +#include <linux/random.h> +#include <linux/rcupdate.h> +#include <trace/events/bcache.h> + +/* + * Todo: + * register_bcache: Return errors out to userspace correctly + * + * Writeback: don't undirty key until after a cache flush + * + * Create an iterator for key pointers + * + * On btree write error, mark bucket such that it won't be freed from the cache + * + * Journalling: + * Check for bad keys in replay + * Propagate barriers + * Refcount journal entries in journal_replay + * + * Garbage collection: + * Finish incremental gc + * Gc should free old UUIDs, data for invalid UUIDs + * + * Provide a way to list backing device UUIDs we have data cached for, and + * probably how long it's been since we've seen them, and a way to invalidate + * dirty data for devices that will never be attached again + * + * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so + * that based on that and how much dirty data we have we can keep writeback + * from being starved + * + * Add a tracepoint or somesuch to watch for writeback starvation + * + * When btree depth > 1 and splitting an interior node, we have to make sure + * alloc_bucket() cannot fail. This should be true but is not completely + * obvious. + * + * Plugging? + * + * If data write is less than hard sector size of ssd, round up offset in open + * bucket to the next whole sector + * + * Superblock needs to be fleshed out for multiple cache devices + * + * Add a sysfs tunable for the number of writeback IOs in flight + * + * Add a sysfs tunable for the number of open data buckets + * + * IO tracking: Can we track when one process is doing io on behalf of another? + * IO tracking: Don't use just an average, weigh more recent stuff higher + * + * Test module load/unload + */ + +#define MAX_NEED_GC 64 +#define MAX_SAVE_PRIO 72 + +#define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) + +#define PTR_HASH(c, k) \ + (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) + +#define insert_lock(s, b) ((b)->level <= (s)->lock) + +/* + * These macros are for recursing down the btree - they handle the details of + * locking and looking up nodes in the cache for you. They're best treated as + * mere syntax when reading code that uses them. + * + * op->lock determines whether we take a read or a write lock at a given depth. + * If you've got a read lock and find that you need a write lock (i.e. you're + * going to have to split), set op->lock and return -EINTR; btree_root() will + * call you again and you'll have the correct lock. + */ + +/** + * btree - recurse down the btree on a specified key + * @fn: function to call, which will be passed the child node + * @key: key to recurse on + * @b: parent btree node + * @op: pointer to struct btree_op + */ +#define btree(fn, key, b, op, ...) \ +({ \ + int _r, l = (b)->level - 1; \ + bool _w = l <= (op)->lock; \ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\ + if (!IS_ERR(_child)) { \ + _child->parent = (b); \ + _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ + rw_unlock(_w, _child); \ + } else \ + _r = PTR_ERR(_child); \ + _r; \ +}) + +/** + * btree_root - call a function on the root of the btree + * @fn: function to call, which will be passed the child node + * @c: cache set + * @op: pointer to struct btree_op + */ +#define btree_root(fn, c, op, ...) \ +({ \ + int _r = -EINTR; \ + do { \ + struct btree *_b = (c)->root; \ + bool _w = insert_lock(op, _b); \ + rw_lock(_w, _b, _b->level); \ + if (_b == (c)->root && \ + _w == insert_lock(op, _b)) { \ + _b->parent = NULL; \ + _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ + } \ + rw_unlock(_w, _b); \ + bch_cannibalize_unlock(c); \ + if (_r == -EINTR) \ + schedule(); \ + } while (_r == -EINTR); \ + \ + finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ + _r; \ +}) + +static inline struct bset *write_block(struct btree *b) +{ + return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); +} + +static void bch_btree_init_next(struct btree *b) +{ + /* If not a leaf node, always sort */ + if (b->level && b->keys.nsets) + bch_btree_sort(&b->keys, &b->c->sort); + else + bch_btree_sort_lazy(&b->keys, &b->c->sort); + + if (b->written < btree_blocks(b)) + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->sb)); + +} + +/* Btree key manipulation */ + +void bkey_put(struct cache_set *c, struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) + atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); +} + +/* Btree IO */ + +static uint64_t btree_csum_set(struct btree *b, struct bset *i) +{ + uint64_t crc = b->key.ptr[0]; + void *data = (void *) i + 8, *end = bset_bkey_last(i); + + crc = bch_crc64_update(crc, data, end - data); + return crc ^ 0xffffffffffffffffULL; +} + +void bch_btree_node_read_done(struct btree *b) +{ + const char *err = "bad btree header"; + struct bset *i = btree_bset_first(b); + struct btree_iter *iter; + + iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); + iter->size = b->c->sb.bucket_size / b->c->sb.block_size; + iter->used = 0; + +#ifdef CONFIG_BCACHE_DEBUG + iter->b = &b->keys; +#endif + + if (!i->seq) + goto err; + + for (; + b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; + i = write_block(b)) { + err = "unsupported bset version"; + if (i->version > BCACHE_BSET_VERSION) + goto err; + + err = "bad btree header"; + if (b->written + set_blocks(i, block_bytes(b->c)) > + btree_blocks(b)) + goto err; + + err = "bad magic"; + if (i->magic != bset_magic(&b->c->sb)) + goto err; + + err = "bad checksum"; + switch (i->version) { + case 0: + if (i->csum != csum_set(i)) + goto err; + break; + case BCACHE_BSET_VERSION: + if (i->csum != btree_csum_set(b, i)) + goto err; + break; + } + + err = "empty set"; + if (i != b->keys.set[0].data && !i->keys) + goto err; + + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); + + b->written += set_blocks(i, block_bytes(b->c)); + } + + err = "corrupted btree"; + for (i = write_block(b); + bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); + i = ((void *) i) + block_bytes(b->c)) + if (i->seq == b->keys.set[0].data->seq) + goto err; + + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); + + i = b->keys.set[0].data; + err = "short btree key"; + if (b->keys.set[0].size && + bkey_cmp(&b->key, &b->keys.set[0].end) < 0) + goto err; + + if (b->written < btree_blocks(b)) + bch_bset_init_next(&b->keys, write_block(b), + bset_magic(&b->c->sb)); +out: + mempool_free(iter, b->c->fill_iter); + return; +err: + set_btree_node_io_error(b); + bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", + err, PTR_BUCKET_NR(b->c, &b->key, 0), + bset_block_offset(b, i), i->keys); + goto out; +} + +static void btree_node_read_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + closure_put(cl); +} + +static void bch_btree_node_read(struct btree *b) +{ + uint64_t start_time = local_clock(); + struct closure cl; + struct bio *bio; + + trace_bcache_btree_read(b); + + closure_init_stack(&cl); + + bio = bch_bbio_alloc(b->c); + bio->bi_rw = REQ_META|READ_SYNC; + bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; + bio->bi_end_io = btree_node_read_endio; + bio->bi_private = &cl; + + bch_bio_map(bio, b->keys.set[0].data); + + bch_submit_bbio(bio, b->c, &b->key, 0); + closure_sync(&cl); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + set_btree_node_io_error(b); + + bch_bbio_free(bio, b->c); + + if (btree_node_io_error(b)) + goto err; + + bch_btree_node_read_done(b); + bch_time_stats_update(&b->c->btree_read_time, start_time); + + return; +err: + bch_cache_set_error(b->c, "io error reading bucket %zu", + PTR_BUCKET_NR(b->c, &b->key, 0)); +} + +static void btree_complete_write(struct btree *b, struct btree_write *w) +{ + if (w->prio_blocked && + !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) + wake_up_allocators(b->c); + + if (w->journal) { + atomic_dec_bug(w->journal); + __closure_wake_up(&b->c->journal.wait); + } + + w->prio_blocked = 0; + w->journal = NULL; +} + +static void btree_node_write_unlock(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io); + + up(&b->io_mutex); +} + +static void __btree_node_write_done(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io); + struct btree_write *w = btree_prev_write(b); + + bch_bbio_free(b->bio, b->c); + b->bio = NULL; + btree_complete_write(b, w); + + if (btree_node_dirty(b)) + schedule_delayed_work(&b->work, 30 * HZ); + + closure_return_with_destructor(cl, btree_node_write_unlock); +} + +static void btree_node_write_done(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io); + struct bio_vec *bv; + int n; + + bio_for_each_segment_all(bv, b->bio, n) + __free_page(bv->bv_page); + + __btree_node_write_done(cl); +} + +static void btree_node_write_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct btree *b = container_of(cl, struct btree, io); + + if (error) + set_btree_node_io_error(b); + + bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); + closure_put(cl); +} + +static void do_btree_node_write(struct btree *b) +{ + struct closure *cl = &b->io; + struct bset *i = btree_bset_last(b); + BKEY_PADDED(key) k; + + i->version = BCACHE_BSET_VERSION; + i->csum = btree_csum_set(b, i); + + BUG_ON(b->bio); + b->bio = bch_bbio_alloc(b->c); + + b->bio->bi_end_io = btree_node_write_endio; + b->bio->bi_private = cl; + b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; + b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); + bch_bio_map(b->bio, i); + + /* + * If we're appending to a leaf node, we don't technically need FUA - + * this write just needs to be persisted before the next journal write, + * which will be marked FLUSH|FUA. + * + * Similarly if we're writing a new btree root - the pointer is going to + * be in the next journal entry. + * + * But if we're writing a new btree node (that isn't a root) or + * appending to a non leaf btree node, we need either FUA or a flush + * when we write the parent with the new pointer. FUA is cheaper than a + * flush, and writes appending to leaf nodes aren't blocking anything so + * just make all btree node writes FUA to keep things sane. + */ + + bkey_copy(&k.key, &b->key); + SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + + bset_sector_offset(&b->keys, i)); + + if (!bio_alloc_pages(b->bio, GFP_NOIO)) { + int j; + struct bio_vec *bv; + void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); + + bio_for_each_segment_all(bv, b->bio, j) + memcpy(page_address(bv->bv_page), + base + j * PAGE_SIZE, PAGE_SIZE); + + bch_submit_bbio(b->bio, b->c, &k.key, 0); + + continue_at(cl, btree_node_write_done, NULL); + } else { + b->bio->bi_vcnt = 0; + bch_bio_map(b->bio, i); + + bch_submit_bbio(b->bio, b->c, &k.key, 0); + + closure_sync(cl); + continue_at_nobarrier(cl, __btree_node_write_done, NULL); + } +} + +void __bch_btree_node_write(struct btree *b, struct closure *parent) +{ + struct bset *i = btree_bset_last(b); + + lockdep_assert_held(&b->write_lock); + + trace_bcache_btree_write(b); + + BUG_ON(current->bio_list); + BUG_ON(b->written >= btree_blocks(b)); + BUG_ON(b->written && !i->keys); + BUG_ON(btree_bset_first(b)->seq != i->seq); + bch_check_keys(&b->keys, "writing"); + + cancel_delayed_work(&b->work); + + /* If caller isn't waiting for write, parent refcount is cache set */ + down(&b->io_mutex); + closure_init(&b->io, parent ?: &b->c->cl); + + clear_bit(BTREE_NODE_dirty, &b->flags); + change_bit(BTREE_NODE_write_idx, &b->flags); + + do_btree_node_write(b); + + atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, + &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); + + b->written += set_blocks(i, block_bytes(b->c)); +} + +void bch_btree_node_write(struct btree *b, struct closure *parent) +{ + unsigned nsets = b->keys.nsets; + + lockdep_assert_held(&b->lock); + + __bch_btree_node_write(b, parent); + + /* + * do verify if there was more than one set initially (i.e. we did a + * sort) and we sorted down to a single set: + */ + if (nsets && !b->keys.nsets) + bch_btree_verify(b); + + bch_btree_init_next(b); +} + +static void bch_btree_node_write_sync(struct btree *b) +{ + struct closure cl; + + closure_init_stack(&cl); + + mutex_lock(&b->write_lock); + bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + + closure_sync(&cl); +} + +static void btree_node_write_work(struct work_struct *w) +{ + struct btree *b = container_of(to_delayed_work(w), struct btree, work); + + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); +} + +static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) +{ + struct bset *i = btree_bset_last(b); + struct btree_write *w = btree_current_write(b); + + lockdep_assert_held(&b->write_lock); + + BUG_ON(!b->written); + BUG_ON(!i->keys); + + if (!btree_node_dirty(b)) + schedule_delayed_work(&b->work, 30 * HZ); + + set_btree_node_dirty(b); + + if (journal_ref) { + if (w->journal && + journal_pin_cmp(b->c, w->journal, journal_ref)) { + atomic_dec_bug(w->journal); + w->journal = NULL; + } + + if (!w->journal) { + w->journal = journal_ref; + atomic_inc(w->journal); + } + } + + /* Force write if set is too big */ + if (set_bytes(i) > PAGE_SIZE - 48 && + !current->bio_list) + bch_btree_node_write(b, NULL); +} + +/* + * Btree in memory cache - allocation/freeing + * mca -> memory cache + */ + +#define mca_reserve(c) (((c->root && c->root->level) \ + ? c->root->level : 1) * 8 + 16) +#define mca_can_free(c) \ + max_t(int, 0, c->btree_cache_used - mca_reserve(c)) + +static void mca_data_free(struct btree *b) +{ + BUG_ON(b->io_mutex.count != 1); + + bch_btree_keys_free(&b->keys); + + b->c->btree_cache_used--; + list_move(&b->list, &b->c->btree_cache_freed); +} + +static void mca_bucket_free(struct btree *b) +{ + BUG_ON(btree_node_dirty(b)); + + b->key.ptr[0] = 0; + hlist_del_init_rcu(&b->hash); + list_move(&b->list, &b->c->btree_cache_freeable); +} + +static unsigned btree_order(struct bkey *k) +{ + return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); +} + +static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) +{ + if (!bch_btree_keys_alloc(&b->keys, + max_t(unsigned, + ilog2(b->c->btree_pages), + btree_order(k)), + gfp)) { + b->c->btree_cache_used++; + list_move(&b->list, &b->c->btree_cache); + } else { + list_move(&b->list, &b->c->btree_cache_freed); + } +} + +static struct btree *mca_bucket_alloc(struct cache_set *c, + struct bkey *k, gfp_t gfp) +{ + struct btree *b = kzalloc(sizeof(struct btree), gfp); + if (!b) + return NULL; + + init_rwsem(&b->lock); + lockdep_set_novalidate_class(&b->lock); + mutex_init(&b->write_lock); + lockdep_set_novalidate_class(&b->write_lock); + INIT_LIST_HEAD(&b->list); + INIT_DELAYED_WORK(&b->work, btree_node_write_work); + b->c = c; + sema_init(&b->io_mutex, 1); + + mca_data_alloc(b, k, gfp); + return b; +} + +static int mca_reap(struct btree *b, unsigned min_order, bool flush) +{ + struct closure cl; + + closure_init_stack(&cl); + lockdep_assert_held(&b->c->bucket_lock); + + if (!down_write_trylock(&b->lock)) + return -ENOMEM; + + BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); + + if (b->keys.page_order < min_order) + goto out_unlock; + + if (!flush) { + if (btree_node_dirty(b)) + goto out_unlock; + + if (down_trylock(&b->io_mutex)) + goto out_unlock; + up(&b->io_mutex); + } + + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) + __bch_btree_node_write(b, &cl); + mutex_unlock(&b->write_lock); + + closure_sync(&cl); + + /* wait for any in flight btree write */ + down(&b->io_mutex); + up(&b->io_mutex); + + return 0; +out_unlock: + rw_unlock(true, b); + return -ENOMEM; +} + +static unsigned long bch_mca_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct btree *b, *t; + unsigned long i, nr = sc->nr_to_scan; + unsigned long freed = 0; + + if (c->shrinker_disabled) + return SHRINK_STOP; + + if (c->btree_cache_alloc_lock) + return SHRINK_STOP; + + /* Return -1 if we can't do anything right now */ + if (sc->gfp_mask & __GFP_IO) + mutex_lock(&c->bucket_lock); + else if (!mutex_trylock(&c->bucket_lock)) + return -1; + + /* + * It's _really_ critical that we don't free too many btree nodes - we + * have to always leave ourselves a reserve. The reserve is how we + * guarantee that allocating memory for a new btree node can always + * succeed, so that inserting keys into the btree can always succeed and + * IO can always make forward progress: + */ + nr /= c->btree_pages; + nr = min_t(unsigned long, nr, mca_can_free(c)); + + i = 0; + list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { + if (freed >= nr) + break; + + if (++i > 3 && + !mca_reap(b, 0, false)) { + mca_data_free(b); + rw_unlock(true, b); + freed++; + } + } + + for (i = 0; (nr--) && i < c->btree_cache_used; i++) { + if (list_empty(&c->btree_cache)) + goto out; + + b = list_first_entry(&c->btree_cache, struct btree, list); + list_rotate_left(&c->btree_cache); + + if (!b->accessed && + !mca_reap(b, 0, false)) { + mca_bucket_free(b); + mca_data_free(b); + rw_unlock(true, b); + freed++; + } else + b->accessed = 0; + } +out: + mutex_unlock(&c->bucket_lock); + return freed; +} + +static unsigned long bch_mca_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct cache_set *c = container_of(shrink, struct cache_set, shrink); + + if (c->shrinker_disabled) + return 0; + + if (c->btree_cache_alloc_lock) + return 0; + + return mca_can_free(c) * c->btree_pages; +} + +void bch_btree_cache_free(struct cache_set *c) +{ + struct btree *b; + struct closure cl; + closure_init_stack(&cl); + + if (c->shrink.list.next) + unregister_shrinker(&c->shrink); + + mutex_lock(&c->bucket_lock); + +#ifdef CONFIG_BCACHE_DEBUG + if (c->verify_data) + list_move(&c->verify_data->list, &c->btree_cache); + + free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); +#endif + + list_splice(&c->btree_cache_freeable, + &c->btree_cache); + + while (!list_empty(&c->btree_cache)) { + b = list_first_entry(&c->btree_cache, struct btree, list); + + if (btree_node_dirty(b)) + btree_complete_write(b, btree_current_write(b)); + clear_bit(BTREE_NODE_dirty, &b->flags); + + mca_data_free(b); + } + + while (!list_empty(&c->btree_cache_freed)) { + b = list_first_entry(&c->btree_cache_freed, + struct btree, list); + list_del(&b->list); + cancel_delayed_work_sync(&b->work); + kfree(b); + } + + mutex_unlock(&c->bucket_lock); +} + +int bch_btree_cache_alloc(struct cache_set *c) +{ + unsigned i; + + for (i = 0; i < mca_reserve(c); i++) + if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) + return -ENOMEM; + + list_splice_init(&c->btree_cache, + &c->btree_cache_freeable); + +#ifdef CONFIG_BCACHE_DEBUG + mutex_init(&c->verify_lock); + + c->verify_ondisk = (void *) + __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); + + c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); + + if (c->verify_data && + c->verify_data->keys.set->data) + list_del_init(&c->verify_data->list); + else + c->verify_data = NULL; +#endif + + c->shrink.count_objects = bch_mca_count; + c->shrink.scan_objects = bch_mca_scan; + c->shrink.seeks = 4; + c->shrink.batch = c->btree_pages * 2; + register_shrinker(&c->shrink); + + return 0; +} + +/* Btree in memory cache - hash table */ + +static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) +{ + return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; +} + +static struct btree *mca_find(struct cache_set *c, struct bkey *k) +{ + struct btree *b; + + rcu_read_lock(); + hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) + if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) + goto out; + b = NULL; +out: + rcu_read_unlock(); + return b; +} + +static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) +{ + struct task_struct *old; + + old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); + if (old && old != current) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + return -EINTR; + } + + return 0; +} + +static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, + struct bkey *k) +{ + struct btree *b; + + trace_bcache_btree_cache_cannibalize(c); + + if (mca_cannibalize_lock(c, op)) + return ERR_PTR(-EINTR); + + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), false)) + return b; + + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), true)) + return b; + + WARN(1, "btree cache cannibalize failed\n"); + return ERR_PTR(-ENOMEM); +} + +/* + * We can only have one thread cannibalizing other cached btree nodes at a time, + * or we'll deadlock. We use an open coded mutex to ensure that, which a + * cannibalize_bucket() will take. This means every time we unlock the root of + * the btree, we need to release this lock if we have it held. + */ +static void bch_cannibalize_unlock(struct cache_set *c) +{ + if (c->btree_cache_alloc_lock == current) { + c->btree_cache_alloc_lock = NULL; + wake_up(&c->btree_cache_wait); + } +} + +static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level) +{ + struct btree *b; + + BUG_ON(current->bio_list); + + lockdep_assert_held(&c->bucket_lock); + + if (mca_find(c, k)) + return NULL; + + /* btree_free() doesn't free memory; it sticks the node on the end of + * the list. Check if there's any freed nodes there: + */ + list_for_each_entry(b, &c->btree_cache_freeable, list) + if (!mca_reap(b, btree_order(k), false)) + goto out; + + /* We never free struct btree itself, just the memory that holds the on + * disk node. Check the freed list before allocating a new one: + */ + list_for_each_entry(b, &c->btree_cache_freed, list) + if (!mca_reap(b, 0, false)) { + mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); + if (!b->keys.set[0].data) + goto err; + else + goto out; + } + + b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); + if (!b) + goto err; + + BUG_ON(!down_write_trylock(&b->lock)); + if (!b->keys.set->data) + goto err; +out: + BUG_ON(b->io_mutex.count != 1); + + bkey_copy(&b->key, k); + list_move(&b->list, &c->btree_cache); + hlist_del_init_rcu(&b->hash); + hlist_add_head_rcu(&b->hash, mca_hash(c, k)); + + lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); + b->parent = (void *) ~0UL; + b->flags = 0; + b->written = 0; + b->level = level; + + if (!b->level) + bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, + &b->c->expensive_debug_checks); + else + bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, + &b->c->expensive_debug_checks); + + return b; +err: + if (b) + rw_unlock(true, b); + + b = mca_cannibalize(c, op, k); + if (!IS_ERR(b)) + goto out; + + return b; +} + +/** + * bch_btree_node_get - find a btree node in the cache and lock it, reading it + * in from disk if necessary. + * + * If IO is necessary and running under generic_make_request, returns -EAGAIN. + * + * The btree node will have either a read or a write lock held, depending on + * level and op->lock. + */ +struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level, bool write) +{ + int i = 0; + struct btree *b; + + BUG_ON(level < 0); +retry: + b = mca_find(c, k); + + if (!b) { + if (current->bio_list) + return ERR_PTR(-EAGAIN); + + mutex_lock(&c->bucket_lock); + b = mca_alloc(c, op, k, level); + mutex_unlock(&c->bucket_lock); + + if (!b) + goto retry; + if (IS_ERR(b)) + return b; + + bch_btree_node_read(b); + + if (!write) + downgrade_write(&b->lock); + } else { + rw_lock(write, b, level); + if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { + rw_unlock(write, b); + goto retry; + } + BUG_ON(b->level != level); + } + + b->accessed = 1; + + for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { + prefetch(b->keys.set[i].tree); + prefetch(b->keys.set[i].data); + } + + for (; i <= b->keys.nsets; i++) + prefetch(b->keys.set[i].data); + + if (btree_node_io_error(b)) { + rw_unlock(write, b); + return ERR_PTR(-EIO); + } + + BUG_ON(!b->written); + + return b; +} + +static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) +{ + struct btree *b; + + mutex_lock(&c->bucket_lock); + b = mca_alloc(c, NULL, k, level); + mutex_unlock(&c->bucket_lock); + + if (!IS_ERR_OR_NULL(b)) { + bch_btree_node_read(b); + rw_unlock(true, b); + } +} + +/* Btree alloc */ + +static void btree_node_free(struct btree *b) +{ + trace_bcache_btree_node_free(b); + + BUG_ON(b == b->c->root); + + mutex_lock(&b->write_lock); + + if (btree_node_dirty(b)) + btree_complete_write(b, btree_current_write(b)); + clear_bit(BTREE_NODE_dirty, &b->flags); + + mutex_unlock(&b->write_lock); + + cancel_delayed_work(&b->work); + + mutex_lock(&b->c->bucket_lock); + bch_bucket_free(b->c, &b->key); + mca_bucket_free(b); + mutex_unlock(&b->c->bucket_lock); +} + +struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + int level) +{ + BKEY_PADDED(key) k; + struct btree *b = ERR_PTR(-EAGAIN); + + mutex_lock(&c->bucket_lock); +retry: + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL)) + goto err; + + bkey_put(c, &k.key); + SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); + + b = mca_alloc(c, op, &k.key, level); + if (IS_ERR(b)) + goto err_free; + + if (!b) { + cache_bug(c, + "Tried to allocate bucket that was in btree cache"); + goto retry; + } + + b->accessed = 1; + bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); + + mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc(b); + return b; +err_free: + bch_bucket_free(c, &k.key); +err: + mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc_fail(b); + return b; +} + +static struct btree *btree_node_alloc_replacement(struct btree *b, + struct btree_op *op) +{ + struct btree *n = bch_btree_node_alloc(b->c, op, b->level); + if (!IS_ERR_OR_NULL(n)) { + mutex_lock(&n->write_lock); + bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); + bkey_copy_key(&n->key, &b->key); + mutex_unlock(&n->write_lock); + } + + return n; +} + +static void make_btree_freeing_key(struct btree *b, struct bkey *k) +{ + unsigned i; + + mutex_lock(&b->c->bucket_lock); + + atomic_inc(&b->c->prio_blocked); + + bkey_copy(k, &b->key); + bkey_copy_key(k, &ZERO_KEY); + + for (i = 0; i < KEY_PTRS(k); i++) + SET_PTR_GEN(k, i, + bch_inc_gen(PTR_CACHE(b->c, &b->key, i), + PTR_BUCKET(b->c, &b->key, i))); + + mutex_unlock(&b->c->bucket_lock); +} + +static int btree_check_reserve(struct btree *b, struct btree_op *op) +{ + struct cache_set *c = b->c; + struct cache *ca; + unsigned i, reserve = (c->root->level - b->level) * 2 + 1; + + mutex_lock(&c->bucket_lock); + + for_each_cache(ca, c, i) + if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { + if (op) + prepare_to_wait(&c->btree_cache_wait, &op->wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&c->bucket_lock); + return -EINTR; + } + + mutex_unlock(&c->bucket_lock); + + return mca_cannibalize_lock(b->c, op); +} + +/* Garbage collection */ + +static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, + struct bkey *k) +{ + uint8_t stale = 0; + unsigned i; + struct bucket *g; + + /* + * ptr_invalid() can't return true for the keys that mark btree nodes as + * freed, but since ptr_bad() returns true we'll never actually use them + * for anything and thus we don't want mark their pointers here + */ + if (!bkey_cmp(k, &ZERO_KEY)) + return stale; + + for (i = 0; i < KEY_PTRS(k); i++) { + if (!ptr_available(c, k, i)) + continue; + + g = PTR_BUCKET(c, k, i); + + if (gen_after(g->last_gc, PTR_GEN(k, i))) + g->last_gc = PTR_GEN(k, i); + + if (ptr_stale(c, k, i)) { + stale = max(stale, ptr_stale(c, k, i)); + continue; + } + + cache_bug_on(GC_MARK(g) && + (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), + c, "inconsistent ptrs: mark = %llu, level = %i", + GC_MARK(g), level); + + if (level) + SET_GC_MARK(g, GC_MARK_METADATA); + else if (KEY_DIRTY(k)) + SET_GC_MARK(g, GC_MARK_DIRTY); + else if (!GC_MARK(g)) + SET_GC_MARK(g, GC_MARK_RECLAIMABLE); + + /* guard against overflow */ + SET_GC_SECTORS_USED(g, min_t(unsigned, + GC_SECTORS_USED(g) + KEY_SIZE(k), + MAX_GC_SECTORS_USED)); + + BUG_ON(!GC_SECTORS_USED(g)); + } + + return stale; +} + +#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) + +void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i) && + !ptr_stale(c, k, i)) { + struct bucket *b = PTR_BUCKET(c, k, i); + + b->gen = PTR_GEN(k, i); + + if (level && bkey_cmp(k, &ZERO_KEY)) + b->prio = BTREE_PRIO; + else if (!level && b->prio == BTREE_PRIO) + b->prio = INITIAL_PRIO; + } + + __bch_btree_mark_key(c, level, k); +} + +static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) +{ + uint8_t stale = 0; + unsigned keys = 0, good_keys = 0; + struct bkey *k; + struct btree_iter iter; + struct bset_tree *t; + + gc->nodes++; + + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { + stale = max(stale, btree_mark_key(b, k)); + keys++; + + if (bch_ptr_bad(&b->keys, k)) + continue; + + gc->key_bytes += bkey_u64s(k); + gc->nkeys++; + good_keys++; + + gc->data += KEY_SIZE(k); + } + + for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) + btree_bug_on(t->size && + bset_written(&b->keys, t) && + bkey_cmp(&b->key, &t->end) < 0, + b, "found short btree key in gc"); + + if (b->c->gc_always_rewrite) + return true; + + if (stale > 10) + return true; + + if ((keys - good_keys) * 2 > keys) + return true; + + return false; +} + +#define GC_MERGE_NODES 4U + +struct gc_merge_info { + struct btree *b; + unsigned keys; +}; + +static int bch_btree_insert_node(struct btree *, struct btree_op *, + struct keylist *, atomic_t *, struct bkey *); + +static int btree_gc_coalesce(struct btree *b, struct btree_op *op, + struct gc_stat *gc, struct gc_merge_info *r) +{ + unsigned i, nodes = 0, keys = 0, blocks; + struct btree *new_nodes[GC_MERGE_NODES]; + struct keylist keylist; + struct closure cl; + struct bkey *k; + + bch_keylist_init(&keylist); + + if (btree_check_reserve(b, NULL)) + return 0; + + memset(new_nodes, 0, sizeof(new_nodes)); + closure_init_stack(&cl); + + while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) + keys += r[nodes++].keys; + + blocks = btree_default_blocks(b->c) * 2 / 3; + + if (nodes < 2 || + __set_blocks(b->keys.set[0].data, keys, + block_bytes(b->c)) > blocks * (nodes - 1)) + return 0; + + for (i = 0; i < nodes; i++) { + new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); + if (IS_ERR_OR_NULL(new_nodes[i])) + goto out_nocoalesce; + } + + /* + * We have to check the reserve here, after we've allocated our new + * nodes, to make sure the insert below will succeed - we also check + * before as an optimization to potentially avoid a bunch of expensive + * allocs/sorts + */ + if (btree_check_reserve(b, NULL)) + goto out_nocoalesce; + + for (i = 0; i < nodes; i++) + mutex_lock(&new_nodes[i]->write_lock); + + for (i = nodes - 1; i > 0; --i) { + struct bset *n1 = btree_bset_first(new_nodes[i]); + struct bset *n2 = btree_bset_first(new_nodes[i - 1]); + struct bkey *k, *last = NULL; + + keys = 0; + + if (i > 1) { + for (k = n2->start; + k < bset_bkey_last(n2); + k = bkey_next(k)) { + if (__set_blocks(n1, n1->keys + keys + + bkey_u64s(k), + block_bytes(b->c)) > blocks) + break; + + last = k; + keys += bkey_u64s(k); + } + } else { + /* + * Last node we're not getting rid of - we're getting + * rid of the node at r[0]. Have to try and fit all of + * the remaining keys into this node; we can't ensure + * they will always fit due to rounding and variable + * length keys (shouldn't be possible in practice, + * though) + */ + if (__set_blocks(n1, n1->keys + n2->keys, + block_bytes(b->c)) > + btree_blocks(new_nodes[i])) + goto out_nocoalesce; + + keys = n2->keys; + /* Take the key of the node we're getting rid of */ + last = &r->b->key; + } + + BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > + btree_blocks(new_nodes[i])); + + if (last) + bkey_copy_key(&new_nodes[i]->key, last); + + memcpy(bset_bkey_last(n1), + n2->start, + (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); + + n1->keys += keys; + r[i].keys = n1->keys; + + memmove(n2->start, + bset_bkey_idx(n2, keys), + (void *) bset_bkey_last(n2) - + (void *) bset_bkey_idx(n2, keys)); + + n2->keys -= keys; + + if (__bch_keylist_realloc(&keylist, + bkey_u64s(&new_nodes[i]->key))) + goto out_nocoalesce; + + bch_btree_node_write(new_nodes[i], &cl); + bch_keylist_add(&keylist, &new_nodes[i]->key); + } + + for (i = 0; i < nodes; i++) + mutex_unlock(&new_nodes[i]->write_lock); + + closure_sync(&cl); + + /* We emptied out this node */ + BUG_ON(btree_bset_first(new_nodes[0])->keys); + btree_node_free(new_nodes[0]); + rw_unlock(true, new_nodes[0]); + + for (i = 0; i < nodes; i++) { + if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) + goto out_nocoalesce; + + make_btree_freeing_key(r[i].b, keylist.top); + bch_keylist_push(&keylist); + } + + bch_btree_insert_node(b, op, &keylist, NULL, NULL); + BUG_ON(!bch_keylist_empty(&keylist)); + + for (i = 0; i < nodes; i++) { + btree_node_free(r[i].b); + rw_unlock(true, r[i].b); + + r[i].b = new_nodes[i]; + } + + memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); + r[nodes - 1].b = ERR_PTR(-EINTR); + + trace_bcache_btree_gc_coalesce(nodes); + gc->nodes--; + + bch_keylist_free(&keylist); + + /* Invalidated our iterator */ + return -EINTR; + +out_nocoalesce: + closure_sync(&cl); + bch_keylist_free(&keylist); + + while ((k = bch_keylist_pop(&keylist))) + if (!bkey_cmp(k, &ZERO_KEY)) + atomic_dec(&b->c->prio_blocked); + + for (i = 0; i < nodes; i++) + if (!IS_ERR_OR_NULL(new_nodes[i])) { + btree_node_free(new_nodes[i]); + rw_unlock(true, new_nodes[i]); + } + return 0; +} + +static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, + struct btree *replace) +{ + struct keylist keys; + struct btree *n; + + if (btree_check_reserve(b, NULL)) + return 0; + + n = btree_node_alloc_replacement(replace, NULL); + + /* recheck reserve after allocating replacement node */ + if (btree_check_reserve(b, NULL)) { + btree_node_free(n); + rw_unlock(true, n); + return 0; + } + + bch_btree_node_write_sync(n); + + bch_keylist_init(&keys); + bch_keylist_add(&keys, &n->key); + + make_btree_freeing_key(replace, keys.top); + bch_keylist_push(&keys); + + bch_btree_insert_node(b, op, &keys, NULL, NULL); + BUG_ON(!bch_keylist_empty(&keys)); + + btree_node_free(replace); + rw_unlock(true, n); + + /* Invalidated our iterator */ + return -EINTR; +} + +static unsigned btree_gc_count_keys(struct btree *b) +{ + struct bkey *k; + struct btree_iter iter; + unsigned ret = 0; + + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) + ret += bkey_u64s(k); + + return ret; +} + +static int btree_gc_recurse(struct btree *b, struct btree_op *op, + struct closure *writes, struct gc_stat *gc) +{ + int ret = 0; + bool should_rewrite; + struct bkey *k; + struct btree_iter iter; + struct gc_merge_info r[GC_MERGE_NODES]; + struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; + + bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + + for (i = r; i < r + ARRAY_SIZE(r); i++) + i->b = ERR_PTR(-EINTR); + + while (1) { + k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); + if (k) { + r->b = bch_btree_node_get(b->c, op, k, b->level - 1, + true); + if (IS_ERR(r->b)) { + ret = PTR_ERR(r->b); + break; + } + + r->keys = btree_gc_count_keys(r->b); + + ret = btree_gc_coalesce(b, op, gc, r); + if (ret) + break; + } + + if (!last->b) + break; + + if (!IS_ERR(last->b)) { + should_rewrite = btree_gc_mark_node(last->b, gc); + if (should_rewrite) { + ret = btree_gc_rewrite_node(b, op, last->b); + if (ret) + break; + } + + if (last->b->level) { + ret = btree_gc_recurse(last->b, op, writes, gc); + if (ret) + break; + } + + bkey_copy_key(&b->c->gc_done, &last->b->key); + + /* + * Must flush leaf nodes before gc ends, since replace + * operations aren't journalled + */ + mutex_lock(&last->b->write_lock); + if (btree_node_dirty(last->b)) + bch_btree_node_write(last->b, writes); + mutex_unlock(&last->b->write_lock); + rw_unlock(true, last->b); + } + + memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); + r->b = NULL; + + if (need_resched()) { + ret = -EAGAIN; + break; + } + } + + for (i = r; i < r + ARRAY_SIZE(r); i++) + if (!IS_ERR_OR_NULL(i->b)) { + mutex_lock(&i->b->write_lock); + if (btree_node_dirty(i->b)) + bch_btree_node_write(i->b, writes); + mutex_unlock(&i->b->write_lock); + rw_unlock(true, i->b); + } + + return ret; +} + +static int bch_btree_gc_root(struct btree *b, struct btree_op *op, + struct closure *writes, struct gc_stat *gc) +{ + struct btree *n = NULL; + int ret = 0; + bool should_rewrite; + + should_rewrite = btree_gc_mark_node(b, gc); + if (should_rewrite) { + n = btree_node_alloc_replacement(b, NULL); + + if (!IS_ERR_OR_NULL(n)) { + bch_btree_node_write_sync(n); + + bch_btree_set_root(n); + btree_node_free(b); + rw_unlock(true, n); + + return -EINTR; + } + } + + __bch_btree_mark_key(b->c, b->level + 1, &b->key); + + if (b->level) { + ret = btree_gc_recurse(b, op, writes, gc); + if (ret) + return ret; + } + + bkey_copy_key(&b->c->gc_done, &b->key); + + return ret; +} + +static void btree_gc_start(struct cache_set *c) +{ + struct cache *ca; + struct bucket *b; + unsigned i; + + if (!c->gc_mark_valid) + return; + + mutex_lock(&c->bucket_lock); + + c->gc_mark_valid = 0; + c->gc_done = ZERO_KEY; + + for_each_cache(ca, c, i) + for_each_bucket(b, ca) { + b->last_gc = b->gen; + if (!atomic_read(&b->pin)) { + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); + } + } + + mutex_unlock(&c->bucket_lock); +} + +static size_t bch_btree_gc_finish(struct cache_set *c) +{ + size_t available = 0; + struct bucket *b; + struct cache *ca; + unsigned i; + + mutex_lock(&c->bucket_lock); + + set_gc_sectors(c); + c->gc_mark_valid = 1; + c->need_gc = 0; + + for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) + SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), + GC_MARK_METADATA); + + /* don't reclaim buckets to which writeback keys point */ + rcu_read_lock(); + for (i = 0; i < c->nr_uuids; i++) { + struct bcache_device *d = c->devices[i]; + struct cached_dev *dc; + struct keybuf_key *w, *n; + unsigned j; + + if (!d || UUID_FLASH_ONLY(&c->uuids[i])) + continue; + dc = container_of(d, struct cached_dev, disk); + + spin_lock(&dc->writeback_keys.lock); + rbtree_postorder_for_each_entry_safe(w, n, + &dc->writeback_keys.keys, node) + for (j = 0; j < KEY_PTRS(&w->key); j++) + SET_GC_MARK(PTR_BUCKET(c, &w->key, j), + GC_MARK_DIRTY); + spin_unlock(&dc->writeback_keys.lock); + } + rcu_read_unlock(); + + for_each_cache(ca, c, i) { + uint64_t *i; + + ca->invalidate_needs_gc = 0; + + for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) + SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); + + for (i = ca->prio_buckets; + i < ca->prio_buckets + prio_buckets(ca) * 2; i++) + SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); + + for_each_bucket(b, ca) { + c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + + if (atomic_read(&b->pin)) + continue; + + BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); + + if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) + available++; + } + } + + mutex_unlock(&c->bucket_lock); + return available; +} + +static void bch_btree_gc(struct cache_set *c) +{ + int ret; + unsigned long available; + struct gc_stat stats; + struct closure writes; + struct btree_op op; + uint64_t start_time = local_clock(); + + trace_bcache_gc_start(c); + + memset(&stats, 0, sizeof(struct gc_stat)); + closure_init_stack(&writes); + bch_btree_op_init(&op, SHRT_MAX); + + btree_gc_start(c); + + do { + ret = btree_root(gc_root, c, &op, &writes, &stats); + closure_sync(&writes); + + if (ret && ret != -EAGAIN) + pr_warn("gc failed!"); + } while (ret); + + available = bch_btree_gc_finish(c); + wake_up_allocators(c); + + bch_time_stats_update(&c->btree_gc_time, start_time); + + stats.key_bytes *= sizeof(uint64_t); + stats.data <<= 9; + stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; + memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); + + trace_bcache_gc_end(c); + + bch_moving_gc(c); +} + +static int bch_gc_thread(void *arg) +{ + struct cache_set *c = arg; + struct cache *ca; + unsigned i; + + while (1) { +again: + bch_btree_gc(c); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + + mutex_lock(&c->bucket_lock); + + for_each_cache(ca, c, i) + if (ca->invalidate_needs_gc) { + mutex_unlock(&c->bucket_lock); + set_current_state(TASK_RUNNING); + goto again; + } + + mutex_unlock(&c->bucket_lock); + + try_to_freeze(); + schedule(); + } + + return 0; +} + +int bch_gc_thread_start(struct cache_set *c) +{ + c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); + if (IS_ERR(c->gc_thread)) + return PTR_ERR(c->gc_thread); + + set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); + return 0; +} + +/* Initial partial gc */ + +static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) +{ + int ret = 0; + struct bkey *k, *p = NULL; + struct btree_iter iter; + + for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) + bch_initial_mark_key(b->c, b->level, k); + + bch_initial_mark_key(b->c, b->level + 1, &b->key); + + if (b->level) { + bch_btree_iter_init(&b->keys, &iter, NULL); + + do { + k = bch_btree_iter_next_filter(&iter, &b->keys, + bch_ptr_bad); + if (k) + btree_node_prefetch(b->c, k, b->level - 1); + + if (p) + ret = btree(check_recurse, p, b, op); + + p = k; + } while (p && !ret); + } + + return ret; +} + +int bch_btree_check(struct cache_set *c) +{ + struct btree_op op; + + bch_btree_op_init(&op, SHRT_MAX); + + return btree_root(check_recurse, c, &op); +} + +void bch_initial_gc_finish(struct cache_set *c) +{ + struct cache *ca; + struct bucket *b; + unsigned i; + + bch_btree_gc_finish(c); + + mutex_lock(&c->bucket_lock); + + /* + * We need to put some unused buckets directly on the prio freelist in + * order to get the allocator thread started - it needs freed buckets in + * order to rewrite the prios and gens, and it needs to rewrite prios + * and gens in order to free buckets. + * + * This is only safe for buckets that have no live data in them, which + * there should always be some of. + */ + for_each_cache(ca, c, i) { + for_each_bucket(b, ca) { + if (fifo_full(&ca->free[RESERVE_PRIO])) + break; + + if (bch_can_invalidate_bucket(ca, b) && + !GC_MARK(b)) { + __bch_invalidate_one_bucket(ca, b); + fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets); + } + } + } + + mutex_unlock(&c->bucket_lock); +} + +/* Btree insertion */ + +static bool btree_insert_key(struct btree *b, struct bkey *k, + struct bkey *replace_key) +{ + unsigned status; + + BUG_ON(bkey_cmp(k, &b->key) > 0); + + status = bch_btree_insert_key(&b->keys, k, replace_key); + if (status != BTREE_INSERT_STATUS_NO_INSERT) { + bch_check_keys(&b->keys, "%u for %s", status, + replace_key ? "replace" : "insert"); + + trace_bcache_btree_insert_key(b, k, replace_key != NULL, + status); + return true; + } else + return false; +} + +static size_t insert_u64s_remaining(struct btree *b) +{ + long ret = bch_btree_keys_u64s_remaining(&b->keys); + + /* + * Might land in the middle of an existing extent and have to split it + */ + if (b->keys.ops->is_extents) + ret -= KEY_MAX_U64S; + + return max(ret, 0L); +} + +static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + struct bkey *replace_key) +{ + bool ret = false; + int oldsize = bch_count_data(&b->keys); + + while (!bch_keylist_empty(insert_keys)) { + struct bkey *k = insert_keys->keys; + + if (bkey_u64s(k) > insert_u64s_remaining(b)) + break; + + if (bkey_cmp(k, &b->key) <= 0) { + if (!b->level) + bkey_put(b->c, k); + + ret |= btree_insert_key(b, k, replace_key); + bch_keylist_pop_front(insert_keys); + } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, insert_keys->keys); + + bch_cut_back(&b->key, &temp.key); + bch_cut_front(&b->key, insert_keys->keys); + + ret |= btree_insert_key(b, &temp.key, replace_key); + break; + } else { + break; + } + } + + if (!ret) + op->insert_collision = true; + + BUG_ON(!bch_keylist_empty(insert_keys) && b->level); + + BUG_ON(bch_count_data(&b->keys) < oldsize); + return ret; +} + +static int btree_split(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + struct bkey *replace_key) +{ + bool split; + struct btree *n1, *n2 = NULL, *n3 = NULL; + uint64_t start_time = local_clock(); + struct closure cl; + struct keylist parent_keys; + + closure_init_stack(&cl); + bch_keylist_init(&parent_keys); + + if (btree_check_reserve(b, op)) { + if (!b->level) + return -EINTR; + else + WARN(1, "insufficient reserve for split\n"); + } + + n1 = btree_node_alloc_replacement(b, op); + if (IS_ERR(n1)) + goto err; + + split = set_blocks(btree_bset_first(n1), + block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; + + if (split) { + unsigned keys = 0; + + trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); + + n2 = bch_btree_node_alloc(b->c, op, b->level); + if (IS_ERR(n2)) + goto err_free1; + + if (!b->parent) { + n3 = bch_btree_node_alloc(b->c, op, b->level + 1); + if (IS_ERR(n3)) + goto err_free2; + } + + mutex_lock(&n1->write_lock); + mutex_lock(&n2->write_lock); + + bch_btree_insert_keys(n1, op, insert_keys, replace_key); + + /* + * Has to be a linear search because we don't have an auxiliary + * search tree yet + */ + + while (keys < (btree_bset_first(n1)->keys * 3) / 5) + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), + keys)); + + bkey_copy_key(&n1->key, + bset_bkey_idx(btree_bset_first(n1), keys)); + keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); + + btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; + btree_bset_first(n1)->keys = keys; + + memcpy(btree_bset_first(n2)->start, + bset_bkey_last(btree_bset_first(n1)), + btree_bset_first(n2)->keys * sizeof(uint64_t)); + + bkey_copy_key(&n2->key, &b->key); + + bch_keylist_add(&parent_keys, &n2->key); + bch_btree_node_write(n2, &cl); + mutex_unlock(&n2->write_lock); + rw_unlock(true, n2); + } else { + trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); + + mutex_lock(&n1->write_lock); + bch_btree_insert_keys(n1, op, insert_keys, replace_key); + } + + bch_keylist_add(&parent_keys, &n1->key); + bch_btree_node_write(n1, &cl); + mutex_unlock(&n1->write_lock); + + if (n3) { + /* Depth increases, make a new root */ + mutex_lock(&n3->write_lock); + bkey_copy_key(&n3->key, &MAX_KEY); + bch_btree_insert_keys(n3, op, &parent_keys, NULL); + bch_btree_node_write(n3, &cl); + mutex_unlock(&n3->write_lock); + + closure_sync(&cl); + bch_btree_set_root(n3); + rw_unlock(true, n3); + } else if (!b->parent) { + /* Root filled up but didn't need to be split */ + closure_sync(&cl); + bch_btree_set_root(n1); + } else { + /* Split a non root node */ + closure_sync(&cl); + make_btree_freeing_key(b, parent_keys.top); + bch_keylist_push(&parent_keys); + + bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); + BUG_ON(!bch_keylist_empty(&parent_keys)); + } + + btree_node_free(b); + rw_unlock(true, n1); + + bch_time_stats_update(&b->c->btree_split_time, start_time); + + return 0; +err_free2: + bkey_put(b->c, &n2->key); + btree_node_free(n2); + rw_unlock(true, n2); +err_free1: + bkey_put(b->c, &n1->key); + btree_node_free(n1); + rw_unlock(true, n1); +err: + WARN(1, "bcache: btree split failed (level %u)", b->level); + + if (n3 == ERR_PTR(-EAGAIN) || + n2 == ERR_PTR(-EAGAIN) || + n1 == ERR_PTR(-EAGAIN)) + return -EAGAIN; + + return -ENOMEM; +} + +static int bch_btree_insert_node(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + atomic_t *journal_ref, + struct bkey *replace_key) +{ + struct closure cl; + + BUG_ON(b->level && replace_key); + + closure_init_stack(&cl); + + mutex_lock(&b->write_lock); + + if (write_block(b) != btree_bset_last(b) && + b->keys.last_set_unwritten) + bch_btree_init_next(b); /* just wrote a set */ + + if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { + mutex_unlock(&b->write_lock); + goto split; + } + + BUG_ON(write_block(b) != btree_bset_last(b)); + + if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { + if (!b->level) + bch_btree_leaf_dirty(b, journal_ref); + else + bch_btree_node_write(b, &cl); + } + + mutex_unlock(&b->write_lock); + + /* wait for btree node write if necessary, after unlock */ + closure_sync(&cl); + + return 0; +split: + if (current->bio_list) { + op->lock = b->c->root->level + 1; + return -EAGAIN; + } else if (op->lock <= b->c->root->level) { + op->lock = b->c->root->level + 1; + return -EINTR; + } else { + /* Invalidated all iterators */ + int ret = btree_split(b, op, insert_keys, replace_key); + + if (bch_keylist_empty(insert_keys)) + return 0; + else if (!ret) + return -EINTR; + return ret; + } +} + +int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, + struct bkey *check_key) +{ + int ret = -EINTR; + uint64_t btree_ptr = b->key.ptr[0]; + unsigned long seq = b->seq; + struct keylist insert; + bool upgrade = op->lock == -1; + + bch_keylist_init(&insert); + + if (upgrade) { + rw_unlock(false, b); + rw_lock(true, b, b->level); + + if (b->key.ptr[0] != btree_ptr || + b->seq != seq + 1) + goto out; + } + + SET_KEY_PTRS(check_key, 1); + get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); + + SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); + + bch_keylist_add(&insert, check_key); + + ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); + + BUG_ON(!ret && !bch_keylist_empty(&insert)); +out: + if (upgrade) + downgrade_write(&b->lock); + return ret; +} + +struct btree_insert_op { + struct btree_op op; + struct keylist *keys; + atomic_t *journal_ref; + struct bkey *replace_key; +}; + +static int btree_insert_fn(struct btree_op *b_op, struct btree *b) +{ + struct btree_insert_op *op = container_of(b_op, + struct btree_insert_op, op); + + int ret = bch_btree_insert_node(b, &op->op, op->keys, + op->journal_ref, op->replace_key); + if (ret && !bch_keylist_empty(op->keys)) + return ret; + else + return MAP_DONE; +} + +int bch_btree_insert(struct cache_set *c, struct keylist *keys, + atomic_t *journal_ref, struct bkey *replace_key) +{ + struct btree_insert_op op; + int ret = 0; + + BUG_ON(current->bio_list); + BUG_ON(bch_keylist_empty(keys)); + + bch_btree_op_init(&op.op, 0); + op.keys = keys; + op.journal_ref = journal_ref; + op.replace_key = replace_key; + + while (!ret && !bch_keylist_empty(keys)) { + op.op.lock = 0; + ret = bch_btree_map_leaf_nodes(&op.op, c, + &START_KEY(keys->keys), + btree_insert_fn); + } + + if (ret) { + struct bkey *k; + + pr_err("error %i", ret); + + while ((k = bch_keylist_pop(keys))) + bkey_put(c, k); + } else if (op.op.insert_collision) + ret = -ESRCH; + + return ret; +} + +void bch_btree_set_root(struct btree *b) +{ + unsigned i; + struct closure cl; + + closure_init_stack(&cl); + + trace_bcache_btree_set_root(b); + + BUG_ON(!b->written); + + for (i = 0; i < KEY_PTRS(&b->key); i++) + BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); + + mutex_lock(&b->c->bucket_lock); + list_del_init(&b->list); + mutex_unlock(&b->c->bucket_lock); + + b->c->root = b; + + bch_journal_meta(b->c, &cl); + closure_sync(&cl); +} + +/* Map across nodes or keys */ + +static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, + btree_map_nodes_fn *fn, int flags) +{ + int ret = MAP_CONTINUE; + + if (b->level) { + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(&b->keys, &iter, from); + + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, + bch_ptr_bad))) { + ret = btree(map_nodes_recurse, k, b, + op, from, fn, flags); + from = NULL; + + if (ret != MAP_CONTINUE) + return ret; + } + } + + if (!b->level || flags == MAP_ALL_NODES) + ret = fn(op, b); + + return ret; +} + +int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn, int flags) +{ + return btree_root(map_nodes_recurse, c, op, from, fn, flags); +} + +static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags) +{ + int ret = MAP_CONTINUE; + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(&b->keys, &iter, from); + + while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { + ret = !b->level + ? fn(op, b, k) + : btree(map_keys_recurse, k, b, op, from, fn, flags); + from = NULL; + + if (ret != MAP_CONTINUE) + return ret; + } + + if (!b->level && (flags & MAP_END_KEY)) + ret = fn(op, b, &KEY(KEY_INODE(&b->key), + KEY_OFFSET(&b->key), 0)); + + return ret; +} + +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_keys_fn *fn, int flags) +{ + return btree_root(map_keys_recurse, c, op, from, fn, flags); +} + +/* Keybuf code */ + +static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) +{ + /* Overlapping keys compare equal */ + if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) + return -1; + if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) + return 1; + return 0; +} + +static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, + struct keybuf_key *r) +{ + return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); +} + +struct refill { + struct btree_op op; + unsigned nr_found; + struct keybuf *buf; + struct bkey *end; + keybuf_pred_fn *pred; +}; + +static int refill_keybuf_fn(struct btree_op *op, struct btree *b, + struct bkey *k) +{ + struct refill *refill = container_of(op, struct refill, op); + struct keybuf *buf = refill->buf; + int ret = MAP_CONTINUE; + + if (bkey_cmp(k, refill->end) >= 0) { + ret = MAP_DONE; + goto out; + } + + if (!KEY_SIZE(k)) /* end key */ + goto out; + + if (refill->pred(buf, k)) { + struct keybuf_key *w; + + spin_lock(&buf->lock); + + w = array_alloc(&buf->freelist); + if (!w) { + spin_unlock(&buf->lock); + return MAP_DONE; + } + + w->private = NULL; + bkey_copy(&w->key, k); + + if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) + array_free(&buf->freelist, w); + else + refill->nr_found++; + + if (array_freelist_empty(&buf->freelist)) + ret = MAP_DONE; + + spin_unlock(&buf->lock); + } +out: + buf->last_scanned = *k; + return ret; +} + +void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, + struct bkey *end, keybuf_pred_fn *pred) +{ + struct bkey start = buf->last_scanned; + struct refill refill; + + cond_resched(); + + bch_btree_op_init(&refill.op, -1); + refill.nr_found = 0; + refill.buf = buf; + refill.end = end; + refill.pred = pred; + + bch_btree_map_keys(&refill.op, c, &buf->last_scanned, + refill_keybuf_fn, MAP_END_KEY); + + trace_bcache_keyscan(refill.nr_found, + KEY_INODE(&start), KEY_OFFSET(&start), + KEY_INODE(&buf->last_scanned), + KEY_OFFSET(&buf->last_scanned)); + + spin_lock(&buf->lock); + + if (!RB_EMPTY_ROOT(&buf->keys)) { + struct keybuf_key *w; + w = RB_FIRST(&buf->keys, struct keybuf_key, node); + buf->start = START_KEY(&w->key); + + w = RB_LAST(&buf->keys, struct keybuf_key, node); + buf->end = w->key; + } else { + buf->start = MAX_KEY; + buf->end = MAX_KEY; + } + + spin_unlock(&buf->lock); +} + +static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) +{ + rb_erase(&w->node, &buf->keys); + array_free(&buf->freelist, w); +} + +void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) +{ + spin_lock(&buf->lock); + __bch_keybuf_del(buf, w); + spin_unlock(&buf->lock); +} + +bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, + struct bkey *end) +{ + bool ret = false; + struct keybuf_key *p, *w, s; + s.key = *start; + + if (bkey_cmp(end, &buf->start) <= 0 || + bkey_cmp(start, &buf->end) >= 0) + return false; + + spin_lock(&buf->lock); + w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); + + while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { + p = w; + w = RB_NEXT(w, node); + + if (p->private) + ret = true; + else + __bch_keybuf_del(buf, p); + } + + spin_unlock(&buf->lock); + return ret; +} + +struct keybuf_key *bch_keybuf_next(struct keybuf *buf) +{ + struct keybuf_key *w; + spin_lock(&buf->lock); + + w = RB_FIRST(&buf->keys, struct keybuf_key, node); + + while (w && w->private) + w = RB_NEXT(w, node); + + if (w) + w->private = ERR_PTR(-EINTR); + + spin_unlock(&buf->lock); + return w; +} + +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, + struct keybuf *buf, + struct bkey *end, + keybuf_pred_fn *pred) +{ + struct keybuf_key *ret; + + while (1) { + ret = bch_keybuf_next(buf); + if (ret) + break; + + if (bkey_cmp(&buf->last_scanned, end) >= 0) { + pr_debug("scan finished"); + break; + } + + bch_refill_keybuf(c, buf, end, pred); + } + + return ret; +} + +void bch_keybuf_init(struct keybuf *buf) +{ + buf->last_scanned = MAX_KEY; + buf->keys = RB_ROOT; + + spin_lock_init(&buf->lock); + array_allocator_init(&buf->freelist); +} diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h new file mode 100644 index 00000000000..91dfa5e6968 --- /dev/null +++ b/drivers/md/bcache/btree.h @@ -0,0 +1,309 @@ +#ifndef _BCACHE_BTREE_H +#define _BCACHE_BTREE_H + +/* + * THE BTREE: + * + * At a high level, bcache's btree is relatively standard b+ tree. All keys and + * pointers are in the leaves; interior nodes only have pointers to the child + * nodes. + * + * In the interior nodes, a struct bkey always points to a child btree node, and + * the key is the highest key in the child node - except that the highest key in + * an interior node is always MAX_KEY. The size field refers to the size on disk + * of the child node - this would allow us to have variable sized btree nodes + * (handy for keeping the depth of the btree 1 by expanding just the root). + * + * Btree nodes are themselves log structured, but this is hidden fairly + * thoroughly. Btree nodes on disk will in practice have extents that overlap + * (because they were written at different times), but in memory we never have + * overlapping extents - when we read in a btree node from disk, the first thing + * we do is resort all the sets of keys with a mergesort, and in the same pass + * we check for overlapping extents and adjust them appropriately. + * + * struct btree_op is a central interface to the btree code. It's used for + * specifying read vs. write locking, and the embedded closure is used for + * waiting on IO or reserve memory. + * + * BTREE CACHE: + * + * Btree nodes are cached in memory; traversing the btree might require reading + * in btree nodes which is handled mostly transparently. + * + * bch_btree_node_get() looks up a btree node in the cache and reads it in from + * disk if necessary. This function is almost never called directly though - the + * btree() macro is used to get a btree node, call some function on it, and + * unlock the node after the function returns. + * + * The root is special cased - it's taken out of the cache's lru (thus pinning + * it in memory), so we can find the root of the btree by just dereferencing a + * pointer instead of looking it up in the cache. This makes locking a bit + * tricky, since the root pointer is protected by the lock in the btree node it + * points to - the btree_root() macro handles this. + * + * In various places we must be able to allocate memory for multiple btree nodes + * in order to make forward progress. To do this we use the btree cache itself + * as a reserve; if __get_free_pages() fails, we'll find a node in the btree + * cache we can reuse. We can't allow more than one thread to be doing this at a + * time, so there's a lock, implemented by a pointer to the btree_op closure - + * this allows the btree_root() macro to implicitly release this lock. + * + * BTREE IO: + * + * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles + * this. + * + * For writing, we have two btree_write structs embeddded in struct btree - one + * write in flight, and one being set up, and we toggle between them. + * + * Writing is done with a single function - bch_btree_write() really serves two + * different purposes and should be broken up into two different functions. When + * passing now = false, it merely indicates that the node is now dirty - calling + * it ensures that the dirty keys will be written at some point in the future. + * + * When passing now = true, bch_btree_write() causes a write to happen + * "immediately" (if there was already a write in flight, it'll cause the write + * to happen as soon as the previous write completes). It returns immediately + * though - but it takes a refcount on the closure in struct btree_op you passed + * to it, so a closure_sync() later can be used to wait for the write to + * complete. + * + * This is handy because btree_split() and garbage collection can issue writes + * in parallel, reducing the amount of time they have to hold write locks. + * + * LOCKING: + * + * When traversing the btree, we may need write locks starting at some level - + * inserting a key into the btree will typically only require a write lock on + * the leaf node. + * + * This is specified with the lock field in struct btree_op; lock = 0 means we + * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get() + * checks this field and returns the node with the appropriate lock held. + * + * If, after traversing the btree, the insertion code discovers it has to split + * then it must restart from the root and take new locks - to do this it changes + * the lock field and returns -EINTR, which causes the btree_root() macro to + * loop. + * + * Handling cache misses require a different mechanism for upgrading to a write + * lock. We do cache lookups with only a read lock held, but if we get a cache + * miss and we wish to insert this data into the cache, we have to insert a + * placeholder key to detect races - otherwise, we could race with a write and + * overwrite the data that was just written to the cache with stale data from + * the backing device. + * + * For this we use a sequence number that write locks and unlocks increment - to + * insert the check key it unlocks the btree node and then takes a write lock, + * and fails if the sequence number doesn't match. + */ + +#include "bset.h" +#include "debug.h" + +struct btree_write { + atomic_t *journal; + + /* If btree_split() frees a btree node, it writes a new pointer to that + * btree node indicating it was freed; it takes a refcount on + * c->prio_blocked because we can't write the gens until the new + * pointer is on disk. This allows btree_write_endio() to release the + * refcount that btree_split() took. + */ + int prio_blocked; +}; + +struct btree { + /* Hottest entries first */ + struct hlist_node hash; + + /* Key/pointer for this btree node */ + BKEY_PADDED(key); + + /* Single bit - set when accessed, cleared by shrinker */ + unsigned long accessed; + unsigned long seq; + struct rw_semaphore lock; + struct cache_set *c; + struct btree *parent; + + struct mutex write_lock; + + unsigned long flags; + uint16_t written; /* would be nice to kill */ + uint8_t level; + + struct btree_keys keys; + + /* For outstanding btree writes, used as a lock - protects write_idx */ + struct closure io; + struct semaphore io_mutex; + + struct list_head list; + struct delayed_work work; + + struct btree_write writes[2]; + struct bio *bio; +}; + +#define BTREE_FLAG(flag) \ +static inline bool btree_node_ ## flag(struct btree *b) \ +{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ + \ +static inline void set_btree_node_ ## flag(struct btree *b) \ +{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ + +enum btree_flags { + BTREE_NODE_io_error, + BTREE_NODE_dirty, + BTREE_NODE_write_idx, +}; + +BTREE_FLAG(io_error); +BTREE_FLAG(dirty); +BTREE_FLAG(write_idx); + +static inline struct btree_write *btree_current_write(struct btree *b) +{ + return b->writes + btree_node_write_idx(b); +} + +static inline struct btree_write *btree_prev_write(struct btree *b) +{ + return b->writes + (btree_node_write_idx(b) ^ 1); +} + +static inline struct bset *btree_bset_first(struct btree *b) +{ + return b->keys.set->data; +} + +static inline struct bset *btree_bset_last(struct btree *b) +{ + return bset_tree_last(&b->keys)->data; +} + +static inline unsigned bset_block_offset(struct btree *b, struct bset *i) +{ + return bset_sector_offset(&b->keys, i) >> b->c->block_bits; +} + +static inline void set_gc_sectors(struct cache_set *c) +{ + atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); +} + +void bkey_put(struct cache_set *c, struct bkey *k); + +/* Looping macros */ + +#define for_each_cached_btree(b, c, iter) \ + for (iter = 0; \ + iter < ARRAY_SIZE((c)->bucket_hash); \ + iter++) \ + hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) + +/* Recursing down the btree */ + +struct btree_op { + /* for waiting on btree reserve in btree_split() */ + wait_queue_t wait; + + /* Btree level at which we start taking write locks */ + short lock; + + unsigned insert_collision:1; +}; + +static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) +{ + memset(op, 0, sizeof(struct btree_op)); + init_wait(&op->wait); + op->lock = write_lock_level; +} + +static inline void rw_lock(bool w, struct btree *b, int level) +{ + w ? down_write_nested(&b->lock, level + 1) + : down_read_nested(&b->lock, level + 1); + if (w) + b->seq++; +} + +static inline void rw_unlock(bool w, struct btree *b) +{ + if (w) + b->seq++; + (w ? up_write : up_read)(&b->lock); +} + +void bch_btree_node_read_done(struct btree *); +void __bch_btree_node_write(struct btree *, struct closure *); +void bch_btree_node_write(struct btree *, struct closure *); + +void bch_btree_set_root(struct btree *); +struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int); +struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *, + struct bkey *, int, bool); + +int bch_btree_insert_check_key(struct btree *, struct btree_op *, + struct bkey *); +int bch_btree_insert(struct cache_set *, struct keylist *, + atomic_t *, struct bkey *); + +int bch_gc_thread_start(struct cache_set *); +void bch_initial_gc_finish(struct cache_set *); +void bch_moving_gc(struct cache_set *); +int bch_btree_check(struct cache_set *); +void bch_initial_mark_key(struct cache_set *, int, struct bkey *); + +static inline void wake_up_gc(struct cache_set *c) +{ + if (c->gc_thread) + wake_up_process(c->gc_thread); +} + +#define MAP_DONE 0 +#define MAP_CONTINUE 1 + +#define MAP_ALL_NODES 0 +#define MAP_LEAF_NODES 1 + +#define MAP_END_KEY 1 + +typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *); +int __bch_btree_map_nodes(struct btree_op *, struct cache_set *, + struct bkey *, btree_map_nodes_fn *, int); + +static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn) +{ + return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES); +} + +static inline int bch_btree_map_leaf_nodes(struct btree_op *op, + struct cache_set *c, + struct bkey *from, + btree_map_nodes_fn *fn) +{ + return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES); +} + +typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *, + struct bkey *); +int bch_btree_map_keys(struct btree_op *, struct cache_set *, + struct bkey *, btree_map_keys_fn *, int); + +typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); + +void bch_keybuf_init(struct keybuf *); +void bch_refill_keybuf(struct cache_set *, struct keybuf *, + struct bkey *, keybuf_pred_fn *); +bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, + struct bkey *); +void bch_keybuf_del(struct keybuf *, struct keybuf_key *); +struct keybuf_key *bch_keybuf_next(struct keybuf *); +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, + struct bkey *, keybuf_pred_fn *); + +#endif diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c new file mode 100644 index 00000000000..7a228de95fd --- /dev/null +++ b/drivers/md/bcache/closure.c @@ -0,0 +1,222 @@ +/* + * Asynchronous refcounty things + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/seq_file.h> + +#include "closure.h" + +static inline void closure_put_after_sub(struct closure *cl, int flags) +{ + int r = flags & CLOSURE_REMAINING_MASK; + + BUG_ON(flags & CLOSURE_GUARD_MASK); + BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); + + /* Must deliver precisely one wakeup */ + if (r == 1 && (flags & CLOSURE_SLEEPING)) + wake_up_process(cl->task); + + if (!r) { + if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { + atomic_set(&cl->remaining, + CLOSURE_REMAINING_INITIALIZER); + closure_queue(cl); + } else { + struct closure *parent = cl->parent; + closure_fn *destructor = cl->fn; + + closure_debug_destroy(cl); + + if (destructor) + destructor(cl); + + if (parent) + closure_put(parent); + } + } +} + +/* For clearing flags with the same atomic op as a put */ +void closure_sub(struct closure *cl, int v) +{ + closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); +} +EXPORT_SYMBOL(closure_sub); + +/** + * closure_put - decrement a closure's refcount + */ +void closure_put(struct closure *cl) +{ + closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); +} +EXPORT_SYMBOL(closure_put); + +/** + * closure_wake_up - wake up all closures on a wait list, without memory barrier + */ +void __closure_wake_up(struct closure_waitlist *wait_list) +{ + struct llist_node *list; + struct closure *cl; + struct llist_node *reverse = NULL; + + list = llist_del_all(&wait_list->list); + + /* We first reverse the list to preserve FIFO ordering and fairness */ + + while (list) { + struct llist_node *t = list; + list = llist_next(list); + + t->next = reverse; + reverse = t; + } + + /* Then do the wakeups */ + + while (reverse) { + cl = container_of(reverse, struct closure, list); + reverse = llist_next(reverse); + + closure_set_waiting(cl, 0); + closure_sub(cl, CLOSURE_WAITING + 1); + } +} +EXPORT_SYMBOL(__closure_wake_up); + +/** + * closure_wait - add a closure to a waitlist + * + * @waitlist will own a ref on @cl, which will be released when + * closure_wake_up() is called on @waitlist. + * + */ +bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) +{ + if (atomic_read(&cl->remaining) & CLOSURE_WAITING) + return false; + + closure_set_waiting(cl, _RET_IP_); + atomic_add(CLOSURE_WAITING + 1, &cl->remaining); + llist_add(&cl->list, &waitlist->list); + + return true; +} +EXPORT_SYMBOL(closure_wait); + +/** + * closure_sync - sleep until a closure a closure has nothing left to wait on + * + * Sleeps until the refcount hits 1 - the thread that's running the closure owns + * the last refcount. + */ +void closure_sync(struct closure *cl) +{ + while (1) { + __closure_start_sleep(cl); + closure_set_ret_ip(cl); + + if ((atomic_read(&cl->remaining) & + CLOSURE_REMAINING_MASK) == 1) + break; + + schedule(); + } + + __closure_end_sleep(cl); +} +EXPORT_SYMBOL(closure_sync); + +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + +static LIST_HEAD(closure_list); +static DEFINE_SPINLOCK(closure_list_lock); + +void closure_debug_create(struct closure *cl) +{ + unsigned long flags; + + BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); + cl->magic = CLOSURE_MAGIC_ALIVE; + + spin_lock_irqsave(&closure_list_lock, flags); + list_add(&cl->all, &closure_list); + spin_unlock_irqrestore(&closure_list_lock, flags); +} +EXPORT_SYMBOL(closure_debug_create); + +void closure_debug_destroy(struct closure *cl) +{ + unsigned long flags; + + BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); + cl->magic = CLOSURE_MAGIC_DEAD; + + spin_lock_irqsave(&closure_list_lock, flags); + list_del(&cl->all); + spin_unlock_irqrestore(&closure_list_lock, flags); +} +EXPORT_SYMBOL(closure_debug_destroy); + +static struct dentry *debug; + +#define work_data_bits(work) ((unsigned long *)(&(work)->data)) + +static int debug_seq_show(struct seq_file *f, void *data) +{ + struct closure *cl; + spin_lock_irq(&closure_list_lock); + + list_for_each_entry(cl, &closure_list, all) { + int r = atomic_read(&cl->remaining); + + seq_printf(f, "%p: %pF -> %pf p %p r %i ", + cl, (void *) cl->ip, cl->fn, cl->parent, + r & CLOSURE_REMAINING_MASK); + + seq_printf(f, "%s%s%s%s\n", + test_bit(WORK_STRUCT_PENDING, + work_data_bits(&cl->work)) ? "Q" : "", + r & CLOSURE_RUNNING ? "R" : "", + r & CLOSURE_STACK ? "S" : "", + r & CLOSURE_SLEEPING ? "Sl" : ""); + + if (r & CLOSURE_WAITING) + seq_printf(f, " W %pF\n", + (void *) cl->waiting_on); + + seq_printf(f, "\n"); + } + + spin_unlock_irq(&closure_list_lock); + return 0; +} + +static int debug_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, debug_seq_show, NULL); +} + +static const struct file_operations debug_ops = { + .owner = THIS_MODULE, + .open = debug_seq_open, + .read = seq_read, + .release = single_release +}; + +void __init closure_debug_init(void) +{ + debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); +} + +#endif + +MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h new file mode 100644 index 00000000000..a08e3eeac3c --- /dev/null +++ b/drivers/md/bcache/closure.h @@ -0,0 +1,386 @@ +#ifndef _LINUX_CLOSURE_H +#define _LINUX_CLOSURE_H + +#include <linux/llist.h> +#include <linux/sched.h> +#include <linux/workqueue.h> + +/* + * Closure is perhaps the most overused and abused term in computer science, but + * since I've been unable to come up with anything better you're stuck with it + * again. + * + * What are closures? + * + * They embed a refcount. The basic idea is they count "things that are in + * progress" - in flight bios, some other thread that's doing something else - + * anything you might want to wait on. + * + * The refcount may be manipulated with closure_get() and closure_put(). + * closure_put() is where many of the interesting things happen, when it causes + * the refcount to go to 0. + * + * Closures can be used to wait on things both synchronously and asynchronously, + * and synchronous and asynchronous use can be mixed without restriction. To + * wait synchronously, use closure_sync() - you will sleep until your closure's + * refcount hits 1. + * + * To wait asynchronously, use + * continue_at(cl, next_function, workqueue); + * + * passing it, as you might expect, the function to run when nothing is pending + * and the workqueue to run that function out of. + * + * continue_at() also, critically, is a macro that returns the calling function. + * There's good reason for this. + * + * To use safely closures asynchronously, they must always have a refcount while + * they are running owned by the thread that is running them. Otherwise, suppose + * you submit some bios and wish to have a function run when they all complete: + * + * foo_endio(struct bio *bio, int error) + * { + * closure_put(cl); + * } + * + * closure_init(cl); + * + * do_stuff(); + * closure_get(cl); + * bio1->bi_endio = foo_endio; + * bio_submit(bio1); + * + * do_more_stuff(); + * closure_get(cl); + * bio2->bi_endio = foo_endio; + * bio_submit(bio2); + * + * continue_at(cl, complete_some_read, system_wq); + * + * If closure's refcount started at 0, complete_some_read() could run before the + * second bio was submitted - which is almost always not what you want! More + * importantly, it wouldn't be possible to say whether the original thread or + * complete_some_read()'s thread owned the closure - and whatever state it was + * associated with! + * + * So, closure_init() initializes a closure's refcount to 1 - and when a + * closure_fn is run, the refcount will be reset to 1 first. + * + * Then, the rule is - if you got the refcount with closure_get(), release it + * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount + * on a closure because you called closure_init() or you were run out of a + * closure - _always_ use continue_at(). Doing so consistently will help + * eliminate an entire class of particularly pernicious races. + * + * Lastly, you might have a wait list dedicated to a specific event, and have no + * need for specifying the condition - you just want to wait until someone runs + * closure_wake_up() on the appropriate wait list. In that case, just use + * closure_wait(). It will return either true or false, depending on whether the + * closure was already on a wait list or not - a closure can only be on one wait + * list at a time. + * + * Parents: + * + * closure_init() takes two arguments - it takes the closure to initialize, and + * a (possibly null) parent. + * + * If parent is non null, the new closure will have a refcount for its lifetime; + * a closure is considered to be "finished" when its refcount hits 0 and the + * function to run is null. Hence + * + * continue_at(cl, NULL, NULL); + * + * returns up the (spaghetti) stack of closures, precisely like normal return + * returns up the C stack. continue_at() with non null fn is better thought of + * as doing a tail call. + * + * All this implies that a closure should typically be embedded in a particular + * struct (which its refcount will normally control the lifetime of), and that + * struct can very much be thought of as a stack frame. + */ + +struct closure; +typedef void (closure_fn) (struct closure *); + +struct closure_waitlist { + struct llist_head list; +}; + +enum closure_state { + /* + * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by + * the thread that owns the closure, and cleared by the thread that's + * waking up the closure. + * + * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep + * - indicates that cl->task is valid and closure_put() may wake it up. + * Only set or cleared by the thread that owns the closure. + * + * The rest are for debugging and don't affect behaviour: + * + * CLOSURE_RUNNING: Set when a closure is running (i.e. by + * closure_init() and when closure_put() runs then next function), and + * must be cleared before remaining hits 0. Primarily to help guard + * against incorrect usage and accidentally transferring references. + * continue_at() and closure_return() clear it for you, if you're doing + * something unusual you can use closure_set_dead() which also helps + * annotate where references are being transferred. + * + * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a + * closure with this flag set + */ + + CLOSURE_BITS_START = (1 << 23), + CLOSURE_DESTRUCTOR = (1 << 23), + CLOSURE_WAITING = (1 << 25), + CLOSURE_SLEEPING = (1 << 27), + CLOSURE_RUNNING = (1 << 29), + CLOSURE_STACK = (1 << 31), +}; + +#define CLOSURE_GUARD_MASK \ + ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \ + CLOSURE_RUNNING|CLOSURE_STACK) << 1) + +#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) +#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) + +struct closure { + union { + struct { + struct workqueue_struct *wq; + struct task_struct *task; + struct llist_node list; + closure_fn *fn; + }; + struct work_struct work; + }; + + struct closure *parent; + + atomic_t remaining; + +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +#define CLOSURE_MAGIC_DEAD 0xc054dead +#define CLOSURE_MAGIC_ALIVE 0xc054a11e + + unsigned magic; + struct list_head all; + unsigned long ip; + unsigned long waiting_on; +#endif +}; + +void closure_sub(struct closure *cl, int v); +void closure_put(struct closure *cl); +void __closure_wake_up(struct closure_waitlist *list); +bool closure_wait(struct closure_waitlist *list, struct closure *cl); +void closure_sync(struct closure *cl); + +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + +void closure_debug_init(void); +void closure_debug_create(struct closure *cl); +void closure_debug_destroy(struct closure *cl); + +#else + +static inline void closure_debug_init(void) {} +static inline void closure_debug_create(struct closure *cl) {} +static inline void closure_debug_destroy(struct closure *cl) {} + +#endif + +static inline void closure_set_ip(struct closure *cl) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + cl->ip = _THIS_IP_; +#endif +} + +static inline void closure_set_ret_ip(struct closure *cl) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + cl->ip = _RET_IP_; +#endif +} + +static inline void closure_set_waiting(struct closure *cl, unsigned long f) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + cl->waiting_on = f; +#endif +} + +static inline void __closure_end_sleep(struct closure *cl) +{ + __set_current_state(TASK_RUNNING); + + if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) + atomic_sub(CLOSURE_SLEEPING, &cl->remaining); +} + +static inline void __closure_start_sleep(struct closure *cl) +{ + closure_set_ip(cl); + cl->task = current; + set_current_state(TASK_UNINTERRUPTIBLE); + + if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) + atomic_add(CLOSURE_SLEEPING, &cl->remaining); +} + +static inline void closure_set_stopped(struct closure *cl) +{ + atomic_sub(CLOSURE_RUNNING, &cl->remaining); +} + +static inline void set_closure_fn(struct closure *cl, closure_fn *fn, + struct workqueue_struct *wq) +{ + BUG_ON(object_is_on_stack(cl)); + closure_set_ip(cl); + cl->fn = fn; + cl->wq = wq; + /* between atomic_dec() in closure_put() */ + smp_mb__before_atomic(); +} + +static inline void closure_queue(struct closure *cl) +{ + struct workqueue_struct *wq = cl->wq; + if (wq) { + INIT_WORK(&cl->work, cl->work.func); + BUG_ON(!queue_work(wq, &cl->work)); + } else + cl->fn(cl); +} + +/** + * closure_get - increment a closure's refcount + */ +static inline void closure_get(struct closure *cl) +{ +#ifdef CONFIG_BCACHE_CLOSURES_DEBUG + BUG_ON((atomic_inc_return(&cl->remaining) & + CLOSURE_REMAINING_MASK) <= 1); +#else + atomic_inc(&cl->remaining); +#endif +} + +/** + * closure_init - Initialize a closure, setting the refcount to 1 + * @cl: closure to initialize + * @parent: parent of the new closure. cl will take a refcount on it for its + * lifetime; may be NULL. + */ +static inline void closure_init(struct closure *cl, struct closure *parent) +{ + memset(cl, 0, sizeof(struct closure)); + cl->parent = parent; + if (parent) + closure_get(parent); + + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); + + closure_debug_create(cl); + closure_set_ip(cl); +} + +static inline void closure_init_stack(struct closure *cl) +{ + memset(cl, 0, sizeof(struct closure)); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); +} + +/** + * closure_wake_up - wake up all closures on a wait list. + */ +static inline void closure_wake_up(struct closure_waitlist *list) +{ + smp_mb(); + __closure_wake_up(list); +} + +/** + * continue_at - jump to another function with barrier + * + * After @cl is no longer waiting on anything (i.e. all outstanding refs have + * been dropped with closure_put()), it will resume execution at @fn running out + * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). + * + * NOTE: This macro expands to a return in the calling function! + * + * This is because after calling continue_at() you no longer have a ref on @cl, + * and whatever @cl owns may be freed out from under you - a running closure fn + * has a ref on its own closure which continue_at() drops. + */ +#define continue_at(_cl, _fn, _wq) \ +do { \ + set_closure_fn(_cl, _fn, _wq); \ + closure_sub(_cl, CLOSURE_RUNNING + 1); \ + return; \ +} while (0) + +/** + * closure_return - finish execution of a closure + * + * This is used to indicate that @cl is finished: when all outstanding refs on + * @cl have been dropped @cl's ref on its parent closure (as passed to + * closure_init()) will be dropped, if one was specified - thus this can be + * thought of as returning to the parent closure. + */ +#define closure_return(_cl) continue_at((_cl), NULL, NULL) + +/** + * continue_at_nobarrier - jump to another function without barrier + * + * Causes @fn to be executed out of @cl, in @wq context (or called directly if + * @wq is NULL). + * + * NOTE: like continue_at(), this macro expands to a return in the caller! + * + * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, + * thus it's not safe to touch anything protected by @cl after a + * continue_at_nobarrier(). + */ +#define continue_at_nobarrier(_cl, _fn, _wq) \ +do { \ + set_closure_fn(_cl, _fn, _wq); \ + closure_queue(_cl); \ + return; \ +} while (0) + +/** + * closure_return - finish execution of a closure, with destructor + * + * Works like closure_return(), except @destructor will be called when all + * outstanding refs on @cl have been dropped; @destructor may be used to safely + * free the memory occupied by @cl, and it is called with the ref on the parent + * closure still held - so @destructor could safely return an item to a + * freelist protected by @cl's parent. + */ +#define closure_return_with_destructor(_cl, _destructor) \ +do { \ + set_closure_fn(_cl, _destructor, NULL); \ + closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ + return; \ +} while (0) + +/** + * closure_call - execute @fn out of a new, uninitialized closure + * + * Typically used when running out of one closure, and we want to run @fn + * asynchronously out of a new closure - @parent will then wait for @cl to + * finish. + */ +static inline void closure_call(struct closure *cl, closure_fn fn, + struct workqueue_struct *wq, + struct closure *parent) +{ + closure_init(cl, parent); + continue_at_nobarrier(cl, fn, wq); +} + +#endif /* _LINUX_CLOSURE_H */ diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c new file mode 100644 index 00000000000..8b1f1d5c181 --- /dev/null +++ b/drivers/md/bcache/debug.c @@ -0,0 +1,252 @@ +/* + * Assorted bcache debug code + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" + +#include <linux/console.h> +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/seq_file.h> + +static struct dentry *debug; + +#ifdef CONFIG_BCACHE_DEBUG + +#define for_each_written_bset(b, start, i) \ + for (i = (start); \ + (void *) i < (void *) (start) + (KEY_SIZE(&b->key) << 9) &&\ + i->seq == (start)->seq; \ + i = (void *) i + set_blocks(i, block_bytes(b->c)) * \ + block_bytes(b->c)) + +void bch_btree_verify(struct btree *b) +{ + struct btree *v = b->c->verify_data; + struct bset *ondisk, *sorted, *inmemory; + struct bio *bio; + + if (!b->c->verify || !b->c->verify_ondisk) + return; + + down(&b->io_mutex); + mutex_lock(&b->c->verify_lock); + + ondisk = b->c->verify_ondisk; + sorted = b->c->verify_data->keys.set->data; + inmemory = b->keys.set->data; + + bkey_copy(&v->key, &b->key); + v->written = 0; + v->level = b->level; + v->keys.ops = b->keys.ops; + + bio = bch_bbio_alloc(b->c); + bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; + bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); + bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; + bch_bio_map(bio, sorted); + + submit_bio_wait(REQ_META|READ_SYNC, bio); + bch_bbio_free(bio, b->c); + + memcpy(ondisk, sorted, KEY_SIZE(&v->key) << 9); + + bch_btree_node_read_done(v); + sorted = v->keys.set->data; + + if (inmemory->keys != sorted->keys || + memcmp(inmemory->start, + sorted->start, + (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) { + struct bset *i; + unsigned j; + + console_lock(); + + printk(KERN_ERR "*** in memory:\n"); + bch_dump_bset(&b->keys, inmemory, 0); + + printk(KERN_ERR "*** read back in:\n"); + bch_dump_bset(&v->keys, sorted, 0); + + for_each_written_bset(b, ondisk, i) { + unsigned block = ((void *) i - (void *) ondisk) / + block_bytes(b->c); + + printk(KERN_ERR "*** on disk block %u:\n", block); + bch_dump_bset(&b->keys, i, block); + } + + printk(KERN_ERR "*** block %zu not written\n", + ((void *) i - (void *) ondisk) / block_bytes(b->c)); + + for (j = 0; j < inmemory->keys; j++) + if (inmemory->d[j] != sorted->d[j]) + break; + + printk(KERN_ERR "b->written %u\n", b->written); + + console_unlock(); + panic("verify failed at %u\n", j); + } + + mutex_unlock(&b->c->verify_lock); + up(&b->io_mutex); +} + +void bch_data_verify(struct cached_dev *dc, struct bio *bio) +{ + char name[BDEVNAME_SIZE]; + struct bio *check; + struct bio_vec bv, *bv2; + struct bvec_iter iter; + int i; + + check = bio_clone(bio, GFP_NOIO); + if (!check) + return; + + if (bio_alloc_pages(check, GFP_NOIO)) + goto out_put; + + submit_bio_wait(READ_SYNC, check); + + bio_for_each_segment(bv, bio, iter) { + void *p1 = kmap_atomic(bv.bv_page); + void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page); + + cache_set_err_on(memcmp(p1 + bv.bv_offset, + p2 + bv.bv_offset, + bv.bv_len), + dc->disk.c, + "verify failed at dev %s sector %llu", + bdevname(dc->bdev, name), + (uint64_t) bio->bi_iter.bi_sector); + + kunmap_atomic(p1); + } + + bio_for_each_segment_all(bv2, check, i) + __free_page(bv2->bv_page); +out_put: + bio_put(check); +} + +#endif + +#ifdef CONFIG_DEBUG_FS + +/* XXX: cache set refcounting */ + +struct dump_iterator { + char buf[PAGE_SIZE]; + size_t bytes; + struct cache_set *c; + struct keybuf keys; +}; + +static bool dump_pred(struct keybuf *buf, struct bkey *k) +{ + return true; +} + +static ssize_t bch_dump_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iterator *i = file->private_data; + ssize_t ret = 0; + char kbuf[80]; + + while (size) { + struct keybuf_key *w; + unsigned bytes = min(i->bytes, size); + + int err = copy_to_user(buf, i->buf, bytes); + if (err) + return err; + + ret += bytes; + buf += bytes; + size -= bytes; + i->bytes -= bytes; + memmove(i->buf, i->buf + bytes, i->bytes); + + if (i->bytes) + break; + + w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred); + if (!w) + break; + + bch_extent_to_text(kbuf, sizeof(kbuf), &w->key); + i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); + bch_keybuf_del(&i->keys, w); + } + + return ret; +} + +static int bch_dump_open(struct inode *inode, struct file *file) +{ + struct cache_set *c = inode->i_private; + struct dump_iterator *i; + + i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL); + if (!i) + return -ENOMEM; + + file->private_data = i; + i->c = c; + bch_keybuf_init(&i->keys); + i->keys.last_scanned = KEY(0, 0, 0); + + return 0; +} + +static int bch_dump_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static const struct file_operations cache_set_debug_ops = { + .owner = THIS_MODULE, + .open = bch_dump_open, + .read = bch_dump_read, + .release = bch_dump_release +}; + +void bch_debug_init_cache_set(struct cache_set *c) +{ + if (!IS_ERR_OR_NULL(debug)) { + char name[50]; + snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); + + c->debug = debugfs_create_file(name, 0400, debug, c, + &cache_set_debug_ops); + } +} + +#endif + +void bch_debug_exit(void) +{ + if (!IS_ERR_OR_NULL(debug)) + debugfs_remove_recursive(debug); +} + +int __init bch_debug_init(struct kobject *kobj) +{ + int ret = 0; + + debug = debugfs_create_dir("bcache", NULL); + return ret; +} diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h new file mode 100644 index 00000000000..1f63c195d24 --- /dev/null +++ b/drivers/md/bcache/debug.h @@ -0,0 +1,34 @@ +#ifndef _BCACHE_DEBUG_H +#define _BCACHE_DEBUG_H + +struct bio; +struct cached_dev; +struct cache_set; + +#ifdef CONFIG_BCACHE_DEBUG + +void bch_btree_verify(struct btree *); +void bch_data_verify(struct cached_dev *, struct bio *); + +#define expensive_debug_checks(c) ((c)->expensive_debug_checks) +#define key_merging_disabled(c) ((c)->key_merging_disabled) +#define bypass_torture_test(d) ((d)->bypass_torture_test) + +#else /* DEBUG */ + +static inline void bch_btree_verify(struct btree *b) {} +static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} + +#define expensive_debug_checks(c) 0 +#define key_merging_disabled(c) 0 +#define bypass_torture_test(d) 0 + +#endif + +#ifdef CONFIG_DEBUG_FS +void bch_debug_init_cache_set(struct cache_set *); +#else +static inline void bch_debug_init_cache_set(struct cache_set *c) {} +#endif + +#endif diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c new file mode 100644 index 00000000000..3a0de4cf977 --- /dev/null +++ b/drivers/md/bcache/extents.c @@ -0,0 +1,620 @@ +/* + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> + * + * Uses a block device as cache for other block devices; optimized for SSDs. + * All allocation is done in buckets, which should match the erase block size + * of the device. + * + * Buckets containing cached data are kept on a heap sorted by priority; + * bucket priority is increased on cache hit, and periodically all the buckets + * on the heap have their priority scaled down. This currently is just used as + * an LRU but in the future should allow for more intelligent heuristics. + * + * Buckets have an 8 bit counter; freeing is accomplished by incrementing the + * counter. Garbage collection is used to remove stale pointers. + * + * Indexing is done via a btree; nodes are not necessarily fully sorted, rather + * as keys are inserted we only sort the pages that have not yet been written. + * When garbage collection is run, we resort the entire node. + * + * All configuration is done via sysfs; see Documentation/bcache.txt. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" +#include "writeback.h" + +static void sort_key_next(struct btree_iter *iter, + struct btree_iter_set *i) +{ + i->k = bkey_next(i->k); + + if (i->k == i->end) + *i = iter->data[--iter->used]; +} + +static bool bch_key_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(l.k, r.k); + + return c ? c > 0 : l.k < r.k; +} + +static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = PTR_CACHE(c, k, i); + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->sb.bucket_size || + bucket < ca->sb.first_bucket || + bucket >= ca->sb.nbuckets) + return true; + } + + return false; +} + +/* Common among btree and extent ptrs */ + +static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) { + struct cache *ca = PTR_CACHE(c, k, i); + size_t bucket = PTR_BUCKET_NR(c, k, i); + size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); + + if (KEY_SIZE(k) + r > c->sb.bucket_size) + return "bad, length too big"; + if (bucket < ca->sb.first_bucket) + return "bad, short offset"; + if (bucket >= ca->sb.nbuckets) + return "bad, offset past end of device"; + if (ptr_stale(c, k, i)) + return "stale"; + } + + if (!bkey_cmp(k, &ZERO_KEY)) + return "bad, null key"; + if (!KEY_PTRS(k)) + return "bad, no pointers"; + if (!KEY_SIZE(k)) + return "zeroed key"; + return ""; +} + +void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) +{ + unsigned i = 0; + char *out = buf, *end = buf + size; + +#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) + + p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); + + for (i = 0; i < KEY_PTRS(k); i++) { + if (i) + p(", "); + + if (PTR_DEV(k, i) == PTR_CHECK_DEV) + p("check dev"); + else + p("%llu:%llu gen %llu", PTR_DEV(k, i), + PTR_OFFSET(k, i), PTR_GEN(k, i)); + } + + p("]"); + + if (KEY_DIRTY(k)) + p(" dirty"); + if (KEY_CSUM(k)) + p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); +#undef p +} + +static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) +{ + struct btree *b = container_of(keys, struct btree, keys); + unsigned j; + char buf[80]; + + bch_extent_to_text(buf, sizeof(buf), k); + printk(" %s", buf); + + for (j = 0; j < KEY_PTRS(k); j++) { + size_t n = PTR_BUCKET_NR(b->c, k, j); + printk(" bucket %zu", n); + + if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) + printk(" prio %i", + PTR_BUCKET(b->c, k, j)->prio); + } + + printk(" %s\n", bch_ptr_status(b->c, k)); +} + +/* Btree ptrs */ + +bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) +{ + char buf[80]; + + if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) + goto bad; + + if (__ptr_invalid(c, k)) + goto bad; + + return false; +bad: + bch_extent_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); + return true; +} + +static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + return __bch_btree_ptr_invalid(b->c, k); +} + +static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) +{ + unsigned i; + char buf[80]; + struct bucket *g; + + if (mutex_trylock(&b->c->bucket_lock)) { + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(b->c, k, i)) { + g = PTR_BUCKET(b->c, k, i); + + if (KEY_DIRTY(k) || + g->prio != BTREE_PRIO || + (b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_METADATA)) + goto err; + } + + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_extent_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu", + buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g)); + return true; +} + +static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + unsigned i; + + if (!bkey_cmp(k, &ZERO_KEY) || + !KEY_PTRS(k) || + bch_ptr_invalid(bk, k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (!ptr_available(b->c, k, i) || + ptr_stale(b->c, k, i)) + return true; + + if (expensive_debug_checks(b->c) && + btree_ptr_bad_expensive(b, k)) + return true; + + return false; +} + +static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key) +{ + struct btree *b = container_of(bk, struct btree, keys); + + if (!KEY_OFFSET(insert)) + btree_current_write(b)->prio_blocked++; + + return false; +} + +const struct btree_keys_ops bch_btree_keys_ops = { + .sort_cmp = bch_key_sort_cmp, + .insert_fixup = bch_btree_ptr_insert_fixup, + .key_invalid = bch_btree_ptr_invalid, + .key_bad = bch_btree_ptr_bad, + .key_to_text = bch_extent_to_text, + .key_dump = bch_bkey_dump, +}; + +/* Extents */ + +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for btree_sort_fixup() - if there are multiple keys that compare + * equal in different sets, we have to process them newest to oldest. + */ +static bool bch_extent_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) +{ + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); + + return c ? c > 0 : l.k < r.k; +} + +static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, + struct bkey *tmp) +{ + while (iter->used > 1) { + struct btree_iter_set *top = iter->data, *i = top + 1; + + if (iter->used > 2 && + bch_extent_sort_cmp(i[0], i[1])) + i++; + + if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) + break; + + if (!KEY_SIZE(i->k)) { + sort_key_next(iter, i); + heap_sift(iter, i - top, bch_extent_sort_cmp); + continue; + } + + if (top->k > i->k) { + if (bkey_cmp(top->k, i->k) >= 0) + sort_key_next(iter, i); + else + bch_cut_front(top->k, i->k); + + heap_sift(iter, i - top, bch_extent_sort_cmp); + } else { + /* can't happen because of comparison func */ + BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); + + if (bkey_cmp(i->k, top->k) < 0) { + bkey_copy(tmp, top->k); + + bch_cut_back(&START_KEY(i->k), tmp); + bch_cut_front(i->k, top->k); + heap_sift(iter, 0, bch_extent_sort_cmp); + + return tmp; + } else { + bch_cut_back(&START_KEY(i->k), top->k); + } + } + } + + return NULL; +} + +static void bch_subtract_dirty(struct bkey *k, + struct cache_set *c, + uint64_t offset, + int sectors) +{ + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(k), + offset, -sectors); +} + +static bool bch_extent_insert_fixup(struct btree_keys *b, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key) +{ + struct cache_set *c = container_of(b, struct btree, keys)->c; + + uint64_t old_offset; + unsigned old_size, sectors_found = 0; + + BUG_ON(!KEY_OFFSET(insert)); + BUG_ON(!KEY_SIZE(insert)); + + while (1) { + struct bkey *k = bch_btree_iter_next(iter); + if (!k) + break; + + if (bkey_cmp(&START_KEY(k), insert) >= 0) { + if (KEY_SIZE(k)) + break; + else + continue; + } + + if (bkey_cmp(k, &START_KEY(insert)) <= 0) + continue; + + old_offset = KEY_START(k); + old_size = KEY_SIZE(k); + + /* + * We might overlap with 0 size extents; we can't skip these + * because if they're in the set we're inserting to we have to + * adjust them so they don't overlap with the key we're + * inserting. But we don't want to check them for replace + * operations. + */ + + if (replace_key && KEY_SIZE(k)) { + /* + * k might have been split since we inserted/found the + * key we're replacing + */ + unsigned i; + uint64_t offset = KEY_START(k) - + KEY_START(replace_key); + + /* But it must be a subset of the replace key */ + if (KEY_START(k) < KEY_START(replace_key) || + KEY_OFFSET(k) > KEY_OFFSET(replace_key)) + goto check_failed; + + /* We didn't find a key that we were supposed to */ + if (KEY_START(k) > KEY_START(insert) + sectors_found) + goto check_failed; + + if (!bch_bkey_equal_header(k, replace_key)) + goto check_failed; + + /* skip past gen */ + offset <<= 8; + + BUG_ON(!KEY_PTRS(replace_key)); + + for (i = 0; i < KEY_PTRS(replace_key); i++) + if (k->ptr[i] != replace_key->ptr[i] + offset) + goto check_failed; + + sectors_found = KEY_OFFSET(k) - KEY_START(insert); + } + + if (bkey_cmp(insert, k) < 0 && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { + /* + * We overlapped in the middle of an existing key: that + * means we have to split the old key. But we have to do + * slightly different things depending on whether the + * old key has been written out yet. + */ + + struct bkey *top; + + bch_subtract_dirty(k, c, KEY_START(insert), + KEY_SIZE(insert)); + + if (bkey_written(b, k)) { + /* + * We insert a new key to cover the top of the + * old key, and the old key is modified in place + * to represent the bottom split. + * + * It's completely arbitrary whether the new key + * is the top or the bottom, but it has to match + * up with what btree_sort_fixup() does - it + * doesn't check for this kind of overlap, it + * depends on us inserting a new key for the top + * here. + */ + top = bch_bset_search(b, bset_tree_last(b), + insert); + bch_bset_insert(b, top, k); + } else { + BKEY_PADDED(key) temp; + bkey_copy(&temp.key, k); + bch_bset_insert(b, k, &temp.key); + top = bkey_next(k); + } + + bch_cut_front(insert, top); + bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + goto out; + } + + if (bkey_cmp(insert, k) < 0) { + bch_cut_front(insert, k); + } else { + if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) + old_offset = KEY_START(insert); + + if (bkey_written(b, k) && + bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { + /* + * Completely overwrote, so we don't have to + * invalidate the binary search tree + */ + bch_cut_front(k, k); + } else { + __bch_cut_back(&START_KEY(insert), k); + bch_bset_fix_invalidated_key(b, k); + } + } + + bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k)); + } + +check_failed: + if (replace_key) { + if (!sectors_found) { + return true; + } else if (sectors_found < KEY_SIZE(insert)) { + SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - + (KEY_SIZE(insert) - sectors_found)); + SET_KEY_SIZE(insert, sectors_found); + } + } +out: + if (KEY_DIRTY(insert)) + bcache_dev_sectors_dirty_add(c, KEY_INODE(insert), + KEY_START(insert), + KEY_SIZE(insert)); + + return false; +} + +static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + char buf[80]; + + if (!KEY_SIZE(k)) + return true; + + if (KEY_SIZE(k) > KEY_OFFSET(k)) + goto bad; + + if (__ptr_invalid(b->c, k)) + goto bad; + + return false; +bad: + bch_extent_to_text(buf, sizeof(buf), k); + cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); + return true; +} + +static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, + unsigned ptr) +{ + struct bucket *g = PTR_BUCKET(b->c, k, ptr); + char buf[80]; + + if (mutex_trylock(&b->c->bucket_lock)) { + if (b->c->gc_mark_valid && + (!GC_MARK(g) || + GC_MARK(g) == GC_MARK_METADATA || + (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k)))) + goto err; + + if (g->prio == BTREE_PRIO) + goto err; + + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_extent_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu", + buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g)); + return true; +} + +static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + struct bucket *g; + unsigned i, stale; + + if (!KEY_PTRS(k) || + bch_extent_invalid(bk, k)) + return true; + + for (i = 0; i < KEY_PTRS(k); i++) + if (!ptr_available(b->c, k, i)) + return true; + + if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) + return false; + + for (i = 0; i < KEY_PTRS(k); i++) { + g = PTR_BUCKET(b->c, k, i); + stale = ptr_stale(b->c, k, i); + + btree_bug_on(stale > 96, b, + "key too stale: %i, need_gc %u", + stale, b->c->need_gc); + + btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), + b, "stale dirty pointer"); + + if (stale) + return true; + + if (expensive_debug_checks(b->c) && + bch_extent_bad_expensive(b, k, i)) + return true; + } + + return false; +} + +static uint64_t merge_chksums(struct bkey *l, struct bkey *r) +{ + return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & + ~((uint64_t)1 << 63); +} + +static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r) +{ + struct btree *b = container_of(bk, struct btree, keys); + unsigned i; + + if (key_merging_disabled(b->c)) + return false; + + for (i = 0; i < KEY_PTRS(l); i++) + if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || + PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) + return false; + + /* Keys with no pointers aren't restricted to one bucket and could + * overflow KEY_SIZE + */ + if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { + SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); + SET_KEY_SIZE(l, USHRT_MAX); + + bch_cut_front(l, r); + return false; + } + + if (KEY_CSUM(l)) { + if (KEY_CSUM(r)) + l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); + else + SET_KEY_CSUM(l, 0); + } + + SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); + SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); + + return true; +} + +const struct btree_keys_ops bch_extent_keys_ops = { + .sort_cmp = bch_extent_sort_cmp, + .sort_fixup = bch_extent_sort_fixup, + .insert_fixup = bch_extent_insert_fixup, + .key_invalid = bch_extent_invalid, + .key_bad = bch_extent_bad, + .key_merge = bch_extent_merge, + .key_to_text = bch_extent_to_text, + .key_dump = bch_bkey_dump, + .is_extents = true, +}; diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h new file mode 100644 index 00000000000..e4e23409782 --- /dev/null +++ b/drivers/md/bcache/extents.h @@ -0,0 +1,13 @@ +#ifndef _BCACHE_EXTENTS_H +#define _BCACHE_EXTENTS_H + +extern const struct btree_keys_ops bch_btree_keys_ops; +extern const struct btree_keys_ops bch_extent_keys_ops; + +struct bkey; +struct cache_set; + +void bch_extent_to_text(char *, size_t, const struct bkey *); +bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); + +#endif /* _BCACHE_EXTENTS_H */ diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c new file mode 100644 index 00000000000..fa028fa82df --- /dev/null +++ b/drivers/md/bcache/io.c @@ -0,0 +1,243 @@ +/* + * Some low level IO code, and hacks for various block layer limitations + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "bset.h" +#include "debug.h" + +#include <linux/blkdev.h> + +static unsigned bch_bio_max_sectors(struct bio *bio) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + struct bio_vec bv; + struct bvec_iter iter; + unsigned ret = 0, seg = 0; + + if (bio->bi_rw & REQ_DISCARD) + return min(bio_sectors(bio), q->limits.max_discard_sectors); + + bio_for_each_segment(bv, bio, iter) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_iter.bi_sector, + .bi_size = ret << 9, + .bi_rw = bio->bi_rw, + }; + + if (seg == min_t(unsigned, BIO_MAX_PAGES, + queue_max_segments(q))) + break; + + if (q->merge_bvec_fn && + q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len) + break; + + seg++; + ret += bv.bv_len >> 9; + } + + ret = min(ret, queue_max_sectors(q)); + + WARN_ON(!ret); + ret = max_t(int, ret, bio_iovec(bio).bv_len >> 9); + + return ret; +} + +static void bch_bio_submit_split_done(struct closure *cl) +{ + struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); + + s->bio->bi_end_io = s->bi_end_io; + s->bio->bi_private = s->bi_private; + bio_endio_nodec(s->bio, 0); + + closure_debug_destroy(&s->cl); + mempool_free(s, s->p->bio_split_hook); +} + +static void bch_bio_submit_split_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); + + if (error) + clear_bit(BIO_UPTODATE, &s->bio->bi_flags); + + bio_put(bio); + closure_put(cl); +} + +void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) +{ + struct bio_split_hook *s; + struct bio *n; + + if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) + goto submit; + + if (bio_sectors(bio) <= bch_bio_max_sectors(bio)) + goto submit; + + s = mempool_alloc(p->bio_split_hook, GFP_NOIO); + closure_init(&s->cl, NULL); + + s->bio = bio; + s->p = p; + s->bi_end_io = bio->bi_end_io; + s->bi_private = bio->bi_private; + bio_get(bio); + + do { + n = bio_next_split(bio, bch_bio_max_sectors(bio), + GFP_NOIO, s->p->bio_split); + + n->bi_end_io = bch_bio_submit_split_endio; + n->bi_private = &s->cl; + + closure_get(&s->cl); + generic_make_request(n); + } while (n != bio); + + continue_at(&s->cl, bch_bio_submit_split_done, NULL); +submit: + generic_make_request(bio); +} + +/* Bios with headers */ + +void bch_bbio_free(struct bio *bio, struct cache_set *c) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + mempool_free(b, c->bio_meta); +} + +struct bio *bch_bbio_alloc(struct cache_set *c) +{ + struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO); + struct bio *bio = &b->bio; + + bio_init(bio); + bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; + bio->bi_max_vecs = bucket_pages(c); + bio->bi_io_vec = bio->bi_inline_vecs; + + return bio; +} + +void __bch_submit_bbio(struct bio *bio, struct cache_set *c) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + + bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); + bio->bi_bdev = PTR_CACHE(c, &b->key, 0)->bdev; + + b->submit_time_us = local_clock_us(); + closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0)); +} + +void bch_submit_bbio(struct bio *bio, struct cache_set *c, + struct bkey *k, unsigned ptr) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + bch_bkey_copy_single_ptr(&b->key, k, ptr); + __bch_submit_bbio(bio, c); +} + +/* IO errors */ + +void bch_count_io_errors(struct cache *ca, int error, const char *m) +{ + /* + * The halflife of an error is: + * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh + */ + + if (ca->set->error_decay) { + unsigned count = atomic_inc_return(&ca->io_count); + + while (count > ca->set->error_decay) { + unsigned errors; + unsigned old = count; + unsigned new = count - ca->set->error_decay; + + /* + * First we subtract refresh from count; each time we + * succesfully do so, we rescale the errors once: + */ + + count = atomic_cmpxchg(&ca->io_count, old, new); + + if (count == old) { + count = new; + + errors = atomic_read(&ca->io_errors); + do { + old = errors; + new = ((uint64_t) errors * 127) / 128; + errors = atomic_cmpxchg(&ca->io_errors, + old, new); + } while (old != errors); + } + } + } + + if (error) { + char buf[BDEVNAME_SIZE]; + unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, + &ca->io_errors); + errors >>= IO_ERROR_SHIFT; + + if (errors < ca->set->error_limit) + pr_err("%s: IO error on %s, recovering", + bdevname(ca->bdev, buf), m); + else + bch_cache_set_error(ca->set, + "%s: too many IO errors %s", + bdevname(ca->bdev, buf), m); + } +} + +void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, + int error, const char *m) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + struct cache *ca = PTR_CACHE(c, &b->key, 0); + + unsigned threshold = bio->bi_rw & REQ_WRITE + ? c->congested_write_threshold_us + : c->congested_read_threshold_us; + + if (threshold) { + unsigned t = local_clock_us(); + + int us = t - b->submit_time_us; + int congested = atomic_read(&c->congested); + + if (us > (int) threshold) { + int ms = us / 1024; + c->congested_last_us = t; + + ms = min(ms, CONGESTED_MAX + congested); + atomic_sub(ms, &c->congested); + } else if (congested < 0) + atomic_inc(&c->congested); + } + + bch_count_io_errors(ca, error, m); +} + +void bch_bbio_endio(struct cache_set *c, struct bio *bio, + int error, const char *m) +{ + struct closure *cl = bio->bi_private; + + bch_bbio_count_io_errors(c, bio, error, m); + bio_put(bio); + closure_put(cl); +} diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c new file mode 100644 index 00000000000..59e82021b5b --- /dev/null +++ b/drivers/md/bcache/journal.c @@ -0,0 +1,815 @@ +/* + * bcache journalling code, for btree insertions + * + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" + +#include <trace/events/bcache.h> + +/* + * Journal replay/recovery: + * + * This code is all driven from run_cache_set(); we first read the journal + * entries, do some other stuff, then we mark all the keys in the journal + * entries (same as garbage collection would), then we replay them - reinserting + * them into the cache in precisely the same order as they appear in the + * journal. + * + * We only journal keys that go in leaf nodes, which simplifies things quite a + * bit. + */ + +static void journal_read_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + closure_put(cl); +} + +static int journal_read_bucket(struct cache *ca, struct list_head *list, + unsigned bucket_index) +{ + struct journal_device *ja = &ca->journal; + struct bio *bio = &ja->bio; + + struct journal_replay *i; + struct jset *j, *data = ca->set->journal.w[0].data; + struct closure cl; + unsigned len, left, offset = 0; + int ret = 0; + sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); + + closure_init_stack(&cl); + + pr_debug("reading %u", bucket_index); + + while (offset < ca->sb.bucket_size) { +reread: left = ca->sb.bucket_size - offset; + len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); + + bio_reset(bio); + bio->bi_iter.bi_sector = bucket + offset; + bio->bi_bdev = ca->bdev; + bio->bi_rw = READ; + bio->bi_iter.bi_size = len << 9; + + bio->bi_end_io = journal_read_endio; + bio->bi_private = &cl; + bch_bio_map(bio, data); + + closure_bio_submit(bio, &cl, ca); + closure_sync(&cl); + + /* This function could be simpler now since we no longer write + * journal entries that overlap bucket boundaries; this means + * the start of a bucket will always have a valid journal entry + * if it has any journal entries at all. + */ + + j = data; + while (len) { + struct list_head *where; + size_t blocks, bytes = set_bytes(j); + + if (j->magic != jset_magic(&ca->sb)) { + pr_debug("%u: bad magic", bucket_index); + return ret; + } + + if (bytes > left << 9 || + bytes > PAGE_SIZE << JSET_BITS) { + pr_info("%u: too big, %zu bytes, offset %u", + bucket_index, bytes, offset); + return ret; + } + + if (bytes > len << 9) + goto reread; + + if (j->csum != csum_set(j)) { + pr_info("%u: bad csum, %zu bytes, offset %u", + bucket_index, bytes, offset); + return ret; + } + + blocks = set_blocks(j, block_bytes(ca->set)); + + while (!list_empty(list)) { + i = list_first_entry(list, + struct journal_replay, list); + if (i->j.seq >= j->last_seq) + break; + list_del(&i->list); + kfree(i); + } + + list_for_each_entry_reverse(i, list, list) { + if (j->seq == i->j.seq) + goto next_set; + + if (j->seq < i->j.last_seq) + goto next_set; + + if (j->seq > i->j.seq) { + where = &i->list; + goto add; + } + } + + where = list; +add: + i = kmalloc(offsetof(struct journal_replay, j) + + bytes, GFP_KERNEL); + if (!i) + return -ENOMEM; + memcpy(&i->j, j, bytes); + list_add(&i->list, where); + ret = 1; + + ja->seq[bucket_index] = j->seq; +next_set: + offset += blocks * ca->sb.block_size; + len -= blocks * ca->sb.block_size; + j = ((void *) j) + blocks * block_bytes(ca); + } + } + + return ret; +} + +int bch_journal_read(struct cache_set *c, struct list_head *list) +{ +#define read_bucket(b) \ + ({ \ + int ret = journal_read_bucket(ca, list, b); \ + __set_bit(b, bitmap); \ + if (ret < 0) \ + return ret; \ + ret; \ + }) + + struct cache *ca; + unsigned iter; + + for_each_cache(ca, c, iter) { + struct journal_device *ja = &ca->journal; + unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG]; + unsigned i, l, r, m; + uint64_t seq; + + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); + pr_debug("%u journal buckets", ca->sb.njournal_buckets); + + /* + * Read journal buckets ordered by golden ratio hash to quickly + * find a sequence of buckets with valid journal entries + */ + for (i = 0; i < ca->sb.njournal_buckets; i++) { + l = (i * 2654435769U) % ca->sb.njournal_buckets; + + if (test_bit(l, bitmap)) + break; + + if (read_bucket(l)) + goto bsearch; + } + + /* + * If that fails, check all the buckets we haven't checked + * already + */ + pr_debug("falling back to linear search"); + + for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); + l < ca->sb.njournal_buckets; + l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1)) + if (read_bucket(l)) + goto bsearch; + + if (list_empty(list)) + continue; +bsearch: + /* Binary search */ + m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); + pr_debug("starting binary search, l %u r %u", l, r); + + while (l + 1 < r) { + seq = list_entry(list->prev, struct journal_replay, + list)->j.seq; + + m = (l + r) >> 1; + read_bucket(m); + + if (seq != list_entry(list->prev, struct journal_replay, + list)->j.seq) + l = m; + else + r = m; + } + + /* + * Read buckets in reverse order until we stop finding more + * journal entries + */ + pr_debug("finishing up: m %u njournal_buckets %u", + m, ca->sb.njournal_buckets); + l = m; + + while (1) { + if (!l--) + l = ca->sb.njournal_buckets - 1; + + if (l == m) + break; + + if (test_bit(l, bitmap)) + continue; + + if (!read_bucket(l)) + break; + } + + seq = 0; + + for (i = 0; i < ca->sb.njournal_buckets; i++) + if (ja->seq[i] > seq) { + seq = ja->seq[i]; + /* + * When journal_reclaim() goes to allocate for + * the first time, it'll use the bucket after + * ja->cur_idx + */ + ja->cur_idx = i; + ja->last_idx = ja->discard_idx = (i + 1) % + ca->sb.njournal_buckets; + + } + } + + if (!list_empty(list)) + c->journal.seq = list_entry(list->prev, + struct journal_replay, + list)->j.seq; + + return 0; +#undef read_bucket +} + +void bch_journal_mark(struct cache_set *c, struct list_head *list) +{ + atomic_t p = { 0 }; + struct bkey *k; + struct journal_replay *i; + struct journal *j = &c->journal; + uint64_t last = j->seq; + + /* + * journal.pin should never fill up - we never write a journal + * entry when it would fill up. But if for some reason it does, we + * iterate over the list in reverse order so that we can just skip that + * refcount instead of bugging. + */ + + list_for_each_entry_reverse(i, list, list) { + BUG_ON(last < i->j.seq); + i->pin = NULL; + + while (last-- != i->j.seq) + if (fifo_free(&j->pin) > 1) { + fifo_push_front(&j->pin, p); + atomic_set(&fifo_front(&j->pin), 0); + } + + if (fifo_free(&j->pin) > 1) { + fifo_push_front(&j->pin, p); + i->pin = &fifo_front(&j->pin); + atomic_set(i->pin, 1); + } + + for (k = i->j.start; + k < bset_bkey_last(&i->j); + k = bkey_next(k)) { + unsigned j; + + for (j = 0; j < KEY_PTRS(k); j++) + if (ptr_available(c, k, j)) + atomic_inc(&PTR_BUCKET(c, k, j)->pin); + + bch_initial_mark_key(c, 0, k); + } + } +} + +int bch_journal_replay(struct cache_set *s, struct list_head *list) +{ + int ret = 0, keys = 0, entries = 0; + struct bkey *k; + struct journal_replay *i = + list_entry(list->prev, struct journal_replay, list); + + uint64_t start = i->j.last_seq, end = i->j.seq, n = start; + struct keylist keylist; + + list_for_each_entry(i, list, list) { + BUG_ON(i->pin && atomic_read(i->pin) != 1); + + cache_set_err_on(n != i->j.seq, s, +"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", + n, i->j.seq - 1, start, end); + + for (k = i->j.start; + k < bset_bkey_last(&i->j); + k = bkey_next(k)) { + trace_bcache_journal_replay_key(k); + + bch_keylist_init_single(&keylist, k); + + ret = bch_btree_insert(s, &keylist, i->pin, NULL); + if (ret) + goto err; + + BUG_ON(!bch_keylist_empty(&keylist)); + keys++; + + cond_resched(); + } + + if (i->pin) + atomic_dec(i->pin); + n = i->j.seq + 1; + entries++; + } + + pr_info("journal replay done, %i keys in %i entries, seq %llu", + keys, entries, end); +err: + while (!list_empty(list)) { + i = list_first_entry(list, struct journal_replay, list); + list_del(&i->list); + kfree(i); + } + + return ret; +} + +/* Journalling */ + +static void btree_flush_write(struct cache_set *c) +{ + /* + * Try to find the btree node with that references the oldest journal + * entry, best is our current candidate and is locked if non NULL: + */ + struct btree *b, *best; + unsigned i; +retry: + best = NULL; + + for_each_cached_btree(b, c, i) + if (btree_current_write(b)->journal) { + if (!best) + best = b; + else if (journal_pin_cmp(c, + btree_current_write(best)->journal, + btree_current_write(b)->journal)) { + best = b; + } + } + + b = best; + if (b) { + mutex_lock(&b->write_lock); + if (!btree_current_write(b)->journal) { + mutex_unlock(&b->write_lock); + /* We raced */ + goto retry; + } + + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); + } +} + +#define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) + +static void journal_discard_endio(struct bio *bio, int error) +{ + struct journal_device *ja = + container_of(bio, struct journal_device, discard_bio); + struct cache *ca = container_of(ja, struct cache, journal); + + atomic_set(&ja->discard_in_flight, DISCARD_DONE); + + closure_wake_up(&ca->set->journal.wait); + closure_put(&ca->set->cl); +} + +static void journal_discard_work(struct work_struct *work) +{ + struct journal_device *ja = + container_of(work, struct journal_device, discard_work); + + submit_bio(0, &ja->discard_bio); +} + +static void do_journal_discard(struct cache *ca) +{ + struct journal_device *ja = &ca->journal; + struct bio *bio = &ja->discard_bio; + + if (!ca->discard) { + ja->discard_idx = ja->last_idx; + return; + } + + switch (atomic_read(&ja->discard_in_flight)) { + case DISCARD_IN_FLIGHT: + return; + + case DISCARD_DONE: + ja->discard_idx = (ja->discard_idx + 1) % + ca->sb.njournal_buckets; + + atomic_set(&ja->discard_in_flight, DISCARD_READY); + /* fallthrough */ + + case DISCARD_READY: + if (ja->discard_idx == ja->last_idx) + return; + + atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); + + bio_init(bio); + bio->bi_iter.bi_sector = bucket_to_sector(ca->set, + ca->sb.d[ja->discard_idx]); + bio->bi_bdev = ca->bdev; + bio->bi_rw = REQ_WRITE|REQ_DISCARD; + bio->bi_max_vecs = 1; + bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_iter.bi_size = bucket_bytes(ca); + bio->bi_end_io = journal_discard_endio; + + closure_get(&ca->set->cl); + INIT_WORK(&ja->discard_work, journal_discard_work); + schedule_work(&ja->discard_work); + } +} + +static void journal_reclaim(struct cache_set *c) +{ + struct bkey *k = &c->journal.key; + struct cache *ca; + uint64_t last_seq; + unsigned iter, n = 0; + atomic_t p; + + while (!atomic_read(&fifo_front(&c->journal.pin))) + fifo_pop(&c->journal.pin, p); + + last_seq = last_seq(&c->journal); + + /* Update last_idx */ + + for_each_cache(ca, c, iter) { + struct journal_device *ja = &ca->journal; + + while (ja->last_idx != ja->cur_idx && + ja->seq[ja->last_idx] < last_seq) + ja->last_idx = (ja->last_idx + 1) % + ca->sb.njournal_buckets; + } + + for_each_cache(ca, c, iter) + do_journal_discard(ca); + + if (c->journal.blocks_free) + goto out; + + /* + * Allocate: + * XXX: Sort by free journal space + */ + + for_each_cache(ca, c, iter) { + struct journal_device *ja = &ca->journal; + unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + + /* No space available on this device */ + if (next == ja->discard_idx) + continue; + + ja->cur_idx = next; + k->ptr[n++] = PTR(0, + bucket_to_sector(c, ca->sb.d[ja->cur_idx]), + ca->sb.nr_this_dev); + } + + bkey_init(k); + SET_KEY_PTRS(k, n); + + if (n) + c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; +out: + if (!journal_full(&c->journal)) + __closure_wake_up(&c->journal.wait); +} + +void bch_journal_next(struct journal *j) +{ + atomic_t p = { 1 }; + + j->cur = (j->cur == j->w) + ? &j->w[1] + : &j->w[0]; + + /* + * The fifo_push() needs to happen at the same time as j->seq is + * incremented for last_seq() to be calculated correctly + */ + BUG_ON(!fifo_push(&j->pin, p)); + atomic_set(&fifo_back(&j->pin), 1); + + j->cur->data->seq = ++j->seq; + j->cur->dirty = false; + j->cur->need_write = false; + j->cur->data->keys = 0; + + if (fifo_full(&j->pin)) + pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); +} + +static void journal_write_endio(struct bio *bio, int error) +{ + struct journal_write *w = bio->bi_private; + + cache_set_err_on(error, w->c, "journal io error"); + closure_put(&w->c->journal.io); +} + +static void journal_write(struct closure *); + +static void journal_write_done(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct journal_write *w = (j->cur == j->w) + ? &j->w[1] + : &j->w[0]; + + __closure_wake_up(&w->wait); + continue_at_nobarrier(cl, journal_write, system_wq); +} + +static void journal_write_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, journal.io); + + c->journal.io_in_flight = 0; + spin_unlock(&c->journal.lock); +} + +static void journal_write_unlocked(struct closure *cl) + __releases(c->journal.lock) +{ + struct cache_set *c = container_of(cl, struct cache_set, journal.io); + struct cache *ca; + struct journal_write *w = c->journal.cur; + struct bkey *k = &c->journal.key; + unsigned i, sectors = set_blocks(w->data, block_bytes(c)) * + c->sb.block_size; + + struct bio *bio; + struct bio_list list; + bio_list_init(&list); + + if (!w->need_write) { + closure_return_with_destructor(cl, journal_write_unlock); + } else if (journal_full(&c->journal)) { + journal_reclaim(c); + spin_unlock(&c->journal.lock); + + btree_flush_write(c); + continue_at(cl, journal_write, system_wq); + } + + c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); + + w->data->btree_level = c->root->level; + + bkey_copy(&w->data->btree_root, &c->root->key); + bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket); + + for_each_cache(ca, c, i) + w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; + + w->data->magic = jset_magic(&c->sb); + w->data->version = BCACHE_JSET_VERSION; + w->data->last_seq = last_seq(&c->journal); + w->data->csum = csum_set(w->data); + + for (i = 0; i < KEY_PTRS(k); i++) { + ca = PTR_CACHE(c, k, i); + bio = &ca->journal.bio; + + atomic_long_add(sectors, &ca->meta_sectors_written); + + bio_reset(bio); + bio->bi_iter.bi_sector = PTR_OFFSET(k, i); + bio->bi_bdev = ca->bdev; + bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA; + bio->bi_iter.bi_size = sectors << 9; + + bio->bi_end_io = journal_write_endio; + bio->bi_private = w; + bch_bio_map(bio, w->data); + + trace_bcache_journal_write(bio); + bio_list_add(&list, bio); + + SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors); + + ca->journal.seq[ca->journal.cur_idx] = w->data->seq; + } + + atomic_dec_bug(&fifo_back(&c->journal.pin)); + bch_journal_next(&c->journal); + journal_reclaim(c); + + spin_unlock(&c->journal.lock); + + while ((bio = bio_list_pop(&list))) + closure_bio_submit(bio, cl, c->cache[0]); + + continue_at(cl, journal_write_done, NULL); +} + +static void journal_write(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, journal.io); + + spin_lock(&c->journal.lock); + journal_write_unlocked(cl); +} + +static void journal_try_write(struct cache_set *c) + __releases(c->journal.lock) +{ + struct closure *cl = &c->journal.io; + struct journal_write *w = c->journal.cur; + + w->need_write = true; + + if (!c->journal.io_in_flight) { + c->journal.io_in_flight = 1; + closure_call(cl, journal_write_unlocked, NULL, &c->cl); + } else { + spin_unlock(&c->journal.lock); + } +} + +static struct journal_write *journal_wait_for_write(struct cache_set *c, + unsigned nkeys) +{ + size_t sectors; + struct closure cl; + bool wait = false; + + closure_init_stack(&cl); + + spin_lock(&c->journal.lock); + + while (1) { + struct journal_write *w = c->journal.cur; + + sectors = __set_blocks(w->data, w->data->keys + nkeys, + block_bytes(c)) * c->sb.block_size; + + if (sectors <= min_t(size_t, + c->journal.blocks_free * c->sb.block_size, + PAGE_SECTORS << JSET_BITS)) + return w; + + if (wait) + closure_wait(&c->journal.wait, &cl); + + if (!journal_full(&c->journal)) { + if (wait) + trace_bcache_journal_entry_full(c); + + /* + * XXX: If we were inserting so many keys that they + * won't fit in an _empty_ journal write, we'll + * deadlock. For now, handle this in + * bch_keylist_realloc() - but something to think about. + */ + BUG_ON(!w->data->keys); + + journal_try_write(c); /* unlocks */ + } else { + if (wait) + trace_bcache_journal_full(c); + + journal_reclaim(c); + spin_unlock(&c->journal.lock); + + btree_flush_write(c); + } + + closure_sync(&cl); + spin_lock(&c->journal.lock); + wait = true; + } +} + +static void journal_write_work(struct work_struct *work) +{ + struct cache_set *c = container_of(to_delayed_work(work), + struct cache_set, + journal.work); + spin_lock(&c->journal.lock); + if (c->journal.cur->dirty) + journal_try_write(c); + else + spin_unlock(&c->journal.lock); +} + +/* + * Entry point to the journalling code - bio_insert() and btree_invalidate() + * pass bch_journal() a list of keys to be journalled, and then + * bch_journal() hands those same keys off to btree_insert_async() + */ + +atomic_t *bch_journal(struct cache_set *c, + struct keylist *keys, + struct closure *parent) +{ + struct journal_write *w; + atomic_t *ret; + + if (!CACHE_SYNC(&c->sb)) + return NULL; + + w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); + + memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); + w->data->keys += bch_keylist_nkeys(keys); + + ret = &fifo_back(&c->journal.pin); + atomic_inc(ret); + + if (parent) { + closure_wait(&w->wait, parent); + journal_try_write(c); + } else if (!w->dirty) { + w->dirty = true; + schedule_delayed_work(&c->journal.work, + msecs_to_jiffies(c->journal_delay_ms)); + spin_unlock(&c->journal.lock); + } else { + spin_unlock(&c->journal.lock); + } + + + return ret; +} + +void bch_journal_meta(struct cache_set *c, struct closure *cl) +{ + struct keylist keys; + atomic_t *ref; + + bch_keylist_init(&keys); + + ref = bch_journal(c, &keys, cl); + if (ref) + atomic_dec_bug(ref); +} + +void bch_journal_free(struct cache_set *c) +{ + free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); + free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); + free_fifo(&c->journal.pin); +} + +int bch_journal_alloc(struct cache_set *c) +{ + struct journal *j = &c->journal; + + spin_lock_init(&j->lock); + INIT_DELAYED_WORK(&j->work, journal_write_work); + + c->journal_delay_ms = 100; + + j->w[0].c = c; + j->w[1].c = c; + + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) + return -ENOMEM; + + return 0; +} diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h new file mode 100644 index 00000000000..e3c39457afb --- /dev/null +++ b/drivers/md/bcache/journal.h @@ -0,0 +1,179 @@ +#ifndef _BCACHE_JOURNAL_H +#define _BCACHE_JOURNAL_H + +/* + * THE JOURNAL: + * + * The journal is treated as a circular buffer of buckets - a journal entry + * never spans two buckets. This means (not implemented yet) we can resize the + * journal at runtime, and will be needed for bcache on raw flash support. + * + * Journal entries contain a list of keys, ordered by the time they were + * inserted; thus journal replay just has to reinsert the keys. + * + * We also keep some things in the journal header that are logically part of the + * superblock - all the things that are frequently updated. This is for future + * bcache on raw flash support; the superblock (which will become another + * journal) can't be moved or wear leveled, so it contains just enough + * information to find the main journal, and the superblock only has to be + * rewritten when we want to move/wear level the main journal. + * + * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be + * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions + * from cache misses, which don't have to be journaled, and for writeback and + * moving gc we work around it by flushing the btree to disk before updating the + * gc information. But it is a potential issue with incremental garbage + * collection, and it's fragile. + * + * OPEN JOURNAL ENTRIES: + * + * Each journal entry contains, in the header, the sequence number of the last + * journal entry still open - i.e. that has keys that haven't been flushed to + * disk in the btree. + * + * We track this by maintaining a refcount for every open journal entry, in a + * fifo; each entry in the fifo corresponds to a particular journal + * entry/sequence number. When the refcount at the tail of the fifo goes to + * zero, we pop it off - thus, the size of the fifo tells us the number of open + * journal entries + * + * We take a refcount on a journal entry when we add some keys to a journal + * entry that we're going to insert (held by struct btree_op), and then when we + * insert those keys into the btree the btree write we're setting up takes a + * copy of that refcount (held by struct btree_write). That refcount is dropped + * when the btree write completes. + * + * A struct btree_write can only hold a refcount on a single journal entry, but + * might contain keys for many journal entries - we handle this by making sure + * it always has a refcount on the _oldest_ journal entry of all the journal + * entries it has keys for. + * + * JOURNAL RECLAIM: + * + * As mentioned previously, our fifo of refcounts tells us the number of open + * journal entries; from that and the current journal sequence number we compute + * last_seq - the oldest journal entry we still need. We write last_seq in each + * journal entry, and we also have to keep track of where it exists on disk so + * we don't overwrite it when we loop around the journal. + * + * To do that we track, for each journal bucket, the sequence number of the + * newest journal entry it contains - if we don't need that journal entry we + * don't need anything in that bucket anymore. From that we track the last + * journal bucket we still need; all this is tracked in struct journal_device + * and updated by journal_reclaim(). + * + * JOURNAL FILLING UP: + * + * There are two ways the journal could fill up; either we could run out of + * space to write to, or we could have too many open journal entries and run out + * of room in the fifo of refcounts. Since those refcounts are decremented + * without any locking we can't safely resize that fifo, so we handle it the + * same way. + * + * If the journal fills up, we start flushing dirty btree nodes until we can + * allocate space for a journal write again - preferentially flushing btree + * nodes that are pinning the oldest journal entries first. + */ + +/* + * Only used for holding the journal entries we read in btree_journal_read() + * during cache_registration + */ +struct journal_replay { + struct list_head list; + atomic_t *pin; + struct jset j; +}; + +/* + * We put two of these in struct journal; we used them for writes to the + * journal that are being staged or in flight. + */ +struct journal_write { + struct jset *data; +#define JSET_BITS 3 + + struct cache_set *c; + struct closure_waitlist wait; + bool dirty; + bool need_write; +}; + +/* Embedded in struct cache_set */ +struct journal { + spinlock_t lock; + /* used when waiting because the journal was full */ + struct closure_waitlist wait; + struct closure io; + int io_in_flight; + struct delayed_work work; + + /* Number of blocks free in the bucket(s) we're currently writing to */ + unsigned blocks_free; + uint64_t seq; + DECLARE_FIFO(atomic_t, pin); + + BKEY_PADDED(key); + + struct journal_write w[2], *cur; +}; + +/* + * Embedded in struct cache. First three fields refer to the array of journal + * buckets, in cache_sb. + */ +struct journal_device { + /* + * For each journal bucket, contains the max sequence number of the + * journal writes it contains - so we know when a bucket can be reused. + */ + uint64_t seq[SB_JOURNAL_BUCKETS]; + + /* Journal bucket we're currently writing to */ + unsigned cur_idx; + + /* Last journal bucket that still contains an open journal entry */ + unsigned last_idx; + + /* Next journal bucket to be discarded */ + unsigned discard_idx; + +#define DISCARD_READY 0 +#define DISCARD_IN_FLIGHT 1 +#define DISCARD_DONE 2 + /* 1 - discard in flight, -1 - discard completed */ + atomic_t discard_in_flight; + + struct work_struct discard_work; + struct bio discard_bio; + struct bio_vec discard_bv; + + /* Bio for journal reads/writes to this device */ + struct bio bio; + struct bio_vec bv[8]; +}; + +#define journal_pin_cmp(c, l, r) \ + (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) + +#define JOURNAL_PIN 20000 + +#define journal_full(j) \ + (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) + +struct closure; +struct cache_set; +struct btree_op; +struct keylist; + +atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *); +void bch_journal_next(struct journal *); +void bch_journal_mark(struct cache_set *, struct list_head *); +void bch_journal_meta(struct cache_set *, struct closure *); +int bch_journal_read(struct cache_set *, struct list_head *); +int bch_journal_replay(struct cache_set *, struct list_head *); + +void bch_journal_free(struct cache_set *); +int bch_journal_alloc(struct cache_set *); + +#endif /* _BCACHE_JOURNAL_H */ diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c new file mode 100644 index 00000000000..cd7490311e5 --- /dev/null +++ b/drivers/md/bcache/movinggc.c @@ -0,0 +1,256 @@ +/* + * Moving/copying garbage collector + * + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "request.h" + +#include <trace/events/bcache.h> + +struct moving_io { + struct closure cl; + struct keybuf_key *w; + struct data_insert_op op; + struct bbio bio; +}; + +static bool moving_pred(struct keybuf *buf, struct bkey *k) +{ + struct cache_set *c = container_of(buf, struct cache_set, + moving_gc_keys); + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i) && + GC_MOVE(PTR_BUCKET(c, k, i))) + return true; + + return false; +} + +/* Moving GC - IO loop */ + +static void moving_io_destructor(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, cl); + kfree(io); +} + +static void write_moving_finish(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct bio *bio = &io->bio.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) + __free_page(bv->bv_page); + + if (io->op.replace_collision) + trace_bcache_gc_copy_collision(&io->w->key); + + bch_keybuf_del(&io->op.c->moving_gc_keys, io->w); + + up(&io->op.c->moving_in_flight); + + closure_return_with_destructor(cl, moving_io_destructor); +} + +static void read_moving_endio(struct bio *bio, int error) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + struct moving_io *io = container_of(bio->bi_private, + struct moving_io, cl); + + if (error) + io->op.error = error; + else if (!KEY_DIRTY(&b->key) && + ptr_stale(io->op.c, &b->key, 0)) { + io->op.error = -EINTR; + } + + bch_bbio_endio(io->op.c, bio, error, "reading data to move"); +} + +static void moving_init(struct moving_io *io) +{ + struct bio *bio = &io->bio.bio; + + bio_init(bio); + bio_get(bio); + bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9; + bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), + PAGE_SECTORS); + bio->bi_private = &io->cl; + bio->bi_io_vec = bio->bi_inline_vecs; + bch_bio_map(bio, NULL); +} + +static void write_moving(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct data_insert_op *op = &io->op; + + if (!op->error) { + moving_init(io); + + io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key); + op->write_prio = 1; + op->bio = &io->bio.bio; + + op->writeback = KEY_DIRTY(&io->w->key); + op->csum = KEY_CSUM(&io->w->key); + + bkey_copy(&op->replace_key, &io->w->key); + op->replace = true; + + closure_call(&op->cl, bch_data_insert, NULL, cl); + } + + continue_at(cl, write_moving_finish, op->wq); +} + +static void read_moving_submit(struct closure *cl) +{ + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct bio *bio = &io->bio.bio; + + bch_submit_bbio(bio, io->op.c, &io->w->key, 0); + + continue_at(cl, write_moving, io->op.wq); +} + +static void read_moving(struct cache_set *c) +{ + struct keybuf_key *w; + struct moving_io *io; + struct bio *bio; + struct closure cl; + + closure_init_stack(&cl); + + /* XXX: if we error, background writeback could stall indefinitely */ + + while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { + w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, + &MAX_KEY, moving_pred); + if (!w) + break; + + if (ptr_stale(c, &w->key, 0)) { + bch_keybuf_del(&c->moving_gc_keys, w); + continue; + } + + io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) + * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); + if (!io) + goto err; + + w->private = io; + io->w = w; + io->op.inode = KEY_INODE(&w->key); + io->op.c = c; + io->op.wq = c->moving_gc_wq; + + moving_init(io); + bio = &io->bio.bio; + + bio->bi_rw = READ; + bio->bi_end_io = read_moving_endio; + + if (bio_alloc_pages(bio, GFP_KERNEL)) + goto err; + + trace_bcache_gc_copy(&w->key); + + down(&c->moving_in_flight); + closure_call(&io->cl, read_moving_submit, NULL, &cl); + } + + if (0) { +err: if (!IS_ERR_OR_NULL(w->private)) + kfree(w->private); + + bch_keybuf_del(&c->moving_gc_keys, w); + } + + closure_sync(&cl); +} + +static bool bucket_cmp(struct bucket *l, struct bucket *r) +{ + return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); +} + +static unsigned bucket_heap_top(struct cache *ca) +{ + struct bucket *b; + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; +} + +void bch_moving_gc(struct cache_set *c) +{ + struct cache *ca; + struct bucket *b; + unsigned i; + + if (!c->copy_gc_enabled) + return; + + mutex_lock(&c->bucket_lock); + + for_each_cache(ca, c, i) { + unsigned sectors_to_move = 0; + unsigned reserve_sectors = ca->sb.bucket_size * + fifo_used(&ca->free[RESERVE_MOVINGGC]); + + ca->heap.used = 0; + + for_each_bucket(b, ca) { + if (GC_MARK(b) == GC_MARK_METADATA || + !GC_SECTORS_USED(b) || + GC_SECTORS_USED(b) == ca->sb.bucket_size || + atomic_read(&b->pin)) + continue; + + if (!heap_full(&ca->heap)) { + sectors_to_move += GC_SECTORS_USED(b); + heap_add(&ca->heap, b, bucket_cmp); + } else if (bucket_cmp(b, heap_peek(&ca->heap))) { + sectors_to_move -= bucket_heap_top(ca); + sectors_to_move += GC_SECTORS_USED(b); + + ca->heap.data[0] = b; + heap_sift(&ca->heap, 0, bucket_cmp); + } + } + + while (sectors_to_move > reserve_sectors) { + heap_pop(&ca->heap, b, bucket_cmp); + sectors_to_move -= GC_SECTORS_USED(b); + } + + while (heap_pop(&ca->heap, b, bucket_cmp)) + SET_GC_MOVE(b, 1); + } + + mutex_unlock(&c->bucket_lock); + + c->moving_gc_keys.last_scanned = ZERO_KEY; + + read_moving(c); +} + +void bch_moving_init_cache_set(struct cache_set *c) +{ + bch_keybuf_init(&c->moving_gc_keys); + sema_init(&c->moving_in_flight, 64); +} diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c new file mode 100644 index 00000000000..15fff4f68a7 --- /dev/null +++ b/drivers/md/bcache/request.c @@ -0,0 +1,1159 @@ +/* + * Main bcache entry point - handle a read or a write request and decide what to + * do with it; the make_request functions are called by the block layer. + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "request.h" +#include "writeback.h" + +#include <linux/module.h> +#include <linux/hash.h> +#include <linux/random.h> + +#include <trace/events/bcache.h> + +#define CUTOFF_CACHE_ADD 95 +#define CUTOFF_CACHE_READA 90 + +struct kmem_cache *bch_search_cache; + +static void bch_data_insert_start(struct closure *); + +static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) +{ + return BDEV_CACHE_MODE(&dc->sb); +} + +static bool verify(struct cached_dev *dc, struct bio *bio) +{ + return dc->verify; +} + +static void bio_csum(struct bio *bio, struct bkey *k) +{ + struct bio_vec bv; + struct bvec_iter iter; + uint64_t csum = 0; + + bio_for_each_segment(bv, bio, iter) { + void *d = kmap(bv.bv_page) + bv.bv_offset; + csum = bch_crc64_update(csum, d, bv.bv_len); + kunmap(bv.bv_page); + } + + k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); +} + +/* Insert data into cache */ + +static void bch_data_insert_keys(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + atomic_t *journal_ref = NULL; + struct bkey *replace_key = op->replace ? &op->replace_key : NULL; + int ret; + + /* + * If we're looping, might already be waiting on + * another journal write - can't wait on more than one journal write at + * a time + * + * XXX: this looks wrong + */ +#if 0 + while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) + closure_sync(&s->cl); +#endif + + if (!op->replace) + journal_ref = bch_journal(op->c, &op->insert_keys, + op->flush_journal ? cl : NULL); + + ret = bch_btree_insert(op->c, &op->insert_keys, + journal_ref, replace_key); + if (ret == -ESRCH) { + op->replace_collision = true; + } else if (ret) { + op->error = -ENOMEM; + op->insert_data_done = true; + } + + if (journal_ref) + atomic_dec_bug(journal_ref); + + if (!op->insert_data_done) + continue_at(cl, bch_data_insert_start, op->wq); + + bch_keylist_free(&op->insert_keys); + closure_return(cl); +} + +static int bch_keylist_realloc(struct keylist *l, unsigned u64s, + struct cache_set *c) +{ + size_t oldsize = bch_keylist_nkeys(l); + size_t newsize = oldsize + u64s; + + /* + * The journalling code doesn't handle the case where the keys to insert + * is bigger than an empty write: If we just return -ENOMEM here, + * bio_insert() and bio_invalidate() will insert the keys created so far + * and finish the rest when the keylist is empty. + */ + if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) + return -ENOMEM; + + return __bch_keylist_realloc(l, u64s); +} + +static void bch_data_invalidate(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct bio *bio = op->bio; + + pr_debug("invalidating %i sectors from %llu", + bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector); + + while (bio_sectors(bio)) { + unsigned sectors = min(bio_sectors(bio), + 1U << (KEY_SIZE_BITS - 1)); + + if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) + goto out; + + bio->bi_iter.bi_sector += sectors; + bio->bi_iter.bi_size -= sectors << 9; + + bch_keylist_add(&op->insert_keys, + &KEY(op->inode, bio->bi_iter.bi_sector, sectors)); + } + + op->insert_data_done = true; + bio_put(bio); +out: + continue_at(cl, bch_data_insert_keys, op->wq); +} + +static void bch_data_insert_error(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + + /* + * Our data write just errored, which means we've got a bunch of keys to + * insert that point to data that wasn't succesfully written. + * + * We don't have to insert those keys but we still have to invalidate + * that region of the cache - so, if we just strip off all the pointers + * from the keys we'll accomplish just that. + */ + + struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys; + + while (src != op->insert_keys.top) { + struct bkey *n = bkey_next(src); + + SET_KEY_PTRS(src, 0); + memmove(dst, src, bkey_bytes(src)); + + dst = bkey_next(dst); + src = n; + } + + op->insert_keys.top = dst; + + bch_data_insert_keys(cl); +} + +static void bch_data_insert_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + + if (error) { + /* TODO: We could try to recover from this. */ + if (op->writeback) + op->error = error; + else if (!op->replace) + set_closure_fn(cl, bch_data_insert_error, op->wq); + else + set_closure_fn(cl, NULL, NULL); + } + + bch_bbio_endio(op->c, bio, error, "writing data to cache"); +} + +static void bch_data_insert_start(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + struct bio *bio = op->bio, *n; + + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { + set_gc_sectors(op->c); + wake_up_gc(op->c); + } + + if (op->bypass) + return bch_data_invalidate(cl); + + /* + * Journal writes are marked REQ_FLUSH; if the original write was a + * flush, it'll wait on the journal write. + */ + bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA); + + do { + unsigned i; + struct bkey *k; + struct bio_set *split = op->c->bio_split; + + /* 1 for the device pointer and 1 for the chksum */ + if (bch_keylist_realloc(&op->insert_keys, + 3 + (op->csum ? 1 : 0), + op->c)) + continue_at(cl, bch_data_insert_keys, op->wq); + + k = op->insert_keys.top; + bkey_init(k); + SET_KEY_INODE(k, op->inode); + SET_KEY_OFFSET(k, bio->bi_iter.bi_sector); + + if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), + op->write_point, op->write_prio, + op->writeback)) + goto err; + + n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split); + + n->bi_end_io = bch_data_insert_endio; + n->bi_private = cl; + + if (op->writeback) { + SET_KEY_DIRTY(k, true); + + for (i = 0; i < KEY_PTRS(k); i++) + SET_GC_MARK(PTR_BUCKET(op->c, k, i), + GC_MARK_DIRTY); + } + + SET_KEY_CSUM(k, op->csum); + if (KEY_CSUM(k)) + bio_csum(n, k); + + trace_bcache_cache_insert(k); + bch_keylist_push(&op->insert_keys); + + n->bi_rw |= REQ_WRITE; + bch_submit_bbio(n, op->c, k, 0); + } while (n != bio); + + op->insert_data_done = true; + continue_at(cl, bch_data_insert_keys, op->wq); +err: + /* bch_alloc_sectors() blocks if s->writeback = true */ + BUG_ON(op->writeback); + + /* + * But if it's not a writeback write we'd rather just bail out if + * there aren't any buckets ready to write to - it might take awhile and + * we might be starving btree writes for gc or something. + */ + + if (!op->replace) { + /* + * Writethrough write: We can't complete the write until we've + * updated the index. But we don't want to delay the write while + * we wait for buckets to be freed up, so just invalidate the + * rest of the write. + */ + op->bypass = true; + return bch_data_invalidate(cl); + } else { + /* + * From a cache miss, we can just insert the keys for the data + * we have written or bail out if we didn't do anything. + */ + op->insert_data_done = true; + bio_put(bio); + + if (!bch_keylist_empty(&op->insert_keys)) + continue_at(cl, bch_data_insert_keys, op->wq); + else + closure_return(cl); + } +} + +/** + * bch_data_insert - stick some data in the cache + * + * This is the starting point for any data to end up in a cache device; it could + * be from a normal write, or a writeback write, or a write to a flash only + * volume - it's also used by the moving garbage collector to compact data in + * mostly empty buckets. + * + * It first writes the data to the cache, creating a list of keys to be inserted + * (if the data had to be fragmented there will be multiple keys); after the + * data is written it calls bch_journal, and after the keys have been added to + * the next journal write they're inserted into the btree. + * + * It inserts the data in s->cache_bio; bi_sector is used for the key offset, + * and op->inode is used for the key inode. + * + * If s->bypass is true, instead of inserting the data it invalidates the + * region of the cache represented by s->cache_bio and op->inode. + */ +void bch_data_insert(struct closure *cl) +{ + struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); + + trace_bcache_write(op->bio, op->writeback, op->bypass); + + bch_keylist_init(&op->insert_keys); + bio_get(op->bio); + bch_data_insert_start(cl); +} + +/* Congested? */ + +unsigned bch_get_congested(struct cache_set *c) +{ + int i; + long rand; + + if (!c->congested_read_threshold_us && + !c->congested_write_threshold_us) + return 0; + + i = (local_clock_us() - c->congested_last_us) / 1024; + if (i < 0) + return 0; + + i += atomic_read(&c->congested); + if (i >= 0) + return 0; + + i += CONGESTED_MAX; + + if (i > 0) + i = fract_exp_two(i, 6); + + rand = get_random_int(); + i -= bitmap_weight(&rand, BITS_PER_LONG); + + return i > 0 ? i : 1; +} + +static void add_sequential(struct task_struct *t) +{ + ewma_add(t->sequential_io_avg, + t->sequential_io, 8, 0); + + t->sequential_io = 0; +} + +static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) +{ + return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; +} + +static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) +{ + struct cache_set *c = dc->disk.c; + unsigned mode = cache_mode(dc, bio); + unsigned sectors, congested = bch_get_congested(c); + struct task_struct *task = current; + struct io *i; + + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || + c->gc_stats.in_use > CUTOFF_CACHE_ADD || + (bio->bi_rw & REQ_DISCARD)) + goto skip; + + if (mode == CACHE_MODE_NONE || + (mode == CACHE_MODE_WRITEAROUND && + (bio->bi_rw & REQ_WRITE))) + goto skip; + + if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || + bio_sectors(bio) & (c->sb.block_size - 1)) { + pr_debug("skipping unaligned io"); + goto skip; + } + + if (bypass_torture_test(dc)) { + if ((get_random_int() & 3) == 3) + goto skip; + else + goto rescale; + } + + if (!congested && !dc->sequential_cutoff) + goto rescale; + + if (!congested && + mode == CACHE_MODE_WRITEBACK && + (bio->bi_rw & REQ_WRITE) && + (bio->bi_rw & REQ_SYNC)) + goto rescale; + + spin_lock(&dc->io_lock); + + hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) + if (i->last == bio->bi_iter.bi_sector && + time_before(jiffies, i->jiffies)) + goto found; + + i = list_first_entry(&dc->io_lru, struct io, lru); + + add_sequential(task); + i->sequential = 0; +found: + if (i->sequential + bio->bi_iter.bi_size > i->sequential) + i->sequential += bio->bi_iter.bi_size; + + i->last = bio_end_sector(bio); + i->jiffies = jiffies + msecs_to_jiffies(5000); + task->sequential_io = i->sequential; + + hlist_del(&i->hash); + hlist_add_head(&i->hash, iohash(dc, i->last)); + list_move_tail(&i->lru, &dc->io_lru); + + spin_unlock(&dc->io_lock); + + sectors = max(task->sequential_io, + task->sequential_io_avg) >> 9; + + if (dc->sequential_cutoff && + sectors >= dc->sequential_cutoff >> 9) { + trace_bcache_bypass_sequential(bio); + goto skip; + } + + if (congested && sectors >= congested) { + trace_bcache_bypass_congested(bio); + goto skip; + } + +rescale: + bch_rescale_priorities(c, bio_sectors(bio)); + return false; +skip: + bch_mark_sectors_bypassed(c, dc, bio_sectors(bio)); + return true; +} + +/* Cache lookup */ + +struct search { + /* Stack frame for bio_complete */ + struct closure cl; + + struct bbio bio; + struct bio *orig_bio; + struct bio *cache_miss; + struct bcache_device *d; + + unsigned insert_bio_sectors; + unsigned recoverable:1; + unsigned write:1; + unsigned read_dirty_data:1; + + unsigned long start_time; + + struct btree_op op; + struct data_insert_op iop; +}; + +static void bch_cache_read_endio(struct bio *bio, int error) +{ + struct bbio *b = container_of(bio, struct bbio, bio); + struct closure *cl = bio->bi_private; + struct search *s = container_of(cl, struct search, cl); + + /* + * If the bucket was reused while our bio was in flight, we might have + * read the wrong data. Set s->error but not error so it doesn't get + * counted against the cache device, but we'll still reread the data + * from the backing device. + */ + + if (error) + s->iop.error = error; + else if (!KEY_DIRTY(&b->key) && + ptr_stale(s->iop.c, &b->key, 0)) { + atomic_long_inc(&s->iop.c->cache_read_races); + s->iop.error = -EINTR; + } + + bch_bbio_endio(s->iop.c, bio, error, "reading from cache"); +} + +/* + * Read from a single key, handling the initial cache miss if the key starts in + * the middle of the bio + */ +static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) +{ + struct search *s = container_of(op, struct search, op); + struct bio *n, *bio = &s->bio.bio; + struct bkey *bio_key; + unsigned ptr; + + if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0) + return MAP_CONTINUE; + + if (KEY_INODE(k) != s->iop.inode || + KEY_START(k) > bio->bi_iter.bi_sector) { + unsigned bio_sectors = bio_sectors(bio); + unsigned sectors = KEY_INODE(k) == s->iop.inode + ? min_t(uint64_t, INT_MAX, + KEY_START(k) - bio->bi_iter.bi_sector) + : INT_MAX; + + int ret = s->d->cache_miss(b, s, bio, sectors); + if (ret != MAP_CONTINUE) + return ret; + + /* if this was a complete miss we shouldn't get here */ + BUG_ON(bio_sectors <= sectors); + } + + if (!KEY_SIZE(k)) + return MAP_CONTINUE; + + /* XXX: figure out best pointer - for multiple cache devices */ + ptr = 0; + + PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; + + if (KEY_DIRTY(k)) + s->read_dirty_data = true; + + n = bio_next_split(bio, min_t(uint64_t, INT_MAX, + KEY_OFFSET(k) - bio->bi_iter.bi_sector), + GFP_NOIO, s->d->bio_split); + + bio_key = &container_of(n, struct bbio, bio)->key; + bch_bkey_copy_single_ptr(bio_key, k, ptr); + + bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key); + bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); + + n->bi_end_io = bch_cache_read_endio; + n->bi_private = &s->cl; + + /* + * The bucket we're reading from might be reused while our bio + * is in flight, and we could then end up reading the wrong + * data. + * + * We guard against this by checking (in cache_read_endio()) if + * the pointer is stale again; if so, we treat it as an error + * and reread from the backing device (but we don't pass that + * error up anywhere). + */ + + __bch_submit_bbio(n, b->c); + return n == bio ? MAP_DONE : MAP_CONTINUE; +} + +static void cache_lookup(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, iop.cl); + struct bio *bio = &s->bio.bio; + int ret; + + bch_btree_op_init(&s->op, -1); + + ret = bch_btree_map_keys(&s->op, s->iop.c, + &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), + cache_lookup_fn, MAP_END_KEY); + if (ret == -EAGAIN) + continue_at(cl, cache_lookup, bcache_wq); + + closure_return(cl); +} + +/* Common code for the make_request functions */ + +static void request_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + + if (error) { + struct search *s = container_of(cl, struct search, cl); + s->iop.error = error; + /* Only cache read errors are recoverable */ + s->recoverable = false; + } + + bio_put(bio); + closure_put(cl); +} + +static void bio_complete(struct search *s) +{ + if (s->orig_bio) { + int cpu, rw = bio_data_dir(s->orig_bio); + unsigned long duration = jiffies - s->start_time; + + cpu = part_stat_lock(); + part_round_stats(cpu, &s->d->disk->part0); + part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); + part_stat_unlock(); + + trace_bcache_request_end(s->d, s->orig_bio); + bio_endio(s->orig_bio, s->iop.error); + s->orig_bio = NULL; + } +} + +static void do_bio_hook(struct search *s, struct bio *orig_bio) +{ + struct bio *bio = &s->bio.bio; + + bio_init(bio); + __bio_clone_fast(bio, orig_bio); + bio->bi_end_io = request_endio; + bio->bi_private = &s->cl; + + atomic_set(&bio->bi_cnt, 3); +} + +static void search_free(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + bio_complete(s); + + if (s->iop.bio) + bio_put(s->iop.bio); + + closure_debug_destroy(cl); + mempool_free(s, s->d->c->search); +} + +static inline struct search *search_alloc(struct bio *bio, + struct bcache_device *d) +{ + struct search *s; + + s = mempool_alloc(d->c->search, GFP_NOIO); + + closure_init(&s->cl, NULL); + do_bio_hook(s, bio); + + s->orig_bio = bio; + s->cache_miss = NULL; + s->d = d; + s->recoverable = 1; + s->write = (bio->bi_rw & REQ_WRITE) != 0; + s->read_dirty_data = 0; + s->start_time = jiffies; + + s->iop.c = d->c; + s->iop.bio = NULL; + s->iop.inode = d->id; + s->iop.write_point = hash_long((unsigned long) current, 16); + s->iop.write_prio = 0; + s->iop.error = 0; + s->iop.flags = 0; + s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; + s->iop.wq = bcache_wq; + + return s; +} + +/* Cached devices */ + +static void cached_dev_bio_complete(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + + search_free(cl); + cached_dev_put(dc); +} + +/* Process reads */ + +static void cached_dev_cache_miss_done(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + + if (s->iop.replace_collision) + bch_mark_cache_miss_collision(s->iop.c, s->d); + + if (s->iop.bio) { + int i; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, s->iop.bio, i) + __free_page(bv->bv_page); + } + + cached_dev_bio_complete(cl); +} + +static void cached_dev_read_error(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct bio *bio = &s->bio.bio; + + if (s->recoverable) { + /* Retry from the backing device: */ + trace_bcache_read_retry(s->orig_bio); + + s->iop.error = 0; + do_bio_hook(s, s->orig_bio); + + /* XXX: invalidate cache */ + + closure_bio_submit(bio, cl, s->d); + } + + continue_at(cl, cached_dev_cache_miss_done, NULL); +} + +static void cached_dev_read_done(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + + /* + * We had a cache miss; cache_bio now contains data ready to be inserted + * into the cache. + * + * First, we copy the data we just read from cache_bio's bounce buffers + * to the buffers the original bio pointed to: + */ + + if (s->iop.bio) { + bio_reset(s->iop.bio); + s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector; + s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; + s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; + bch_bio_map(s->iop.bio, NULL); + + bio_copy_data(s->cache_miss, s->iop.bio); + + bio_put(s->cache_miss); + s->cache_miss = NULL; + } + + if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data) + bch_data_verify(dc, s->orig_bio); + + bio_complete(s); + + if (s->iop.bio && + !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { + BUG_ON(!s->iop.replace); + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + } + + continue_at(cl, cached_dev_cache_miss_done, NULL); +} + +static void cached_dev_read_done_bh(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + + bch_mark_cache_accounting(s->iop.c, s->d, + !s->cache_miss, s->iop.bypass); + trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); + + if (s->iop.error) + continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); + else if (s->iop.bio || verify(dc, &s->bio.bio)) + continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); + else + continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); +} + +static int cached_dev_cache_miss(struct btree *b, struct search *s, + struct bio *bio, unsigned sectors) +{ + int ret = MAP_CONTINUE; + unsigned reada = 0; + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct bio *miss, *cache_bio; + + if (s->cache_miss || s->iop.bypass) { + miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); + ret = miss == bio ? MAP_DONE : MAP_CONTINUE; + goto out_submit; + } + + if (!(bio->bi_rw & REQ_RAHEAD) && + !(bio->bi_rw & REQ_META) && + s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) + reada = min_t(sector_t, dc->readahead >> 9, + bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); + + s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); + + s->iop.replace_key = KEY(s->iop.inode, + bio->bi_iter.bi_sector + s->insert_bio_sectors, + s->insert_bio_sectors); + + ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); + if (ret) + return ret; + + s->iop.replace = true; + + miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); + + /* btree_search_recurse()'s btree iterator is no good anymore */ + ret = miss == bio ? MAP_DONE : -EINTR; + + cache_bio = bio_alloc_bioset(GFP_NOWAIT, + DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), + dc->disk.bio_split); + if (!cache_bio) + goto out_submit; + + cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector; + cache_bio->bi_bdev = miss->bi_bdev; + cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; + + cache_bio->bi_end_io = request_endio; + cache_bio->bi_private = &s->cl; + + bch_bio_map(cache_bio, NULL); + if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) + goto out_put; + + if (reada) + bch_mark_cache_readahead(s->iop.c, s->d); + + s->cache_miss = miss; + s->iop.bio = cache_bio; + bio_get(cache_bio); + closure_bio_submit(cache_bio, &s->cl, s->d); + + return ret; +out_put: + bio_put(cache_bio); +out_submit: + miss->bi_end_io = request_endio; + miss->bi_private = &s->cl; + closure_bio_submit(miss, &s->cl, s->d); + return ret; +} + +static void cached_dev_read(struct cached_dev *dc, struct search *s) +{ + struct closure *cl = &s->cl; + + closure_call(&s->iop.cl, cache_lookup, NULL, cl); + continue_at(cl, cached_dev_read_done_bh, NULL); +} + +/* Process writes */ + +static void cached_dev_write_complete(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + + up_read_non_owner(&dc->writeback_lock); + cached_dev_bio_complete(cl); +} + +static void cached_dev_write(struct cached_dev *dc, struct search *s) +{ + struct closure *cl = &s->cl; + struct bio *bio = &s->bio.bio; + struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0); + struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); + + bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); + + down_read_non_owner(&dc->writeback_lock); + if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { + /* + * We overlap with some dirty data undergoing background + * writeback, force this write to writeback + */ + s->iop.bypass = false; + s->iop.writeback = true; + } + + /* + * Discards aren't _required_ to do anything, so skipping if + * check_overlapping returned true is ok + * + * But check_overlapping drops dirty keys for which io hasn't started, + * so we still want to call it. + */ + if (bio->bi_rw & REQ_DISCARD) + s->iop.bypass = true; + + if (should_writeback(dc, s->orig_bio, + cache_mode(dc, bio), + s->iop.bypass)) { + s->iop.bypass = false; + s->iop.writeback = true; + } + + if (s->iop.bypass) { + s->iop.bio = s->orig_bio; + bio_get(s->iop.bio); + + if (!(bio->bi_rw & REQ_DISCARD) || + blk_queue_discard(bdev_get_queue(dc->bdev))) + closure_bio_submit(bio, cl, s->d); + } else if (s->iop.writeback) { + bch_writeback_add(dc); + s->iop.bio = bio; + + if (bio->bi_rw & REQ_FLUSH) { + /* Also need to send a flush to the backing device */ + struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, + dc->disk.bio_split); + + flush->bi_rw = WRITE_FLUSH; + flush->bi_bdev = bio->bi_bdev; + flush->bi_end_io = request_endio; + flush->bi_private = cl; + + closure_bio_submit(flush, cl, s->d); + } + } else { + s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); + + closure_bio_submit(bio, cl, s->d); + } + + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + continue_at(cl, cached_dev_write_complete, NULL); +} + +static void cached_dev_nodata(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct bio *bio = &s->bio.bio; + + if (s->iop.flush_journal) + bch_journal_meta(s->iop.c, cl); + + /* If it's a flush, we send the flush to the backing device too */ + closure_bio_submit(bio, cl, s->d); + + continue_at(cl, cached_dev_bio_complete, NULL); +} + +/* Cached devices - read & write stuff */ + +static void cached_dev_make_request(struct request_queue *q, struct bio *bio) +{ + struct search *s; + struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + int cpu, rw = bio_data_dir(bio); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &d->disk->part0, ios[rw]); + part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); + part_stat_unlock(); + + bio->bi_bdev = dc->bdev; + bio->bi_iter.bi_sector += dc->sb.data_offset; + + if (cached_dev_get(dc)) { + s = search_alloc(bio, d); + trace_bcache_request_start(s->d, bio); + + if (!bio->bi_iter.bi_size) { + /* + * can't call bch_journal_meta from under + * generic_make_request + */ + continue_at_nobarrier(&s->cl, + cached_dev_nodata, + bcache_wq); + } else { + s->iop.bypass = check_should_bypass(dc, bio); + + if (rw) + cached_dev_write(dc, s); + else + cached_dev_read(dc, s); + } + } else { + if ((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(dc->bdev))) + bio_endio(bio, 0); + else + bch_generic_make_request(bio, &d->bio_split_hook); + } +} + +static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); +} + +static int cached_dev_congested(void *data, int bits) +{ + struct bcache_device *d = data; + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + struct request_queue *q = bdev_get_queue(dc->bdev); + int ret = 0; + + if (bdi_congested(&q->backing_dev_info, bits)) + return 1; + + if (cached_dev_get(dc)) { + unsigned i; + struct cache *ca; + + for_each_cache(ca, d->c, i) { + q = bdev_get_queue(ca->bdev); + ret |= bdi_congested(&q->backing_dev_info, bits); + } + + cached_dev_put(dc); + } + + return ret; +} + +void bch_cached_dev_request_init(struct cached_dev *dc) +{ + struct gendisk *g = dc->disk.disk; + + g->queue->make_request_fn = cached_dev_make_request; + g->queue->backing_dev_info.congested_fn = cached_dev_congested; + dc->disk.cache_miss = cached_dev_cache_miss; + dc->disk.ioctl = cached_dev_ioctl; +} + +/* Flash backed devices */ + +static int flash_dev_cache_miss(struct btree *b, struct search *s, + struct bio *bio, unsigned sectors) +{ + unsigned bytes = min(sectors, bio_sectors(bio)) << 9; + + swap(bio->bi_iter.bi_size, bytes); + zero_fill_bio(bio); + swap(bio->bi_iter.bi_size, bytes); + + bio_advance(bio, bytes); + + if (!bio->bi_iter.bi_size) + return MAP_DONE; + + return MAP_CONTINUE; +} + +static void flash_dev_nodata(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + + if (s->iop.flush_journal) + bch_journal_meta(s->iop.c, cl); + + continue_at(cl, search_free, NULL); +} + +static void flash_dev_make_request(struct request_queue *q, struct bio *bio) +{ + struct search *s; + struct closure *cl; + struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; + int cpu, rw = bio_data_dir(bio); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &d->disk->part0, ios[rw]); + part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); + part_stat_unlock(); + + s = search_alloc(bio, d); + cl = &s->cl; + bio = &s->bio.bio; + + trace_bcache_request_start(s->d, bio); + + if (!bio->bi_iter.bi_size) { + /* + * can't call bch_journal_meta from under + * generic_make_request + */ + continue_at_nobarrier(&s->cl, + flash_dev_nodata, + bcache_wq); + } else if (rw) { + bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, + &KEY(d->id, bio->bi_iter.bi_sector, 0), + &KEY(d->id, bio_end_sector(bio), 0)); + + s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0; + s->iop.writeback = true; + s->iop.bio = bio; + + closure_call(&s->iop.cl, bch_data_insert, NULL, cl); + } else { + closure_call(&s->iop.cl, cache_lookup, NULL, cl); + } + + continue_at(cl, search_free, NULL); +} + +static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + return -ENOTTY; +} + +static int flash_dev_congested(void *data, int bits) +{ + struct bcache_device *d = data; + struct request_queue *q; + struct cache *ca; + unsigned i; + int ret = 0; + + for_each_cache(ca, d->c, i) { + q = bdev_get_queue(ca->bdev); + ret |= bdi_congested(&q->backing_dev_info, bits); + } + + return ret; +} + +void bch_flash_dev_request_init(struct bcache_device *d) +{ + struct gendisk *g = d->disk; + + g->queue->make_request_fn = flash_dev_make_request; + g->queue->backing_dev_info.congested_fn = flash_dev_congested; + d->cache_miss = flash_dev_cache_miss; + d->ioctl = flash_dev_ioctl; +} + +void bch_request_exit(void) +{ + if (bch_search_cache) + kmem_cache_destroy(bch_search_cache); +} + +int __init bch_request_init(void) +{ + bch_search_cache = KMEM_CACHE(search, 0); + if (!bch_search_cache) + return -ENOMEM; + + return 0; +} diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h new file mode 100644 index 00000000000..1ff36875c2b --- /dev/null +++ b/drivers/md/bcache/request.h @@ -0,0 +1,43 @@ +#ifndef _BCACHE_REQUEST_H_ +#define _BCACHE_REQUEST_H_ + +struct data_insert_op { + struct closure cl; + struct cache_set *c; + struct bio *bio; + struct workqueue_struct *wq; + + unsigned inode; + uint16_t write_point; + uint16_t write_prio; + short error; + + union { + uint16_t flags; + + struct { + unsigned bypass:1; + unsigned writeback:1; + unsigned flush_journal:1; + unsigned csum:1; + + unsigned replace:1; + unsigned replace_collision:1; + + unsigned insert_data_done:1; + }; + }; + + struct keylist insert_keys; + BKEY_PADDED(replace_key); +}; + +unsigned bch_get_congested(struct cache_set *); +void bch_data_insert(struct closure *cl); + +void bch_cached_dev_request_init(struct cached_dev *dc); +void bch_flash_dev_request_init(struct bcache_device *d); + +extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache; + +#endif /* _BCACHE_REQUEST_H_ */ diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c new file mode 100644 index 00000000000..0ca072c20d0 --- /dev/null +++ b/drivers/md/bcache/stats.c @@ -0,0 +1,241 @@ +/* + * bcache stats code + * + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "stats.h" +#include "btree.h" +#include "sysfs.h" + +/* + * We keep absolute totals of various statistics, and addionally a set of three + * rolling averages. + * + * Every so often, a timer goes off and rescales the rolling averages. + * accounting_rescale[] is how many times the timer has to go off before we + * rescale each set of numbers; that gets us half lives of 5 minutes, one hour, + * and one day. + * + * accounting_delay is how often the timer goes off - 22 times in 5 minutes, + * and accounting_weight is what we use to rescale: + * + * pow(31 / 32, 22) ~= 1/2 + * + * So that we don't have to increment each set of numbers every time we (say) + * get a cache hit, we increment a single atomic_t in acc->collector, and when + * the rescale function runs it resets the atomic counter to 0 and adds its + * old value to each of the exported numbers. + * + * To reduce rounding error, the numbers in struct cache_stats are all + * stored left shifted by 16, and scaled back in the sysfs show() function. + */ + +static const unsigned DAY_RESCALE = 288; +static const unsigned HOUR_RESCALE = 12; +static const unsigned FIVE_MINUTE_RESCALE = 1; +static const unsigned accounting_delay = (HZ * 300) / 22; +static const unsigned accounting_weight = 32; + +/* sysfs reading/writing */ + +read_attribute(cache_hits); +read_attribute(cache_misses); +read_attribute(cache_bypass_hits); +read_attribute(cache_bypass_misses); +read_attribute(cache_hit_ratio); +read_attribute(cache_readaheads); +read_attribute(cache_miss_collisions); +read_attribute(bypassed); + +SHOW(bch_stats) +{ + struct cache_stats *s = + container_of(kobj, struct cache_stats, kobj); +#define var(stat) (s->stat >> 16) + var_print(cache_hits); + var_print(cache_misses); + var_print(cache_bypass_hits); + var_print(cache_bypass_misses); + + sysfs_print(cache_hit_ratio, + DIV_SAFE(var(cache_hits) * 100, + var(cache_hits) + var(cache_misses))); + + var_print(cache_readaheads); + var_print(cache_miss_collisions); + sysfs_hprint(bypassed, var(sectors_bypassed) << 9); +#undef var + return 0; +} + +STORE(bch_stats) +{ + return size; +} + +static void bch_stats_release(struct kobject *k) +{ +} + +static struct attribute *bch_stats_files[] = { + &sysfs_cache_hits, + &sysfs_cache_misses, + &sysfs_cache_bypass_hits, + &sysfs_cache_bypass_misses, + &sysfs_cache_hit_ratio, + &sysfs_cache_readaheads, + &sysfs_cache_miss_collisions, + &sysfs_bypassed, + NULL +}; +static KTYPE(bch_stats); + +int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, + struct kobject *parent) +{ + int ret = kobject_add(&acc->total.kobj, parent, + "stats_total"); + ret = ret ?: kobject_add(&acc->five_minute.kobj, parent, + "stats_five_minute"); + ret = ret ?: kobject_add(&acc->hour.kobj, parent, + "stats_hour"); + ret = ret ?: kobject_add(&acc->day.kobj, parent, + "stats_day"); + return ret; +} + +void bch_cache_accounting_clear(struct cache_accounting *acc) +{ + memset(&acc->total.cache_hits, + 0, + sizeof(unsigned long) * 7); +} + +void bch_cache_accounting_destroy(struct cache_accounting *acc) +{ + kobject_put(&acc->total.kobj); + kobject_put(&acc->five_minute.kobj); + kobject_put(&acc->hour.kobj); + kobject_put(&acc->day.kobj); + + atomic_set(&acc->closing, 1); + if (del_timer_sync(&acc->timer)) + closure_return(&acc->cl); +} + +/* EWMA scaling */ + +static void scale_stat(unsigned long *stat) +{ + *stat = ewma_add(*stat, 0, accounting_weight, 0); +} + +static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) +{ + if (++stats->rescale == rescale_at) { + stats->rescale = 0; + scale_stat(&stats->cache_hits); + scale_stat(&stats->cache_misses); + scale_stat(&stats->cache_bypass_hits); + scale_stat(&stats->cache_bypass_misses); + scale_stat(&stats->cache_readaheads); + scale_stat(&stats->cache_miss_collisions); + scale_stat(&stats->sectors_bypassed); + } +} + +static void scale_accounting(unsigned long data) +{ + struct cache_accounting *acc = (struct cache_accounting *) data; + +#define move_stat(name) do { \ + unsigned t = atomic_xchg(&acc->collector.name, 0); \ + t <<= 16; \ + acc->five_minute.name += t; \ + acc->hour.name += t; \ + acc->day.name += t; \ + acc->total.name += t; \ +} while (0) + + move_stat(cache_hits); + move_stat(cache_misses); + move_stat(cache_bypass_hits); + move_stat(cache_bypass_misses); + move_stat(cache_readaheads); + move_stat(cache_miss_collisions); + move_stat(sectors_bypassed); + + scale_stats(&acc->total, 0); + scale_stats(&acc->day, DAY_RESCALE); + scale_stats(&acc->hour, HOUR_RESCALE); + scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE); + + acc->timer.expires += accounting_delay; + + if (!atomic_read(&acc->closing)) + add_timer(&acc->timer); + else + closure_return(&acc->cl); +} + +static void mark_cache_stats(struct cache_stat_collector *stats, + bool hit, bool bypass) +{ + if (!bypass) + if (hit) + atomic_inc(&stats->cache_hits); + else + atomic_inc(&stats->cache_misses); + else + if (hit) + atomic_inc(&stats->cache_bypass_hits); + else + atomic_inc(&stats->cache_bypass_misses); +} + +void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, + bool hit, bool bypass) +{ + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + mark_cache_stats(&dc->accounting.collector, hit, bypass); + mark_cache_stats(&c->accounting.collector, hit, bypass); +} + +void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) +{ + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + atomic_inc(&dc->accounting.collector.cache_readaheads); + atomic_inc(&c->accounting.collector.cache_readaheads); +} + +void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) +{ + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + atomic_inc(&dc->accounting.collector.cache_miss_collisions); + atomic_inc(&c->accounting.collector.cache_miss_collisions); +} + +void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, + int sectors) +{ + atomic_add(sectors, &dc->accounting.collector.sectors_bypassed); + atomic_add(sectors, &c->accounting.collector.sectors_bypassed); +} + +void bch_cache_accounting_init(struct cache_accounting *acc, + struct closure *parent) +{ + kobject_init(&acc->total.kobj, &bch_stats_ktype); + kobject_init(&acc->five_minute.kobj, &bch_stats_ktype); + kobject_init(&acc->hour.kobj, &bch_stats_ktype); + kobject_init(&acc->day.kobj, &bch_stats_ktype); + + closure_init(&acc->cl, parent); + init_timer(&acc->timer); + acc->timer.expires = jiffies + accounting_delay; + acc->timer.data = (unsigned long) acc; + acc->timer.function = scale_accounting; + add_timer(&acc->timer); +} diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h new file mode 100644 index 00000000000..adbff141c88 --- /dev/null +++ b/drivers/md/bcache/stats.h @@ -0,0 +1,61 @@ +#ifndef _BCACHE_STATS_H_ +#define _BCACHE_STATS_H_ + +struct cache_stat_collector { + atomic_t cache_hits; + atomic_t cache_misses; + atomic_t cache_bypass_hits; + atomic_t cache_bypass_misses; + atomic_t cache_readaheads; + atomic_t cache_miss_collisions; + atomic_t sectors_bypassed; +}; + +struct cache_stats { + struct kobject kobj; + + unsigned long cache_hits; + unsigned long cache_misses; + unsigned long cache_bypass_hits; + unsigned long cache_bypass_misses; + unsigned long cache_readaheads; + unsigned long cache_miss_collisions; + unsigned long sectors_bypassed; + + unsigned rescale; +}; + +struct cache_accounting { + struct closure cl; + struct timer_list timer; + atomic_t closing; + + struct cache_stat_collector collector; + + struct cache_stats total; + struct cache_stats five_minute; + struct cache_stats hour; + struct cache_stats day; +}; + +struct cache_set; +struct cached_dev; +struct bcache_device; + +void bch_cache_accounting_init(struct cache_accounting *acc, + struct closure *parent); + +int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, + struct kobject *parent); + +void bch_cache_accounting_clear(struct cache_accounting *acc); + +void bch_cache_accounting_destroy(struct cache_accounting *acc); + +void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *, + bool, bool); +void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *); +void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *); +void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int); + +#endif /* _BCACHE_STATS_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c new file mode 100644 index 00000000000..926ded8ccbf --- /dev/null +++ b/drivers/md/bcache/super.c @@ -0,0 +1,2102 @@ +/* + * bcache setup/teardown code, and some metadata io - read a superblock and + * figure out what to do with it. + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "extents.h" +#include "request.h" +#include "writeback.h" + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/debugfs.h> +#include <linux/genhd.h> +#include <linux/idr.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/reboot.h> +#include <linux/sysfs.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); + +static const char bcache_magic[] = { + 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, + 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 +}; + +static const char invalid_uuid[] = { + 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78, + 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99 +}; + +/* Default is -1; we skip past it for struct cached_dev's cache mode */ +const char * const bch_cache_modes[] = { + "default", + "writethrough", + "writeback", + "writearound", + "none", + NULL +}; + +static struct kobject *bcache_kobj; +struct mutex bch_register_lock; +LIST_HEAD(bch_cache_sets); +static LIST_HEAD(uncached_devices); + +static int bcache_major; +static DEFINE_IDA(bcache_minor); +static wait_queue_head_t unregister_wait; +struct workqueue_struct *bcache_wq; + +#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) + +static void bio_split_pool_free(struct bio_split_pool *p) +{ + if (p->bio_split_hook) + mempool_destroy(p->bio_split_hook); + + if (p->bio_split) + bioset_free(p->bio_split); +} + +static int bio_split_pool_init(struct bio_split_pool *p) +{ + p->bio_split = bioset_create(4, 0); + if (!p->bio_split) + return -ENOMEM; + + p->bio_split_hook = mempool_create_kmalloc_pool(4, + sizeof(struct bio_split_hook)); + if (!p->bio_split_hook) + return -ENOMEM; + + return 0; +} + +/* Superblock */ + +static const char *read_super(struct cache_sb *sb, struct block_device *bdev, + struct page **res) +{ + const char *err; + struct cache_sb *s; + struct buffer_head *bh = __bread(bdev, 1, SB_SIZE); + unsigned i; + + if (!bh) + return "IO error"; + + s = (struct cache_sb *) bh->b_data; + + sb->offset = le64_to_cpu(s->offset); + sb->version = le64_to_cpu(s->version); + + memcpy(sb->magic, s->magic, 16); + memcpy(sb->uuid, s->uuid, 16); + memcpy(sb->set_uuid, s->set_uuid, 16); + memcpy(sb->label, s->label, SB_LABEL_SIZE); + + sb->flags = le64_to_cpu(s->flags); + sb->seq = le64_to_cpu(s->seq); + sb->last_mount = le32_to_cpu(s->last_mount); + sb->first_bucket = le16_to_cpu(s->first_bucket); + sb->keys = le16_to_cpu(s->keys); + + for (i = 0; i < SB_JOURNAL_BUCKETS; i++) + sb->d[i] = le64_to_cpu(s->d[i]); + + pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", + sb->version, sb->flags, sb->seq, sb->keys); + + err = "Not a bcache superblock"; + if (sb->offset != SB_SECTOR) + goto err; + + if (memcmp(sb->magic, bcache_magic, 16)) + goto err; + + err = "Too many journal buckets"; + if (sb->keys > SB_JOURNAL_BUCKETS) + goto err; + + err = "Bad checksum"; + if (s->csum != csum_set(s)) + goto err; + + err = "Bad UUID"; + if (bch_is_zero(sb->uuid, 16)) + goto err; + + sb->block_size = le16_to_cpu(s->block_size); + + err = "Superblock block size smaller than device block size"; + if (sb->block_size << 9 < bdev_logical_block_size(bdev)) + goto err; + + switch (sb->version) { + case BCACHE_SB_VERSION_BDEV: + sb->data_offset = BDEV_DATA_START_DEFAULT; + break; + case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: + sb->data_offset = le64_to_cpu(s->data_offset); + + err = "Bad data offset"; + if (sb->data_offset < BDEV_DATA_START_DEFAULT) + goto err; + + break; + case BCACHE_SB_VERSION_CDEV: + case BCACHE_SB_VERSION_CDEV_WITH_UUID: + sb->nbuckets = le64_to_cpu(s->nbuckets); + sb->block_size = le16_to_cpu(s->block_size); + sb->bucket_size = le16_to_cpu(s->bucket_size); + + sb->nr_in_set = le16_to_cpu(s->nr_in_set); + sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); + + err = "Too many buckets"; + if (sb->nbuckets > LONG_MAX) + goto err; + + err = "Not enough buckets"; + if (sb->nbuckets < 1 << 7) + goto err; + + err = "Bad block/bucket size"; + if (!is_power_of_2(sb->block_size) || + sb->block_size > PAGE_SECTORS || + !is_power_of_2(sb->bucket_size) || + sb->bucket_size < PAGE_SECTORS) + goto err; + + err = "Invalid superblock: device too small"; + if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets) + goto err; + + err = "Bad UUID"; + if (bch_is_zero(sb->set_uuid, 16)) + goto err; + + err = "Bad cache device number in set"; + if (!sb->nr_in_set || + sb->nr_in_set <= sb->nr_this_dev || + sb->nr_in_set > MAX_CACHES_PER_SET) + goto err; + + err = "Journal buckets not sequential"; + for (i = 0; i < sb->keys; i++) + if (sb->d[i] != sb->first_bucket + i) + goto err; + + err = "Too many journal buckets"; + if (sb->first_bucket + sb->keys > sb->nbuckets) + goto err; + + err = "Invalid superblock: first bucket comes before end of super"; + if (sb->first_bucket * sb->bucket_size < 16) + goto err; + + break; + default: + err = "Unsupported superblock version"; + goto err; + } + + sb->last_mount = get_seconds(); + err = NULL; + + get_page(bh->b_page); + *res = bh->b_page; +err: + put_bh(bh); + return err; +} + +static void write_bdev_super_endio(struct bio *bio, int error) +{ + struct cached_dev *dc = bio->bi_private; + /* XXX: error checking */ + + closure_put(&dc->sb_write); +} + +static void __write_super(struct cache_sb *sb, struct bio *bio) +{ + struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); + unsigned i; + + bio->bi_iter.bi_sector = SB_SECTOR; + bio->bi_rw = REQ_SYNC|REQ_META; + bio->bi_iter.bi_size = SB_SIZE; + bch_bio_map(bio, NULL); + + out->offset = cpu_to_le64(sb->offset); + out->version = cpu_to_le64(sb->version); + + memcpy(out->uuid, sb->uuid, 16); + memcpy(out->set_uuid, sb->set_uuid, 16); + memcpy(out->label, sb->label, SB_LABEL_SIZE); + + out->flags = cpu_to_le64(sb->flags); + out->seq = cpu_to_le64(sb->seq); + + out->last_mount = cpu_to_le32(sb->last_mount); + out->first_bucket = cpu_to_le16(sb->first_bucket); + out->keys = cpu_to_le16(sb->keys); + + for (i = 0; i < sb->keys; i++) + out->d[i] = cpu_to_le64(sb->d[i]); + + out->csum = csum_set(out); + + pr_debug("ver %llu, flags %llu, seq %llu", + sb->version, sb->flags, sb->seq); + + submit_bio(REQ_WRITE, bio); +} + +static void bch_write_bdev_super_unlock(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write); + + up(&dc->sb_write_mutex); +} + +void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) +{ + struct closure *cl = &dc->sb_write; + struct bio *bio = &dc->sb_bio; + + down(&dc->sb_write_mutex); + closure_init(cl, parent); + + bio_reset(bio); + bio->bi_bdev = dc->bdev; + bio->bi_end_io = write_bdev_super_endio; + bio->bi_private = dc; + + closure_get(cl); + __write_super(&dc->sb, bio); + + closure_return_with_destructor(cl, bch_write_bdev_super_unlock); +} + +static void write_super_endio(struct bio *bio, int error) +{ + struct cache *ca = bio->bi_private; + + bch_count_io_errors(ca, error, "writing superblock"); + closure_put(&ca->set->sb_write); +} + +static void bcache_write_super_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, sb_write); + + up(&c->sb_write_mutex); +} + +void bcache_write_super(struct cache_set *c) +{ + struct closure *cl = &c->sb_write; + struct cache *ca; + unsigned i; + + down(&c->sb_write_mutex); + closure_init(cl, &c->cl); + + c->sb.seq++; + + for_each_cache(ca, c, i) { + struct bio *bio = &ca->sb_bio; + + ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID; + ca->sb.seq = c->sb.seq; + ca->sb.last_mount = c->sb.last_mount; + + SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); + + bio_reset(bio); + bio->bi_bdev = ca->bdev; + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; + + closure_get(cl); + __write_super(&ca->sb, bio); + } + + closure_return_with_destructor(cl, bcache_write_super_unlock); +} + +/* UUID io */ + +static void uuid_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct cache_set *c = container_of(cl, struct cache_set, uuid_write); + + cache_set_err_on(error, c, "accessing uuids"); + bch_bbio_free(bio, c); + closure_put(cl); +} + +static void uuid_io_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, uuid_write); + + up(&c->uuid_write_mutex); +} + +static void uuid_io(struct cache_set *c, unsigned long rw, + struct bkey *k, struct closure *parent) +{ + struct closure *cl = &c->uuid_write; + struct uuid_entry *u; + unsigned i; + char buf[80]; + + BUG_ON(!parent); + down(&c->uuid_write_mutex); + closure_init(cl, parent); + + for (i = 0; i < KEY_PTRS(k); i++) { + struct bio *bio = bch_bbio_alloc(c); + + bio->bi_rw = REQ_SYNC|REQ_META|rw; + bio->bi_iter.bi_size = KEY_SIZE(k) << 9; + + bio->bi_end_io = uuid_endio; + bio->bi_private = cl; + bch_bio_map(bio, c->uuids); + + bch_submit_bbio(bio, c, k, i); + + if (!(rw & WRITE)) + break; + } + + bch_extent_to_text(buf, sizeof(buf), k); + pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); + + for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) + if (!bch_is_zero(u->uuid, 16)) + pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u", + u - c->uuids, u->uuid, u->label, + u->first_reg, u->last_reg, u->invalidated); + + closure_return_with_destructor(cl, uuid_io_unlock); +} + +static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) +{ + struct bkey *k = &j->uuid_bucket; + + if (__bch_btree_ptr_invalid(c, k)) + return "bad uuid pointer"; + + bkey_copy(&c->uuid_bucket, k); + uuid_io(c, READ_SYNC, k, cl); + + if (j->version < BCACHE_JSET_VERSION_UUIDv1) { + struct uuid_entry_v0 *u0 = (void *) c->uuids; + struct uuid_entry *u1 = (void *) c->uuids; + int i; + + closure_sync(cl); + + /* + * Since the new uuid entry is bigger than the old, we have to + * convert starting at the highest memory address and work down + * in order to do it in place + */ + + for (i = c->nr_uuids - 1; + i >= 0; + --i) { + memcpy(u1[i].uuid, u0[i].uuid, 16); + memcpy(u1[i].label, u0[i].label, 32); + + u1[i].first_reg = u0[i].first_reg; + u1[i].last_reg = u0[i].last_reg; + u1[i].invalidated = u0[i].invalidated; + + u1[i].flags = 0; + u1[i].sectors = 0; + } + } + + return NULL; +} + +static int __uuid_write(struct cache_set *c) +{ + BKEY_PADDED(key) k; + struct closure cl; + closure_init_stack(&cl); + + lockdep_assert_held(&bch_register_lock); + + if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) + return 1; + + SET_KEY_SIZE(&k.key, c->sb.bucket_size); + uuid_io(c, REQ_WRITE, &k.key, &cl); + closure_sync(&cl); + + bkey_copy(&c->uuid_bucket, &k.key); + bkey_put(c, &k.key); + return 0; +} + +int bch_uuid_write(struct cache_set *c) +{ + int ret = __uuid_write(c); + + if (!ret) + bch_journal_meta(c, NULL); + + return ret; +} + +static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid) +{ + struct uuid_entry *u; + + for (u = c->uuids; + u < c->uuids + c->nr_uuids; u++) + if (!memcmp(u->uuid, uuid, 16)) + return u; + + return NULL; +} + +static struct uuid_entry *uuid_find_empty(struct cache_set *c) +{ + static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + return uuid_find(c, zero_uuid); +} + +/* + * Bucket priorities/gens: + * + * For each bucket, we store on disk its + * 8 bit gen + * 16 bit priority + * + * See alloc.c for an explanation of the gen. The priority is used to implement + * lru (and in the future other) cache replacement policies; for most purposes + * it's just an opaque integer. + * + * The gens and the priorities don't have a whole lot to do with each other, and + * it's actually the gens that must be written out at specific times - it's no + * big deal if the priorities don't get written, if we lose them we just reuse + * buckets in suboptimal order. + * + * On disk they're stored in a packed array, and in as many buckets are required + * to fit them all. The buckets we use to store them form a list; the journal + * header points to the first bucket, the first bucket points to the second + * bucket, et cetera. + * + * This code is used by the allocation code; periodically (whenever it runs out + * of buckets to allocate from) the allocation code will invalidate some + * buckets, but it can't use those buckets until their new gens are safely on + * disk. + */ + +static void prio_endio(struct bio *bio, int error) +{ + struct cache *ca = bio->bi_private; + + cache_set_err_on(error, ca->set, "accessing priorities"); + bch_bbio_free(bio, ca->set); + closure_put(&ca->prio); +} + +static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) +{ + struct closure *cl = &ca->prio; + struct bio *bio = bch_bbio_alloc(ca->set); + + closure_init_stack(cl); + + bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size; + bio->bi_bdev = ca->bdev; + bio->bi_rw = REQ_SYNC|REQ_META|rw; + bio->bi_iter.bi_size = bucket_bytes(ca); + + bio->bi_end_io = prio_endio; + bio->bi_private = ca; + bch_bio_map(bio, ca->disk_buckets); + + closure_bio_submit(bio, &ca->prio, ca); + closure_sync(cl); +} + +void bch_prio_write(struct cache *ca) +{ + int i; + struct bucket *b; + struct closure cl; + + closure_init_stack(&cl); + + lockdep_assert_held(&ca->set->bucket_lock); + + ca->disk_buckets->seq++; + + atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), + &ca->meta_sectors_written); + + //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), + // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); + + for (i = prio_buckets(ca) - 1; i >= 0; --i) { + long bucket; + struct prio_set *p = ca->disk_buckets; + struct bucket_disk *d = p->data; + struct bucket_disk *end = d + prios_per_bucket(ca); + + for (b = ca->buckets + i * prios_per_bucket(ca); + b < ca->buckets + ca->sb.nbuckets && d < end; + b++, d++) { + d->prio = cpu_to_le16(b->prio); + d->gen = b->gen; + } + + p->next_bucket = ca->prio_buckets[i + 1]; + p->magic = pset_magic(&ca->sb); + p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); + + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); + BUG_ON(bucket == -1); + + mutex_unlock(&ca->set->bucket_lock); + prio_io(ca, bucket, REQ_WRITE); + mutex_lock(&ca->set->bucket_lock); + + ca->prio_buckets[i] = bucket; + atomic_dec_bug(&ca->buckets[bucket].pin); + } + + mutex_unlock(&ca->set->bucket_lock); + + bch_journal_meta(ca->set, &cl); + closure_sync(&cl); + + mutex_lock(&ca->set->bucket_lock); + + /* + * Don't want the old priorities to get garbage collected until after we + * finish writing the new ones, and they're journalled + */ + for (i = 0; i < prio_buckets(ca); i++) { + if (ca->prio_last_buckets[i]) + __bch_bucket_free(ca, + &ca->buckets[ca->prio_last_buckets[i]]); + + ca->prio_last_buckets[i] = ca->prio_buckets[i]; + } +} + +static void prio_read(struct cache *ca, uint64_t bucket) +{ + struct prio_set *p = ca->disk_buckets; + struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; + struct bucket *b; + unsigned bucket_nr = 0; + + for (b = ca->buckets; + b < ca->buckets + ca->sb.nbuckets; + b++, d++) { + if (d == end) { + ca->prio_buckets[bucket_nr] = bucket; + ca->prio_last_buckets[bucket_nr] = bucket; + bucket_nr++; + + prio_io(ca, bucket, READ_SYNC); + + if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) + pr_warn("bad csum reading priorities"); + + if (p->magic != pset_magic(&ca->sb)) + pr_warn("bad magic reading priorities"); + + bucket = p->next_bucket; + d = p->data; + } + + b->prio = le16_to_cpu(d->prio); + b->gen = b->last_gc = d->gen; + } +} + +/* Bcache device */ + +static int open_dev(struct block_device *b, fmode_t mode) +{ + struct bcache_device *d = b->bd_disk->private_data; + if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) + return -ENXIO; + + closure_get(&d->cl); + return 0; +} + +static void release_dev(struct gendisk *b, fmode_t mode) +{ + struct bcache_device *d = b->private_data; + closure_put(&d->cl); +} + +static int ioctl_dev(struct block_device *b, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct bcache_device *d = b->bd_disk->private_data; + return d->ioctl(d, mode, cmd, arg); +} + +static const struct block_device_operations bcache_ops = { + .open = open_dev, + .release = release_dev, + .ioctl = ioctl_dev, + .owner = THIS_MODULE, +}; + +void bcache_device_stop(struct bcache_device *d) +{ + if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) + closure_queue(&d->cl); +} + +static void bcache_device_unlink(struct bcache_device *d) +{ + lockdep_assert_held(&bch_register_lock); + + if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { + unsigned i; + struct cache *ca; + + sysfs_remove_link(&d->c->kobj, d->name); + sysfs_remove_link(&d->kobj, "cache"); + + for_each_cache(ca, d->c, i) + bd_unlink_disk_holder(ca->bdev, d->disk); + } +} + +static void bcache_device_link(struct bcache_device *d, struct cache_set *c, + const char *name) +{ + unsigned i; + struct cache *ca; + + for_each_cache(ca, d->c, i) + bd_link_disk_holder(ca->bdev, d->disk); + + snprintf(d->name, BCACHEDEVNAME_SIZE, + "%s%u", name, d->id); + + WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || + sysfs_create_link(&c->kobj, &d->kobj, d->name), + "Couldn't create device <-> cache set symlinks"); +} + +static void bcache_device_detach(struct bcache_device *d) +{ + lockdep_assert_held(&bch_register_lock); + + if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { + struct uuid_entry *u = d->c->uuids + d->id; + + SET_UUID_FLASH_ONLY(u, 0); + memcpy(u->uuid, invalid_uuid, 16); + u->invalidated = cpu_to_le32(get_seconds()); + bch_uuid_write(d->c); + } + + bcache_device_unlink(d); + + d->c->devices[d->id] = NULL; + closure_put(&d->c->caching); + d->c = NULL; +} + +static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, + unsigned id) +{ + BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags)); + + d->id = id; + d->c = c; + c->devices[id] = d; + + closure_get(&c->caching); +} + +static void bcache_device_free(struct bcache_device *d) +{ + lockdep_assert_held(&bch_register_lock); + + pr_info("%s stopped", d->disk->disk_name); + + if (d->c) + bcache_device_detach(d); + if (d->disk && d->disk->flags & GENHD_FL_UP) + del_gendisk(d->disk); + if (d->disk && d->disk->queue) + blk_cleanup_queue(d->disk->queue); + if (d->disk) { + ida_simple_remove(&bcache_minor, d->disk->first_minor); + put_disk(d->disk); + } + + bio_split_pool_free(&d->bio_split_hook); + if (d->bio_split) + bioset_free(d->bio_split); + if (is_vmalloc_addr(d->full_dirty_stripes)) + vfree(d->full_dirty_stripes); + else + kfree(d->full_dirty_stripes); + if (is_vmalloc_addr(d->stripe_sectors_dirty)) + vfree(d->stripe_sectors_dirty); + else + kfree(d->stripe_sectors_dirty); + + closure_debug_destroy(&d->cl); +} + +static int bcache_device_init(struct bcache_device *d, unsigned block_size, + sector_t sectors) +{ + struct request_queue *q; + size_t n; + int minor; + + if (!d->stripe_size) + d->stripe_size = 1 << 31; + + d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); + + if (!d->nr_stripes || + d->nr_stripes > INT_MAX || + d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) { + pr_err("nr_stripes too large"); + return -ENOMEM; + } + + n = d->nr_stripes * sizeof(atomic_t); + d->stripe_sectors_dirty = n < PAGE_SIZE << 6 + ? kzalloc(n, GFP_KERNEL) + : vzalloc(n); + if (!d->stripe_sectors_dirty) + return -ENOMEM; + + n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); + d->full_dirty_stripes = n < PAGE_SIZE << 6 + ? kzalloc(n, GFP_KERNEL) + : vzalloc(n); + if (!d->full_dirty_stripes) + return -ENOMEM; + + minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL); + if (minor < 0) + return minor; + + if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || + bio_split_pool_init(&d->bio_split_hook) || + !(d->disk = alloc_disk(1))) { + ida_simple_remove(&bcache_minor, minor); + return -ENOMEM; + } + + set_capacity(d->disk, sectors); + snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); + + d->disk->major = bcache_major; + d->disk->first_minor = minor; + d->disk->fops = &bcache_ops; + d->disk->private_data = d; + + q = blk_alloc_queue(GFP_KERNEL); + if (!q) + return -ENOMEM; + + blk_queue_make_request(q, NULL); + d->disk->queue = q; + q->queuedata = d; + q->backing_dev_info.congested_data = d; + q->limits.max_hw_sectors = UINT_MAX; + q->limits.max_sectors = UINT_MAX; + q->limits.max_segment_size = UINT_MAX; + q->limits.max_segments = BIO_MAX_PAGES; + q->limits.max_discard_sectors = UINT_MAX; + q->limits.discard_granularity = 512; + q->limits.io_min = block_size; + q->limits.logical_block_size = block_size; + q->limits.physical_block_size = block_size; + set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); + set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); + + blk_queue_flush(q, REQ_FLUSH|REQ_FUA); + + return 0; +} + +/* Cached device */ + +static void calc_cached_dev_sectors(struct cache_set *c) +{ + uint64_t sectors = 0; + struct cached_dev *dc; + + list_for_each_entry(dc, &c->cached_devs, list) + sectors += bdev_sectors(dc->bdev); + + c->cached_dev_sectors = sectors; +} + +void bch_cached_dev_run(struct cached_dev *dc) +{ + struct bcache_device *d = &dc->disk; + char buf[SB_LABEL_SIZE + 1]; + char *env[] = { + "DRIVER=bcache", + kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), + NULL, + NULL, + }; + + memcpy(buf, dc->sb.label, SB_LABEL_SIZE); + buf[SB_LABEL_SIZE] = '\0'; + env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); + + if (atomic_xchg(&dc->running, 1)) + return; + + if (!d->c && + BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { + struct closure cl; + closure_init_stack(&cl); + + SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE); + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + } + + add_disk(d->disk); + bd_link_disk_holder(dc->bdev, dc->disk.disk); + /* won't show up in the uevent file, use udevadm monitor -e instead + * only class / kset properties are persistent */ + kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); + kfree(env[1]); + kfree(env[2]); + + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || + sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) + pr_debug("error creating sysfs link"); +} + +static void cached_dev_detach_finish(struct work_struct *w) +{ + struct cached_dev *dc = container_of(w, struct cached_dev, detach); + char buf[BDEVNAME_SIZE]; + struct closure cl; + closure_init_stack(&cl); + + BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); + BUG_ON(atomic_read(&dc->count)); + + mutex_lock(&bch_register_lock); + + memset(&dc->sb.set_uuid, 0, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); + + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + + bcache_device_detach(&dc->disk); + list_move(&dc->list, &uncached_devices); + + clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); + + mutex_unlock(&bch_register_lock); + + pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); + + /* Drop ref we took in cached_dev_detach() */ + closure_put(&dc->disk.cl); +} + +void bch_cached_dev_detach(struct cached_dev *dc) +{ + lockdep_assert_held(&bch_register_lock); + + if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) + return; + + if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) + return; + + /* + * Block the device from being closed and freed until we're finished + * detaching + */ + closure_get(&dc->disk.cl); + + bch_writeback_queue(dc); + cached_dev_put(dc); +} + +int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) +{ + uint32_t rtime = cpu_to_le32(get_seconds()); + struct uuid_entry *u; + char buf[BDEVNAME_SIZE]; + + bdevname(dc->bdev, buf); + + if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)) + return -ENOENT; + + if (dc->disk.c) { + pr_err("Can't attach %s: already attached", buf); + return -EINVAL; + } + + if (test_bit(CACHE_SET_STOPPING, &c->flags)) { + pr_err("Can't attach %s: shutting down", buf); + return -EINVAL; + } + + if (dc->sb.block_size < c->sb.block_size) { + /* Will die */ + pr_err("Couldn't attach %s: block size less than set's block size", + buf); + return -EINVAL; + } + + u = uuid_find(c, dc->sb.uuid); + + if (u && + (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE || + BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) { + memcpy(u->uuid, invalid_uuid, 16); + u->invalidated = cpu_to_le32(get_seconds()); + u = NULL; + } + + if (!u) { + if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { + pr_err("Couldn't find uuid for %s in set", buf); + return -ENOENT; + } + + u = uuid_find_empty(c); + if (!u) { + pr_err("Not caching %s, no room for UUID", buf); + return -EINVAL; + } + } + + /* Deadlocks since we're called via sysfs... + sysfs_remove_file(&dc->kobj, &sysfs_attach); + */ + + if (bch_is_zero(u->uuid, 16)) { + struct closure cl; + closure_init_stack(&cl); + + memcpy(u->uuid, dc->sb.uuid, 16); + memcpy(u->label, dc->sb.label, SB_LABEL_SIZE); + u->first_reg = u->last_reg = rtime; + bch_uuid_write(c); + + memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + + bch_write_bdev_super(dc, &cl); + closure_sync(&cl); + } else { + u->last_reg = rtime; + bch_uuid_write(c); + } + + bcache_device_attach(&dc->disk, c, u - c->uuids); + list_move(&dc->list, &c->cached_devs); + calc_cached_dev_sectors(c); + + smp_wmb(); + /* + * dc->c must be set before dc->count != 0 - paired with the mb in + * cached_dev_get() + */ + atomic_set(&dc->count, 1); + + if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { + bch_sectors_dirty_init(dc); + atomic_set(&dc->has_dirty, 1); + atomic_inc(&dc->count); + bch_writeback_queue(dc); + } + + bch_cached_dev_run(dc); + bcache_device_link(&dc->disk, c, "bdev"); + + pr_info("Caching %s as %s on set %pU", + bdevname(dc->bdev, buf), dc->disk.disk->disk_name, + dc->disk.c->sb.set_uuid); + return 0; +} + +void bch_cached_dev_release(struct kobject *kobj) +{ + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + kfree(dc); + module_put(THIS_MODULE); +} + +static void cached_dev_free(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + + cancel_delayed_work_sync(&dc->writeback_rate_update); + kthread_stop(dc->writeback_thread); + + mutex_lock(&bch_register_lock); + + if (atomic_read(&dc->running)) + bd_unlink_disk_holder(dc->bdev, dc->disk.disk); + bcache_device_free(&dc->disk); + list_del(&dc->list); + + mutex_unlock(&bch_register_lock); + + if (!IS_ERR_OR_NULL(dc->bdev)) { + if (dc->bdev->bd_disk) + blk_sync_queue(bdev_get_queue(dc->bdev)); + + blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + } + + wake_up(&unregister_wait); + + kobject_put(&dc->disk.kobj); +} + +static void cached_dev_flush(struct closure *cl) +{ + struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); + struct bcache_device *d = &dc->disk; + + mutex_lock(&bch_register_lock); + bcache_device_unlink(d); + mutex_unlock(&bch_register_lock); + + bch_cache_accounting_destroy(&dc->accounting); + kobject_del(&d->kobj); + + continue_at(cl, cached_dev_free, system_wq); +} + +static int cached_dev_init(struct cached_dev *dc, unsigned block_size) +{ + int ret; + struct io *io; + struct request_queue *q = bdev_get_queue(dc->bdev); + + __module_get(THIS_MODULE); + INIT_LIST_HEAD(&dc->list); + closure_init(&dc->disk.cl, NULL); + set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); + kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); + INIT_WORK(&dc->detach, cached_dev_detach_finish); + sema_init(&dc->sb_write_mutex, 1); + INIT_LIST_HEAD(&dc->io_lru); + spin_lock_init(&dc->io_lock); + bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); + + dc->sequential_cutoff = 4 << 20; + + for (io = dc->io; io < dc->io + RECENT_IO; io++) { + list_add(&io->lru, &dc->io_lru); + hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); + } + + dc->disk.stripe_size = q->limits.io_opt >> 9; + + if (dc->disk.stripe_size) + dc->partial_stripes_expensive = + q->limits.raid_partial_stripes_expensive; + + ret = bcache_device_init(&dc->disk, block_size, + dc->bdev->bd_part->nr_sects - dc->sb.data_offset); + if (ret) + return ret; + + set_capacity(dc->disk.disk, + dc->bdev->bd_part->nr_sects - dc->sb.data_offset); + + dc->disk.disk->queue->backing_dev_info.ra_pages = + max(dc->disk.disk->queue->backing_dev_info.ra_pages, + q->backing_dev_info.ra_pages); + + bch_cached_dev_request_init(dc); + bch_cached_dev_writeback_init(dc); + return 0; +} + +/* Cached device - bcache superblock */ + +static void register_bdev(struct cache_sb *sb, struct page *sb_page, + struct block_device *bdev, + struct cached_dev *dc) +{ + char name[BDEVNAME_SIZE]; + const char *err = "cannot allocate memory"; + struct cache_set *c; + + memcpy(&dc->sb, sb, sizeof(struct cache_sb)); + dc->bdev = bdev; + dc->bdev->bd_holder = dc; + + bio_init(&dc->sb_bio); + dc->sb_bio.bi_max_vecs = 1; + dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs; + dc->sb_bio.bi_io_vec[0].bv_page = sb_page; + get_page(sb_page); + + if (cached_dev_init(dc, sb->block_size << 9)) + goto err; + + err = "error creating kobject"; + if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj, + "bcache")) + goto err; + if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) + goto err; + + pr_info("registered backing device %s", bdevname(bdev, name)); + + list_add(&dc->list, &uncached_devices); + list_for_each_entry(c, &bch_cache_sets, list) + bch_cached_dev_attach(dc, c); + + if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || + BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) + bch_cached_dev_run(dc); + + return; +err: + pr_notice("error opening %s: %s", bdevname(bdev, name), err); + bcache_device_stop(&dc->disk); +} + +/* Flash only volumes */ + +void bch_flash_dev_release(struct kobject *kobj) +{ + struct bcache_device *d = container_of(kobj, struct bcache_device, + kobj); + kfree(d); +} + +static void flash_dev_free(struct closure *cl) +{ + struct bcache_device *d = container_of(cl, struct bcache_device, cl); + bcache_device_free(d); + kobject_put(&d->kobj); +} + +static void flash_dev_flush(struct closure *cl) +{ + struct bcache_device *d = container_of(cl, struct bcache_device, cl); + + bcache_device_unlink(d); + kobject_del(&d->kobj); + continue_at(cl, flash_dev_free, system_wq); +} + +static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) +{ + struct bcache_device *d = kzalloc(sizeof(struct bcache_device), + GFP_KERNEL); + if (!d) + return -ENOMEM; + + closure_init(&d->cl, NULL); + set_closure_fn(&d->cl, flash_dev_flush, system_wq); + + kobject_init(&d->kobj, &bch_flash_dev_ktype); + + if (bcache_device_init(d, block_bytes(c), u->sectors)) + goto err; + + bcache_device_attach(d, c, u - c->uuids); + bch_flash_dev_request_init(d); + add_disk(d->disk); + + if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache")) + goto err; + + bcache_device_link(d, c, "volume"); + + return 0; +err: + kobject_put(&d->kobj); + return -ENOMEM; +} + +static int flash_devs_run(struct cache_set *c) +{ + int ret = 0; + struct uuid_entry *u; + + for (u = c->uuids; + u < c->uuids + c->nr_uuids && !ret; + u++) + if (UUID_FLASH_ONLY(u)) + ret = flash_dev_run(c, u); + + return ret; +} + +int bch_flash_dev_create(struct cache_set *c, uint64_t size) +{ + struct uuid_entry *u; + + if (test_bit(CACHE_SET_STOPPING, &c->flags)) + return -EINTR; + + u = uuid_find_empty(c); + if (!u) { + pr_err("Can't create volume, no room for UUID"); + return -EINVAL; + } + + get_random_bytes(u->uuid, 16); + memset(u->label, 0, 32); + u->first_reg = u->last_reg = cpu_to_le32(get_seconds()); + + SET_UUID_FLASH_ONLY(u, 1); + u->sectors = size >> 9; + + bch_uuid_write(c); + + return flash_dev_run(c, u); +} + +/* Cache set */ + +__printf(2, 3) +bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) +{ + va_list args; + + if (c->on_error != ON_ERROR_PANIC && + test_bit(CACHE_SET_STOPPING, &c->flags)) + return false; + + /* XXX: we can be called from atomic context + acquire_console_sem(); + */ + + printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid); + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + printk(", disabling caching\n"); + + if (c->on_error == ON_ERROR_PANIC) + panic("panic forced after error\n"); + + bch_cache_set_unregister(c); + return true; +} + +void bch_cache_set_release(struct kobject *kobj) +{ + struct cache_set *c = container_of(kobj, struct cache_set, kobj); + kfree(c); + module_put(THIS_MODULE); +} + +static void cache_set_free(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, cl); + struct cache *ca; + unsigned i; + + if (!IS_ERR_OR_NULL(c->debug)) + debugfs_remove(c->debug); + + bch_open_buckets_free(c); + bch_btree_cache_free(c); + bch_journal_free(c); + + for_each_cache(ca, c, i) + if (ca) + kobject_put(&ca->kobj); + + bch_bset_sort_state_free(&c->sort); + free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); + + if (c->moving_gc_wq) + destroy_workqueue(c->moving_gc_wq); + if (c->bio_split) + bioset_free(c->bio_split); + if (c->fill_iter) + mempool_destroy(c->fill_iter); + if (c->bio_meta) + mempool_destroy(c->bio_meta); + if (c->search) + mempool_destroy(c->search); + kfree(c->devices); + + mutex_lock(&bch_register_lock); + list_del(&c->list); + mutex_unlock(&bch_register_lock); + + pr_info("Cache set %pU unregistered", c->sb.set_uuid); + wake_up(&unregister_wait); + + closure_debug_destroy(&c->cl); + kobject_put(&c->kobj); +} + +static void cache_set_flush(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, caching); + struct cache *ca; + struct btree *b; + unsigned i; + + bch_cache_accounting_destroy(&c->accounting); + + kobject_put(&c->internal); + kobject_del(&c->kobj); + + if (c->gc_thread) + kthread_stop(c->gc_thread); + + if (!IS_ERR_OR_NULL(c->root)) + list_add(&c->root->list, &c->btree_cache); + + /* Should skip this if we're unregistering because of an error */ + list_for_each_entry(b, &c->btree_cache, list) { + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); + } + + for_each_cache(ca, c, i) + if (ca->alloc_thread) + kthread_stop(ca->alloc_thread); + + cancel_delayed_work_sync(&c->journal.work); + /* flush last journal entry if needed */ + c->journal.work.work.func(&c->journal.work.work); + + closure_return(cl); +} + +static void __cache_set_unregister(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, caching); + struct cached_dev *dc; + size_t i; + + mutex_lock(&bch_register_lock); + + for (i = 0; i < c->nr_uuids; i++) + if (c->devices[i]) { + if (!UUID_FLASH_ONLY(&c->uuids[i]) && + test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { + dc = container_of(c->devices[i], + struct cached_dev, disk); + bch_cached_dev_detach(dc); + } else { + bcache_device_stop(c->devices[i]); + } + } + + mutex_unlock(&bch_register_lock); + + continue_at(cl, cache_set_flush, system_wq); +} + +void bch_cache_set_stop(struct cache_set *c) +{ + if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags)) + closure_queue(&c->caching); +} + +void bch_cache_set_unregister(struct cache_set *c) +{ + set_bit(CACHE_SET_UNREGISTERING, &c->flags); + bch_cache_set_stop(c); +} + +#define alloc_bucket_pages(gfp, c) \ + ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c)))) + +struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) +{ + int iter_size; + struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL); + if (!c) + return NULL; + + __module_get(THIS_MODULE); + closure_init(&c->cl, NULL); + set_closure_fn(&c->cl, cache_set_free, system_wq); + + closure_init(&c->caching, &c->cl); + set_closure_fn(&c->caching, __cache_set_unregister, system_wq); + + /* Maybe create continue_at_noreturn() and use it here? */ + closure_set_stopped(&c->cl); + closure_put(&c->cl); + + kobject_init(&c->kobj, &bch_cache_set_ktype); + kobject_init(&c->internal, &bch_cache_set_internal_ktype); + + bch_cache_accounting_init(&c->accounting, &c->cl); + + memcpy(c->sb.set_uuid, sb->set_uuid, 16); + c->sb.block_size = sb->block_size; + c->sb.bucket_size = sb->bucket_size; + c->sb.nr_in_set = sb->nr_in_set; + c->sb.last_mount = sb->last_mount; + c->bucket_bits = ilog2(sb->bucket_size); + c->block_bits = ilog2(sb->block_size); + c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); + + c->btree_pages = bucket_pages(c); + if (c->btree_pages > BTREE_MAX_PAGES) + c->btree_pages = max_t(int, c->btree_pages / 4, + BTREE_MAX_PAGES); + + sema_init(&c->sb_write_mutex, 1); + mutex_init(&c->bucket_lock); + init_waitqueue_head(&c->btree_cache_wait); + init_waitqueue_head(&c->bucket_wait); + sema_init(&c->uuid_write_mutex, 1); + + spin_lock_init(&c->btree_gc_time.lock); + spin_lock_init(&c->btree_split_time.lock); + spin_lock_init(&c->btree_read_time.lock); + + bch_moving_init_cache_set(c); + + INIT_LIST_HEAD(&c->list); + INIT_LIST_HEAD(&c->cached_devs); + INIT_LIST_HEAD(&c->btree_cache); + INIT_LIST_HEAD(&c->btree_cache_freeable); + INIT_LIST_HEAD(&c->btree_cache_freed); + INIT_LIST_HEAD(&c->data_buckets); + + c->search = mempool_create_slab_pool(32, bch_search_cache); + if (!c->search) + goto err; + + iter_size = (sb->bucket_size / sb->block_size + 1) * + sizeof(struct btree_iter_set); + + if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) || + !(c->bio_meta = mempool_create_kmalloc_pool(2, + sizeof(struct bbio) + sizeof(struct bio_vec) * + bucket_pages(c))) || + !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || + !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || + !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || + !(c->moving_gc_wq = create_workqueue("bcache_gc")) || + bch_journal_alloc(c) || + bch_btree_cache_alloc(c) || + bch_open_buckets_alloc(c) || + bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) + goto err; + + c->congested_read_threshold_us = 2000; + c->congested_write_threshold_us = 20000; + c->error_limit = 8 << IO_ERROR_SHIFT; + + return c; +err: + bch_cache_set_unregister(c); + return NULL; +} + +static void run_cache_set(struct cache_set *c) +{ + const char *err = "cannot allocate memory"; + struct cached_dev *dc, *t; + struct cache *ca; + struct closure cl; + unsigned i; + + closure_init_stack(&cl); + + for_each_cache(ca, c, i) + c->nbuckets += ca->sb.nbuckets; + + if (CACHE_SYNC(&c->sb)) { + LIST_HEAD(journal); + struct bkey *k; + struct jset *j; + + err = "cannot allocate memory for journal"; + if (bch_journal_read(c, &journal)) + goto err; + + pr_debug("btree_journal_read() done"); + + err = "no journal entries found"; + if (list_empty(&journal)) + goto err; + + j = &list_entry(journal.prev, struct journal_replay, list)->j; + + err = "IO error reading priorities"; + for_each_cache(ca, c, i) + prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]); + + /* + * If prio_read() fails it'll call cache_set_error and we'll + * tear everything down right away, but if we perhaps checked + * sooner we could avoid journal replay. + */ + + k = &j->btree_root; + + err = "bad btree root"; + if (__bch_btree_ptr_invalid(c, k)) + goto err; + + err = "error reading btree root"; + c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true); + if (IS_ERR_OR_NULL(c->root)) + goto err; + + list_del_init(&c->root->list); + rw_unlock(true, c->root); + + err = uuid_read(c, j, &cl); + if (err) + goto err; + + err = "error in recovery"; + if (bch_btree_check(c)) + goto err; + + bch_journal_mark(c, &journal); + bch_initial_gc_finish(c); + pr_debug("btree_check() done"); + + /* + * bcache_journal_next() can't happen sooner, or + * btree_gc_finish() will give spurious errors about last_gc > + * gc_gen - this is a hack but oh well. + */ + bch_journal_next(&c->journal); + + err = "error starting allocator thread"; + for_each_cache(ca, c, i) + if (bch_cache_allocator_start(ca)) + goto err; + + /* + * First place it's safe to allocate: btree_check() and + * btree_gc_finish() have to run before we have buckets to + * allocate, and bch_bucket_alloc_set() might cause a journal + * entry to be written so bcache_journal_next() has to be called + * first. + * + * If the uuids were in the old format we have to rewrite them + * before the next journal entry is written: + */ + if (j->version < BCACHE_JSET_VERSION_UUID) + __uuid_write(c); + + bch_journal_replay(c, &journal); + } else { + pr_notice("invalidating existing data"); + + for_each_cache(ca, c, i) { + unsigned j; + + ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, + 2, SB_JOURNAL_BUCKETS); + + for (j = 0; j < ca->sb.keys; j++) + ca->sb.d[j] = ca->sb.first_bucket + j; + } + + bch_initial_gc_finish(c); + + err = "error starting allocator thread"; + for_each_cache(ca, c, i) + if (bch_cache_allocator_start(ca)) + goto err; + + mutex_lock(&c->bucket_lock); + for_each_cache(ca, c, i) + bch_prio_write(ca); + mutex_unlock(&c->bucket_lock); + + err = "cannot allocate new UUID bucket"; + if (__uuid_write(c)) + goto err; + + err = "cannot allocate new btree root"; + c->root = bch_btree_node_alloc(c, NULL, 0); + if (IS_ERR_OR_NULL(c->root)) + goto err; + + mutex_lock(&c->root->write_lock); + bkey_copy_key(&c->root->key, &MAX_KEY); + bch_btree_node_write(c->root, &cl); + mutex_unlock(&c->root->write_lock); + + bch_btree_set_root(c->root); + rw_unlock(true, c->root); + + /* + * We don't want to write the first journal entry until + * everything is set up - fortunately journal entries won't be + * written until the SET_CACHE_SYNC() here: + */ + SET_CACHE_SYNC(&c->sb, true); + + bch_journal_next(&c->journal); + bch_journal_meta(c, &cl); + } + + err = "error starting gc thread"; + if (bch_gc_thread_start(c)) + goto err; + + closure_sync(&cl); + c->sb.last_mount = get_seconds(); + bcache_write_super(c); + + list_for_each_entry_safe(dc, t, &uncached_devices, list) + bch_cached_dev_attach(dc, c); + + flash_devs_run(c); + + return; +err: + closure_sync(&cl); + /* XXX: test this, it's broken */ + bch_cache_set_error(c, "%s", err); +} + +static bool can_attach_cache(struct cache *ca, struct cache_set *c) +{ + return ca->sb.block_size == c->sb.block_size && + ca->sb.bucket_size == c->sb.bucket_size && + ca->sb.nr_in_set == c->sb.nr_in_set; +} + +static const char *register_cache_set(struct cache *ca) +{ + char buf[12]; + const char *err = "cannot allocate memory"; + struct cache_set *c; + + list_for_each_entry(c, &bch_cache_sets, list) + if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) { + if (c->cache[ca->sb.nr_this_dev]) + return "duplicate cache set member"; + + if (!can_attach_cache(ca, c)) + return "cache sb does not match set"; + + if (!CACHE_SYNC(&ca->sb)) + SET_CACHE_SYNC(&c->sb, false); + + goto found; + } + + c = bch_cache_set_alloc(&ca->sb); + if (!c) + return err; + + err = "error creating kobject"; + if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) || + kobject_add(&c->internal, &c->kobj, "internal")) + goto err; + + if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj)) + goto err; + + bch_debug_init_cache_set(c); + + list_add(&c->list, &bch_cache_sets); +found: + sprintf(buf, "cache%i", ca->sb.nr_this_dev); + if (sysfs_create_link(&ca->kobj, &c->kobj, "set") || + sysfs_create_link(&c->kobj, &ca->kobj, buf)) + goto err; + + if (ca->sb.seq > c->sb.seq) { + c->sb.version = ca->sb.version; + memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16); + c->sb.flags = ca->sb.flags; + c->sb.seq = ca->sb.seq; + pr_debug("set version = %llu", c->sb.version); + } + + ca->set = c; + ca->set->cache[ca->sb.nr_this_dev] = ca; + c->cache_by_alloc[c->caches_loaded++] = ca; + + if (c->caches_loaded == c->sb.nr_in_set) + run_cache_set(c); + + return NULL; +err: + bch_cache_set_unregister(c); + return err; +} + +/* Cache device */ + +void bch_cache_release(struct kobject *kobj) +{ + struct cache *ca = container_of(kobj, struct cache, kobj); + unsigned i; + + if (ca->set) + ca->set->cache[ca->sb.nr_this_dev] = NULL; + + bio_split_pool_free(&ca->bio_split_hook); + + free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); + kfree(ca->prio_buckets); + vfree(ca->buckets); + + free_heap(&ca->heap); + free_fifo(&ca->free_inc); + + for (i = 0; i < RESERVE_NR; i++) + free_fifo(&ca->free[i]); + + if (ca->sb_bio.bi_inline_vecs[0].bv_page) + put_page(ca->sb_bio.bi_io_vec[0].bv_page); + + if (!IS_ERR_OR_NULL(ca->bdev)) { + blk_sync_queue(bdev_get_queue(ca->bdev)); + blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + } + + kfree(ca); + module_put(THIS_MODULE); +} + +static int cache_alloc(struct cache_sb *sb, struct cache *ca) +{ + size_t free; + struct bucket *b; + + __module_get(THIS_MODULE); + kobject_init(&ca->kobj, &bch_cache_ktype); + + bio_init(&ca->journal.bio); + ca->journal.bio.bi_max_vecs = 8; + ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; + + free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; + + if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || + !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || + !init_heap(&ca->heap, free << 3, GFP_KERNEL) || + !(ca->buckets = vzalloc(sizeof(struct bucket) * + ca->sb.nbuckets)) || + !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * + 2, GFP_KERNEL)) || + !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || + bio_split_pool_init(&ca->bio_split_hook)) + return -ENOMEM; + + ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); + + for_each_bucket(b, ca) + atomic_set(&b->pin, 0); + + return 0; +} + +static void register_cache(struct cache_sb *sb, struct page *sb_page, + struct block_device *bdev, struct cache *ca) +{ + char name[BDEVNAME_SIZE]; + const char *err = "cannot allocate memory"; + + memcpy(&ca->sb, sb, sizeof(struct cache_sb)); + ca->bdev = bdev; + ca->bdev->bd_holder = ca; + + bio_init(&ca->sb_bio); + ca->sb_bio.bi_max_vecs = 1; + ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs; + ca->sb_bio.bi_io_vec[0].bv_page = sb_page; + get_page(sb_page); + + if (blk_queue_discard(bdev_get_queue(ca->bdev))) + ca->discard = CACHE_DISCARD(&ca->sb); + + if (cache_alloc(sb, ca) != 0) + goto err; + + err = "error creating kobject"; + if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) + goto err; + + mutex_lock(&bch_register_lock); + err = register_cache_set(ca); + mutex_unlock(&bch_register_lock); + + if (err) + goto err; + + pr_info("registered cache device %s", bdevname(bdev, name)); + return; +err: + pr_notice("error opening %s: %s", bdevname(bdev, name), err); + kobject_put(&ca->kobj); +} + +/* Global interfaces/init */ + +static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, + const char *, size_t); + +kobj_attribute_write(register, register_bcache); +kobj_attribute_write(register_quiet, register_bcache); + +static bool bch_is_open_backing(struct block_device *bdev) { + struct cache_set *c, *tc; + struct cached_dev *dc, *t; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + list_for_each_entry_safe(dc, t, &c->cached_devs, list) + if (dc->bdev == bdev) + return true; + list_for_each_entry_safe(dc, t, &uncached_devices, list) + if (dc->bdev == bdev) + return true; + return false; +} + +static bool bch_is_open_cache(struct block_device *bdev) { + struct cache_set *c, *tc; + struct cache *ca; + unsigned i; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + for_each_cache(ca, c, i) + if (ca->bdev == bdev) + return true; + return false; +} + +static bool bch_is_open(struct block_device *bdev) { + return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); +} + +static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size) +{ + ssize_t ret = size; + const char *err = "cannot allocate memory"; + char *path = NULL; + struct cache_sb *sb = NULL; + struct block_device *bdev = NULL; + struct page *sb_page = NULL; + + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || + !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) + goto err; + + err = "failed to open device"; + bdev = blkdev_get_by_path(strim(path), + FMODE_READ|FMODE_WRITE|FMODE_EXCL, + sb); + if (IS_ERR(bdev)) { + if (bdev == ERR_PTR(-EBUSY)) { + bdev = lookup_bdev(strim(path)); + if (!IS_ERR(bdev) && bch_is_open(bdev)) + err = "device already registered"; + else + err = "device busy"; + } + goto err; + } + + err = "failed to set blocksize"; + if (set_blocksize(bdev, 4096)) + goto err_close; + + err = read_super(sb, bdev, &sb_page); + if (err) + goto err_close; + + if (SB_IS_BDEV(sb)) { + struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); + if (!dc) + goto err_close; + + mutex_lock(&bch_register_lock); + register_bdev(sb, sb_page, bdev, dc); + mutex_unlock(&bch_register_lock); + } else { + struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto err_close; + + register_cache(sb, sb_page, bdev, ca); + } +out: + if (sb_page) + put_page(sb_page); + kfree(sb); + kfree(path); + module_put(THIS_MODULE); + return ret; + +err_close: + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +err: + if (attr != &ksysfs_register_quiet) + pr_info("error opening %s: %s", path, err); + ret = -EINVAL; + goto out; +} + +static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) +{ + if (code == SYS_DOWN || + code == SYS_HALT || + code == SYS_POWER_OFF) { + DEFINE_WAIT(wait); + unsigned long start = jiffies; + bool stopped = false; + + struct cache_set *c, *tc; + struct cached_dev *dc, *tdc; + + mutex_lock(&bch_register_lock); + + if (list_empty(&bch_cache_sets) && + list_empty(&uncached_devices)) + goto out; + + pr_info("Stopping all devices:"); + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + bch_cache_set_stop(c); + + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) + bcache_device_stop(&dc->disk); + + /* What's a condition variable? */ + while (1) { + long timeout = start + 2 * HZ - jiffies; + + stopped = list_empty(&bch_cache_sets) && + list_empty(&uncached_devices); + + if (timeout < 0 || stopped) + break; + + prepare_to_wait(&unregister_wait, &wait, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&bch_register_lock); + schedule_timeout(timeout); + mutex_lock(&bch_register_lock); + } + + finish_wait(&unregister_wait, &wait); + + if (stopped) + pr_info("All devices stopped"); + else + pr_notice("Timeout waiting for devices to be closed"); +out: + mutex_unlock(&bch_register_lock); + } + + return NOTIFY_DONE; +} + +static struct notifier_block reboot = { + .notifier_call = bcache_reboot, + .priority = INT_MAX, /* before any real devices */ +}; + +static void bcache_exit(void) +{ + bch_debug_exit(); + bch_request_exit(); + if (bcache_kobj) + kobject_put(bcache_kobj); + if (bcache_wq) + destroy_workqueue(bcache_wq); + if (bcache_major) + unregister_blkdev(bcache_major, "bcache"); + unregister_reboot_notifier(&reboot); +} + +static int __init bcache_init(void) +{ + static const struct attribute *files[] = { + &ksysfs_register.attr, + &ksysfs_register_quiet.attr, + NULL + }; + + mutex_init(&bch_register_lock); + init_waitqueue_head(&unregister_wait); + register_reboot_notifier(&reboot); + closure_debug_init(); + + bcache_major = register_blkdev(0, "bcache"); + if (bcache_major < 0) + return bcache_major; + + if (!(bcache_wq = create_workqueue("bcache")) || + !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || + sysfs_create_files(bcache_kobj, files) || + bch_request_init() || + bch_debug_init(bcache_kobj)) + goto err; + + return 0; +err: + bcache_exit(); + return -ENOMEM; +} + +module_exit(bcache_exit); +module_init(bcache_init); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c new file mode 100644 index 00000000000..b3ff57d61dd --- /dev/null +++ b/drivers/md/bcache/sysfs.c @@ -0,0 +1,898 @@ +/* + * bcache sysfs interfaces + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "sysfs.h" +#include "btree.h" +#include "request.h" +#include "writeback.h" + +#include <linux/blkdev.h> +#include <linux/sort.h> + +static const char * const cache_replacement_policies[] = { + "lru", + "fifo", + "random", + NULL +}; + +static const char * const error_actions[] = { + "unregister", + "panic", + NULL +}; + +write_attribute(attach); +write_attribute(detach); +write_attribute(unregister); +write_attribute(stop); +write_attribute(clear_stats); +write_attribute(trigger_gc); +write_attribute(prune_cache); +write_attribute(flash_vol_create); + +read_attribute(bucket_size); +read_attribute(block_size); +read_attribute(nbuckets); +read_attribute(tree_depth); +read_attribute(root_usage_percent); +read_attribute(priority_stats); +read_attribute(btree_cache_size); +read_attribute(btree_cache_max_chain); +read_attribute(cache_available_percent); +read_attribute(written); +read_attribute(btree_written); +read_attribute(metadata_written); +read_attribute(active_journal_entries); + +sysfs_time_stats_attribute(btree_gc, sec, ms); +sysfs_time_stats_attribute(btree_split, sec, us); +sysfs_time_stats_attribute(btree_sort, ms, us); +sysfs_time_stats_attribute(btree_read, ms, us); + +read_attribute(btree_nodes); +read_attribute(btree_used_percent); +read_attribute(average_key_size); +read_attribute(dirty_data); +read_attribute(bset_tree_stats); + +read_attribute(state); +read_attribute(cache_read_races); +read_attribute(writeback_keys_done); +read_attribute(writeback_keys_failed); +read_attribute(io_errors); +read_attribute(congested); +rw_attribute(congested_read_threshold_us); +rw_attribute(congested_write_threshold_us); + +rw_attribute(sequential_cutoff); +rw_attribute(data_csum); +rw_attribute(cache_mode); +rw_attribute(writeback_metadata); +rw_attribute(writeback_running); +rw_attribute(writeback_percent); +rw_attribute(writeback_delay); +rw_attribute(writeback_rate); + +rw_attribute(writeback_rate_update_seconds); +rw_attribute(writeback_rate_d_term); +rw_attribute(writeback_rate_p_term_inverse); +read_attribute(writeback_rate_debug); + +read_attribute(stripe_size); +read_attribute(partial_stripes_expensive); + +rw_attribute(synchronous); +rw_attribute(journal_delay_ms); +rw_attribute(discard); +rw_attribute(running); +rw_attribute(label); +rw_attribute(readahead); +rw_attribute(errors); +rw_attribute(io_error_limit); +rw_attribute(io_error_halflife); +rw_attribute(verify); +rw_attribute(bypass_torture_test); +rw_attribute(key_merging_disabled); +rw_attribute(gc_always_rewrite); +rw_attribute(expensive_debug_checks); +rw_attribute(cache_replacement_policy); +rw_attribute(btree_shrinker_disabled); +rw_attribute(copy_gc_enabled); +rw_attribute(size); + +SHOW(__bch_cached_dev) +{ + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; + +#define var(stat) (dc->stat) + + if (attr == &sysfs_cache_mode) + return bch_snprint_string_list(buf, PAGE_SIZE, + bch_cache_modes + 1, + BDEV_CACHE_MODE(&dc->sb)); + + sysfs_printf(data_csum, "%i", dc->disk.data_csum); + var_printf(verify, "%i"); + var_printf(bypass_torture_test, "%i"); + var_printf(writeback_metadata, "%i"); + var_printf(writeback_running, "%i"); + var_print(writeback_delay); + var_print(writeback_percent); + sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); + + var_print(writeback_rate_update_seconds); + var_print(writeback_rate_d_term); + var_print(writeback_rate_p_term_inverse); + + if (attr == &sysfs_writeback_rate_debug) { + char rate[20]; + char dirty[20]; + char target[20]; + char proportional[20]; + char derivative[20]; + char change[20]; + s64 next_io; + + bch_hprint(rate, dc->writeback_rate.rate << 9); + bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); + bch_hprint(target, dc->writeback_rate_target << 9); + bch_hprint(proportional,dc->writeback_rate_proportional << 9); + bch_hprint(derivative, dc->writeback_rate_derivative << 9); + bch_hprint(change, dc->writeback_rate_change << 9); + + next_io = div64_s64(dc->writeback_rate.next - local_clock(), + NSEC_PER_MSEC); + + return sprintf(buf, + "rate:\t\t%s/sec\n" + "dirty:\t\t%s\n" + "target:\t\t%s\n" + "proportional:\t%s\n" + "derivative:\t%s\n" + "change:\t\t%s/sec\n" + "next io:\t%llims\n", + rate, dirty, target, proportional, + derivative, change, next_io); + } + + sysfs_hprint(dirty_data, + bcache_dev_sectors_dirty(&dc->disk) << 9); + + sysfs_hprint(stripe_size, dc->disk.stripe_size << 9); + var_printf(partial_stripes_expensive, "%u"); + + var_hprint(sequential_cutoff); + var_hprint(readahead); + + sysfs_print(running, atomic_read(&dc->running)); + sysfs_print(state, states[BDEV_STATE(&dc->sb)]); + + if (attr == &sysfs_label) { + memcpy(buf, dc->sb.label, SB_LABEL_SIZE); + buf[SB_LABEL_SIZE + 1] = '\0'; + strcat(buf, "\n"); + return strlen(buf); + } + +#undef var + return 0; +} +SHOW_LOCKED(bch_cached_dev) + +STORE(__cached_dev) +{ + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + unsigned v = size; + struct cache_set *c; + struct kobj_uevent_env *env; + +#define d_strtoul(var) sysfs_strtoul(var, dc->var) +#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) +#define d_strtoi_h(var) sysfs_hatoi(var, dc->var) + + sysfs_strtoul(data_csum, dc->disk.data_csum); + d_strtoul(verify); + d_strtoul(bypass_torture_test); + d_strtoul(writeback_metadata); + d_strtoul(writeback_running); + d_strtoul(writeback_delay); + + sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); + + sysfs_strtoul_clamp(writeback_rate, + dc->writeback_rate.rate, 1, INT_MAX); + + d_strtoul_nonzero(writeback_rate_update_seconds); + d_strtoul(writeback_rate_d_term); + d_strtoul_nonzero(writeback_rate_p_term_inverse); + + d_strtoi_h(sequential_cutoff); + d_strtoi_h(readahead); + + if (attr == &sysfs_clear_stats) + bch_cache_accounting_clear(&dc->accounting); + + if (attr == &sysfs_running && + strtoul_or_return(buf)) + bch_cached_dev_run(dc); + + if (attr == &sysfs_cache_mode) { + ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1); + + if (v < 0) + return v; + + if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) { + SET_BDEV_CACHE_MODE(&dc->sb, v); + bch_write_bdev_super(dc, NULL); + } + } + + if (attr == &sysfs_label) { + if (size > SB_LABEL_SIZE) + return -EINVAL; + memcpy(dc->sb.label, buf, size); + if (size < SB_LABEL_SIZE) + dc->sb.label[size] = '\0'; + if (size && dc->sb.label[size - 1] == '\n') + dc->sb.label[size - 1] = '\0'; + bch_write_bdev_super(dc, NULL); + if (dc->disk.c) { + memcpy(dc->disk.c->uuids[dc->disk.id].label, + buf, SB_LABEL_SIZE); + bch_uuid_write(dc->disk.c); + } + env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + add_uevent_var(env, "DRIVER=bcache"); + add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid), + add_uevent_var(env, "CACHED_LABEL=%s", buf); + kobject_uevent_env( + &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp); + kfree(env); + } + + if (attr == &sysfs_attach) { + if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16) + return -EINVAL; + + list_for_each_entry(c, &bch_cache_sets, list) { + v = bch_cached_dev_attach(dc, c); + if (!v) + return size; + } + + pr_err("Can't attach %s: cache set not found", buf); + size = v; + } + + if (attr == &sysfs_detach && dc->disk.c) + bch_cached_dev_detach(dc); + + if (attr == &sysfs_stop) + bcache_device_stop(&dc->disk); + + return size; +} + +STORE(bch_cached_dev) +{ + struct cached_dev *dc = container_of(kobj, struct cached_dev, + disk.kobj); + + mutex_lock(&bch_register_lock); + size = __cached_dev_store(kobj, attr, buf, size); + + if (attr == &sysfs_writeback_running) + bch_writeback_queue(dc); + + if (attr == &sysfs_writeback_percent) + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + + mutex_unlock(&bch_register_lock); + return size; +} + +static struct attribute *bch_cached_dev_files[] = { + &sysfs_attach, + &sysfs_detach, + &sysfs_stop, +#if 0 + &sysfs_data_csum, +#endif + &sysfs_cache_mode, + &sysfs_writeback_metadata, + &sysfs_writeback_running, + &sysfs_writeback_delay, + &sysfs_writeback_percent, + &sysfs_writeback_rate, + &sysfs_writeback_rate_update_seconds, + &sysfs_writeback_rate_d_term, + &sysfs_writeback_rate_p_term_inverse, + &sysfs_writeback_rate_debug, + &sysfs_dirty_data, + &sysfs_stripe_size, + &sysfs_partial_stripes_expensive, + &sysfs_sequential_cutoff, + &sysfs_clear_stats, + &sysfs_running, + &sysfs_state, + &sysfs_label, + &sysfs_readahead, +#ifdef CONFIG_BCACHE_DEBUG + &sysfs_verify, + &sysfs_bypass_torture_test, +#endif + NULL +}; +KTYPE(bch_cached_dev); + +SHOW(bch_flash_dev) +{ + struct bcache_device *d = container_of(kobj, struct bcache_device, + kobj); + struct uuid_entry *u = &d->c->uuids[d->id]; + + sysfs_printf(data_csum, "%i", d->data_csum); + sysfs_hprint(size, u->sectors << 9); + + if (attr == &sysfs_label) { + memcpy(buf, u->label, SB_LABEL_SIZE); + buf[SB_LABEL_SIZE + 1] = '\0'; + strcat(buf, "\n"); + return strlen(buf); + } + + return 0; +} + +STORE(__bch_flash_dev) +{ + struct bcache_device *d = container_of(kobj, struct bcache_device, + kobj); + struct uuid_entry *u = &d->c->uuids[d->id]; + + sysfs_strtoul(data_csum, d->data_csum); + + if (attr == &sysfs_size) { + uint64_t v; + strtoi_h_or_return(buf, v); + + u->sectors = v >> 9; + bch_uuid_write(d->c); + set_capacity(d->disk, u->sectors); + } + + if (attr == &sysfs_label) { + memcpy(u->label, buf, SB_LABEL_SIZE); + bch_uuid_write(d->c); + } + + if (attr == &sysfs_unregister) { + set_bit(BCACHE_DEV_DETACHING, &d->flags); + bcache_device_stop(d); + } + + return size; +} +STORE_LOCKED(bch_flash_dev) + +static struct attribute *bch_flash_dev_files[] = { + &sysfs_unregister, +#if 0 + &sysfs_data_csum, +#endif + &sysfs_label, + &sysfs_size, + NULL +}; +KTYPE(bch_flash_dev); + +struct bset_stats_op { + struct btree_op op; + size_t nodes; + struct bset_stats stats; +}; + +static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b) +{ + struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op); + + op->nodes++; + bch_btree_keys_stats(&b->keys, &op->stats); + + return MAP_CONTINUE; +} + +static int bch_bset_print_stats(struct cache_set *c, char *buf) +{ + struct bset_stats_op op; + int ret; + + memset(&op, 0, sizeof(op)); + bch_btree_op_init(&op.op, -1); + + ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats); + if (ret < 0) + return ret; + + return snprintf(buf, PAGE_SIZE, + "btree nodes: %zu\n" + "written sets: %zu\n" + "unwritten sets: %zu\n" + "written key bytes: %zu\n" + "unwritten key bytes: %zu\n" + "floats: %zu\n" + "failed: %zu\n", + op.nodes, + op.stats.sets_written, op.stats.sets_unwritten, + op.stats.bytes_written, op.stats.bytes_unwritten, + op.stats.floats, op.stats.failed); +} + +static unsigned bch_root_usage(struct cache_set *c) +{ + unsigned bytes = 0; + struct bkey *k; + struct btree *b; + struct btree_iter iter; + + goto lock_root; + + do { + rw_unlock(false, b); +lock_root: + b = c->root; + rw_lock(false, b, b->level); + } while (b != c->root); + + for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) + bytes += bkey_bytes(k); + + rw_unlock(false, b); + + return (bytes * 100) / btree_bytes(c); +} + +static size_t bch_cache_size(struct cache_set *c) +{ + size_t ret = 0; + struct btree *b; + + mutex_lock(&c->bucket_lock); + list_for_each_entry(b, &c->btree_cache, list) + ret += 1 << (b->keys.page_order + PAGE_SHIFT); + + mutex_unlock(&c->bucket_lock); + return ret; +} + +static unsigned bch_cache_max_chain(struct cache_set *c) +{ + unsigned ret = 0; + struct hlist_head *h; + + mutex_lock(&c->bucket_lock); + + for (h = c->bucket_hash; + h < c->bucket_hash + (1 << BUCKET_HASH_BITS); + h++) { + unsigned i = 0; + struct hlist_node *p; + + hlist_for_each(p, h) + i++; + + ret = max(ret, i); + } + + mutex_unlock(&c->bucket_lock); + return ret; +} + +static unsigned bch_btree_used(struct cache_set *c) +{ + return div64_u64(c->gc_stats.key_bytes * 100, + (c->gc_stats.nodes ?: 1) * btree_bytes(c)); +} + +static unsigned bch_average_key_size(struct cache_set *c) +{ + return c->gc_stats.nkeys + ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) + : 0; +} + +SHOW(__bch_cache_set) +{ + struct cache_set *c = container_of(kobj, struct cache_set, kobj); + + sysfs_print(synchronous, CACHE_SYNC(&c->sb)); + sysfs_print(journal_delay_ms, c->journal_delay_ms); + sysfs_hprint(bucket_size, bucket_bytes(c)); + sysfs_hprint(block_size, block_bytes(c)); + sysfs_print(tree_depth, c->root->level); + sysfs_print(root_usage_percent, bch_root_usage(c)); + + sysfs_hprint(btree_cache_size, bch_cache_size(c)); + sysfs_print(btree_cache_max_chain, bch_cache_max_chain(c)); + sysfs_print(cache_available_percent, 100 - c->gc_stats.in_use); + + sysfs_print_time_stats(&c->btree_gc_time, btree_gc, sec, ms); + sysfs_print_time_stats(&c->btree_split_time, btree_split, sec, us); + sysfs_print_time_stats(&c->sort.time, btree_sort, ms, us); + sysfs_print_time_stats(&c->btree_read_time, btree_read, ms, us); + + sysfs_print(btree_used_percent, bch_btree_used(c)); + sysfs_print(btree_nodes, c->gc_stats.nodes); + sysfs_hprint(average_key_size, bch_average_key_size(c)); + + sysfs_print(cache_read_races, + atomic_long_read(&c->cache_read_races)); + + sysfs_print(writeback_keys_done, + atomic_long_read(&c->writeback_keys_done)); + sysfs_print(writeback_keys_failed, + atomic_long_read(&c->writeback_keys_failed)); + + if (attr == &sysfs_errors) + return bch_snprint_string_list(buf, PAGE_SIZE, error_actions, + c->on_error); + + /* See count_io_errors for why 88 */ + sysfs_print(io_error_halflife, c->error_decay * 88); + sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); + + sysfs_hprint(congested, + ((uint64_t) bch_get_congested(c)) << 9); + sysfs_print(congested_read_threshold_us, + c->congested_read_threshold_us); + sysfs_print(congested_write_threshold_us, + c->congested_write_threshold_us); + + sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); + sysfs_printf(verify, "%i", c->verify); + sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); + sysfs_printf(expensive_debug_checks, + "%i", c->expensive_debug_checks); + sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); + sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + + if (attr == &sysfs_bset_tree_stats) + return bch_bset_print_stats(c, buf); + + return 0; +} +SHOW_LOCKED(bch_cache_set) + +STORE(__bch_cache_set) +{ + struct cache_set *c = container_of(kobj, struct cache_set, kobj); + + if (attr == &sysfs_unregister) + bch_cache_set_unregister(c); + + if (attr == &sysfs_stop) + bch_cache_set_stop(c); + + if (attr == &sysfs_synchronous) { + bool sync = strtoul_or_return(buf); + + if (sync != CACHE_SYNC(&c->sb)) { + SET_CACHE_SYNC(&c->sb, sync); + bcache_write_super(c); + } + } + + if (attr == &sysfs_flash_vol_create) { + int r; + uint64_t v; + strtoi_h_or_return(buf, v); + + r = bch_flash_dev_create(c, v); + if (r) + return r; + } + + if (attr == &sysfs_clear_stats) { + atomic_long_set(&c->writeback_keys_done, 0); + atomic_long_set(&c->writeback_keys_failed, 0); + + memset(&c->gc_stats, 0, sizeof(struct gc_stat)); + bch_cache_accounting_clear(&c->accounting); + } + + if (attr == &sysfs_trigger_gc) + wake_up_gc(c); + + if (attr == &sysfs_prune_cache) { + struct shrink_control sc; + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = strtoul_or_return(buf); + c->shrink.scan_objects(&c->shrink, &sc); + } + + sysfs_strtoul(congested_read_threshold_us, + c->congested_read_threshold_us); + sysfs_strtoul(congested_write_threshold_us, + c->congested_write_threshold_us); + + if (attr == &sysfs_errors) { + ssize_t v = bch_read_string_list(buf, error_actions); + + if (v < 0) + return v; + + c->on_error = v; + } + + if (attr == &sysfs_io_error_limit) + c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; + + /* See count_io_errors() for why 88 */ + if (attr == &sysfs_io_error_halflife) + c->error_decay = strtoul_or_return(buf) / 88; + + sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); + sysfs_strtoul(verify, c->verify); + sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); + sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); + sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); + sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); + sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); + + return size; +} +STORE_LOCKED(bch_cache_set) + +SHOW(bch_cache_set_internal) +{ + struct cache_set *c = container_of(kobj, struct cache_set, internal); + return bch_cache_set_show(&c->kobj, attr, buf); +} + +STORE(bch_cache_set_internal) +{ + struct cache_set *c = container_of(kobj, struct cache_set, internal); + return bch_cache_set_store(&c->kobj, attr, buf, size); +} + +static void bch_cache_set_internal_release(struct kobject *k) +{ +} + +static struct attribute *bch_cache_set_files[] = { + &sysfs_unregister, + &sysfs_stop, + &sysfs_synchronous, + &sysfs_journal_delay_ms, + &sysfs_flash_vol_create, + + &sysfs_bucket_size, + &sysfs_block_size, + &sysfs_tree_depth, + &sysfs_root_usage_percent, + &sysfs_btree_cache_size, + &sysfs_cache_available_percent, + + &sysfs_average_key_size, + + &sysfs_errors, + &sysfs_io_error_limit, + &sysfs_io_error_halflife, + &sysfs_congested, + &sysfs_congested_read_threshold_us, + &sysfs_congested_write_threshold_us, + &sysfs_clear_stats, + NULL +}; +KTYPE(bch_cache_set); + +static struct attribute *bch_cache_set_internal_files[] = { + &sysfs_active_journal_entries, + + sysfs_time_stats_attribute_list(btree_gc, sec, ms) + sysfs_time_stats_attribute_list(btree_split, sec, us) + sysfs_time_stats_attribute_list(btree_sort, ms, us) + sysfs_time_stats_attribute_list(btree_read, ms, us) + + &sysfs_btree_nodes, + &sysfs_btree_used_percent, + &sysfs_btree_cache_max_chain, + + &sysfs_bset_tree_stats, + &sysfs_cache_read_races, + &sysfs_writeback_keys_done, + &sysfs_writeback_keys_failed, + + &sysfs_trigger_gc, + &sysfs_prune_cache, +#ifdef CONFIG_BCACHE_DEBUG + &sysfs_verify, + &sysfs_key_merging_disabled, + &sysfs_expensive_debug_checks, +#endif + &sysfs_gc_always_rewrite, + &sysfs_btree_shrinker_disabled, + &sysfs_copy_gc_enabled, + NULL +}; +KTYPE(bch_cache_set_internal); + +SHOW(__bch_cache) +{ + struct cache *ca = container_of(kobj, struct cache, kobj); + + sysfs_hprint(bucket_size, bucket_bytes(ca)); + sysfs_hprint(block_size, block_bytes(ca)); + sysfs_print(nbuckets, ca->sb.nbuckets); + sysfs_print(discard, ca->discard); + sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9); + sysfs_hprint(btree_written, + atomic_long_read(&ca->btree_sectors_written) << 9); + sysfs_hprint(metadata_written, + (atomic_long_read(&ca->meta_sectors_written) + + atomic_long_read(&ca->btree_sectors_written)) << 9); + + sysfs_print(io_errors, + atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT); + + if (attr == &sysfs_cache_replacement_policy) + return bch_snprint_string_list(buf, PAGE_SIZE, + cache_replacement_policies, + CACHE_REPLACEMENT(&ca->sb)); + + if (attr == &sysfs_priority_stats) { + int cmp(const void *l, const void *r) + { return *((uint16_t *) r) - *((uint16_t *) l); } + + struct bucket *b; + size_t n = ca->sb.nbuckets, i; + size_t unused = 0, available = 0, dirty = 0, meta = 0; + uint64_t sum = 0; + /* Compute 31 quantiles */ + uint16_t q[31], *p, *cached; + ssize_t ret; + + cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); + if (!p) + return -ENOMEM; + + mutex_lock(&ca->set->bucket_lock); + for_each_bucket(b, ca) { + if (!GC_SECTORS_USED(b)) + unused++; + if (GC_MARK(b) == GC_MARK_RECLAIMABLE) + available++; + if (GC_MARK(b) == GC_MARK_DIRTY) + dirty++; + if (GC_MARK(b) == GC_MARK_METADATA) + meta++; + } + + for (i = ca->sb.first_bucket; i < n; i++) + p[i] = ca->buckets[i].prio; + mutex_unlock(&ca->set->bucket_lock); + + sort(p, n, sizeof(uint16_t), cmp, NULL); + + while (n && + !cached[n - 1]) + --n; + + unused = ca->sb.nbuckets - n; + + while (cached < p + n && + *cached == BTREE_PRIO) + cached++, n--; + + for (i = 0; i < n; i++) + sum += INITIAL_PRIO - cached[i]; + + if (n) + do_div(sum, n); + + for (i = 0; i < ARRAY_SIZE(q); i++) + q[i] = INITIAL_PRIO - cached[n * (i + 1) / + (ARRAY_SIZE(q) + 1)]; + + vfree(p); + + ret = scnprintf(buf, PAGE_SIZE, + "Unused: %zu%%\n" + "Clean: %zu%%\n" + "Dirty: %zu%%\n" + "Metadata: %zu%%\n" + "Average: %llu\n" + "Sectors per Q: %zu\n" + "Quantiles: [", + unused * 100 / (size_t) ca->sb.nbuckets, + available * 100 / (size_t) ca->sb.nbuckets, + dirty * 100 / (size_t) ca->sb.nbuckets, + meta * 100 / (size_t) ca->sb.nbuckets, sum, + n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); + + for (i = 0; i < ARRAY_SIZE(q); i++) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "%u ", q[i]); + ret--; + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); + + return ret; + } + + return 0; +} +SHOW_LOCKED(bch_cache) + +STORE(__bch_cache) +{ + struct cache *ca = container_of(kobj, struct cache, kobj); + + if (attr == &sysfs_discard) { + bool v = strtoul_or_return(buf); + + if (blk_queue_discard(bdev_get_queue(ca->bdev))) + ca->discard = v; + + if (v != CACHE_DISCARD(&ca->sb)) { + SET_CACHE_DISCARD(&ca->sb, v); + bcache_write_super(ca->set); + } + } + + if (attr == &sysfs_cache_replacement_policy) { + ssize_t v = bch_read_string_list(buf, cache_replacement_policies); + + if (v < 0) + return v; + + if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) { + mutex_lock(&ca->set->bucket_lock); + SET_CACHE_REPLACEMENT(&ca->sb, v); + mutex_unlock(&ca->set->bucket_lock); + + bcache_write_super(ca->set); + } + } + + if (attr == &sysfs_clear_stats) { + atomic_long_set(&ca->sectors_written, 0); + atomic_long_set(&ca->btree_sectors_written, 0); + atomic_long_set(&ca->meta_sectors_written, 0); + atomic_set(&ca->io_count, 0); + atomic_set(&ca->io_errors, 0); + } + + return size; +} +STORE_LOCKED(bch_cache) + +static struct attribute *bch_cache_files[] = { + &sysfs_bucket_size, + &sysfs_block_size, + &sysfs_nbuckets, + &sysfs_priority_stats, + &sysfs_discard, + &sysfs_written, + &sysfs_btree_written, + &sysfs_metadata_written, + &sysfs_io_errors, + &sysfs_clear_stats, + &sysfs_cache_replacement_policy, + NULL +}; +KTYPE(bch_cache); diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h new file mode 100644 index 00000000000..0526fe92a68 --- /dev/null +++ b/drivers/md/bcache/sysfs.h @@ -0,0 +1,110 @@ +#ifndef _BCACHE_SYSFS_H_ +#define _BCACHE_SYSFS_H_ + +#define KTYPE(type) \ +struct kobj_type type ## _ktype = { \ + .release = type ## _release, \ + .sysfs_ops = &((const struct sysfs_ops) { \ + .show = type ## _show, \ + .store = type ## _store \ + }), \ + .default_attrs = type ## _files \ +} + +#define SHOW(fn) \ +static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ + char *buf) \ + +#define STORE(fn) \ +static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ + const char *buf, size_t size) \ + +#define SHOW_LOCKED(fn) \ +SHOW(fn) \ +{ \ + ssize_t ret; \ + mutex_lock(&bch_register_lock); \ + ret = __ ## fn ## _show(kobj, attr, buf); \ + mutex_unlock(&bch_register_lock); \ + return ret; \ +} + +#define STORE_LOCKED(fn) \ +STORE(fn) \ +{ \ + ssize_t ret; \ + mutex_lock(&bch_register_lock); \ + ret = __ ## fn ## _store(kobj, attr, buf, size); \ + mutex_unlock(&bch_register_lock); \ + return ret; \ +} + +#define __sysfs_attribute(_name, _mode) \ + static struct attribute sysfs_##_name = \ + { .name = #_name, .mode = _mode } + +#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) +#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) +#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) + +#define sysfs_printf(file, fmt, ...) \ +do { \ + if (attr == &sysfs_ ## file) \ + return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__); \ +} while (0) + +#define sysfs_print(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return snprint(buf, PAGE_SIZE, var); \ +} while (0) + +#define sysfs_hprint(file, val) \ +do { \ + if (attr == &sysfs_ ## file) { \ + ssize_t ret = bch_hprint(buf, val); \ + strcat(buf, "\n"); \ + return ret + 1; \ + } \ +} while (0) + +#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) +#define var_print(_var) sysfs_print(_var, var(_var)) +#define var_hprint(_var) sysfs_hprint(_var, var(_var)) + +#define sysfs_strtoul(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoul_safe(buf, var) ?: (ssize_t) size; \ +} while (0) + +#define sysfs_strtoul_clamp(file, var, min, max) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoul_safe_clamp(buf, var, min, max) \ + ?: (ssize_t) size; \ +} while (0) + +#define strtoul_or_return(cp) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (_r) \ + return _r; \ + _v; \ +}) + +#define strtoi_h_or_return(cp, v) \ +do { \ + int _r = strtoi_h(cp, &v); \ + if (_r) \ + return _r; \ +} while (0) + +#define sysfs_hatoi(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoi_h(buf, &var) ?: (ssize_t) size; \ +} while (0) + +#endif /* _BCACHE_SYSFS_H_ */ diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c new file mode 100644 index 00000000000..b7820b0d262 --- /dev/null +++ b/drivers/md/bcache/trace.c @@ -0,0 +1,52 @@ +#include "bcache.h" +#include "btree.h" + +#include <linux/blktrace_api.h> +#include <linux/module.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/bcache.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision); diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c new file mode 100644 index 00000000000..db3ae4c2b22 --- /dev/null +++ b/drivers/md/bcache/util.c @@ -0,0 +1,381 @@ +/* + * random utiility code, for bcache but in theory not specific to bcache + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/types.h> + +#include "util.h" + +#define simple_strtoint(c, end, base) simple_strtol(c, end, base) +#define simple_strtouint(c, end, base) simple_strtoul(c, end, base) + +#define STRTO_H(name, type) \ +int bch_ ## name ## _h(const char *cp, type *res) \ +{ \ + int u = 0; \ + char *e; \ + type i = simple_ ## name(cp, &e, 10); \ + \ + switch (tolower(*e)) { \ + default: \ + return -EINVAL; \ + case 'y': \ + case 'z': \ + u++; \ + case 'e': \ + u++; \ + case 'p': \ + u++; \ + case 't': \ + u++; \ + case 'g': \ + u++; \ + case 'm': \ + u++; \ + case 'k': \ + u++; \ + if (e++ == cp) \ + return -EINVAL; \ + case '\n': \ + case '\0': \ + if (*e == '\n') \ + e++; \ + } \ + \ + if (*e) \ + return -EINVAL; \ + \ + while (u--) { \ + if ((type) ~0 > 0 && \ + (type) ~0 / 1024 <= i) \ + return -EINVAL; \ + if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) || \ + (i < 0 && -ANYSINT_MAX(type) / 1024 > i)) \ + return -EINVAL; \ + i *= 1024; \ + } \ + \ + *res = i; \ + return 0; \ +} \ + +STRTO_H(strtoint, int) +STRTO_H(strtouint, unsigned int) +STRTO_H(strtoll, long long) +STRTO_H(strtoull, unsigned long long) + +ssize_t bch_hprint(char *buf, int64_t v) +{ + static const char units[] = "?kMGTPEZY"; + char dec[4] = ""; + int u, t = 0; + + for (u = 0; v >= 1024 || v <= -1024; u++) { + t = v & ~(~0 << 10); + v >>= 10; + } + + if (!u) + return sprintf(buf, "%llu", v); + + if (v < 100 && v > -100) + snprintf(dec, sizeof(dec), ".%i", t / 100); + + return sprintf(buf, "%lli%s%c", v, dec, units[u]); +} + +ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], + size_t selected) +{ + char *out = buf; + size_t i; + + for (i = 0; list[i]; i++) + out += snprintf(out, buf + size - out, + i == selected ? "[%s] " : "%s ", list[i]); + + out[-1] = '\n'; + return out - buf; +} + +ssize_t bch_read_string_list(const char *buf, const char * const list[]) +{ + size_t i; + char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL); + if (!d) + return -ENOMEM; + + s = strim(d); + + for (i = 0; list[i]; i++) + if (!strcmp(list[i], s)) + break; + + kfree(d); + + if (!list[i]) + return -EINVAL; + + return i; +} + +bool bch_is_zero(const char *p, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + if (p[i]) + return false; + return true; +} + +int bch_parse_uuid(const char *s, char *uuid) +{ + size_t i, j, x; + memset(uuid, 0, 16); + + for (i = 0, j = 0; + i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32; + i++) { + x = s[i] | 32; + + switch (x) { + case '0'...'9': + x -= '0'; + break; + case 'a'...'f': + x -= 'a' - 10; + break; + default: + continue; + } + + if (!(j & 1)) + x <<= 4; + uuid[j++ >> 1] |= x; + } + return i; +} + +void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) +{ + uint64_t now, duration, last; + + spin_lock(&stats->lock); + + now = local_clock(); + duration = time_after64(now, start_time) + ? now - start_time : 0; + last = time_after64(now, stats->last) + ? now - stats->last : 0; + + stats->max_duration = max(stats->max_duration, duration); + + if (stats->last) { + ewma_add(stats->average_duration, duration, 8, 8); + + if (stats->average_frequency) + ewma_add(stats->average_frequency, last, 8, 8); + else + stats->average_frequency = last << 8; + } else { + stats->average_duration = duration << 8; + } + + stats->last = now ?: 1; + + spin_unlock(&stats->lock); +} + +/** + * bch_next_delay() - increment @d by the amount of work done, and return how + * long to delay until the next time to do some work. + * + * @d - the struct bch_ratelimit to update + * @done - the amount of work done, in arbitrary units + * + * Returns the amount of time to delay by, in jiffies + */ +uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) +{ + uint64_t now = local_clock(); + + d->next += div_u64(done * NSEC_PER_SEC, d->rate); + + if (time_before64(now + NSEC_PER_SEC, d->next)) + d->next = now + NSEC_PER_SEC; + + if (time_after64(now - NSEC_PER_SEC * 2, d->next)) + d->next = now - NSEC_PER_SEC * 2; + + return time_after64(d->next, now) + ? div_u64(d->next - now, NSEC_PER_SEC / HZ) + : 0; +} + +void bch_bio_map(struct bio *bio, void *base) +{ + size_t size = bio->bi_iter.bi_size; + struct bio_vec *bv = bio->bi_io_vec; + + BUG_ON(!bio->bi_iter.bi_size); + BUG_ON(bio->bi_vcnt); + + bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0; + goto start; + + for (; size; bio->bi_vcnt++, bv++) { + bv->bv_offset = 0; +start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, + size); + if (base) { + bv->bv_page = is_vmalloc_addr(base) + ? vmalloc_to_page(base) + : virt_to_page(base); + + base += bv->bv_len; + } + + size -= bv->bv_len; + } +} + +/* + * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any + * use permitted, subject to terms of PostgreSQL license; see.) + + * If we have a 64-bit integer type, then a 64-bit CRC looks just like the + * usual sort of implementation. (See Ross Williams' excellent introduction + * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from + * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) + * If we have no working 64-bit type, then fake it with two 32-bit registers. + * + * The present implementation is a normal (not "reflected", in Williams' + * terms) 64-bit CRC, using initial all-ones register contents and a final + * bit inversion. The chosen polynomial is borrowed from the DLT1 spec + * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): + * + * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + + * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + + * x^7 + x^4 + x + 1 +*/ + +static const uint64_t crc_table[256] = { + 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL, + 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL, + 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL, + 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL, + 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL, + 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL, + 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL, + 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL, + 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL, + 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL, + 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL, + 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL, + 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL, + 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL, + 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL, + 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL, + 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL, + 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL, + 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL, + 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL, + 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL, + 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL, + 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL, + 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL, + 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL, + 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL, + 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL, + 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL, + 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL, + 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL, + 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL, + 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL, + 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL, + 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL, + 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL, + 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL, + 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL, + 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL, + 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL, + 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL, + 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL, + 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL, + 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL, + 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL, + 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL, + 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL, + 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL, + 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL, + 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL, + 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL, + 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL, + 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL, + 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL, + 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL, + 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL, + 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL, + 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL, + 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL, + 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL, + 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL, + 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL, + 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL, + 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL, + 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL, + 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL, + 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL, + 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL, + 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL, + 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL, + 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL, + 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL, + 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL, + 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL, + 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL, + 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL, + 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL, + 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL, + 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL, + 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL, + 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL, + 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL, + 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL, + 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL, + 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL, + 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL, + 0x9AFCE626CE85B507ULL, +}; + +uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len) +{ + const unsigned char *data = _data; + + while (len--) { + int i = ((int) (crc >> 56) ^ *data++) & 0xFF; + crc = crc_table[i] ^ (crc << 8); + } + + return crc; +} + +uint64_t bch_crc64(const void *data, size_t len) +{ + uint64_t crc = 0xffffffffffffffffULL; + + crc = bch_crc64_update(crc, data, len); + + return crc ^ 0xffffffffffffffffULL; +} diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h new file mode 100644 index 00000000000..ac7d0d1f70d --- /dev/null +++ b/drivers/md/bcache/util.h @@ -0,0 +1,588 @@ + +#ifndef _BCACHE_UTIL_H +#define _BCACHE_UTIL_H + +#include <linux/blkdev.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/llist.h> +#include <linux/ratelimit.h> +#include <linux/vmalloc.h> +#include <linux/workqueue.h> + +#include "closure.h" + +#define PAGE_SECTORS (PAGE_SIZE / 512) + +struct closure; + +#ifdef CONFIG_BCACHE_DEBUG + +#define EBUG_ON(cond) BUG_ON(cond) +#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) +#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) + +#else /* DEBUG */ + +#define EBUG_ON(cond) do { if (cond); } while (0) +#define atomic_dec_bug(v) atomic_dec(v) +#define atomic_inc_bug(v, i) atomic_inc(v) + +#endif + +#define DECLARE_HEAP(type, name) \ + struct { \ + size_t size, used; \ + type *data; \ + } name + +#define init_heap(heap, _size, gfp) \ +({ \ + size_t _bytes; \ + (heap)->used = 0; \ + (heap)->size = (_size); \ + _bytes = (heap)->size * sizeof(*(heap)->data); \ + (heap)->data = NULL; \ + if (_bytes < KMALLOC_MAX_SIZE) \ + (heap)->data = kmalloc(_bytes, (gfp)); \ + if ((!(heap)->data) && ((gfp) & GFP_KERNEL)) \ + (heap)->data = vmalloc(_bytes); \ + (heap)->data; \ +}) + +#define free_heap(heap) \ +do { \ + if (is_vmalloc_addr((heap)->data)) \ + vfree((heap)->data); \ + else \ + kfree((heap)->data); \ + (heap)->data = NULL; \ +} while (0) + +#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) + +#define heap_sift(h, i, cmp) \ +do { \ + size_t _r, _j = i; \ + \ + for (; _j * 2 + 1 < (h)->used; _j = _r) { \ + _r = _j * 2 + 1; \ + if (_r + 1 < (h)->used && \ + cmp((h)->data[_r], (h)->data[_r + 1])) \ + _r++; \ + \ + if (cmp((h)->data[_r], (h)->data[_j])) \ + break; \ + heap_swap(h, _r, _j); \ + } \ +} while (0) + +#define heap_sift_down(h, i, cmp) \ +do { \ + while (i) { \ + size_t p = (i - 1) / 2; \ + if (cmp((h)->data[i], (h)->data[p])) \ + break; \ + heap_swap(h, i, p); \ + i = p; \ + } \ +} while (0) + +#define heap_add(h, d, cmp) \ +({ \ + bool _r = !heap_full(h); \ + if (_r) { \ + size_t _i = (h)->used++; \ + (h)->data[_i] = d; \ + \ + heap_sift_down(h, _i, cmp); \ + heap_sift(h, _i, cmp); \ + } \ + _r; \ +}) + +#define heap_pop(h, d, cmp) \ +({ \ + bool _r = (h)->used; \ + if (_r) { \ + (d) = (h)->data[0]; \ + (h)->used--; \ + heap_swap(h, 0, (h)->used); \ + heap_sift(h, 0, cmp); \ + } \ + _r; \ +}) + +#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL) + +#define heap_full(h) ((h)->used == (h)->size) + +#define DECLARE_FIFO(type, name) \ + struct { \ + size_t front, back, size, mask; \ + type *data; \ + } name + +#define fifo_for_each(c, fifo, iter) \ + for (iter = (fifo)->front; \ + c = (fifo)->data[iter], iter != (fifo)->back; \ + iter = (iter + 1) & (fifo)->mask) + +#define __init_fifo(fifo, gfp) \ +({ \ + size_t _allocated_size, _bytes; \ + BUG_ON(!(fifo)->size); \ + \ + _allocated_size = roundup_pow_of_two((fifo)->size + 1); \ + _bytes = _allocated_size * sizeof(*(fifo)->data); \ + \ + (fifo)->mask = _allocated_size - 1; \ + (fifo)->front = (fifo)->back = 0; \ + (fifo)->data = NULL; \ + \ + if (_bytes < KMALLOC_MAX_SIZE) \ + (fifo)->data = kmalloc(_bytes, (gfp)); \ + if ((!(fifo)->data) && ((gfp) & GFP_KERNEL)) \ + (fifo)->data = vmalloc(_bytes); \ + (fifo)->data; \ +}) + +#define init_fifo_exact(fifo, _size, gfp) \ +({ \ + (fifo)->size = (_size); \ + __init_fifo(fifo, gfp); \ +}) + +#define init_fifo(fifo, _size, gfp) \ +({ \ + (fifo)->size = (_size); \ + if ((fifo)->size > 4) \ + (fifo)->size = roundup_pow_of_two((fifo)->size) - 1; \ + __init_fifo(fifo, gfp); \ +}) + +#define free_fifo(fifo) \ +do { \ + if (is_vmalloc_addr((fifo)->data)) \ + vfree((fifo)->data); \ + else \ + kfree((fifo)->data); \ + (fifo)->data = NULL; \ +} while (0) + +#define fifo_used(fifo) (((fifo)->back - (fifo)->front) & (fifo)->mask) +#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) + +#define fifo_empty(fifo) (!fifo_used(fifo)) +#define fifo_full(fifo) (!fifo_free(fifo)) + +#define fifo_front(fifo) ((fifo)->data[(fifo)->front]) +#define fifo_back(fifo) \ + ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) + +#define fifo_idx(fifo, p) (((p) - &fifo_front(fifo)) & (fifo)->mask) + +#define fifo_push_back(fifo, i) \ +({ \ + bool _r = !fifo_full((fifo)); \ + if (_r) { \ + (fifo)->data[(fifo)->back++] = (i); \ + (fifo)->back &= (fifo)->mask; \ + } \ + _r; \ +}) + +#define fifo_pop_front(fifo, i) \ +({ \ + bool _r = !fifo_empty((fifo)); \ + if (_r) { \ + (i) = (fifo)->data[(fifo)->front++]; \ + (fifo)->front &= (fifo)->mask; \ + } \ + _r; \ +}) + +#define fifo_push_front(fifo, i) \ +({ \ + bool _r = !fifo_full((fifo)); \ + if (_r) { \ + --(fifo)->front; \ + (fifo)->front &= (fifo)->mask; \ + (fifo)->data[(fifo)->front] = (i); \ + } \ + _r; \ +}) + +#define fifo_pop_back(fifo, i) \ +({ \ + bool _r = !fifo_empty((fifo)); \ + if (_r) { \ + --(fifo)->back; \ + (fifo)->back &= (fifo)->mask; \ + (i) = (fifo)->data[(fifo)->back] \ + } \ + _r; \ +}) + +#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) +#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) + +#define fifo_swap(l, r) \ +do { \ + swap((l)->front, (r)->front); \ + swap((l)->back, (r)->back); \ + swap((l)->size, (r)->size); \ + swap((l)->mask, (r)->mask); \ + swap((l)->data, (r)->data); \ +} while (0) + +#define fifo_move(dest, src) \ +do { \ + typeof(*((dest)->data)) _t; \ + while (!fifo_full(dest) && \ + fifo_pop(src, _t)) \ + fifo_push(dest, _t); \ +} while (0) + +/* + * Simple array based allocator - preallocates a number of elements and you can + * never allocate more than that, also has no locking. + * + * Handy because if you know you only need a fixed number of elements you don't + * have to worry about memory allocation failure, and sometimes a mempool isn't + * what you want. + * + * We treat the free elements as entries in a singly linked list, and the + * freelist as a stack - allocating and freeing push and pop off the freelist. + */ + +#define DECLARE_ARRAY_ALLOCATOR(type, name, size) \ + struct { \ + type *freelist; \ + type data[size]; \ + } name + +#define array_alloc(array) \ +({ \ + typeof((array)->freelist) _ret = (array)->freelist; \ + \ + if (_ret) \ + (array)->freelist = *((typeof((array)->freelist) *) _ret);\ + \ + _ret; \ +}) + +#define array_free(array, ptr) \ +do { \ + typeof((array)->freelist) _ptr = ptr; \ + \ + *((typeof((array)->freelist) *) _ptr) = (array)->freelist; \ + (array)->freelist = _ptr; \ +} while (0) + +#define array_allocator_init(array) \ +do { \ + typeof((array)->freelist) _i; \ + \ + BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *)); \ + (array)->freelist = NULL; \ + \ + for (_i = (array)->data; \ + _i < (array)->data + ARRAY_SIZE((array)->data); \ + _i++) \ + array_free(array, _i); \ +} while (0) + +#define array_freelist_empty(array) ((array)->freelist == NULL) + +#define ANYSINT_MAX(t) \ + ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) + +int bch_strtoint_h(const char *, int *); +int bch_strtouint_h(const char *, unsigned int *); +int bch_strtoll_h(const char *, long long *); +int bch_strtoull_h(const char *, unsigned long long *); + +static inline int bch_strtol_h(const char *cp, long *res) +{ +#if BITS_PER_LONG == 32 + return bch_strtoint_h(cp, (int *) res); +#else + return bch_strtoll_h(cp, (long long *) res); +#endif +} + +static inline int bch_strtoul_h(const char *cp, long *res) +{ +#if BITS_PER_LONG == 32 + return bch_strtouint_h(cp, (unsigned int *) res); +#else + return bch_strtoull_h(cp, (unsigned long long *) res); +#endif +} + +#define strtoi_h(cp, res) \ + (__builtin_types_compatible_p(typeof(*res), int) \ + ? bch_strtoint_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), long) \ + ? bch_strtol_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), long long) \ + ? bch_strtoll_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), unsigned int) \ + ? bch_strtouint_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), unsigned long) \ + ? bch_strtoul_h(cp, (void *) res) \ + : __builtin_types_compatible_p(typeof(*res), unsigned long long)\ + ? bch_strtoull_h(cp, (void *) res) : -EINVAL) + +#define strtoul_safe(cp, var) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r) \ + var = _v; \ + _r; \ +}) + +#define strtoul_safe_clamp(cp, var, min, max) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r) \ + var = clamp_t(typeof(var), _v, min, max); \ + _r; \ +}) + +#define snprint(buf, size, var) \ + snprintf(buf, size, \ + __builtin_types_compatible_p(typeof(var), int) \ + ? "%i\n" : \ + __builtin_types_compatible_p(typeof(var), unsigned) \ + ? "%u\n" : \ + __builtin_types_compatible_p(typeof(var), long) \ + ? "%li\n" : \ + __builtin_types_compatible_p(typeof(var), unsigned long)\ + ? "%lu\n" : \ + __builtin_types_compatible_p(typeof(var), int64_t) \ + ? "%lli\n" : \ + __builtin_types_compatible_p(typeof(var), uint64_t) \ + ? "%llu\n" : \ + __builtin_types_compatible_p(typeof(var), const char *) \ + ? "%s\n" : "%i\n", var) + +ssize_t bch_hprint(char *buf, int64_t v); + +bool bch_is_zero(const char *p, size_t n); +int bch_parse_uuid(const char *s, char *uuid); + +ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], + size_t selected); + +ssize_t bch_read_string_list(const char *buf, const char * const list[]); + +struct time_stats { + spinlock_t lock; + /* + * all fields are in nanoseconds, averages are ewmas stored left shifted + * by 8 + */ + uint64_t max_duration; + uint64_t average_duration; + uint64_t average_frequency; + uint64_t last; +}; + +void bch_time_stats_update(struct time_stats *stats, uint64_t time); + +static inline unsigned local_clock_us(void) +{ + return local_clock() >> 10; +} + +#define NSEC_PER_ns 1L +#define NSEC_PER_us NSEC_PER_USEC +#define NSEC_PER_ms NSEC_PER_MSEC +#define NSEC_PER_sec NSEC_PER_SEC + +#define __print_time_stat(stats, name, stat, units) \ + sysfs_print(name ## _ ## stat ## _ ## units, \ + div_u64((stats)->stat >> 8, NSEC_PER_ ## units)) + +#define sysfs_print_time_stats(stats, name, \ + frequency_units, \ + duration_units) \ +do { \ + __print_time_stat(stats, name, \ + average_frequency, frequency_units); \ + __print_time_stat(stats, name, \ + average_duration, duration_units); \ + __print_time_stat(stats, name, \ + max_duration, duration_units); \ + \ + sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ + ? div_s64(local_clock() - (stats)->last, \ + NSEC_PER_ ## frequency_units) \ + : -1LL); \ +} while (0) + +#define sysfs_time_stats_attribute(name, \ + frequency_units, \ + duration_units) \ +read_attribute(name ## _average_frequency_ ## frequency_units); \ +read_attribute(name ## _average_duration_ ## duration_units); \ +read_attribute(name ## _max_duration_ ## duration_units); \ +read_attribute(name ## _last_ ## frequency_units) + +#define sysfs_time_stats_attribute_list(name, \ + frequency_units, \ + duration_units) \ +&sysfs_ ## name ## _average_frequency_ ## frequency_units, \ +&sysfs_ ## name ## _average_duration_ ## duration_units, \ +&sysfs_ ## name ## _max_duration_ ## duration_units, \ +&sysfs_ ## name ## _last_ ## frequency_units, + +#define ewma_add(ewma, val, weight, factor) \ +({ \ + (ewma) *= (weight) - 1; \ + (ewma) += (val) << factor; \ + (ewma) /= (weight); \ + (ewma) >> factor; \ +}) + +struct bch_ratelimit { + /* Next time we want to do some work, in nanoseconds */ + uint64_t next; + + /* + * Rate at which we want to do work, in units per nanosecond + * The units here correspond to the units passed to bch_next_delay() + */ + unsigned rate; +}; + +static inline void bch_ratelimit_reset(struct bch_ratelimit *d) +{ + d->next = local_clock(); +} + +uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done); + +#define __DIV_SAFE(n, d, zero) \ +({ \ + typeof(n) _n = (n); \ + typeof(d) _d = (d); \ + _d ? _n / _d : zero; \ +}) + +#define DIV_SAFE(n, d) __DIV_SAFE(n, d, 0) + +#define container_of_or_null(ptr, type, member) \ +({ \ + typeof(ptr) _ptr = ptr; \ + _ptr ? container_of(_ptr, type, member) : NULL; \ +}) + +#define RB_INSERT(root, new, member, cmp) \ +({ \ + __label__ dup; \ + struct rb_node **n = &(root)->rb_node, *parent = NULL; \ + typeof(new) this; \ + int res, ret = -1; \ + \ + while (*n) { \ + parent = *n; \ + this = container_of(*n, typeof(*(new)), member); \ + res = cmp(new, this); \ + if (!res) \ + goto dup; \ + n = res < 0 \ + ? &(*n)->rb_left \ + : &(*n)->rb_right; \ + } \ + \ + rb_link_node(&(new)->member, parent, n); \ + rb_insert_color(&(new)->member, root); \ + ret = 0; \ +dup: \ + ret; \ +}) + +#define RB_SEARCH(root, search, member, cmp) \ +({ \ + struct rb_node *n = (root)->rb_node; \ + typeof(&(search)) this, ret = NULL; \ + int res; \ + \ + while (n) { \ + this = container_of(n, typeof(search), member); \ + res = cmp(&(search), this); \ + if (!res) { \ + ret = this; \ + break; \ + } \ + n = res < 0 \ + ? n->rb_left \ + : n->rb_right; \ + } \ + ret; \ +}) + +#define RB_GREATER(root, search, member, cmp) \ +({ \ + struct rb_node *n = (root)->rb_node; \ + typeof(&(search)) this, ret = NULL; \ + int res; \ + \ + while (n) { \ + this = container_of(n, typeof(search), member); \ + res = cmp(&(search), this); \ + if (res < 0) { \ + ret = this; \ + n = n->rb_left; \ + } else \ + n = n->rb_right; \ + } \ + ret; \ +}) + +#define RB_FIRST(root, type, member) \ + container_of_or_null(rb_first(root), type, member) + +#define RB_LAST(root, type, member) \ + container_of_or_null(rb_last(root), type, member) + +#define RB_NEXT(ptr, member) \ + container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member) + +#define RB_PREV(ptr, member) \ + container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) + +/* Does linear interpolation between powers of two */ +static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) +{ + unsigned fract = x & ~(~0 << fract_bits); + + x >>= fract_bits; + x = 1 << x; + x += (x * fract) >> fract_bits; + + return x; +} + +void bch_bio_map(struct bio *bio, void *base); + +static inline sector_t bdev_sectors(struct block_device *bdev) +{ + return bdev->bd_inode->i_size >> 9; +} + +#define closure_bio_submit(bio, cl, dev) \ +do { \ + closure_get(cl); \ + bch_generic_make_request(bio, &(dev)->bio_split_hook); \ +} while (0) + +uint64_t bch_crc64_update(uint64_t, const void *, size_t); +uint64_t bch_crc64(const void *, size_t); + +#endif /* _BCACHE_UTIL_H */ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c new file mode 100644 index 00000000000..f4300e4c011 --- /dev/null +++ b/drivers/md/bcache/writeback.c @@ -0,0 +1,507 @@ +/* + * background writeback - scan btree for dirty data and write it to the backing + * device + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "writeback.h" + +#include <linux/delay.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <trace/events/bcache.h> + +/* Rate limiting */ + +static void __update_writeback_rate(struct cached_dev *dc) +{ + struct cache_set *c = dc->disk.c; + uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; + uint64_t cache_dirty_target = + div_u64(cache_sectors * dc->writeback_percent, 100); + + int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), + c->cached_dev_sectors); + + /* PD controller */ + + int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); + int64_t derivative = dirty - dc->disk.sectors_dirty_last; + int64_t proportional = dirty - target; + int64_t change; + + dc->disk.sectors_dirty_last = dirty; + + /* Scale to sectors per second */ + + proportional *= dc->writeback_rate_update_seconds; + proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); + + derivative = div_s64(derivative, dc->writeback_rate_update_seconds); + + derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, + (dc->writeback_rate_d_term / + dc->writeback_rate_update_seconds) ?: 1, 0); + + derivative *= dc->writeback_rate_d_term; + derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); + + change = proportional + derivative; + + /* Don't increase writeback rate if the device isn't keeping up */ + if (change > 0 && + time_after64(local_clock(), + dc->writeback_rate.next + NSEC_PER_MSEC)) + change = 0; + + dc->writeback_rate.rate = + clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, + 1, NSEC_PER_MSEC); + + dc->writeback_rate_proportional = proportional; + dc->writeback_rate_derivative = derivative; + dc->writeback_rate_change = change; + dc->writeback_rate_target = target; +} + +static void update_writeback_rate(struct work_struct *work) +{ + struct cached_dev *dc = container_of(to_delayed_work(work), + struct cached_dev, + writeback_rate_update); + + down_read(&dc->writeback_lock); + + if (atomic_read(&dc->has_dirty) && + dc->writeback_percent) + __update_writeback_rate(dc); + + up_read(&dc->writeback_lock); + + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); +} + +static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) +{ + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || + !dc->writeback_percent) + return 0; + + return bch_next_delay(&dc->writeback_rate, sectors); +} + +struct dirty_io { + struct closure cl; + struct cached_dev *dc; + struct bio bio; +}; + +static void dirty_init(struct keybuf_key *w) +{ + struct dirty_io *io = w->private; + struct bio *bio = &io->bio; + + bio_init(bio); + if (!io->dc->writeback_percent) + bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; + bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); + bio->bi_private = w; + bio->bi_io_vec = bio->bi_inline_vecs; + bch_bio_map(bio, NULL); +} + +static void dirty_io_destructor(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + kfree(io); +} + +static void write_dirty_finish(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + struct keybuf_key *w = io->bio.bi_private; + struct cached_dev *dc = io->dc; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, &io->bio, i) + __free_page(bv->bv_page); + + /* This is kind of a dumb way of signalling errors. */ + if (KEY_DIRTY(&w->key)) { + int ret; + unsigned i; + struct keylist keys; + + bch_keylist_init(&keys); + + bkey_copy(keys.top, &w->key); + SET_KEY_DIRTY(keys.top, false); + bch_keylist_push(&keys); + + for (i = 0; i < KEY_PTRS(&w->key); i++) + atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); + + ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); + + if (ret) + trace_bcache_writeback_collision(&w->key); + + atomic_long_inc(ret + ? &dc->disk.c->writeback_keys_failed + : &dc->disk.c->writeback_keys_done); + } + + bch_keybuf_del(&dc->writeback_keys, w); + up(&dc->in_flight); + + closure_return_with_destructor(cl, dirty_io_destructor); +} + +static void dirty_endio(struct bio *bio, int error) +{ + struct keybuf_key *w = bio->bi_private; + struct dirty_io *io = w->private; + + if (error) + SET_KEY_DIRTY(&w->key, false); + + closure_put(&io->cl); +} + +static void write_dirty(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + struct keybuf_key *w = io->bio.bi_private; + + dirty_init(w); + io->bio.bi_rw = WRITE; + io->bio.bi_iter.bi_sector = KEY_START(&w->key); + io->bio.bi_bdev = io->dc->bdev; + io->bio.bi_end_io = dirty_endio; + + closure_bio_submit(&io->bio, cl, &io->dc->disk); + + continue_at(cl, write_dirty_finish, system_wq); +} + +static void read_dirty_endio(struct bio *bio, int error) +{ + struct keybuf_key *w = bio->bi_private; + struct dirty_io *io = w->private; + + bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), + error, "reading dirty data from cache"); + + dirty_endio(bio, error); +} + +static void read_dirty_submit(struct closure *cl) +{ + struct dirty_io *io = container_of(cl, struct dirty_io, cl); + + closure_bio_submit(&io->bio, cl, &io->dc->disk); + + continue_at(cl, write_dirty, system_wq); +} + +static void read_dirty(struct cached_dev *dc) +{ + unsigned delay = 0; + struct keybuf_key *w; + struct dirty_io *io; + struct closure cl; + + closure_init_stack(&cl); + + /* + * XXX: if we error, background writeback just spins. Should use some + * mempools. + */ + + while (!kthread_should_stop()) { + try_to_freeze(); + + w = bch_keybuf_next(&dc->writeback_keys); + if (!w) + break; + + BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); + + if (KEY_START(&w->key) != dc->last_read || + jiffies_to_msecs(delay) > 50) + while (!kthread_should_stop() && delay) + delay = schedule_timeout_uninterruptible(delay); + + dc->last_read = KEY_OFFSET(&w->key); + + io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) + * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); + if (!io) + goto err; + + w->private = io; + io->dc = dc; + + dirty_init(w); + io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); + io->bio.bi_bdev = PTR_CACHE(dc->disk.c, + &w->key, 0)->bdev; + io->bio.bi_rw = READ; + io->bio.bi_end_io = read_dirty_endio; + + if (bio_alloc_pages(&io->bio, GFP_KERNEL)) + goto err_free; + + trace_bcache_writeback(&w->key); + + down(&dc->in_flight); + closure_call(&io->cl, read_dirty_submit, NULL, &cl); + + delay = writeback_delay(dc, KEY_SIZE(&w->key)); + } + + if (0) { +err_free: + kfree(w->private); +err: + bch_keybuf_del(&dc->writeback_keys, w); + } + + /* + * Wait for outstanding writeback IOs to finish (and keybuf slots to be + * freed) before refilling again + */ + closure_sync(&cl); +} + +/* Scan for dirty data */ + +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, + uint64_t offset, int nr_sectors) +{ + struct bcache_device *d = c->devices[inode]; + unsigned stripe_offset, stripe, sectors_dirty; + + if (!d) + return; + + stripe = offset_to_stripe(d, offset); + stripe_offset = offset & (d->stripe_size - 1); + + while (nr_sectors) { + int s = min_t(unsigned, abs(nr_sectors), + d->stripe_size - stripe_offset); + + if (nr_sectors < 0) + s = -s; + + if (stripe >= d->nr_stripes) + return; + + sectors_dirty = atomic_add_return(s, + d->stripe_sectors_dirty + stripe); + if (sectors_dirty == d->stripe_size) + set_bit(stripe, d->full_dirty_stripes); + else + clear_bit(stripe, d->full_dirty_stripes); + + nr_sectors -= s; + stripe_offset = 0; + stripe++; + } +} + +static bool dirty_pred(struct keybuf *buf, struct bkey *k) +{ + return KEY_DIRTY(k); +} + +static void refill_full_stripes(struct cached_dev *dc) +{ + struct keybuf *buf = &dc->writeback_keys; + unsigned start_stripe, stripe, next_stripe; + bool wrapped = false; + + stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); + + if (stripe >= dc->disk.nr_stripes) + stripe = 0; + + start_stripe = stripe; + + while (1) { + stripe = find_next_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + if (stripe == dc->disk.nr_stripes) + goto next; + + next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + buf->last_scanned = KEY(dc->disk.id, + stripe * dc->disk.stripe_size, 0); + + bch_refill_keybuf(dc->disk.c, buf, + &KEY(dc->disk.id, + next_stripe * dc->disk.stripe_size, 0), + dirty_pred); + + if (array_freelist_empty(&buf->freelist)) + return; + + stripe = next_stripe; +next: + if (wrapped && stripe > start_stripe) + return; + + if (stripe == dc->disk.nr_stripes) { + stripe = 0; + wrapped = true; + } + } +} + +static bool refill_dirty(struct cached_dev *dc) +{ + struct keybuf *buf = &dc->writeback_keys; + struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); + bool searched_from_start = false; + + if (dc->partial_stripes_expensive) { + refill_full_stripes(dc); + if (array_freelist_empty(&buf->freelist)) + return false; + } + + if (bkey_cmp(&buf->last_scanned, &end) >= 0) { + buf->last_scanned = KEY(dc->disk.id, 0, 0); + searched_from_start = true; + } + + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); + + return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; +} + +static int bch_writeback_thread(void *arg) +{ + struct cached_dev *dc = arg; + bool searched_full_index; + + while (!kthread_should_stop()) { + down_write(&dc->writeback_lock); + if (!atomic_read(&dc->has_dirty) || + (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && + !dc->writeback_running)) { + up_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + return 0; + + try_to_freeze(); + schedule(); + continue; + } + + searched_full_index = refill_dirty(dc); + + if (searched_full_index && + RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { + atomic_set(&dc->has_dirty, 0); + cached_dev_put(dc); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + bch_write_bdev_super(dc, NULL); + } + + up_write(&dc->writeback_lock); + + bch_ratelimit_reset(&dc->writeback_rate); + read_dirty(dc); + + if (searched_full_index) { + unsigned delay = dc->writeback_delay * HZ; + + while (delay && + !kthread_should_stop() && + !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) + delay = schedule_timeout_uninterruptible(delay); + } + } + + return 0; +} + +/* Init */ + +struct sectors_dirty_init { + struct btree_op op; + unsigned inode; +}; + +static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, + struct bkey *k) +{ + struct sectors_dirty_init *op = container_of(_op, + struct sectors_dirty_init, op); + if (KEY_INODE(k) > op->inode) + return MAP_DONE; + + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); + + return MAP_CONTINUE; +} + +void bch_sectors_dirty_init(struct cached_dev *dc) +{ + struct sectors_dirty_init op; + + bch_btree_op_init(&op.op, -1); + op.inode = dc->disk.id; + + bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), + sectors_dirty_init_fn, 0); + + dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); +} + +int bch_cached_dev_writeback_init(struct cached_dev *dc) +{ + sema_init(&dc->in_flight, 64); + init_rwsem(&dc->writeback_lock); + bch_keybuf_init(&dc->writeback_keys); + + dc->writeback_metadata = true; + dc->writeback_running = true; + dc->writeback_percent = 10; + dc->writeback_delay = 30; + dc->writeback_rate.rate = 1024; + + dc->writeback_rate_update_seconds = 5; + dc->writeback_rate_d_term = 30; + dc->writeback_rate_p_term_inverse = 6000; + + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, + "bcache_writeback"); + if (IS_ERR(dc->writeback_thread)) + return PTR_ERR(dc->writeback_thread); + + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); + + return 0; +} diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h new file mode 100644 index 00000000000..e2f8598937a --- /dev/null +++ b/drivers/md/bcache/writeback.h @@ -0,0 +1,90 @@ +#ifndef _BCACHE_WRITEBACK_H +#define _BCACHE_WRITEBACK_H + +#define CUTOFF_WRITEBACK 40 +#define CUTOFF_WRITEBACK_SYNC 70 + +static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) +{ + uint64_t i, ret = 0; + + for (i = 0; i < d->nr_stripes; i++) + ret += atomic_read(d->stripe_sectors_dirty + i); + + return ret; +} + +static inline unsigned offset_to_stripe(struct bcache_device *d, + uint64_t offset) +{ + do_div(offset, d->stripe_size); + return offset; +} + +static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, + uint64_t offset, + unsigned nr_sectors) +{ + unsigned stripe = offset_to_stripe(&dc->disk, offset); + + while (1) { + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) + return true; + + if (nr_sectors <= dc->disk.stripe_size) + return false; + + nr_sectors -= dc->disk.stripe_size; + stripe++; + } +} + +static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, + unsigned cache_mode, bool would_skip) +{ + unsigned in_use = dc->disk.c->gc_stats.in_use; + + if (cache_mode != CACHE_MODE_WRITEBACK || + test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || + in_use > CUTOFF_WRITEBACK_SYNC) + return false; + + if (dc->partial_stripes_expensive && + bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector, + bio_sectors(bio))) + return true; + + if (would_skip) + return false; + + return bio->bi_rw & REQ_SYNC || + in_use <= CUTOFF_WRITEBACK; +} + +static inline void bch_writeback_queue(struct cached_dev *dc) +{ + wake_up_process(dc->writeback_thread); +} + +static inline void bch_writeback_add(struct cached_dev *dc) +{ + if (!atomic_read(&dc->has_dirty) && + !atomic_xchg(&dc->has_dirty, 1)) { + atomic_inc(&dc->count); + + if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { + SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); + /* XXX: should do this synchronously */ + bch_write_bdev_super(dc, NULL); + } + + bch_writeback_queue(dc); + } +} + +void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); + +void bch_sectors_dirty_init(struct cached_dev *dc); +int bch_cached_dev_writeback_init(struct cached_dev *); + +#endif diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index ac89a5deaca..67f8b31e205 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -13,9 +13,9 @@ * Still to do: * * flush after percent set rather than just time based. (maybe both). - * wait if count gets too high, wake when it drops to half. */ +#include <linux/blkdev.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> @@ -26,76 +26,15 @@ #include <linux/file.h> #include <linux/mount.h> #include <linux/buffer_head.h> -#include <linux/raid/md.h> -#include <linux/raid/bitmap.h> +#include <linux/seq_file.h> +#include "md.h" +#include "bitmap.h" -/* debug macros */ - -#define DEBUG 0 - -#if DEBUG -/* these are for debugging purposes only! */ - -/* define one and only one of these */ -#define INJECT_FAULTS_1 0 /* cause bitmap_alloc_page to fail always */ -#define INJECT_FAULTS_2 0 /* cause bitmap file to be kicked when first bit set*/ -#define INJECT_FAULTS_3 0 /* treat bitmap file as kicked at init time */ -#define INJECT_FAULTS_4 0 /* undef */ -#define INJECT_FAULTS_5 0 /* undef */ -#define INJECT_FAULTS_6 0 - -/* if these are defined, the driver will fail! debug only */ -#define INJECT_FATAL_FAULT_1 0 /* fail kmalloc, causing bitmap_create to fail */ -#define INJECT_FATAL_FAULT_2 0 /* undef */ -#define INJECT_FATAL_FAULT_3 0 /* undef */ -#endif - -//#define DPRINTK PRINTK /* set this NULL to avoid verbose debug output */ -#define DPRINTK(x...) do { } while(0) - -#ifndef PRINTK -# if DEBUG > 0 -# define PRINTK(x...) printk(KERN_DEBUG x) -# else -# define PRINTK(x...) -# endif -#endif - -static inline char * bmname(struct bitmap *bitmap) +static inline char *bmname(struct bitmap *bitmap) { return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } - -/* - * just a placeholder - calls kmalloc for bitmap pages - */ -static unsigned char *bitmap_alloc_page(struct bitmap *bitmap) -{ - unsigned char *page; - -#ifdef INJECT_FAULTS_1 - page = NULL; -#else - page = kmalloc(PAGE_SIZE, GFP_NOIO); -#endif - if (!page) - printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap)); - else - PRINTK("%s: bitmap_alloc_page: allocated page at %p\n", - bmname(bitmap), page); - return page; -} - -/* - * for now just a placeholder -- just calls kfree for bitmap pages - */ -static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page) -{ - PRINTK("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page); - kfree(page); -} - /* * check a page and, if necessary, allocate it (or hijack it if the alloc fails) * @@ -106,18 +45,21 @@ static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page) * if we find our page, we increment the page's refcount so that it stays * allocated while we're using it */ -static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create) +static int bitmap_checkpage(struct bitmap_counts *bitmap, + unsigned long page, int create) +__releases(bitmap->lock) +__acquires(bitmap->lock) { unsigned char *mappage; if (page >= bitmap->pages) { - printk(KERN_ALERT - "%s: invalid bitmap page request: %lu (> %lu)\n", - bmname(bitmap), page, bitmap->pages-1); + /* This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page. + * It is harmless. + */ return -EINVAL; } - if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ return 0; @@ -127,47 +69,37 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat if (!create) return -ENOENT; - spin_unlock_irq(&bitmap->lock); - /* this page has not been allocated yet */ - if ((mappage = bitmap_alloc_page(bitmap)) == NULL) { - PRINTK("%s: bitmap map page allocation failed, hijacking\n", - bmname(bitmap)); + spin_unlock_irq(&bitmap->lock); + mappage = kzalloc(PAGE_SIZE, GFP_NOIO); + spin_lock_irq(&bitmap->lock); + + if (mappage == NULL) { + pr_debug("md/bitmap: map page allocation failed, hijacking\n"); /* failed - set the hijacked flag so that we can use the * pointer as a counter */ - spin_lock_irq(&bitmap->lock); if (!bitmap->bp[page].map) bitmap->bp[page].hijacked = 1; - goto out; - } - - /* got a page */ - - spin_lock_irq(&bitmap->lock); - - /* recheck the page */ - - if (bitmap->bp[page].map || bitmap->bp[page].hijacked) { + } else if (bitmap->bp[page].map || + bitmap->bp[page].hijacked) { /* somebody beat us to getting the page */ - bitmap_free_page(bitmap, mappage); + kfree(mappage); return 0; - } + } else { - /* no page was in place and we have one, so install it */ + /* no page was in place and we have one, so install it */ - memset(mappage, 0, PAGE_SIZE); - bitmap->bp[page].map = mappage; - bitmap->missing_pages--; -out: + bitmap->bp[page].map = mappage; + bitmap->missing_pages--; + } return 0; } - /* if page is completely empty, put it back on the free list, or dealloc it */ /* if page was hijacked, unmark the flag so it might get alloced next time */ /* Note: lock should be held when calling this */ -static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) +static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page) { char *ptr; @@ -179,26 +111,15 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */ bitmap->bp[page].hijacked = 0; bitmap->bp[page].map = NULL; - return; + } else { + /* normal case, free the page */ + ptr = bitmap->bp[page].map; + bitmap->bp[page].map = NULL; + bitmap->missing_pages++; + kfree(ptr); } - - /* normal case, free the page */ - -#if 0 -/* actually ... let's not. We will probably need the page again exactly when - * memory is tight and we are flusing to disk - */ - return; -#else - ptr = bitmap->bp[page].map; - bitmap->bp[page].map = NULL; - bitmap->missing_pages++; - bitmap_free_page(bitmap, ptr); - return; -#endif } - /* * bitmap file handling - read and write the bitmap file and its superblock */ @@ -208,37 +129,33 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) */ /* IO operations when bitmap is stored near all superblocks */ -static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long index) +static int read_sb_page(struct mddev *mddev, loff_t offset, + struct page *page, + unsigned long index, int size) { /* choose a good rdev and read the page from there */ - mdk_rdev_t *rdev; - struct list_head *tmp; - struct page *page = alloc_page(GFP_KERNEL); + struct md_rdev *rdev; sector_t target; - if (!page) - return ERR_PTR(-ENOMEM); - - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { if (! test_bit(In_sync, &rdev->flags) || test_bit(Faulty, &rdev->flags)) continue; - target = rdev->sb_start + offset + index * (PAGE_SIZE/512); + target = offset + index * (PAGE_SIZE/512); - if (sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)) { + if (sync_page_io(rdev, target, + roundup(size, bdev_logical_block_size(rdev->bdev)), + page, READ, true)) { page->index = index; - attach_page_buffers(page, NULL); /* so that free_buffer will - * quietly no-op */ - return page; + return 0; } } - return ERR_PTR(-EIO); - + return -EIO; } -static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) +static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) { /* Iterate the disks of an mddev, using rcu to protect access to the * linked list, and raising the refcount of devices we return to ensure @@ -246,22 +163,18 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) * As devices are only added or removed when raid_disk is < 0 and * nr_pending is 0 and In_sync is clear, the entries we return will * still be in the same position on the list when we re-enter - * list_for_each_continue_rcu. + * list_for_each_entry_continue_rcu. */ - struct list_head *pos; rcu_read_lock(); if (rdev == NULL) /* start at the beginning */ - pos = &mddev->disks; + rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set); else { /* release the previous rdev and start from there. */ rdev_dec_pending(rdev, mddev); - pos = &rdev->same_set; } - list_for_each_continue_rcu(pos, &mddev->disks) { - rdev = list_entry(pos, mdk_rdev_t, same_set); + list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && - test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) { /* this is a usable devices */ atomic_inc(&rdev->nr_pending); @@ -275,44 +188,64 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { - mdk_rdev_t *rdev = NULL; - mddev_t *mddev = bitmap->mddev; + struct md_rdev *rdev = NULL; + struct block_device *bdev; + struct mddev *mddev = bitmap->mddev; + struct bitmap_storage *store = &bitmap->storage; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { - int size = PAGE_SIZE; - if (page->index == bitmap->file_pages-1) - size = roundup(bitmap->last_page_size, - bdev_hardsect_size(rdev->bdev)); - /* Just make sure we aren't corrupting data or - * metadata - */ - if (bitmap->offset < 0) { - /* DATA BITMAP METADATA */ - if (bitmap->offset - + (long)(page->index * (PAGE_SIZE/512)) - + size/512 > 0) - /* bitmap runs in to metadata */ - goto bad_alignment; - if (rdev->data_offset + mddev->size*2 - > rdev->sb_start + bitmap->offset) - /* data runs in to bitmap */ - goto bad_alignment; - } else if (rdev->sb_start < rdev->data_offset) { - /* METADATA BITMAP DATA */ - if (rdev->sb_start - + bitmap->offset - + page->index*(PAGE_SIZE/512) + size/512 - > rdev->data_offset) - /* bitmap runs in to data */ - goto bad_alignment; - } else { - /* DATA METADATA BITMAP - no problems */ - } - md_super_write(mddev, rdev, - rdev->sb_start + bitmap->offset - + page->index * (PAGE_SIZE/512), - size, - page); + int size = PAGE_SIZE; + loff_t offset = mddev->bitmap_info.offset; + + bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; + + if (page->index == store->file_pages-1) { + int last_page_size = store->bytes & (PAGE_SIZE-1); + if (last_page_size == 0) + last_page_size = PAGE_SIZE; + size = roundup(last_page_size, + bdev_logical_block_size(bdev)); + } + /* Just make sure we aren't corrupting data or + * metadata + */ + if (mddev->external) { + /* Bitmap could be anywhere. */ + if (rdev->sb_start + offset + (page->index + * (PAGE_SIZE/512)) + > rdev->data_offset + && + rdev->sb_start + offset + < (rdev->data_offset + mddev->dev_sectors + + (PAGE_SIZE/512))) + goto bad_alignment; + } else if (offset < 0) { + /* DATA BITMAP METADATA */ + if (offset + + (long)(page->index * (PAGE_SIZE/512)) + + size/512 > 0) + /* bitmap runs in to metadata */ + goto bad_alignment; + if (rdev->data_offset + mddev->dev_sectors + > rdev->sb_start + offset) + /* data runs in to bitmap */ + goto bad_alignment; + } else if (rdev->sb_start < rdev->data_offset) { + /* METADATA BITMAP DATA */ + if (rdev->sb_start + + offset + + page->index*(PAGE_SIZE/512) + size/512 + > rdev->data_offset) + /* bitmap runs in to data */ + goto bad_alignment; + } else { + /* DATA METADATA BITMAP - no problems */ + } + md_super_write(mddev, rdev, + rdev->sb_start + offset + + page->index * (PAGE_SIZE/512), + size, + page); } if (wait) @@ -320,7 +253,6 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) return 0; bad_alignment: - rcu_read_unlock(); return -EINVAL; } @@ -332,10 +264,10 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) { struct buffer_head *bh; - if (bitmap->file == NULL) { + if (bitmap->storage.file == NULL) { switch (write_sb_page(bitmap, page, wait)) { case -EINVAL: - bitmap->flags |= BITMAP_WRITE_ERROR; + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); } } else { @@ -345,29 +277,24 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) atomic_inc(&bitmap->pending_writes); set_buffer_locked(bh); set_buffer_mapped(bh); - submit_bh(WRITE, bh); + submit_bh(WRITE | REQ_SYNC, bh); bh = bh->b_this_page; } - if (wait) { + if (wait) wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); - } } - if (bitmap->flags & BITMAP_WRITE_ERROR) + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) bitmap_file_kick(bitmap); } static void end_bitmap_write(struct buffer_head *bh, int uptodate) { struct bitmap *bitmap = bh->b_private; - unsigned long flags; - if (!uptodate) { - spin_lock_irqsave(&bitmap->lock, flags); - bitmap->flags |= BITMAP_WRITE_ERROR; - spin_unlock_irqrestore(&bitmap->lock, flags); - } + if (!uptodate) + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); if (atomic_dec_and_test(&bitmap->pending_writes)) wake_up(&bitmap->write_wait); } @@ -382,8 +309,12 @@ __clear_page_buffers(struct page *page) } static void free_buffers(struct page *page) { - struct buffer_head *bh = page_buffers(page); + struct buffer_head *bh; + + if (!PagePrivate(page)) + return; + bh = page_buffers(page); while (bh) { struct buffer_head *next = bh->b_this_page; free_buffer_head(bh); @@ -400,28 +331,22 @@ static void free_buffers(struct page *page) * This usage is similar to how swap files are handled, and allows us * to write to a file with no concerns of memory allocation failing. */ -static struct page *read_page(struct file *file, unsigned long index, - struct bitmap *bitmap, - unsigned long count) +static int read_page(struct file *file, unsigned long index, + struct bitmap *bitmap, + unsigned long count, + struct page *page) { - struct page *page = NULL; - struct inode *inode = file->f_path.dentry->d_inode; + int ret = 0; + struct inode *inode = file_inode(file); struct buffer_head *bh; sector_t block; - PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE, - (unsigned long long)index << PAGE_SHIFT); - - page = alloc_page(GFP_KERNEL); - if (!page) - page = ERR_PTR(-ENOMEM); - if (IS_ERR(page)) - goto out; + pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, + (unsigned long long)index << PAGE_SHIFT); bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); if (!bh) { - put_page(page); - page = ERR_PTR(-ENOMEM); + ret = -ENOMEM; goto out; } attach_page_buffers(page, bh); @@ -433,8 +358,7 @@ static struct page *read_page(struct file *file, unsigned long index, bh->b_blocknr = bmap(inode, block); if (bh->b_blocknr == 0) { /* Cannot use this file! */ - free_buffers(page); - page = ERR_PTR(-EINVAL); + ret = -EINVAL; goto out; } bh->b_bdev = inode->i_sb->s_bdev; @@ -457,17 +381,15 @@ static struct page *read_page(struct file *file, unsigned long index, wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); - if (bitmap->flags & BITMAP_WRITE_ERROR) { - free_buffers(page); - page = ERR_PTR(-EIO); - } + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) + ret = -EIO; out: - if (IS_ERR(page)) - printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n", + if (ret) + printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT, - PTR_ERR(page)); - return page; + ret); + return ret; } /* @@ -478,25 +400,30 @@ out: void bitmap_update_sb(struct bitmap *bitmap) { bitmap_super_t *sb; - unsigned long flags; if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ return; - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->sb_page) { /* no superblock */ - spin_unlock_irqrestore(&bitmap->lock, flags); + if (bitmap->mddev->bitmap_info.external) return; - } - spin_unlock_irqrestore(&bitmap->lock, flags); - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + if (!bitmap->storage.sb_page) /* no superblock */ + return; + sb = kmap_atomic(bitmap->storage.sb_page); sb->events = cpu_to_le64(bitmap->mddev->events); - if (bitmap->mddev->events < bitmap->events_cleared) { + if (bitmap->mddev->events < bitmap->events_cleared) /* rocking back to read-only */ bitmap->events_cleared = bitmap->mddev->events; - sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - } - kunmap_atomic(sb, KM_USER0); - write_page(bitmap, bitmap->sb_page, 1); + sb->events_cleared = cpu_to_le64(bitmap->events_cleared); + sb->state = cpu_to_le32(bitmap->flags); + /* Just in case these have been changed via sysfs: */ + sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); + sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); + /* This might have been changed by a reshape */ + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); + sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); + sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> + bitmap_info.space); + kunmap_atomic(sb); + write_page(bitmap, bitmap->storage.sb_page, 1); } /* print out the bitmap file superblock */ @@ -504,9 +431,9 @@ void bitmap_print_sb(struct bitmap *bitmap) { bitmap_super_t *sb; - if (!bitmap || !bitmap->sb_page) + if (!bitmap || !bitmap->storage.sb_page) return; - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(bitmap->storage.sb_page); printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); @@ -525,7 +452,76 @@ void bitmap_print_sb(struct bitmap *bitmap) printk(KERN_DEBUG " sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); +} + +/* + * bitmap_new_disk_sb + * @bitmap + * + * This function is somewhat the reverse of bitmap_read_sb. bitmap_read_sb + * reads and verifies the on-disk bitmap superblock and populates bitmap_info. + * This function verifies 'bitmap_info' and populates the on-disk bitmap + * structure, which is to be written to disk. + * + * Returns: 0 on success, -Exxx on error + */ +static int bitmap_new_disk_sb(struct bitmap *bitmap) +{ + bitmap_super_t *sb; + unsigned long chunksize, daemon_sleep, write_behind; + + bitmap->storage.sb_page = alloc_page(GFP_KERNEL); + if (bitmap->storage.sb_page == NULL) + return -ENOMEM; + bitmap->storage.sb_page->index = 0; + + sb = kmap_atomic(bitmap->storage.sb_page); + + sb->magic = cpu_to_le32(BITMAP_MAGIC); + sb->version = cpu_to_le32(BITMAP_MAJOR_HI); + + chunksize = bitmap->mddev->bitmap_info.chunksize; + BUG_ON(!chunksize); + if (!is_power_of_2(chunksize)) { + kunmap_atomic(sb); + printk(KERN_ERR "bitmap chunksize not a power of 2\n"); + return -EINVAL; + } + sb->chunksize = cpu_to_le32(chunksize); + + daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; + if (!daemon_sleep || + (daemon_sleep < 1) || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { + printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n"); + daemon_sleep = 5 * HZ; + } + sb->daemon_sleep = cpu_to_le32(daemon_sleep); + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + + /* + * FIXME: write_behind for RAID1. If not specified, what + * is a good choice? We choose COUNTER_MAX / 2 arbitrarily. + */ + write_behind = bitmap->mddev->bitmap_info.max_write_behind; + if (write_behind > COUNTER_MAX) + write_behind = COUNTER_MAX / 2; + sb->write_behind = cpu_to_le32(write_behind); + bitmap->mddev->bitmap_info.max_write_behind = write_behind; + + /* keep the array size field of the bitmap superblock up to date */ + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); + + memcpy(sb->uuid, bitmap->mddev->uuid, 16); + + set_bit(BITMAP_STALE, &bitmap->flags); + sb->state = cpu_to_le32(bitmap->flags); + bitmap->events_cleared = bitmap->mddev->events; + sb->events_cleared = cpu_to_le64(bitmap->mddev->events); + + kunmap_atomic(sb); + + return 0; } /* read the superblock from the bitmap file and initialize some bitmap fields */ @@ -535,28 +531,45 @@ static int bitmap_read_sb(struct bitmap *bitmap) bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; unsigned long long events; + unsigned long sectors_reserved = 0; int err = -EINVAL; - + struct page *sb_page; + + if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { + chunksize = 128 * 1024 * 1024; + daemon_sleep = 5 * HZ; + write_behind = 0; + set_bit(BITMAP_STALE, &bitmap->flags); + err = 0; + goto out_no_sb; + } /* page 0 is the superblock, read it... */ - if (bitmap->file) { - loff_t isize = i_size_read(bitmap->file->f_mapping->host); + sb_page = alloc_page(GFP_KERNEL); + if (!sb_page) + return -ENOMEM; + bitmap->storage.sb_page = sb_page; + + if (bitmap->storage.file) { + loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; - bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); + err = read_page(bitmap->storage.file, 0, + bitmap, bytes, sb_page); } else { - bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0); + err = read_sb_page(bitmap->mddev, + bitmap->mddev->bitmap_info.offset, + sb_page, + 0, sizeof(bitmap_super_t)); } - if (IS_ERR(bitmap->sb_page)) { - err = PTR_ERR(bitmap->sb_page); - bitmap->sb_page = NULL; + if (err) return err; - } - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); + sb = kmap_atomic(sb_page); chunksize = le32_to_cpu(sb->chunksize); - daemon_sleep = le32_to_cpu(sb->daemon_sleep); + daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); + sectors_reserved = le32_to_cpu(sb->sectors_reserved); /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) @@ -564,11 +577,11 @@ static int bitmap_read_sb(struct bitmap *bitmap) else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) reason = "unrecognized superblock version"; - else if (chunksize < PAGE_SIZE) + else if (chunksize < 512) reason = "bitmap chunksize too small"; - else if ((1 << ffz(~chunksize)) != chunksize) + else if (!is_power_of_2(chunksize)) reason = "bitmap chunksize not a power of 2"; - else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) + else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) reason = "daemon sleep period out of range"; else if (write_behind > COUNTER_MAX) reason = "write-behind limit out of range (0 - 16383)"; @@ -581,159 +594,169 @@ static int bitmap_read_sb(struct bitmap *bitmap) /* keep the array size field of the bitmap superblock up to date */ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); - if (!bitmap->mddev->persistent) - goto success; - - /* - * if we have a persistent array superblock, compare the - * bitmap's UUID and event counter to the mddev's - */ - if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { - printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n", - bmname(bitmap)); - goto out; - } - events = le64_to_cpu(sb->events); - if (events < bitmap->mddev->events) { - printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) " - "-- forcing full recovery\n", bmname(bitmap), events, - (unsigned long long) bitmap->mddev->events); - sb->state |= cpu_to_le32(BITMAP_STALE); + if (bitmap->mddev->persistent) { + /* + * We have a persistent array superblock, so compare the + * bitmap's UUID and event counter to the mddev's + */ + if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { + printk(KERN_INFO + "%s: bitmap superblock UUID mismatch\n", + bmname(bitmap)); + goto out; + } + events = le64_to_cpu(sb->events); + if (events < bitmap->mddev->events) { + printk(KERN_INFO + "%s: bitmap file is out of date (%llu < %llu) " + "-- forcing full recovery\n", + bmname(bitmap), events, + (unsigned long long) bitmap->mddev->events); + set_bit(BITMAP_STALE, &bitmap->flags); + } } -success: + /* assign fields using values from superblock */ - bitmap->chunksize = chunksize; - bitmap->daemon_sleep = daemon_sleep; - bitmap->daemon_lastrun = jiffies; - bitmap->max_write_behind = write_behind; bitmap->flags |= le32_to_cpu(sb->state); if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) - bitmap->flags |= BITMAP_HOSTENDIAN; + set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); bitmap->events_cleared = le64_to_cpu(sb->events_cleared); - if (sb->state & cpu_to_le32(BITMAP_STALE)) - bitmap->events_cleared = bitmap->mddev->events; err = 0; out: - kunmap_atomic(sb, KM_USER0); + kunmap_atomic(sb); +out_no_sb: + if (test_bit(BITMAP_STALE, &bitmap->flags)) + bitmap->events_cleared = bitmap->mddev->events; + bitmap->mddev->bitmap_info.chunksize = chunksize; + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + bitmap->mddev->bitmap_info.max_write_behind = write_behind; + if (bitmap->mddev->bitmap_info.space == 0 || + bitmap->mddev->bitmap_info.space > sectors_reserved) + bitmap->mddev->bitmap_info.space = sectors_reserved; if (err) bitmap_print_sb(bitmap); return err; } -enum bitmap_mask_op { - MASK_SET, - MASK_UNSET -}; - -/* record the state of the bitmap in the superblock. Return the old value */ -static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, - enum bitmap_mask_op op) -{ - bitmap_super_t *sb; - unsigned long flags; - int old; - - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->sb_page) { /* can't set the state */ - spin_unlock_irqrestore(&bitmap->lock, flags); - return 0; - } - spin_unlock_irqrestore(&bitmap->lock, flags); - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); - old = le32_to_cpu(sb->state) & bits; - switch (op) { - case MASK_SET: sb->state |= cpu_to_le32(bits); - break; - case MASK_UNSET: sb->state &= cpu_to_le32(~bits); - break; - default: BUG(); - } - kunmap_atomic(sb, KM_USER0); - return old; -} - /* * general bitmap file operations */ +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ /* calculate the index of the page that contains this bit */ -static inline unsigned long file_page_index(unsigned long chunk) +static inline unsigned long file_page_index(struct bitmap_storage *store, + unsigned long chunk) { - return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT; + if (store->sb_page) + chunk += sizeof(bitmap_super_t) << 3; + return chunk >> PAGE_BIT_SHIFT; } /* calculate the (bit) offset of this bit within a page */ -static inline unsigned long file_page_offset(unsigned long chunk) +static inline unsigned long file_page_offset(struct bitmap_storage *store, + unsigned long chunk) { - return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1); + if (store->sb_page) + chunk += sizeof(bitmap_super_t) << 3; + return chunk & (PAGE_BITS - 1); } /* * return a pointer to the page in the filemap that contains the given bit * - * this lookup is complicated by the fact that the bitmap sb might be exactly - * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page - * 0 or page 1 */ -static inline struct page *filemap_get_page(struct bitmap *bitmap, - unsigned long chunk) +static inline struct page *filemap_get_page(struct bitmap_storage *store, + unsigned long chunk) { - if (file_page_index(chunk) >= bitmap->file_pages) return NULL; - return bitmap->filemap[file_page_index(chunk) - file_page_index(0)]; + if (file_page_index(store, chunk) >= store->file_pages) + return NULL; + return store->filemap[file_page_index(store, chunk)]; } +static int bitmap_storage_alloc(struct bitmap_storage *store, + unsigned long chunks, int with_super) +{ + int pnum; + unsigned long num_pages; + unsigned long bytes; + + bytes = DIV_ROUND_UP(chunks, 8); + if (with_super) + bytes += sizeof(bitmap_super_t); + + num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); + + store->filemap = kmalloc(sizeof(struct page *) + * num_pages, GFP_KERNEL); + if (!store->filemap) + return -ENOMEM; + + if (with_super && !store->sb_page) { + store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (store->sb_page == NULL) + return -ENOMEM; + store->sb_page->index = 0; + } + pnum = 0; + if (store->sb_page) { + store->filemap[0] = store->sb_page; + pnum = 1; + } + for ( ; pnum < num_pages; pnum++) { + store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!store->filemap[pnum]) { + store->file_pages = pnum; + return -ENOMEM; + } + store->filemap[pnum]->index = pnum; + } + store->file_pages = pnum; + + /* We need 4 bits per page, rounded up to a multiple + * of sizeof(unsigned long) */ + store->filemap_attr = kzalloc( + roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), + GFP_KERNEL); + if (!store->filemap_attr) + return -ENOMEM; + + store->bytes = bytes; + + return 0; +} -static void bitmap_file_unmap(struct bitmap *bitmap) +static void bitmap_file_unmap(struct bitmap_storage *store) { struct page **map, *sb_page; - unsigned long *attr; int pages; - unsigned long flags; + struct file *file; - spin_lock_irqsave(&bitmap->lock, flags); - map = bitmap->filemap; - bitmap->filemap = NULL; - attr = bitmap->filemap_attr; - bitmap->filemap_attr = NULL; - pages = bitmap->file_pages; - bitmap->file_pages = 0; - sb_page = bitmap->sb_page; - bitmap->sb_page = NULL; - spin_unlock_irqrestore(&bitmap->lock, flags); + file = store->file; + map = store->filemap; + pages = store->file_pages; + sb_page = store->sb_page; while (pages--) - if (map[pages]->index != 0) /* 0 is sb_page, release it below */ + if (map[pages] != sb_page) /* 0 is sb_page, release it below */ free_buffers(map[pages]); kfree(map); - kfree(attr); + kfree(store->filemap_attr); if (sb_page) free_buffers(sb_page); -} - -static void bitmap_file_put(struct bitmap *bitmap) -{ - struct file *file; - unsigned long flags; - - spin_lock_irqsave(&bitmap->lock, flags); - file = bitmap->file; - bitmap->file = NULL; - spin_unlock_irqrestore(&bitmap->lock, flags); - - if (file) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); - bitmap_file_unmap(bitmap); if (file) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); invalidate_mapping_pages(inode->i_mapping, 0, -1); fput(file); } } - /* * bitmap_file_kick - if an error occurs while manipulating the bitmap file * then it is no longer reliable, so we stop using it and we mark the file @@ -743,15 +766,14 @@ static void bitmap_file_kick(struct bitmap *bitmap) { char *path, *ptr = NULL; - if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) { + if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { bitmap_update_sb(bitmap); - if (bitmap->file) { + if (bitmap->storage.file) { path = kmalloc(PAGE_SIZE, GFP_KERNEL); if (path) - ptr = d_path(&bitmap->file->f_path, path, - PAGE_SIZE); - + ptr = d_path(&bitmap->storage.file->f_path, + path, PAGE_SIZE); printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n", @@ -763,36 +785,39 @@ static void bitmap_file_kick(struct bitmap *bitmap) "%s: disabling internal bitmap due to errors\n", bmname(bitmap)); } - - bitmap_file_put(bitmap); - - return; } enum bitmap_page_attr { - BITMAP_PAGE_DIRTY = 0, // there are set bits that need to be synced - BITMAP_PAGE_CLEAN = 1, // there are bits that might need to be cleared - BITMAP_PAGE_NEEDWRITE=2, // there are cleared bits that need to be synced + BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */ + BITMAP_PAGE_PENDING = 1, /* there are bits that are being cleaned. + * i.e. counter is 1 or 2. */ + BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ }; -static inline void set_page_attr(struct bitmap *bitmap, struct page *page, - enum bitmap_page_attr attr) +static inline void set_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) { - __set_bit((page->index<<2) + attr, bitmap->filemap_attr); + set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); } -static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, - enum bitmap_page_attr attr) +static inline void clear_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) { - __clear_bit((page->index<<2) + attr, bitmap->filemap_attr); + clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); } -static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, - enum bitmap_page_attr attr) +static inline int test_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) { - return test_bit((page->index<<2) + attr, bitmap->filemap_attr); + return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); } +static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) +{ + return test_and_clear_bit((pnum<<2) + attr, + bitmap->storage.filemap_attr); +} /* * bitmap_file_set_bit -- called before performing a write to the md device * to set (and eventually sync) a particular bit in the bitmap file @@ -805,28 +830,46 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) unsigned long bit; struct page *page; void *kaddr; - unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); + unsigned long chunk = block >> bitmap->counts.chunkshift; - if (!bitmap->filemap) { + page = filemap_get_page(&bitmap->storage, chunk); + if (!page) return; - } + bit = file_page_offset(&bitmap->storage, chunk); - page = filemap_get_page(bitmap, chunk); - if (!page) return; - bit = file_page_offset(chunk); - - /* set the bit */ - kaddr = kmap_atomic(page, KM_USER0); - if (bitmap->flags & BITMAP_HOSTENDIAN) + /* set the bit */ + kaddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set_bit(bit, kaddr); else - ext2_set_bit(bit, kaddr); - kunmap_atomic(kaddr, KM_USER0); - PRINTK("set file bit %lu page %lu\n", bit, page->index); - + set_bit_le(bit, kaddr); + kunmap_atomic(kaddr); + pr_debug("set file bit %lu page %lu\n", bit, page->index); /* record page number so it gets flushed to disk when unplug occurs */ - set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); + set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY); +} +static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) +{ + unsigned long bit; + struct page *page; + void *paddr; + unsigned long chunk = block >> bitmap->counts.chunkshift; + + page = filemap_get_page(&bitmap->storage, chunk); + if (!page) + return; + bit = file_page_offset(&bitmap->storage, chunk); + paddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) + clear_bit(bit, paddr); + else + clear_bit_le(bit, paddr); + kunmap_atomic(paddr); + if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) { + set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING); + bitmap->allclean = 0; + } } /* this gets called when the md device is ready to unplug its underlying @@ -834,44 +877,40 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) * sync the dirty pages of the bitmap file to disk */ void bitmap_unplug(struct bitmap *bitmap) { - unsigned long i, flags; + unsigned long i; int dirty, need_write; - struct page *page; int wait = 0; - if (!bitmap) + if (!bitmap || !bitmap->storage.filemap || + test_bit(BITMAP_STALE, &bitmap->flags)) return; /* look at each page to see if there are any set bits that need to be * flushed out to disk */ - for (i = 0; i < bitmap->file_pages; i++) { - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->filemap) { - spin_unlock_irqrestore(&bitmap->lock, flags); + for (i = 0; i < bitmap->storage.file_pages; i++) { + if (!bitmap->storage.filemap) return; + dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); + need_write = test_and_clear_page_attr(bitmap, i, + BITMAP_PAGE_NEEDWRITE); + if (dirty || need_write) { + clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); + write_page(bitmap, bitmap->storage.filemap[i], 0); } - page = bitmap->filemap[i]; - dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); - need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); - clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); - clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); if (dirty) wait = 1; - spin_unlock_irqrestore(&bitmap->lock, flags); - - if (dirty | need_write) - write_page(bitmap, page, 0); } if (wait) { /* if any writes were performed, we need to wait on them */ - if (bitmap->file) + if (bitmap->storage.file) wait_event(bitmap->write_wait, atomic_read(&bitmap->pending_writes)==0); else md_super_wait(bitmap->mddev); } - if (bitmap->flags & BITMAP_WRITE_ERROR) + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) bitmap_file_kick(bitmap); } +EXPORT_SYMBOL(bitmap_unplug); static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); /* * bitmap_init_from_disk -- called at bitmap_create time to initialize @@ -888,140 +927,117 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) { unsigned long i, chunks, index, oldindex, bit; - struct page *page = NULL, *oldpage = NULL; - unsigned long num_pages, bit_cnt = 0; + struct page *page = NULL; + unsigned long bit_cnt = 0; struct file *file; - unsigned long bytes, offset; + unsigned long offset; int outofdate; int ret = -ENOSPC; void *paddr; + struct bitmap_storage *store = &bitmap->storage; - chunks = bitmap->chunks; - file = bitmap->file; + chunks = bitmap->counts.chunks; + file = store->file; - BUG_ON(!file && !bitmap->offset); + if (!file && !bitmap->mddev->bitmap_info.offset) { + /* No permanent bitmap - fill with '1s'. */ + store->filemap = NULL; + store->file_pages = 0; + for (i = 0; i < chunks ; i++) { + /* if the disk bit is set, set the memory bit */ + int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift) + >= start); + bitmap_set_memory_bits(bitmap, + (sector_t)i << bitmap->counts.chunkshift, + needed); + } + return 0; + } -#ifdef INJECT_FAULTS_3 - outofdate = 1; -#else - outofdate = bitmap->flags & BITMAP_STALE; -#endif + outofdate = test_bit(BITMAP_STALE, &bitmap->flags); if (outofdate) printk(KERN_INFO "%s: bitmap file is out of date, doing full " "recovery\n", bmname(bitmap)); - bytes = (chunks + 7) / 8; - - num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE; - - if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) { + if (file && i_size_read(file->f_mapping->host) < store->bytes) { printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", - bmname(bitmap), - (unsigned long) i_size_read(file->f_mapping->host), - bytes + sizeof(bitmap_super_t)); + bmname(bitmap), + (unsigned long) i_size_read(file->f_mapping->host), + store->bytes); goto err; } - ret = -ENOMEM; - - bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); - if (!bitmap->filemap) - goto err; - - /* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */ - bitmap->filemap_attr = kzalloc( - roundup( DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), - GFP_KERNEL); - if (!bitmap->filemap_attr) - goto err; - oldindex = ~0L; + offset = 0; + if (!bitmap->mddev->bitmap_info.external) + offset = sizeof(bitmap_super_t); for (i = 0; i < chunks; i++) { int b; - index = file_page_index(i); - bit = file_page_offset(i); + index = file_page_index(&bitmap->storage, i); + bit = file_page_offset(&bitmap->storage, i); if (index != oldindex) { /* this is a new page, read it in */ int count; /* unmap the old page, we're done with it */ - if (index == num_pages-1) - count = bytes + sizeof(bitmap_super_t) - - index * PAGE_SIZE; + if (index == store->file_pages-1) + count = store->bytes - index * PAGE_SIZE; else count = PAGE_SIZE; - if (index == 0) { - /* - * if we're here then the superblock page - * contains some bits (PAGE_SIZE != sizeof sb) - * we've already read it in, so just use it - */ - page = bitmap->sb_page; - offset = sizeof(bitmap_super_t); - } else if (file) { - page = read_page(file, index, bitmap, count); - offset = 0; - } else { - page = read_sb_page(bitmap->mddev, bitmap->offset, index); - offset = 0; - } - if (IS_ERR(page)) { /* read error */ - ret = PTR_ERR(page); + page = store->filemap[index]; + if (file) + ret = read_page(file, index, bitmap, + count, page); + else + ret = read_sb_page( + bitmap->mddev, + bitmap->mddev->bitmap_info.offset, + page, + index, count); + + if (ret) goto err; - } oldindex = index; - oldpage = page; if (outofdate) { /* * if bitmap is out of date, dirty the - * whole page and write it out + * whole page and write it out */ - paddr = kmap_atomic(page, KM_USER0); + paddr = kmap_atomic(page); memset(paddr + offset, 0xff, PAGE_SIZE - offset); - kunmap_atomic(paddr, KM_USER0); + kunmap_atomic(paddr); write_page(bitmap, page, 1); ret = -EIO; - if (bitmap->flags & BITMAP_WRITE_ERROR) { - /* release, page not in filemap yet */ - put_page(page); + if (test_bit(BITMAP_WRITE_ERROR, + &bitmap->flags)) goto err; - } } - - bitmap->filemap[bitmap->file_pages++] = page; - bitmap->last_page_size = count; } - paddr = kmap_atomic(page, KM_USER0); - if (bitmap->flags & BITMAP_HOSTENDIAN) + paddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) b = test_bit(bit, paddr); else - b = ext2_test_bit(bit, paddr); - kunmap_atomic(paddr, KM_USER0); + b = test_bit_le(bit, paddr); + kunmap_atomic(paddr); if (b) { /* if the disk bit is set, set the memory bit */ - bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap), - ((i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) >= start) - ); + int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift + >= start); + bitmap_set_memory_bits(bitmap, + (sector_t)i << bitmap->counts.chunkshift, + needed); bit_cnt++; - set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); } - } - - /* everything went OK */ - ret = 0; - bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); - - if (bit_cnt) { /* Kick recovery if any bits were set */ - set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); - md_wakeup_thread(bitmap->mddev->thread); + offset = 0; } printk(KERN_INFO "%s: bitmap initialized from disk: " - "read %lu/%lu pages, set %lu bits\n", - bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt); + "read %lu pages, set %lu of %lu bits\n", + bmname(bitmap), store->file_pages, + bit_cnt, chunks); return 0; @@ -1038,25 +1054,39 @@ void bitmap_write_all(struct bitmap *bitmap) */ int i; - for (i=0; i < bitmap->file_pages; i++) - set_page_attr(bitmap, bitmap->filemap[i], + if (!bitmap || !bitmap->storage.filemap) + return; + if (bitmap->storage.file) + /* Only one copy, so nothing needed */ + return; + + for (i = 0; i < bitmap->storage.file_pages; i++) + set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); + bitmap->allclean = 0; } - -static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) +static void bitmap_count_page(struct bitmap_counts *bitmap, + sector_t offset, int inc) { - sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); + sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; bitmap->bp[page].count += inc; -/* - if (page == 0) printk("count page 0, offset %llu: %d gives %d\n", - (unsigned long long)offset, inc, bitmap->bp[page].count); -*/ bitmap_checkfree(bitmap, page); } -static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, - sector_t offset, int *blocks, + +static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) +{ + sector_t chunk = offset >> bitmap->chunkshift; + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; + struct bitmap_page *bp = &bitmap->bp[page]; + + if (!bp->pending) + bp->pending = 1; +} + +static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, + sector_t offset, sector_t *blocks, int create); /* @@ -1064,189 +1094,198 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, * out to disk */ -void bitmap_daemon_work(struct bitmap *bitmap) +void bitmap_daemon_work(struct mddev *mddev) { + struct bitmap *bitmap; unsigned long j; - unsigned long flags; - struct page *page = NULL, *lastpage = NULL; - int blocks; - void *paddr; + unsigned long nextpage; + sector_t blocks; + struct bitmap_counts *counts; - if (bitmap == NULL) + /* Use a mutex to guard daemon_work against + * bitmap_destroy. + */ + mutex_lock(&mddev->bitmap_info.mutex); + bitmap = mddev->bitmap; + if (bitmap == NULL) { + mutex_unlock(&mddev->bitmap_info.mutex); return; - if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ)) + } + if (time_before(jiffies, bitmap->daemon_lastrun + + mddev->bitmap_info.daemon_sleep)) goto done; bitmap->daemon_lastrun = jiffies; if (bitmap->allclean) { - bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - return; + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + goto done; } bitmap->allclean = 1; - for (j = 0; j < bitmap->chunks; j++) { - bitmap_counter_t *bmc; - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->filemap) { - /* error or shutdown */ - spin_unlock_irqrestore(&bitmap->lock, flags); - break; + /* Any file-page which is PENDING now needs to be written. + * So set NEEDWRITE now, then after we make any last-minute changes + * we will write it. + */ + for (j = 0; j < bitmap->storage.file_pages; j++) + if (test_and_clear_page_attr(bitmap, j, + BITMAP_PAGE_PENDING)) + set_page_attr(bitmap, j, + BITMAP_PAGE_NEEDWRITE); + + if (bitmap->need_sync && + mddev->bitmap_info.external == 0) { + /* Arrange for superblock update as well as + * other changes */ + bitmap_super_t *sb; + bitmap->need_sync = 0; + if (bitmap->storage.filemap) { + sb = kmap_atomic(bitmap->storage.sb_page); + sb->events_cleared = + cpu_to_le64(bitmap->events_cleared); + kunmap_atomic(sb); + set_page_attr(bitmap, 0, + BITMAP_PAGE_NEEDWRITE); } + } + /* Now look at the bitmap counters and if any are '2' or '1', + * decrement and handle accordingly. + */ + counts = &bitmap->counts; + spin_lock_irq(&counts->lock); + nextpage = 0; + for (j = 0; j < counts->chunks; j++) { + bitmap_counter_t *bmc; + sector_t block = (sector_t)j << counts->chunkshift; - page = filemap_get_page(bitmap, j); - - if (page != lastpage) { - /* skip this page unless it's marked as needing cleaning */ - if (!test_page_attr(bitmap, page, BITMAP_PAGE_CLEAN)) { - int need_write = test_page_attr(bitmap, page, - BITMAP_PAGE_NEEDWRITE); - if (need_write) - clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); - - spin_unlock_irqrestore(&bitmap->lock, flags); - if (need_write) { - write_page(bitmap, page, 0); - bitmap->allclean = 0; - } + if (j == nextpage) { + nextpage += PAGE_COUNTER_RATIO; + if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) { + j |= PAGE_COUNTER_MASK; continue; } - - /* grab the new page, sync and release the old */ - if (lastpage != NULL) { - if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { - clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); - spin_unlock_irqrestore(&bitmap->lock, flags); - write_page(bitmap, lastpage, 0); - } else { - set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); - spin_unlock_irqrestore(&bitmap->lock, flags); - } - } else - spin_unlock_irqrestore(&bitmap->lock, flags); - lastpage = page; - - /* We are possibly going to clear some bits, so make - * sure that events_cleared is up-to-date. - */ - if (bitmap->need_sync) { - bitmap_super_t *sb; - bitmap->need_sync = 0; - sb = kmap_atomic(bitmap->sb_page, KM_USER0); - sb->events_cleared = - cpu_to_le64(bitmap->events_cleared); - kunmap_atomic(sb, KM_USER0); - write_page(bitmap, bitmap->sb_page, 1); - } - spin_lock_irqsave(&bitmap->lock, flags); - clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); + counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0; } - bmc = bitmap_get_counter(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap), - &blocks, 0); - if (bmc) { -/* - if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc); -*/ - if (*bmc) - bitmap->allclean = 0; + bmc = bitmap_get_counter(counts, + block, + &blocks, 0); - if (*bmc == 2) { - *bmc=1; /* maybe clear the bit next time */ - set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); - } else if (*bmc == 1) { - /* we can clear the bit */ - *bmc = 0; - bitmap_count_page(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap), - -1); - - /* clear the bit */ - paddr = kmap_atomic(page, KM_USER0); - if (bitmap->flags & BITMAP_HOSTENDIAN) - clear_bit(file_page_offset(j), paddr); - else - ext2_clear_bit(file_page_offset(j), paddr); - kunmap_atomic(paddr, KM_USER0); - } + if (!bmc) { + j |= PAGE_COUNTER_MASK; + continue; + } + if (*bmc == 1 && !bitmap->need_sync) { + /* We can clear the bit */ + *bmc = 0; + bitmap_count_page(counts, block, -1); + bitmap_file_clear_bit(bitmap, block); + } else if (*bmc && *bmc <= 2) { + *bmc = 1; + bitmap_set_pending(counts, block); + bitmap->allclean = 0; } - spin_unlock_irqrestore(&bitmap->lock, flags); } - - /* now sync the final page */ - if (lastpage != NULL) { - spin_lock_irqsave(&bitmap->lock, flags); - if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { - clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); - spin_unlock_irqrestore(&bitmap->lock, flags); - write_page(bitmap, lastpage, 0); - } else { - set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); - spin_unlock_irqrestore(&bitmap->lock, flags); + spin_unlock_irq(&counts->lock); + + /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. + * DIRTY pages need to be written by bitmap_unplug so it can wait + * for them. + * If we find any DIRTY page we stop there and let bitmap_unplug + * handle all the rest. This is important in the case where + * the first blocking holds the superblock and it has been updated. + * We mustn't write any other blocks before the superblock. + */ + for (j = 0; + j < bitmap->storage.file_pages + && !test_bit(BITMAP_STALE, &bitmap->flags); + j++) { + + if (test_page_attr(bitmap, j, + BITMAP_PAGE_DIRTY)) + /* bitmap_unplug will handle the rest */ + break; + if (test_and_clear_page_attr(bitmap, j, + BITMAP_PAGE_NEEDWRITE)) { + write_page(bitmap, bitmap->storage.filemap[j], 0); } } done: if (bitmap->allclean == 0) - bitmap->mddev->thread->timeout = bitmap->daemon_sleep * HZ; + mddev->thread->timeout = + mddev->bitmap_info.daemon_sleep; + mutex_unlock(&mddev->bitmap_info.mutex); } -static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, - sector_t offset, int *blocks, +static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, + sector_t offset, sector_t *blocks, int create) +__releases(bitmap->lock) +__acquires(bitmap->lock) { /* If 'create', we might release the lock and reclaim it. * The lock must have been taken with interrupts enabled. * If !create, we don't release the lock. */ - sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); + sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; sector_t csize; + int err; - if (bitmap_checkpage(bitmap, page, create) < 0) { - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); - *blocks = csize - (offset & (csize- 1)); + err = bitmap_checkpage(bitmap, page, create); + + if (bitmap->bp[page].hijacked || + bitmap->bp[page].map == NULL) + csize = ((sector_t)1) << (bitmap->chunkshift + + PAGE_COUNTER_SHIFT - 1); + else + csize = ((sector_t)1) << bitmap->chunkshift; + *blocks = csize - (offset & (csize - 1)); + + if (err < 0) return NULL; - } + /* now locked ... */ if (bitmap->bp[page].hijacked) { /* hijacked pointer */ /* should we use the first or second counter field * of the hijacked pointer? */ int hi = (pageoff > PAGE_COUNTER_MASK); - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + - PAGE_COUNTER_SHIFT - 1); - *blocks = csize - (offset & (csize- 1)); return &((bitmap_counter_t *) &bitmap->bp[page].map)[hi]; - } else { /* page is allocated */ - csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); - *blocks = csize - (offset & (csize- 1)); + } else /* page is allocated */ return (bitmap_counter_t *) &(bitmap->bp[page].map[pageoff]); - } } int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) { - if (!bitmap) return 0; + if (!bitmap) + return 0; if (behind) { + int bw; atomic_inc(&bitmap->behind_writes); - PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", - atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); + bw = atomic_read(&bitmap->behind_writes); + if (bw > bitmap->behind_writes_used) + bitmap->behind_writes_used = bw; + + pr_debug("inc write-behind count %d/%lu\n", + bw, bitmap->mddev->bitmap_info.max_write_behind); } while (sectors) { - int blocks; + sector_t blocks; bitmap_counter_t *bmc; - spin_lock_irq(&bitmap->lock); - bmc = bitmap_get_counter(bitmap, offset, &blocks, 1); + spin_lock_irq(&bitmap->counts.lock); + bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); if (!bmc) { - spin_unlock_irq(&bitmap->lock); + spin_unlock_irq(&bitmap->counts.lock); return 0; } - if (unlikely((*bmc & COUNTER_MAX) == COUNTER_MAX)) { + if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) { DEFINE_WAIT(__wait); /* note that it is safe to do the prepare_to_wait * after the test as long as we do it before dropping @@ -1254,18 +1293,16 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect */ prepare_to_wait(&bitmap->overflow_wait, &__wait, TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&bitmap->lock); - blk_unplug(bitmap->mddev->queue); + spin_unlock_irq(&bitmap->counts.lock); schedule(); finish_wait(&bitmap->overflow_wait, &__wait); continue; } - switch(*bmc) { + switch (*bmc) { case 0: bitmap_file_set_bit(bitmap, offset); - bitmap_count_page(bitmap,offset, 1); - blk_plug_device_unlocked(bitmap->mddev->queue); + bitmap_count_page(&bitmap->counts, offset, 1); /* fall through */ case 1: *bmc = 2; @@ -1273,67 +1310,73 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect (*bmc)++; - spin_unlock_irq(&bitmap->lock); + spin_unlock_irq(&bitmap->counts.lock); offset += blocks; if (sectors > blocks) sectors -= blocks; - else sectors = 0; + else + sectors = 0; } - bitmap->allclean = 0; return 0; } +EXPORT_SYMBOL(bitmap_startwrite); void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int success, int behind) { - if (!bitmap) return; + if (!bitmap) + return; if (behind) { - atomic_dec(&bitmap->behind_writes); - PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", - atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); + if (atomic_dec_and_test(&bitmap->behind_writes)) + wake_up(&bitmap->behind_wait); + pr_debug("dec write-behind count %d/%lu\n", + atomic_read(&bitmap->behind_writes), + bitmap->mddev->bitmap_info.max_write_behind); } while (sectors) { - int blocks; + sector_t blocks; unsigned long flags; bitmap_counter_t *bmc; - spin_lock_irqsave(&bitmap->lock, flags); - bmc = bitmap_get_counter(bitmap, offset, &blocks, 0); + spin_lock_irqsave(&bitmap->counts.lock, flags); + bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0); if (!bmc) { - spin_unlock_irqrestore(&bitmap->lock, flags); + spin_unlock_irqrestore(&bitmap->counts.lock, flags); return; } - if (success && + if (success && !bitmap->mddev->degraded && bitmap->events_cleared < bitmap->mddev->events) { bitmap->events_cleared = bitmap->mddev->events; bitmap->need_sync = 1; + sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); } - if (!success && ! (*bmc & NEEDED_MASK)) + if (!success && !NEEDED(*bmc)) *bmc |= NEEDED_MASK; - if ((*bmc & COUNTER_MAX) == COUNTER_MAX) + if (COUNTER(*bmc) == COUNTER_MAX) wake_up(&bitmap->overflow_wait); (*bmc)--; if (*bmc <= 2) { - set_page_attr(bitmap, - filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), - BITMAP_PAGE_CLEAN); + bitmap_set_pending(&bitmap->counts, offset); + bitmap->allclean = 0; } - spin_unlock_irqrestore(&bitmap->lock, flags); + spin_unlock_irqrestore(&bitmap->counts.lock, flags); offset += blocks; if (sectors > blocks) sectors -= blocks; - else sectors = 0; + else + sectors = 0; } } +EXPORT_SYMBOL(bitmap_endwrite); -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, - int degraded) +static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, + int degraded) { bitmap_counter_t *bmc; int rv; @@ -1341,8 +1384,8 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, *blocks = 1024; return 1; /* always resync if no bitmap */ } - spin_lock_irq(&bitmap->lock); - bmc = bitmap_get_counter(bitmap, offset, blocks, 0); + spin_lock_irq(&bitmap->counts.lock); + bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); rv = 0; if (bmc) { /* locked */ @@ -1356,29 +1399,48 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, } } } - spin_unlock_irq(&bitmap->lock); - bitmap->allclean = 0; + spin_unlock_irq(&bitmap->counts.lock); + return rv; +} + +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, + int degraded) +{ + /* bitmap_start_sync must always report on multiples of whole + * pages, otherwise resync (which is very PAGE_SIZE based) will + * get confused. + * So call __bitmap_start_sync repeatedly (if needed) until + * At least PAGE_SIZE>>9 blocks are covered. + * Return the 'or' of the result. + */ + int rv = 0; + sector_t blocks1; + + *blocks = 0; + while (*blocks < (PAGE_SIZE>>9)) { + rv |= __bitmap_start_sync(bitmap, offset, + &blocks1, degraded); + offset += blocks1; + *blocks += blocks1; + } return rv; } +EXPORT_SYMBOL(bitmap_start_sync); -void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) { bitmap_counter_t *bmc; unsigned long flags; -/* - if (offset == 0) printk("bitmap_end_sync 0 (%d)\n", aborted); -*/ if (bitmap == NULL) { + + if (bitmap == NULL) { *blocks = 1024; return; } - spin_lock_irqsave(&bitmap->lock, flags); - bmc = bitmap_get_counter(bitmap, offset, blocks, 0); + spin_lock_irqsave(&bitmap->counts.lock, flags); + bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc == NULL) goto unlock; /* locked */ -/* - if (offset == 0) printk("bitmap_end sync found 0x%x, blocks %d\n", *bmc, *blocks); -*/ if (RESYNC(*bmc)) { *bmc &= ~RESYNC_MASK; @@ -1386,16 +1448,15 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int ab *bmc |= NEEDED_MASK; else { if (*bmc <= 2) { - set_page_attr(bitmap, - filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), - BITMAP_PAGE_CLEAN); + bitmap_set_pending(&bitmap->counts, offset); + bitmap->allclean = 0; } } } unlock: - spin_unlock_irqrestore(&bitmap->lock, flags); - bitmap->allclean = 0; + spin_unlock_irqrestore(&bitmap->counts.lock, flags); } +EXPORT_SYMBOL(bitmap_end_sync); void bitmap_close_sync(struct bitmap *bitmap) { @@ -1404,7 +1465,7 @@ void bitmap_close_sync(struct bitmap *bitmap) * RESYNC bit wherever it is still on */ sector_t sector = 0; - int blocks; + sector_t blocks; if (!bitmap) return; while (sector < bitmap->mddev->resync_max_sectors) { @@ -1412,11 +1473,12 @@ void bitmap_close_sync(struct bitmap *bitmap) sector += blocks; } } +EXPORT_SYMBOL(bitmap_close_sync); void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) { sector_t s = 0; - int blocks; + sector_t blocks; if (!bitmap) return; @@ -1425,44 +1487,46 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) return; } if (time_before(jiffies, (bitmap->last_end_sync - + bitmap->daemon_sleep * HZ))) + + bitmap->mddev->bitmap_info.daemon_sleep))) return; wait_event(bitmap->mddev->recovery_wait, atomic_read(&bitmap->mddev->recovery_active) == 0); - sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); + bitmap->mddev->curr_resync_completed = sector; + set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); + sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { bitmap_end_sync(bitmap, s, &blocks, 0); s += blocks; } bitmap->last_end_sync = jiffies; + sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); } +EXPORT_SYMBOL(bitmap_cond_end_sync); static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) { /* For each chunk covered by any of these sectors, set the - * counter to 1 and set resync_needed. They should all + * counter to 2 and possibly set resync_needed. They should all * be 0 at this point */ - int secs; + sector_t secs; bitmap_counter_t *bmc; - spin_lock_irq(&bitmap->lock); - bmc = bitmap_get_counter(bitmap, offset, &secs, 1); + spin_lock_irq(&bitmap->counts.lock); + bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1); if (!bmc) { - spin_unlock_irq(&bitmap->lock); + spin_unlock_irq(&bitmap->counts.lock); return; } - if (! *bmc) { - struct page *page; - *bmc = 1 | (needed?NEEDED_MASK:0); - bitmap_count_page(bitmap, offset, 1); - page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); - set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); + if (!*bmc) { + *bmc = 2 | (needed ? NEEDED_MASK : 0); + bitmap_count_page(&bitmap->counts, offset, 1); + bitmap_set_pending(&bitmap->counts, offset); + bitmap->allclean = 0; } - spin_unlock_irq(&bitmap->lock); - bitmap->allclean = 0; + spin_unlock_irq(&bitmap->counts.lock); } /* dirty the memory and file bits for bitmap chunks "s" to "e" */ @@ -1471,19 +1535,25 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) unsigned long chunk; for (chunk = s; chunk <= e; chunk++) { - sector_t sec = chunk << CHUNK_BLOCK_SHIFT(bitmap); + sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; bitmap_set_memory_bits(bitmap, sec, 1); bitmap_file_set_bit(bitmap, sec); + if (sec < bitmap->mddev->recovery_cp) + /* We are asserting that the array is dirty, + * so move the recovery_cp address back so + * that it is obvious that it is dirty + */ + bitmap->mddev->recovery_cp = sec; } } /* * flush out any pending updates */ -void bitmap_flush(mddev_t *mddev) +void bitmap_flush(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; - int sleep; + long sleep; if (!bitmap) /* there was no bitmap */ return; @@ -1491,12 +1561,13 @@ void bitmap_flush(mddev_t *mddev) /* run the daemon_work three time to ensure everything is flushed * that can be */ - sleep = bitmap->daemon_sleep; - bitmap->daemon_sleep = 0; - bitmap_daemon_work(bitmap); - bitmap_daemon_work(bitmap); - bitmap_daemon_work(bitmap); - bitmap->daemon_sleep = sleep; + sleep = mddev->bitmap_info.daemon_sleep * 2; + bitmap->daemon_lastrun -= sleep; + bitmap_daemon_work(mddev); + bitmap->daemon_lastrun -= sleep; + bitmap_daemon_work(mddev); + bitmap->daemon_lastrun -= sleep; + bitmap_daemon_work(mddev); bitmap_update_sb(bitmap); } @@ -1511,11 +1582,15 @@ static void bitmap_free(struct bitmap *bitmap) if (!bitmap) /* there was no bitmap */ return; - /* release the bitmap file and kill the daemon */ - bitmap_file_put(bitmap); + /* Shouldn't be needed - but just in case.... */ + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes) == 0); + + /* release the bitmap file */ + bitmap_file_unmap(&bitmap->storage); - bp = bitmap->bp; - pages = bitmap->pages; + bp = bitmap->counts.bp; + pages = bitmap->counts.pages; /* free all allocated memory */ @@ -1526,17 +1601,23 @@ static void bitmap_free(struct bitmap *bitmap) kfree(bp); kfree(bitmap); } -void bitmap_destroy(mddev_t *mddev) + +void bitmap_destroy(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) /* there was no bitmap */ return; + mutex_lock(&mddev->bitmap_info.mutex); mddev->bitmap = NULL; /* disconnect from the md device */ + mutex_unlock(&mddev->bitmap_info.mutex); if (mddev->thread) mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + if (bitmap->sysfs_can_clear) + sysfs_put(bitmap->sysfs_can_clear); + bitmap_free(bitmap); } @@ -1544,106 +1625,641 @@ void bitmap_destroy(mddev_t *mddev) * initialize the bitmap structure * if this returns an error, bitmap_destroy must be called to do clean up */ -int bitmap_create(mddev_t *mddev) +int bitmap_create(struct mddev *mddev) { struct bitmap *bitmap; - unsigned long blocks = mddev->resync_max_sectors; - unsigned long chunks; - unsigned long pages; - struct file *file = mddev->bitmap_file; + sector_t blocks = mddev->resync_max_sectors; + struct file *file = mddev->bitmap_info.file; int err; - sector_t start; + struct kernfs_node *bm = NULL; BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); - if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */ - return 0; - - BUG_ON(file && mddev->bitmap_offset); + BUG_ON(file && mddev->bitmap_info.offset); bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) return -ENOMEM; - spin_lock_init(&bitmap->lock); + spin_lock_init(&bitmap->counts.lock); atomic_set(&bitmap->pending_writes, 0); init_waitqueue_head(&bitmap->write_wait); init_waitqueue_head(&bitmap->overflow_wait); + init_waitqueue_head(&bitmap->behind_wait); bitmap->mddev = mddev; - bitmap->file = file; - bitmap->offset = mddev->bitmap_offset; + if (mddev->kobj.sd) + bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); + if (bm) { + bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear"); + sysfs_put(bm); + } else + bitmap->sysfs_can_clear = NULL; + + bitmap->storage.file = file; if (file) { get_file(file); - do_sync_mapping_range(file->f_mapping, 0, LLONG_MAX, - SYNC_FILE_RANGE_WAIT_BEFORE | - SYNC_FILE_RANGE_WRITE | - SYNC_FILE_RANGE_WAIT_AFTER); + /* As future accesses to this file will use bmap, + * and bypass the page cache, we must sync the file + * first. + */ + vfs_fsync(file, 1); + } + /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ + if (!mddev->bitmap_info.external) { + /* + * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is + * instructing us to create a new on-disk bitmap instance. + */ + if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags)) + err = bitmap_new_disk_sb(bitmap); + else + err = bitmap_read_sb(bitmap); + } else { + err = 0; + if (mddev->bitmap_info.chunksize == 0 || + mddev->bitmap_info.daemon_sleep == 0) + /* chunksize and time_base need to be + * set first. */ + err = -EINVAL; } - /* read superblock from bitmap file (this sets bitmap->chunksize) */ - err = bitmap_read_sb(bitmap); if (err) goto error; - bitmap->chunkshift = ffz(~bitmap->chunksize); + bitmap->daemon_lastrun = jiffies; + err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); + if (err) + goto error; - /* now that chunksize and chunkshift are set, we can use these macros */ - chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) / - CHUNK_BLOCK_RATIO(bitmap); - pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; + printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", + bitmap->counts.pages, bmname(bitmap)); - BUG_ON(!pages); + mddev->bitmap = bitmap; + return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; - bitmap->chunks = chunks; - bitmap->pages = pages; - bitmap->missing_pages = pages; - bitmap->counter_bits = COUNTER_BITS; + error: + bitmap_free(bitmap); + return err; +} - bitmap->syncchunk = ~0UL; +int bitmap_load(struct mddev *mddev) +{ + int err = 0; + sector_t start = 0; + sector_t sector = 0; + struct bitmap *bitmap = mddev->bitmap; -#ifdef INJECT_FATAL_FAULT_1 - bitmap->bp = NULL; -#else - bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); -#endif - err = -ENOMEM; - if (!bitmap->bp) - goto error; + if (!bitmap) + goto out; + + /* Clear out old bitmap info first: Either there is none, or we + * are resuming after someone else has possibly changed things, + * so we should forget old cached info. + * All chunks should be clean, but some might need_sync. + */ + while (sector < mddev->resync_max_sectors) { + sector_t blocks; + bitmap_start_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + bitmap_close_sync(bitmap); - /* now that we have some pages available, initialize the in-memory - * bitmap from the on-disk bitmap */ - start = 0; if (mddev->degraded == 0 || bitmap->events_cleared == mddev->events) - /* no need to keep dirty bits to optimise a re-add of a missing device */ + /* no need to keep dirty bits to optimise a + * re-add of a missing device */ start = mddev->recovery_cp; + + mutex_lock(&mddev->bitmap_info.mutex); err = bitmap_init_from_disk(bitmap, start); + mutex_unlock(&mddev->bitmap_info.mutex); if (err) - goto error; - - printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", - pages, bmname(bitmap)); + goto out; + clear_bit(BITMAP_STALE, &bitmap->flags); - mddev->bitmap = bitmap; + /* Kick recovery in case any bits were set */ + set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); - mddev->thread->timeout = bitmap->daemon_sleep * HZ; + mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; + md_wakeup_thread(mddev->thread); bitmap_update_sb(bitmap); - return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0; - - error: - bitmap_free(bitmap); + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) + err = -EIO; +out: return err; } +EXPORT_SYMBOL_GPL(bitmap_load); + +void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) +{ + unsigned long chunk_kb; + struct bitmap_counts *counts; + + if (!bitmap) + return; + + counts = &bitmap->counts; + + chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " + "%lu%s chunk", + counts->pages - counts->missing_pages, + counts->pages, + (counts->pages - counts->missing_pages) + << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + if (bitmap->storage.file) { + seq_printf(seq, ", file: "); + seq_path(seq, &bitmap->storage.file->f_path, " \t\n"); + } + + seq_printf(seq, "\n"); +} + +int bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, int init) +{ + /* If chunk_size is 0, choose an appropriate chunk size. + * Then possibly allocate new storage space. + * Then quiesce, copy bits, replace bitmap, and re-start + * + * This function is called both to set up the initial bitmap + * and to resize the bitmap while the array is active. + * If this happens as a result of the array being resized, + * chunksize will be zero, and we need to choose a suitable + * chunksize, otherwise we use what we are given. + */ + struct bitmap_storage store; + struct bitmap_counts old_counts; + unsigned long chunks; + sector_t block; + sector_t old_blocks, new_blocks; + int chunkshift; + int ret = 0; + long pages; + struct bitmap_page *new_bp; + + if (chunksize == 0) { + /* If there is enough space, leave the chunk size unchanged, + * else increase by factor of two until there is enough space. + */ + long bytes; + long space = bitmap->mddev->bitmap_info.space; + + if (space == 0) { + /* We don't know how much space there is, so limit + * to current size - in sectors. + */ + bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8); + if (!bitmap->mddev->bitmap_info.external) + bytes += sizeof(bitmap_super_t); + space = DIV_ROUND_UP(bytes, 512); + bitmap->mddev->bitmap_info.space = space; + } + chunkshift = bitmap->counts.chunkshift; + chunkshift--; + do { + /* 'chunkshift' is shift from block size to chunk size */ + chunkshift++; + chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); + bytes = DIV_ROUND_UP(chunks, 8); + if (!bitmap->mddev->bitmap_info.external) + bytes += sizeof(bitmap_super_t); + } while (bytes > (space << 9)); + } else + chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; + + chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); + memset(&store, 0, sizeof(store)); + if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) + ret = bitmap_storage_alloc(&store, chunks, + !bitmap->mddev->bitmap_info.external); + if (ret) + goto err; + + pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); + + new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL); + ret = -ENOMEM; + if (!new_bp) { + bitmap_file_unmap(&store); + goto err; + } + + if (!init) + bitmap->mddev->pers->quiesce(bitmap->mddev, 1); + + store.file = bitmap->storage.file; + bitmap->storage.file = NULL; + + if (store.sb_page && bitmap->storage.sb_page) + memcpy(page_address(store.sb_page), + page_address(bitmap->storage.sb_page), + sizeof(bitmap_super_t)); + bitmap_file_unmap(&bitmap->storage); + bitmap->storage = store; + + old_counts = bitmap->counts; + bitmap->counts.bp = new_bp; + bitmap->counts.pages = pages; + bitmap->counts.missing_pages = pages; + bitmap->counts.chunkshift = chunkshift; + bitmap->counts.chunks = chunks; + bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift + + BITMAP_BLOCK_SHIFT); + + blocks = min(old_counts.chunks << old_counts.chunkshift, + chunks << chunkshift); + + spin_lock_irq(&bitmap->counts.lock); + for (block = 0; block < blocks; ) { + bitmap_counter_t *bmc_old, *bmc_new; + int set; + + bmc_old = bitmap_get_counter(&old_counts, block, + &old_blocks, 0); + set = bmc_old && NEEDED(*bmc_old); + + if (set) { + bmc_new = bitmap_get_counter(&bitmap->counts, block, + &new_blocks, 1); + if (*bmc_new == 0) { + /* need to set on-disk bits too. */ + sector_t end = block + new_blocks; + sector_t start = block >> chunkshift; + start <<= chunkshift; + while (start < end) { + bitmap_file_set_bit(bitmap, block); + start += 1 << chunkshift; + } + *bmc_new = 2; + bitmap_count_page(&bitmap->counts, + block, 1); + bitmap_set_pending(&bitmap->counts, + block); + } + *bmc_new |= NEEDED_MASK; + if (new_blocks < old_blocks) + old_blocks = new_blocks; + } + block += old_blocks; + } + + if (!init) { + int i; + while (block < (chunks << chunkshift)) { + bitmap_counter_t *bmc; + bmc = bitmap_get_counter(&bitmap->counts, block, + &new_blocks, 1); + if (bmc) { + /* new space. It needs to be resynced, so + * we set NEEDED_MASK. + */ + if (*bmc == 0) { + *bmc = NEEDED_MASK | 2; + bitmap_count_page(&bitmap->counts, + block, 1); + bitmap_set_pending(&bitmap->counts, + block); + } + } + block += new_blocks; + } + for (i = 0; i < bitmap->storage.file_pages; i++) + set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); + } + spin_unlock_irq(&bitmap->counts.lock); + + if (!init) { + bitmap_unplug(bitmap); + bitmap->mddev->pers->quiesce(bitmap->mddev, 0); + } + ret = 0; +err: + return ret; +} +EXPORT_SYMBOL_GPL(bitmap_resize); + +static ssize_t +location_show(struct mddev *mddev, char *page) +{ + ssize_t len; + if (mddev->bitmap_info.file) + len = sprintf(page, "file"); + else if (mddev->bitmap_info.offset) + len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); + else + len = sprintf(page, "none"); + len += sprintf(page+len, "\n"); + return len; +} + +static ssize_t +location_store(struct mddev *mddev, const char *buf, size_t len) +{ + + if (mddev->pers) { + if (!mddev->pers->quiesce) + return -EBUSY; + if (mddev->recovery || mddev->sync_thread) + return -EBUSY; + } + + if (mddev->bitmap || mddev->bitmap_info.file || + mddev->bitmap_info.offset) { + /* bitmap already configured. Only option is to clear it */ + if (strncmp(buf, "none", 4) != 0) + return -EBUSY; + if (mddev->pers) { + mddev->pers->quiesce(mddev, 1); + bitmap_destroy(mddev); + mddev->pers->quiesce(mddev, 0); + } + mddev->bitmap_info.offset = 0; + if (mddev->bitmap_info.file) { + struct file *f = mddev->bitmap_info.file; + mddev->bitmap_info.file = NULL; + fput(f); + } + } else { + /* No bitmap, OK to set a location */ + long long offset; + if (strncmp(buf, "none", 4) == 0) + /* nothing to be done */; + else if (strncmp(buf, "file:", 5) == 0) { + /* Not supported yet */ + return -EINVAL; + } else { + int rv; + if (buf[0] == '+') + rv = kstrtoll(buf+1, 10, &offset); + else + rv = kstrtoll(buf, 10, &offset); + if (rv) + return rv; + if (offset == 0) + return -EINVAL; + if (mddev->bitmap_info.external == 0 && + mddev->major_version == 0 && + offset != mddev->bitmap_info.default_offset) + return -EINVAL; + mddev->bitmap_info.offset = offset; + if (mddev->pers) { + mddev->pers->quiesce(mddev, 1); + rv = bitmap_create(mddev); + if (!rv) + rv = bitmap_load(mddev); + if (rv) { + bitmap_destroy(mddev); + mddev->bitmap_info.offset = 0; + } + mddev->pers->quiesce(mddev, 0); + if (rv) + return rv; + } + } + } + if (!mddev->external) { + /* Ensure new bitmap info is stored in + * metadata promptly. + */ + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + } + return len; +} + +static struct md_sysfs_entry bitmap_location = +__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); + +/* 'bitmap/space' is the space available at 'location' for the + * bitmap. This allows the kernel to know when it is safe to + * resize the bitmap to match a resized array. + */ +static ssize_t +space_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.space); +} + +static ssize_t +space_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long sectors; + int rv; + + rv = kstrtoul(buf, 10, §ors); + if (rv) + return rv; + + if (sectors == 0) + return -EINVAL; + + if (mddev->bitmap && + sectors < (mddev->bitmap->storage.bytes + 511) >> 9) + return -EFBIG; /* Bitmap is too big for this small space */ + + /* could make sure it isn't too big, but that isn't really + * needed - user-space should be careful. + */ + mddev->bitmap_info.space = sectors; + return len; +} + +static struct md_sysfs_entry bitmap_space = +__ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store); + +static ssize_t +timeout_show(struct mddev *mddev, char *page) +{ + ssize_t len; + unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; + unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; + + len = sprintf(page, "%lu", secs); + if (jifs) + len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); + len += sprintf(page+len, "\n"); + return len; +} + +static ssize_t +timeout_store(struct mddev *mddev, const char *buf, size_t len) +{ + /* timeout can be set at any time */ + unsigned long timeout; + int rv = strict_strtoul_scaled(buf, &timeout, 4); + if (rv) + return rv; + + /* just to make sure we don't overflow... */ + if (timeout >= LONG_MAX / HZ) + return -EINVAL; + + timeout = timeout * HZ / 10000; + + if (timeout >= MAX_SCHEDULE_TIMEOUT) + timeout = MAX_SCHEDULE_TIMEOUT-1; + if (timeout < 1) + timeout = 1; + mddev->bitmap_info.daemon_sleep = timeout; + if (mddev->thread) { + /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then + * the bitmap is all clean and we don't need to + * adjust the timeout right now + */ + if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { + mddev->thread->timeout = timeout; + md_wakeup_thread(mddev->thread); + } + } + return len; +} + +static struct md_sysfs_entry bitmap_timeout = +__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store); + +static ssize_t +backlog_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind); +} + +static ssize_t +backlog_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long backlog; + int rv = kstrtoul(buf, 10, &backlog); + if (rv) + return rv; + if (backlog > COUNTER_MAX) + return -EINVAL; + mddev->bitmap_info.max_write_behind = backlog; + return len; +} + +static struct md_sysfs_entry bitmap_backlog = +__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store); + +static ssize_t +chunksize_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize); +} + +static ssize_t +chunksize_store(struct mddev *mddev, const char *buf, size_t len) +{ + /* Can only be changed when no bitmap is active */ + int rv; + unsigned long csize; + if (mddev->bitmap) + return -EBUSY; + rv = kstrtoul(buf, 10, &csize); + if (rv) + return rv; + if (csize < 512 || + !is_power_of_2(csize)) + return -EINVAL; + mddev->bitmap_info.chunksize = csize; + return len; +} + +static struct md_sysfs_entry bitmap_chunksize = +__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); + +static ssize_t metadata_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%s\n", (mddev->bitmap_info.external + ? "external" : "internal")); +} + +static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap || + mddev->bitmap_info.file || + mddev->bitmap_info.offset) + return -EBUSY; + if (strncmp(buf, "external", 8) == 0) + mddev->bitmap_info.external = 1; + else if (strncmp(buf, "internal", 8) == 0) + mddev->bitmap_info.external = 0; + else + return -EINVAL; + return len; +} + +static struct md_sysfs_entry bitmap_metadata = +__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); + +static ssize_t can_clear_show(struct mddev *mddev, char *page) +{ + int len; + if (mddev->bitmap) + len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? + "false" : "true")); + else + len = sprintf(page, "\n"); + return len; +} + +static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap == NULL) + return -ENOENT; + if (strncmp(buf, "false", 5) == 0) + mddev->bitmap->need_sync = 1; + else if (strncmp(buf, "true", 4) == 0) { + if (mddev->degraded) + return -EBUSY; + mddev->bitmap->need_sync = 0; + } else + return -EINVAL; + return len; +} + +static struct md_sysfs_entry bitmap_can_clear = +__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); + +static ssize_t +behind_writes_used_show(struct mddev *mddev, char *page) +{ + if (mddev->bitmap == NULL) + return sprintf(page, "0\n"); + return sprintf(page, "%lu\n", + mddev->bitmap->behind_writes_used); +} + +static ssize_t +behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap) + mddev->bitmap->behind_writes_used = 0; + return len; +} + +static struct md_sysfs_entry max_backlog_used = +__ATTR(max_backlog_used, S_IRUGO | S_IWUSR, + behind_writes_used_show, behind_writes_used_reset); + +static struct attribute *md_bitmap_attrs[] = { + &bitmap_location.attr, + &bitmap_space.attr, + &bitmap_timeout.attr, + &bitmap_backlog.attr, + &bitmap_chunksize.attr, + &bitmap_metadata.attr, + &bitmap_can_clear.attr, + &max_backlog_used.attr, + NULL +}; +struct attribute_group md_bitmap_group = { + .name = "bitmap", + .attrs = md_bitmap_attrs, +}; -/* the bitmap API -- for raid personalities */ -EXPORT_SYMBOL(bitmap_startwrite); -EXPORT_SYMBOL(bitmap_endwrite); -EXPORT_SYMBOL(bitmap_start_sync); -EXPORT_SYMBOL(bitmap_end_sync); -EXPORT_SYMBOL(bitmap_unplug); -EXPORT_SYMBOL(bitmap_close_sync); -EXPORT_SYMBOL(bitmap_cond_end_sync); diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h new file mode 100644 index 00000000000..30210b9c4ef --- /dev/null +++ b/drivers/md/bitmap.h @@ -0,0 +1,265 @@ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_HOSTENDIAN 3 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared as well, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SHIFT 9 + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ + BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ + BITMAP_HOSTENDIAN =15, +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __le32 magic; /* 0 BITMAP_MAGIC */ + __le32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __le64 events; /* 24 event counter for the bitmap (1)*/ + __le64 events_cleared;/*32 event counter when last bit cleared (2) */ + __le64 sync_size; /* 40 the size of the md device's sync range(3) */ + __le32 state; /* 48 bitmap state information */ + __le32 chunksize; /* 52 the bitmap chunk size in bytes */ + __le32 daemon_sleep; /* 56 seconds between disk flushes */ + __le32 write_behind; /* 60 number of outstanding write-behind writes */ + __le32 sectors_reserved; /* 64 number of 512-byte sectors that are + * reserved for the bitmap. */ + + __u8 pad[256 - 68]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * If any counter in this page is '1' or '2' - and so could be + * cleared then that page is marked as 'pending' + */ + unsigned int pending:1; + /* + * count of dirty bits on the page + */ + unsigned int count:30; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + + struct bitmap_counts { + spinlock_t lock; + struct bitmap_page *bp; + unsigned long pages; /* total number of pages + * in the bitmap */ + unsigned long missing_pages; /* number of pages + * not yet allocated */ + unsigned long chunkshift; /* chunksize = 2^chunkshift + * (for bitops) */ + unsigned long chunks; /* Total number of data + * chunks for the array */ + } counts; + + struct mddev *mddev; /* the md device that the bitmap is for */ + + __u64 events_cleared; + int need_sync; + + struct bitmap_storage { + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap + * file superblock */ + struct page **filemap; /* list of cache pages for + * the file */ + unsigned long *filemap_attr; /* attributes associated + * w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file*/ + unsigned long bytes; /* total bytes in the bitmap */ + } storage; + + unsigned long flags; + + int allclean; + + atomic_t behind_writes; + unsigned long behind_writes_used; /* highest actual value at runtime */ + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + unsigned long last_end_sync; /* when we lasted called end_sync to + * update bitmap with resync progress */ + + atomic_t pending_writes; /* pending writes to the bitmap file */ + wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; + wait_queue_head_t behind_wait; + + struct kernfs_node *sysfs_can_clear; +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +int bitmap_create(struct mddev *mddev); +int bitmap_load(struct mddev *mddev); +void bitmap_flush(struct mddev *mddev); +void bitmap_destroy(struct mddev *mddev); + +void bitmap_print_sb(struct bitmap *bitmap); +void bitmap_update_sb(struct bitmap *bitmap); +void bitmap_status(struct seq_file *seq, struct bitmap *bitmap); + +int bitmap_setallbits(struct bitmap *bitmap); +void bitmap_write_all(struct bitmap *bitmap); + +void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); + +/* these are exported */ +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int behind); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int behind); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); +void bitmap_close_sync(struct bitmap *bitmap); +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); + +void bitmap_unplug(struct bitmap *bitmap); +void bitmap_daemon_work(struct mddev *mddev); + +int bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, int init); +#endif + +#endif diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h deleted file mode 100644 index d4509be0fe6..00000000000 --- a/drivers/md/dm-bio-list.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (C) 2004 Red Hat UK Ltd. - * - * This file is released under the GPL. - */ - -#ifndef DM_BIO_LIST_H -#define DM_BIO_LIST_H - -#include <linux/bio.h> - -#ifdef CONFIG_BLOCK - -struct bio_list { - struct bio *head; - struct bio *tail; -}; - -static inline int bio_list_empty(const struct bio_list *bl) -{ - return bl->head == NULL; -} - -static inline void bio_list_init(struct bio_list *bl) -{ - bl->head = bl->tail = NULL; -} - -#define bio_list_for_each(bio, bl) \ - for (bio = (bl)->head; bio; bio = bio->bi_next) - -static inline unsigned bio_list_size(const struct bio_list *bl) -{ - unsigned sz = 0; - struct bio *bio; - - bio_list_for_each(bio, bl) - sz++; - - return sz; -} - -static inline void bio_list_add(struct bio_list *bl, struct bio *bio) -{ - bio->bi_next = NULL; - - if (bl->tail) - bl->tail->bi_next = bio; - else - bl->head = bio; - - bl->tail = bio; -} - -static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) -{ - if (!bl2->head) - return; - - if (bl->tail) - bl->tail->bi_next = bl2->head; - else - bl->head = bl2->head; - - bl->tail = bl2->tail; -} - -static inline void bio_list_merge_head(struct bio_list *bl, - struct bio_list *bl2) -{ - if (!bl2->head) - return; - - if (bl->head) - bl2->tail->bi_next = bl->head; - else - bl->tail = bl2->tail; - - bl->head = bl2->head; -} - -static inline struct bio *bio_list_pop(struct bio_list *bl) -{ - struct bio *bio = bl->head; - - if (bio) { - bl->head = bl->head->bi_next; - if (!bl->head) - bl->tail = NULL; - - bio->bi_next = NULL; - } - - return bio; -} - -static inline struct bio *bio_list_get(struct bio_list *bl) -{ - struct bio *bio = bl->head; - - bl->head = bl->tail = NULL; - - return bio; -} - -#endif /* CONFIG_BLOCK */ -#endif diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c new file mode 100644 index 00000000000..f752d12081f --- /dev/null +++ b/drivers/md/dm-bio-prison.c @@ -0,0 +1,408 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-bio-prison.h" + +#include <linux/spinlock.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/slab.h> + +/*----------------------------------------------------------------*/ + +struct bucket { + spinlock_t lock; + struct hlist_head cells; +}; + +struct dm_bio_prison { + mempool_t *cell_pool; + + unsigned nr_buckets; + unsigned hash_mask; + struct bucket *buckets; +}; + +/*----------------------------------------------------------------*/ + +static uint32_t calc_nr_buckets(unsigned nr_cells) +{ + uint32_t n = 128; + + nr_cells /= 4; + nr_cells = min(nr_cells, 8192u); + + while (n < nr_cells) + n <<= 1; + + return n; +} + +static struct kmem_cache *_cell_cache; + +static void init_bucket(struct bucket *b) +{ + spin_lock_init(&b->lock); + INIT_HLIST_HEAD(&b->cells); +} + +/* + * @nr_cells should be the number of cells you want in use _concurrently_. + * Don't confuse it with the number of distinct keys. + */ +struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells) +{ + unsigned i; + uint32_t nr_buckets = calc_nr_buckets(nr_cells); + size_t len = sizeof(struct dm_bio_prison) + + (sizeof(struct bucket) * nr_buckets); + struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL); + + if (!prison) + return NULL; + + prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); + if (!prison->cell_pool) { + kfree(prison); + return NULL; + } + + prison->nr_buckets = nr_buckets; + prison->hash_mask = nr_buckets - 1; + prison->buckets = (struct bucket *) (prison + 1); + for (i = 0; i < nr_buckets; i++) + init_bucket(prison->buckets + i); + + return prison; +} +EXPORT_SYMBOL_GPL(dm_bio_prison_create); + +void dm_bio_prison_destroy(struct dm_bio_prison *prison) +{ + mempool_destroy(prison->cell_pool); + kfree(prison); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_destroy); + +struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, gfp_t gfp) +{ + return mempool_alloc(prison->cell_pool, gfp); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell); + +void dm_bio_prison_free_cell(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell) +{ + mempool_free(cell, prison->cell_pool); +} +EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell); + +static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key) +{ + const unsigned long BIG_PRIME = 4294967291UL; + uint64_t hash = key->block * BIG_PRIME; + + return (uint32_t) (hash & prison->hash_mask); +} + +static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs) +{ + return (lhs->virtual == rhs->virtual) && + (lhs->dev == rhs->dev) && + (lhs->block == rhs->block); +} + +static struct bucket *get_bucket(struct dm_bio_prison *prison, + struct dm_cell_key *key) +{ + return prison->buckets + hash_key(prison, key); +} + +static struct dm_bio_prison_cell *__search_bucket(struct bucket *b, + struct dm_cell_key *key) +{ + struct dm_bio_prison_cell *cell; + + hlist_for_each_entry(cell, &b->cells, list) + if (keys_equal(&cell->key, key)) + return cell; + + return NULL; +} + +static void __setup_new_cell(struct bucket *b, + struct dm_cell_key *key, + struct bio *holder, + struct dm_bio_prison_cell *cell) +{ + memcpy(&cell->key, key, sizeof(cell->key)); + cell->holder = holder; + bio_list_init(&cell->bios); + hlist_add_head(&cell->list, &b->cells); +} + +static int __bio_detain(struct bucket *b, + struct dm_cell_key *key, + struct bio *inmate, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result) +{ + struct dm_bio_prison_cell *cell; + + cell = __search_bucket(b, key); + if (cell) { + if (inmate) + bio_list_add(&cell->bios, inmate); + *cell_result = cell; + return 1; + } + + __setup_new_cell(b, key, inmate, cell_prealloc); + *cell_result = cell_prealloc; + return 0; +} + +static int bio_detain(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct bio *inmate, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result) +{ + int r; + unsigned long flags; + struct bucket *b = get_bucket(prison, key); + + spin_lock_irqsave(&b->lock, flags); + r = __bio_detain(b, key, inmate, cell_prealloc, cell_result); + spin_unlock_irqrestore(&b->lock, flags); + + return r; +} + +int dm_bio_detain(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct bio *inmate, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result) +{ + return bio_detain(prison, key, inmate, cell_prealloc, cell_result); +} +EXPORT_SYMBOL_GPL(dm_bio_detain); + +int dm_get_cell(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result) +{ + return bio_detain(prison, key, NULL, cell_prealloc, cell_result); +} +EXPORT_SYMBOL_GPL(dm_get_cell); + +/* + * @inmates must have been initialised prior to this call + */ +static void __cell_release(struct dm_bio_prison_cell *cell, + struct bio_list *inmates) +{ + hlist_del(&cell->list); + + if (inmates) { + if (cell->holder) + bio_list_add(inmates, cell->holder); + bio_list_merge(inmates, &cell->bios); + } +} + +void dm_cell_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *bios) +{ + unsigned long flags; + struct bucket *b = get_bucket(prison, &cell->key); + + spin_lock_irqsave(&b->lock, flags); + __cell_release(cell, bios); + spin_unlock_irqrestore(&b->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_cell_release); + +/* + * Sometimes we don't want the holder, just the additional bios. + */ +static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, + struct bio_list *inmates) +{ + hlist_del(&cell->list); + bio_list_merge(inmates, &cell->bios); +} + +void dm_cell_release_no_holder(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *inmates) +{ + unsigned long flags; + struct bucket *b = get_bucket(prison, &cell->key); + + spin_lock_irqsave(&b->lock, flags); + __cell_release_no_holder(cell, inmates); + spin_unlock_irqrestore(&b->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); + +void dm_cell_error(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, int error) +{ + struct bio_list bios; + struct bio *bio; + + bio_list_init(&bios); + dm_cell_release(prison, cell, &bios); + + while ((bio = bio_list_pop(&bios))) + bio_endio(bio, error); +} +EXPORT_SYMBOL_GPL(dm_cell_error); + +/*----------------------------------------------------------------*/ + +#define DEFERRED_SET_SIZE 64 + +struct dm_deferred_entry { + struct dm_deferred_set *ds; + unsigned count; + struct list_head work_items; +}; + +struct dm_deferred_set { + spinlock_t lock; + unsigned current_entry; + unsigned sweeper; + struct dm_deferred_entry entries[DEFERRED_SET_SIZE]; +}; + +struct dm_deferred_set *dm_deferred_set_create(void) +{ + int i; + struct dm_deferred_set *ds; + + ds = kmalloc(sizeof(*ds), GFP_KERNEL); + if (!ds) + return NULL; + + spin_lock_init(&ds->lock); + ds->current_entry = 0; + ds->sweeper = 0; + for (i = 0; i < DEFERRED_SET_SIZE; i++) { + ds->entries[i].ds = ds; + ds->entries[i].count = 0; + INIT_LIST_HEAD(&ds->entries[i].work_items); + } + + return ds; +} +EXPORT_SYMBOL_GPL(dm_deferred_set_create); + +void dm_deferred_set_destroy(struct dm_deferred_set *ds) +{ + kfree(ds); +} +EXPORT_SYMBOL_GPL(dm_deferred_set_destroy); + +struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds) +{ + unsigned long flags; + struct dm_deferred_entry *entry; + + spin_lock_irqsave(&ds->lock, flags); + entry = ds->entries + ds->current_entry; + entry->count++; + spin_unlock_irqrestore(&ds->lock, flags); + + return entry; +} +EXPORT_SYMBOL_GPL(dm_deferred_entry_inc); + +static unsigned ds_next(unsigned index) +{ + return (index + 1) % DEFERRED_SET_SIZE; +} + +static void __sweep(struct dm_deferred_set *ds, struct list_head *head) +{ + while ((ds->sweeper != ds->current_entry) && + !ds->entries[ds->sweeper].count) { + list_splice_init(&ds->entries[ds->sweeper].work_items, head); + ds->sweeper = ds_next(ds->sweeper); + } + + if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->sweeper].count) + list_splice_init(&ds->entries[ds->sweeper].work_items, head); +} + +void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head) +{ + unsigned long flags; + + spin_lock_irqsave(&entry->ds->lock, flags); + BUG_ON(!entry->count); + --entry->count; + __sweep(entry->ds, head); + spin_unlock_irqrestore(&entry->ds->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_deferred_entry_dec); + +/* + * Returns 1 if deferred or 0 if no pending items to delay job. + */ +int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work) +{ + int r = 1; + unsigned long flags; + unsigned next_entry; + + spin_lock_irqsave(&ds->lock, flags); + if ((ds->sweeper == ds->current_entry) && + !ds->entries[ds->current_entry].count) + r = 0; + else { + list_add(work, &ds->entries[ds->current_entry].work_items); + next_entry = ds_next(ds->current_entry); + if (!ds->entries[next_entry].count) + ds->current_entry = next_entry; + } + spin_unlock_irqrestore(&ds->lock, flags); + + return r; +} +EXPORT_SYMBOL_GPL(dm_deferred_set_add_work); + +/*----------------------------------------------------------------*/ + +static int __init dm_bio_prison_init(void) +{ + _cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0); + if (!_cell_cache) + return -ENOMEM; + + return 0; +} + +static void __exit dm_bio_prison_exit(void) +{ + kmem_cache_destroy(_cell_cache); + _cell_cache = NULL; +} + +/* + * module hooks + */ +module_init(dm_bio_prison_init); +module_exit(dm_bio_prison_exit); + +MODULE_DESCRIPTION(DM_NAME " bio prison"); +MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h new file mode 100644 index 00000000000..6805a142b75 --- /dev/null +++ b/drivers/md/dm-bio-prison.h @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2011-2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_BIO_PRISON_H +#define DM_BIO_PRISON_H + +#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */ +#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */ + +#include <linux/list.h> +#include <linux/bio.h> + +/*----------------------------------------------------------------*/ + +/* + * Sometimes we can't deal with a bio straight away. We put them in prison + * where they can't cause any mischief. Bios are put in a cell identified + * by a key, multiple bios can be in the same cell. When the cell is + * subsequently unlocked the bios become available. + */ +struct dm_bio_prison; + +/* FIXME: this needs to be more abstract */ +struct dm_cell_key { + int virtual; + dm_thin_id dev; + dm_block_t block; +}; + +/* + * Treat this as opaque, only in header so callers can manage allocation + * themselves. + */ +struct dm_bio_prison_cell { + struct hlist_node list; + struct dm_cell_key key; + struct bio *holder; + struct bio_list bios; +}; + +struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells); +void dm_bio_prison_destroy(struct dm_bio_prison *prison); + +/* + * These two functions just wrap a mempool. This is a transitory step: + * Eventually all bio prison clients should manage their own cell memory. + * + * Like mempool_alloc(), dm_bio_prison_alloc_cell() can only fail if called + * in interrupt context or passed GFP_NOWAIT. + */ +struct dm_bio_prison_cell *dm_bio_prison_alloc_cell(struct dm_bio_prison *prison, + gfp_t gfp); +void dm_bio_prison_free_cell(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell); + +/* + * Creates, or retrieves a cell for the given key. + * + * Returns 1 if pre-existing cell returned, zero if new cell created using + * @cell_prealloc. + */ +int dm_get_cell(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result); + +/* + * An atomic op that combines retrieving a cell, and adding a bio to it. + * + * Returns 1 if the cell was already held, 0 if @inmate is the new holder. + */ +int dm_bio_detain(struct dm_bio_prison *prison, + struct dm_cell_key *key, + struct bio *inmate, + struct dm_bio_prison_cell *cell_prealloc, + struct dm_bio_prison_cell **cell_result); + +void dm_cell_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *bios); +void dm_cell_release_no_holder(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, + struct bio_list *inmates); +void dm_cell_error(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, int error); + +/*----------------------------------------------------------------*/ + +/* + * We use the deferred set to keep track of pending reads to shared blocks. + * We do this to ensure the new mapping caused by a write isn't performed + * until these prior reads have completed. Otherwise the insertion of the + * new mapping could free the old block that the read bios are mapped to. + */ + +struct dm_deferred_set; +struct dm_deferred_entry; + +struct dm_deferred_set *dm_deferred_set_create(void); +void dm_deferred_set_destroy(struct dm_deferred_set *ds); + +struct dm_deferred_entry *dm_deferred_entry_inc(struct dm_deferred_set *ds); +void dm_deferred_entry_dec(struct dm_deferred_entry *entry, struct list_head *head); +int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index d3ec217847d..dd364611156 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -16,30 +16,25 @@ * functions in this file help the target record and restore the * original bio state. */ + struct dm_bio_details { - sector_t bi_sector; struct block_device *bi_bdev; - unsigned int bi_size; - unsigned short bi_idx; unsigned long bi_flags; + struct bvec_iter bi_iter; }; static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) { - bd->bi_sector = bio->bi_sector; bd->bi_bdev = bio->bi_bdev; - bd->bi_size = bio->bi_size; - bd->bi_idx = bio->bi_idx; bd->bi_flags = bio->bi_flags; + bd->bi_iter = bio->bi_iter; } static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) { - bio->bi_sector = bd->bi_sector; bio->bi_bdev = bd->bi_bdev; - bio->bi_size = bd->bi_size; - bio->bi_idx = bd->bi_idx; bio->bi_flags = bd->bi_flags; + bio->bi_iter = bd->bi_iter; } #endif diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c new file mode 100644 index 00000000000..d724459860d --- /dev/null +++ b/drivers/md/dm-bufio.c @@ -0,0 +1,1867 @@ +/* + * Copyright (C) 2009-2011 Red Hat, Inc. + * + * Author: Mikulas Patocka <mpatocka@redhat.com> + * + * This file is released under the GPL. + */ + +#include "dm-bufio.h" + +#include <linux/device-mapper.h> +#include <linux/dm-io.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/shrinker.h> +#include <linux/module.h> + +#define DM_MSG_PREFIX "bufio" + +/* + * Memory management policy: + * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory + * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). + * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. + * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT + * dirty buffers. + */ +#define DM_BUFIO_MIN_BUFFERS 8 + +#define DM_BUFIO_MEMORY_PERCENT 2 +#define DM_BUFIO_VMALLOC_PERCENT 25 +#define DM_BUFIO_WRITEBACK_PERCENT 75 + +/* + * Check buffer ages in this interval (seconds) + */ +#define DM_BUFIO_WORK_TIMER_SECS 10 + +/* + * Free buffers when they are older than this (seconds) + */ +#define DM_BUFIO_DEFAULT_AGE_SECS 60 + +/* + * The number of bvec entries that are embedded directly in the buffer. + * If the chunk size is larger, dm-io is used to do the io. + */ +#define DM_BUFIO_INLINE_VECS 16 + +/* + * Buffer hash + */ +#define DM_BUFIO_HASH_BITS 20 +#define DM_BUFIO_HASH(block) \ + ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ + ((1 << DM_BUFIO_HASH_BITS) - 1)) + +/* + * Don't try to use kmem_cache_alloc for blocks larger than this. + * For explanation, see alloc_buffer_data below. + */ +#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) +#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) + +/* + * dm_buffer->list_mode + */ +#define LIST_CLEAN 0 +#define LIST_DIRTY 1 +#define LIST_SIZE 2 + +/* + * Linking of buffers: + * All buffers are linked to cache_hash with their hash_list field. + * + * Clean buffers that are not being written (B_WRITING not set) + * are linked to lru[LIST_CLEAN] with their lru_list field. + * + * Dirty and clean buffers that are being written are linked to + * lru[LIST_DIRTY] with their lru_list field. When the write + * finishes, the buffer cannot be relinked immediately (because we + * are in an interrupt context and relinking requires process + * context), so some clean-not-writing buffers can be held on + * dirty_lru too. They are later added to lru in the process + * context. + */ +struct dm_bufio_client { + struct mutex lock; + + struct list_head lru[LIST_SIZE]; + unsigned long n_buffers[LIST_SIZE]; + + struct block_device *bdev; + unsigned block_size; + unsigned char sectors_per_block_bits; + unsigned char pages_per_block_bits; + unsigned char blocks_per_page_bits; + unsigned aux_size; + void (*alloc_callback)(struct dm_buffer *); + void (*write_callback)(struct dm_buffer *); + + struct dm_io_client *dm_io; + + struct list_head reserved_buffers; + unsigned need_reserved_buffers; + + unsigned minimum_buffers; + + struct hlist_head *cache_hash; + wait_queue_head_t free_buffer_wait; + + int async_write_error; + + struct list_head client_list; + struct shrinker shrinker; +}; + +/* + * Buffer state bits. + */ +#define B_READING 0 +#define B_WRITING 1 +#define B_DIRTY 2 + +/* + * Describes how the block was allocated: + * kmem_cache_alloc(), __get_free_pages() or vmalloc(). + * See the comment at alloc_buffer_data. + */ +enum data_mode { + DATA_MODE_SLAB = 0, + DATA_MODE_GET_FREE_PAGES = 1, + DATA_MODE_VMALLOC = 2, + DATA_MODE_LIMIT = 3 +}; + +struct dm_buffer { + struct hlist_node hash_list; + struct list_head lru_list; + sector_t block; + void *data; + enum data_mode data_mode; + unsigned char list_mode; /* LIST_* */ + unsigned hold_count; + int read_error; + int write_error; + unsigned long state; + unsigned long last_accessed; + struct dm_bufio_client *c; + struct list_head write_list; + struct bio bio; + struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; +}; + +/*----------------------------------------------------------------*/ + +static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; +static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; + +static inline int dm_bufio_cache_index(struct dm_bufio_client *c) +{ + unsigned ret = c->blocks_per_page_bits - 1; + + BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); + + return ret; +} + +#define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) +#define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) + +#define dm_bufio_in_request() (!!current->bio_list) + +static void dm_bufio_lock(struct dm_bufio_client *c) +{ + mutex_lock_nested(&c->lock, dm_bufio_in_request()); +} + +static int dm_bufio_trylock(struct dm_bufio_client *c) +{ + return mutex_trylock(&c->lock); +} + +static void dm_bufio_unlock(struct dm_bufio_client *c) +{ + mutex_unlock(&c->lock); +} + +/* + * FIXME Move to sched.h? + */ +#ifdef CONFIG_PREEMPT_VOLUNTARY +# define dm_bufio_cond_resched() \ +do { \ + if (unlikely(need_resched())) \ + _cond_resched(); \ +} while (0) +#else +# define dm_bufio_cond_resched() do { } while (0) +#endif + +/*----------------------------------------------------------------*/ + +/* + * Default cache size: available memory divided by the ratio. + */ +static unsigned long dm_bufio_default_cache_size; + +/* + * Total cache size set by the user. + */ +static unsigned long dm_bufio_cache_size; + +/* + * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change + * at any time. If it disagrees, the user has changed cache size. + */ +static unsigned long dm_bufio_cache_size_latch; + +static DEFINE_SPINLOCK(param_spinlock); + +/* + * Buffers are freed after this timeout + */ +static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; + +static unsigned long dm_bufio_peak_allocated; +static unsigned long dm_bufio_allocated_kmem_cache; +static unsigned long dm_bufio_allocated_get_free_pages; +static unsigned long dm_bufio_allocated_vmalloc; +static unsigned long dm_bufio_current_allocated; + +/*----------------------------------------------------------------*/ + +/* + * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count + */ +static unsigned long dm_bufio_cache_size_per_client; + +/* + * The current number of clients. + */ +static int dm_bufio_client_count; + +/* + * The list of all clients. + */ +static LIST_HEAD(dm_bufio_all_clients); + +/* + * This mutex protects dm_bufio_cache_size_latch, + * dm_bufio_cache_size_per_client and dm_bufio_client_count + */ +static DEFINE_MUTEX(dm_bufio_clients_lock); + +/*----------------------------------------------------------------*/ + +static void adjust_total_allocated(enum data_mode data_mode, long diff) +{ + static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { + &dm_bufio_allocated_kmem_cache, + &dm_bufio_allocated_get_free_pages, + &dm_bufio_allocated_vmalloc, + }; + + spin_lock(¶m_spinlock); + + *class_ptr[data_mode] += diff; + + dm_bufio_current_allocated += diff; + + if (dm_bufio_current_allocated > dm_bufio_peak_allocated) + dm_bufio_peak_allocated = dm_bufio_current_allocated; + + spin_unlock(¶m_spinlock); +} + +/* + * Change the number of clients and recalculate per-client limit. + */ +static void __cache_size_refresh(void) +{ + BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); + BUG_ON(dm_bufio_client_count < 0); + + dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); + + /* + * Use default if set to 0 and report the actual cache size used. + */ + if (!dm_bufio_cache_size_latch) { + (void)cmpxchg(&dm_bufio_cache_size, 0, + dm_bufio_default_cache_size); + dm_bufio_cache_size_latch = dm_bufio_default_cache_size; + } + + dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / + (dm_bufio_client_count ? : 1); +} + +/* + * Allocating buffer data. + * + * Small buffers are allocated with kmem_cache, to use space optimally. + * + * For large buffers, we choose between get_free_pages and vmalloc. + * Each has advantages and disadvantages. + * + * __get_free_pages can randomly fail if the memory is fragmented. + * __vmalloc won't randomly fail, but vmalloc space is limited (it may be + * as low as 128M) so using it for caching is not appropriate. + * + * If the allocation may fail we use __get_free_pages. Memory fragmentation + * won't have a fatal effect here, but it just causes flushes of some other + * buffers and more I/O will be performed. Don't use __get_free_pages if it + * always fails (i.e. order >= MAX_ORDER). + * + * If the allocation shouldn't fail we use __vmalloc. This is only for the + * initial reserve allocation, so there's no risk of wasting all vmalloc + * space. + */ +static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, + enum data_mode *data_mode) +{ + unsigned noio_flag; + void *ptr; + + if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { + *data_mode = DATA_MODE_SLAB; + return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); + } + + if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && + gfp_mask & __GFP_NORETRY) { + *data_mode = DATA_MODE_GET_FREE_PAGES; + return (void *)__get_free_pages(gfp_mask, + c->pages_per_block_bits); + } + + *data_mode = DATA_MODE_VMALLOC; + + /* + * __vmalloc allocates the data pages and auxiliary structures with + * gfp_flags that were specified, but pagetables are always allocated + * with GFP_KERNEL, no matter what was specified as gfp_mask. + * + * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that + * all allocations done by this process (including pagetables) are done + * as if GFP_NOIO was specified. + */ + + if (gfp_mask & __GFP_NORETRY) + noio_flag = memalloc_noio_save(); + + ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); + + if (gfp_mask & __GFP_NORETRY) + memalloc_noio_restore(noio_flag); + + return ptr; +} + +/* + * Free buffer's data. + */ +static void free_buffer_data(struct dm_bufio_client *c, + void *data, enum data_mode data_mode) +{ + switch (data_mode) { + case DATA_MODE_SLAB: + kmem_cache_free(DM_BUFIO_CACHE(c), data); + break; + + case DATA_MODE_GET_FREE_PAGES: + free_pages((unsigned long)data, c->pages_per_block_bits); + break; + + case DATA_MODE_VMALLOC: + vfree(data); + break; + + default: + DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", + data_mode); + BUG(); + } +} + +/* + * Allocate buffer and its data. + */ +static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) +{ + struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, + gfp_mask); + + if (!b) + return NULL; + + b->c = c; + + b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); + if (!b->data) { + kfree(b); + return NULL; + } + + adjust_total_allocated(b->data_mode, (long)c->block_size); + + return b; +} + +/* + * Free buffer and its data. + */ +static void free_buffer(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + adjust_total_allocated(b->data_mode, -(long)c->block_size); + + free_buffer_data(c, b->data, b->data_mode); + kfree(b); +} + +/* + * Link buffer to the hash list and clean or dirty queue. + */ +static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) +{ + struct dm_bufio_client *c = b->c; + + c->n_buffers[dirty]++; + b->block = block; + b->list_mode = dirty; + list_add(&b->lru_list, &c->lru[dirty]); + hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); + b->last_accessed = jiffies; +} + +/* + * Unlink buffer from the hash list and dirty or clean queue. + */ +static void __unlink_buffer(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + BUG_ON(!c->n_buffers[b->list_mode]); + + c->n_buffers[b->list_mode]--; + hlist_del(&b->hash_list); + list_del(&b->lru_list); +} + +/* + * Place the buffer to the head of dirty or clean LRU queue. + */ +static void __relink_lru(struct dm_buffer *b, int dirty) +{ + struct dm_bufio_client *c = b->c; + + BUG_ON(!c->n_buffers[b->list_mode]); + + c->n_buffers[b->list_mode]--; + c->n_buffers[dirty]++; + b->list_mode = dirty; + list_move(&b->lru_list, &c->lru[dirty]); +} + +/*---------------------------------------------------------------- + * Submit I/O on the buffer. + * + * Bio interface is faster but it has some problems: + * the vector list is limited (increasing this limit increases + * memory-consumption per buffer, so it is not viable); + * + * the memory must be direct-mapped, not vmalloced; + * + * the I/O driver can reject requests spuriously if it thinks that + * the requests are too big for the device or if they cross a + * controller-defined memory boundary. + * + * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and + * it is not vmalloced, try using the bio interface. + * + * If the buffer is big, if it is vmalloced or if the underlying device + * rejects the bio because it is too large, use dm-io layer to do the I/O. + * The dm-io layer splits the I/O into multiple requests, avoiding the above + * shortcomings. + *--------------------------------------------------------------*/ + +/* + * dm-io completion routine. It just calls b->bio.bi_end_io, pretending + * that the request was handled directly with bio interface. + */ +static void dmio_complete(unsigned long error, void *context) +{ + struct dm_buffer *b = context; + + b->bio.bi_end_io(&b->bio, error ? -EIO : 0); +} + +static void use_dmio(struct dm_buffer *b, int rw, sector_t block, + bio_end_io_t *end_io) +{ + int r; + struct dm_io_request io_req = { + .bi_rw = rw, + .notify.fn = dmio_complete, + .notify.context = b, + .client = b->c->dm_io, + }; + struct dm_io_region region = { + .bdev = b->c->bdev, + .sector = block << b->c->sectors_per_block_bits, + .count = b->c->block_size >> SECTOR_SHIFT, + }; + + if (b->data_mode != DATA_MODE_VMALLOC) { + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = b->data; + } else { + io_req.mem.type = DM_IO_VMA; + io_req.mem.ptr.vma = b->data; + } + + b->bio.bi_end_io = end_io; + + r = dm_io(&io_req, 1, ®ion, NULL); + if (r) + end_io(&b->bio, r); +} + +static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, + bio_end_io_t *end_io) +{ + char *ptr; + int len; + + bio_init(&b->bio); + b->bio.bi_io_vec = b->bio_vec; + b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; + b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; + b->bio.bi_bdev = b->c->bdev; + b->bio.bi_end_io = end_io; + + /* + * We assume that if len >= PAGE_SIZE ptr is page-aligned. + * If len < PAGE_SIZE the buffer doesn't cross page boundary. + */ + ptr = b->data; + len = b->c->block_size; + + if (len >= PAGE_SIZE) + BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); + else + BUG_ON((unsigned long)ptr & (len - 1)); + + do { + if (!bio_add_page(&b->bio, virt_to_page(ptr), + len < PAGE_SIZE ? len : PAGE_SIZE, + virt_to_phys(ptr) & (PAGE_SIZE - 1))) { + BUG_ON(b->c->block_size <= PAGE_SIZE); + use_dmio(b, rw, block, end_io); + return; + } + + len -= PAGE_SIZE; + ptr += PAGE_SIZE; + } while (len > 0); + + submit_bio(rw, &b->bio); +} + +static void submit_io(struct dm_buffer *b, int rw, sector_t block, + bio_end_io_t *end_io) +{ + if (rw == WRITE && b->c->write_callback) + b->c->write_callback(b); + + if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && + b->data_mode != DATA_MODE_VMALLOC) + use_inline_bio(b, rw, block, end_io); + else + use_dmio(b, rw, block, end_io); +} + +/*---------------------------------------------------------------- + * Writing dirty buffers + *--------------------------------------------------------------*/ + +/* + * The endio routine for write. + * + * Set the error, clear B_WRITING bit and wake anyone who was waiting on + * it. + */ +static void write_endio(struct bio *bio, int error) +{ + struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); + + b->write_error = error; + if (unlikely(error)) { + struct dm_bufio_client *c = b->c; + (void)cmpxchg(&c->async_write_error, 0, error); + } + + BUG_ON(!test_bit(B_WRITING, &b->state)); + + smp_mb__before_atomic(); + clear_bit(B_WRITING, &b->state); + smp_mb__after_atomic(); + + wake_up_bit(&b->state, B_WRITING); +} + +/* + * This function is called when wait_on_bit is actually waiting. + */ +static int do_io_schedule(void *word) +{ + io_schedule(); + + return 0; +} + +/* + * Initiate a write on a dirty buffer, but don't wait for it. + * + * - If the buffer is not dirty, exit. + * - If there some previous write going on, wait for it to finish (we can't + * have two writes on the same buffer simultaneously). + * - Submit our write and don't wait on it. We set B_WRITING indicating + * that there is a write in progress. + */ +static void __write_dirty_buffer(struct dm_buffer *b, + struct list_head *write_list) +{ + if (!test_bit(B_DIRTY, &b->state)) + return; + + clear_bit(B_DIRTY, &b->state); + wait_on_bit_lock(&b->state, B_WRITING, + do_io_schedule, TASK_UNINTERRUPTIBLE); + + if (!write_list) + submit_io(b, WRITE, b->block, write_endio); + else + list_add_tail(&b->write_list, write_list); +} + +static void __flush_write_list(struct list_head *write_list) +{ + struct blk_plug plug; + blk_start_plug(&plug); + while (!list_empty(write_list)) { + struct dm_buffer *b = + list_entry(write_list->next, struct dm_buffer, write_list); + list_del(&b->write_list); + submit_io(b, WRITE, b->block, write_endio); + dm_bufio_cond_resched(); + } + blk_finish_plug(&plug); +} + +/* + * Wait until any activity on the buffer finishes. Possibly write the + * buffer if it is dirty. When this function finishes, there is no I/O + * running on the buffer and the buffer is not dirty. + */ +static void __make_buffer_clean(struct dm_buffer *b) +{ + BUG_ON(b->hold_count); + + if (!b->state) /* fast case */ + return; + + wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); + __write_dirty_buffer(b, NULL); + wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); +} + +/* + * Find some buffer that is not held by anybody, clean it, unlink it and + * return it. + */ +static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) +{ + struct dm_buffer *b; + + list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { + BUG_ON(test_bit(B_WRITING, &b->state)); + BUG_ON(test_bit(B_DIRTY, &b->state)); + + if (!b->hold_count) { + __make_buffer_clean(b); + __unlink_buffer(b); + return b; + } + dm_bufio_cond_resched(); + } + + list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { + BUG_ON(test_bit(B_READING, &b->state)); + + if (!b->hold_count) { + __make_buffer_clean(b); + __unlink_buffer(b); + return b; + } + dm_bufio_cond_resched(); + } + + return NULL; +} + +/* + * Wait until some other threads free some buffer or release hold count on + * some buffer. + * + * This function is entered with c->lock held, drops it and regains it + * before exiting. + */ +static void __wait_for_free_buffer(struct dm_bufio_client *c) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&c->free_buffer_wait, &wait); + set_task_state(current, TASK_UNINTERRUPTIBLE); + dm_bufio_unlock(c); + + io_schedule(); + + set_task_state(current, TASK_RUNNING); + remove_wait_queue(&c->free_buffer_wait, &wait); + + dm_bufio_lock(c); +} + +enum new_flag { + NF_FRESH = 0, + NF_READ = 1, + NF_GET = 2, + NF_PREFETCH = 3 +}; + +/* + * Allocate a new buffer. If the allocation is not possible, wait until + * some other thread frees a buffer. + * + * May drop the lock and regain it. + */ +static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) +{ + struct dm_buffer *b; + + /* + * dm-bufio is resistant to allocation failures (it just keeps + * one buffer reserved in cases all the allocations fail). + * So set flags to not try too hard: + * GFP_NOIO: don't recurse into the I/O layer + * __GFP_NORETRY: don't retry and rather return failure + * __GFP_NOMEMALLOC: don't use emergency reserves + * __GFP_NOWARN: don't print a warning in case of failure + * + * For debugging, if we set the cache size to 1, no new buffers will + * be allocated. + */ + while (1) { + if (dm_bufio_cache_size_latch != 1) { + b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (b) + return b; + } + + if (nf == NF_PREFETCH) + return NULL; + + if (!list_empty(&c->reserved_buffers)) { + b = list_entry(c->reserved_buffers.next, + struct dm_buffer, lru_list); + list_del(&b->lru_list); + c->need_reserved_buffers++; + + return b; + } + + b = __get_unclaimed_buffer(c); + if (b) + return b; + + __wait_for_free_buffer(c); + } +} + +static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) +{ + struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); + + if (!b) + return NULL; + + if (c->alloc_callback) + c->alloc_callback(b); + + return b; +} + +/* + * Free a buffer and wake other threads waiting for free buffers. + */ +static void __free_buffer_wake(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + if (!c->need_reserved_buffers) + free_buffer(b); + else { + list_add(&b->lru_list, &c->reserved_buffers); + c->need_reserved_buffers--; + } + + wake_up(&c->free_buffer_wait); +} + +static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, + struct list_head *write_list) +{ + struct dm_buffer *b, *tmp; + + list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { + BUG_ON(test_bit(B_READING, &b->state)); + + if (!test_bit(B_DIRTY, &b->state) && + !test_bit(B_WRITING, &b->state)) { + __relink_lru(b, LIST_CLEAN); + continue; + } + + if (no_wait && test_bit(B_WRITING, &b->state)) + return; + + __write_dirty_buffer(b, write_list); + dm_bufio_cond_resched(); + } +} + +/* + * Get writeback threshold and buffer limit for a given client. + */ +static void __get_memory_limit(struct dm_bufio_client *c, + unsigned long *threshold_buffers, + unsigned long *limit_buffers) +{ + unsigned long buffers; + + if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { + mutex_lock(&dm_bufio_clients_lock); + __cache_size_refresh(); + mutex_unlock(&dm_bufio_clients_lock); + } + + buffers = dm_bufio_cache_size_per_client >> + (c->sectors_per_block_bits + SECTOR_SHIFT); + + if (buffers < c->minimum_buffers) + buffers = c->minimum_buffers; + + *limit_buffers = buffers; + *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; +} + +/* + * Check if we're over watermark. + * If we are over threshold_buffers, start freeing buffers. + * If we're over "limit_buffers", block until we get under the limit. + */ +static void __check_watermark(struct dm_bufio_client *c, + struct list_head *write_list) +{ + unsigned long threshold_buffers, limit_buffers; + + __get_memory_limit(c, &threshold_buffers, &limit_buffers); + + while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > + limit_buffers) { + + struct dm_buffer *b = __get_unclaimed_buffer(c); + + if (!b) + return; + + __free_buffer_wake(b); + dm_bufio_cond_resched(); + } + + if (c->n_buffers[LIST_DIRTY] > threshold_buffers) + __write_dirty_buffers_async(c, 1, write_list); +} + +/* + * Find a buffer in the hash. + */ +static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) +{ + struct dm_buffer *b; + + hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)], + hash_list) { + dm_bufio_cond_resched(); + if (b->block == block) + return b; + } + + return NULL; +} + +/*---------------------------------------------------------------- + * Getting a buffer + *--------------------------------------------------------------*/ + +static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, + enum new_flag nf, int *need_submit, + struct list_head *write_list) +{ + struct dm_buffer *b, *new_b = NULL; + + *need_submit = 0; + + b = __find(c, block); + if (b) + goto found_buffer; + + if (nf == NF_GET) + return NULL; + + new_b = __alloc_buffer_wait(c, nf); + if (!new_b) + return NULL; + + /* + * We've had a period where the mutex was unlocked, so need to + * recheck the hash table. + */ + b = __find(c, block); + if (b) { + __free_buffer_wake(new_b); + goto found_buffer; + } + + __check_watermark(c, write_list); + + b = new_b; + b->hold_count = 1; + b->read_error = 0; + b->write_error = 0; + __link_buffer(b, block, LIST_CLEAN); + + if (nf == NF_FRESH) { + b->state = 0; + return b; + } + + b->state = 1 << B_READING; + *need_submit = 1; + + return b; + +found_buffer: + if (nf == NF_PREFETCH) + return NULL; + /* + * Note: it is essential that we don't wait for the buffer to be + * read if dm_bufio_get function is used. Both dm_bufio_get and + * dm_bufio_prefetch can be used in the driver request routine. + * If the user called both dm_bufio_prefetch and dm_bufio_get on + * the same buffer, it would deadlock if we waited. + */ + if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) + return NULL; + + b->hold_count++; + __relink_lru(b, test_bit(B_DIRTY, &b->state) || + test_bit(B_WRITING, &b->state)); + return b; +} + +/* + * The endio routine for reading: set the error, clear the bit and wake up + * anyone waiting on the buffer. + */ +static void read_endio(struct bio *bio, int error) +{ + struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); + + b->read_error = error; + + BUG_ON(!test_bit(B_READING, &b->state)); + + smp_mb__before_atomic(); + clear_bit(B_READING, &b->state); + smp_mb__after_atomic(); + + wake_up_bit(&b->state, B_READING); +} + +/* + * A common routine for dm_bufio_new and dm_bufio_read. Operation of these + * functions is similar except that dm_bufio_new doesn't read the + * buffer from the disk (assuming that the caller overwrites all the data + * and uses dm_bufio_mark_buffer_dirty to write new data back). + */ +static void *new_read(struct dm_bufio_client *c, sector_t block, + enum new_flag nf, struct dm_buffer **bp) +{ + int need_submit; + struct dm_buffer *b; + + LIST_HEAD(write_list); + + dm_bufio_lock(c); + b = __bufio_new(c, block, nf, &need_submit, &write_list); + dm_bufio_unlock(c); + + __flush_write_list(&write_list); + + if (!b) + return b; + + if (need_submit) + submit_io(b, READ, b->block, read_endio); + + wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); + + if (b->read_error) { + int error = b->read_error; + + dm_bufio_release(b); + + return ERR_PTR(error); + } + + *bp = b; + + return b->data; +} + +void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp) +{ + return new_read(c, block, NF_GET, bp); +} +EXPORT_SYMBOL_GPL(dm_bufio_get); + +void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp) +{ + BUG_ON(dm_bufio_in_request()); + + return new_read(c, block, NF_READ, bp); +} +EXPORT_SYMBOL_GPL(dm_bufio_read); + +void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp) +{ + BUG_ON(dm_bufio_in_request()); + + return new_read(c, block, NF_FRESH, bp); +} +EXPORT_SYMBOL_GPL(dm_bufio_new); + +void dm_bufio_prefetch(struct dm_bufio_client *c, + sector_t block, unsigned n_blocks) +{ + struct blk_plug plug; + + LIST_HEAD(write_list); + + BUG_ON(dm_bufio_in_request()); + + blk_start_plug(&plug); + dm_bufio_lock(c); + + for (; n_blocks--; block++) { + int need_submit; + struct dm_buffer *b; + b = __bufio_new(c, block, NF_PREFETCH, &need_submit, + &write_list); + if (unlikely(!list_empty(&write_list))) { + dm_bufio_unlock(c); + blk_finish_plug(&plug); + __flush_write_list(&write_list); + blk_start_plug(&plug); + dm_bufio_lock(c); + } + if (unlikely(b != NULL)) { + dm_bufio_unlock(c); + + if (need_submit) + submit_io(b, READ, b->block, read_endio); + dm_bufio_release(b); + + dm_bufio_cond_resched(); + + if (!n_blocks) + goto flush_plug; + dm_bufio_lock(c); + } + } + + dm_bufio_unlock(c); + +flush_plug: + blk_finish_plug(&plug); +} +EXPORT_SYMBOL_GPL(dm_bufio_prefetch); + +void dm_bufio_release(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + dm_bufio_lock(c); + + BUG_ON(!b->hold_count); + + b->hold_count--; + if (!b->hold_count) { + wake_up(&c->free_buffer_wait); + + /* + * If there were errors on the buffer, and the buffer is not + * to be written, free the buffer. There is no point in caching + * invalid buffer. + */ + if ((b->read_error || b->write_error) && + !test_bit(B_READING, &b->state) && + !test_bit(B_WRITING, &b->state) && + !test_bit(B_DIRTY, &b->state)) { + __unlink_buffer(b); + __free_buffer_wake(b); + } + } + + dm_bufio_unlock(c); +} +EXPORT_SYMBOL_GPL(dm_bufio_release); + +void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) +{ + struct dm_bufio_client *c = b->c; + + dm_bufio_lock(c); + + BUG_ON(test_bit(B_READING, &b->state)); + + if (!test_and_set_bit(B_DIRTY, &b->state)) + __relink_lru(b, LIST_DIRTY); + + dm_bufio_unlock(c); +} +EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); + +void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) +{ + LIST_HEAD(write_list); + + BUG_ON(dm_bufio_in_request()); + + dm_bufio_lock(c); + __write_dirty_buffers_async(c, 0, &write_list); + dm_bufio_unlock(c); + __flush_write_list(&write_list); +} +EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); + +/* + * For performance, it is essential that the buffers are written asynchronously + * and simultaneously (so that the block layer can merge the writes) and then + * waited upon. + * + * Finally, we flush hardware disk cache. + */ +int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) +{ + int a, f; + unsigned long buffers_processed = 0; + struct dm_buffer *b, *tmp; + + LIST_HEAD(write_list); + + dm_bufio_lock(c); + __write_dirty_buffers_async(c, 0, &write_list); + dm_bufio_unlock(c); + __flush_write_list(&write_list); + dm_bufio_lock(c); + +again: + list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { + int dropped_lock = 0; + + if (buffers_processed < c->n_buffers[LIST_DIRTY]) + buffers_processed++; + + BUG_ON(test_bit(B_READING, &b->state)); + + if (test_bit(B_WRITING, &b->state)) { + if (buffers_processed < c->n_buffers[LIST_DIRTY]) { + dropped_lock = 1; + b->hold_count++; + dm_bufio_unlock(c); + wait_on_bit(&b->state, B_WRITING, + do_io_schedule, + TASK_UNINTERRUPTIBLE); + dm_bufio_lock(c); + b->hold_count--; + } else + wait_on_bit(&b->state, B_WRITING, + do_io_schedule, + TASK_UNINTERRUPTIBLE); + } + + if (!test_bit(B_DIRTY, &b->state) && + !test_bit(B_WRITING, &b->state)) + __relink_lru(b, LIST_CLEAN); + + dm_bufio_cond_resched(); + + /* + * If we dropped the lock, the list is no longer consistent, + * so we must restart the search. + * + * In the most common case, the buffer just processed is + * relinked to the clean list, so we won't loop scanning the + * same buffer again and again. + * + * This may livelock if there is another thread simultaneously + * dirtying buffers, so we count the number of buffers walked + * and if it exceeds the total number of buffers, it means that + * someone is doing some writes simultaneously with us. In + * this case, stop, dropping the lock. + */ + if (dropped_lock) + goto again; + } + wake_up(&c->free_buffer_wait); + dm_bufio_unlock(c); + + a = xchg(&c->async_write_error, 0); + f = dm_bufio_issue_flush(c); + if (a) + return a; + + return f; +} +EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); + +/* + * Use dm-io to send and empty barrier flush the device. + */ +int dm_bufio_issue_flush(struct dm_bufio_client *c) +{ + struct dm_io_request io_req = { + .bi_rw = WRITE_FLUSH, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = NULL, + .client = c->dm_io, + }; + struct dm_io_region io_reg = { + .bdev = c->bdev, + .sector = 0, + .count = 0, + }; + + BUG_ON(dm_bufio_in_request()); + + return dm_io(&io_req, 1, &io_reg, NULL); +} +EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); + +/* + * We first delete any other buffer that may be at that new location. + * + * Then, we write the buffer to the original location if it was dirty. + * + * Then, if we are the only one who is holding the buffer, relink the buffer + * in the hash queue for the new location. + * + * If there was someone else holding the buffer, we write it to the new + * location but not relink it, because that other user needs to have the buffer + * at the same place. + */ +void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) +{ + struct dm_bufio_client *c = b->c; + struct dm_buffer *new; + + BUG_ON(dm_bufio_in_request()); + + dm_bufio_lock(c); + +retry: + new = __find(c, new_block); + if (new) { + if (new->hold_count) { + __wait_for_free_buffer(c); + goto retry; + } + + /* + * FIXME: Is there any point waiting for a write that's going + * to be overwritten in a bit? + */ + __make_buffer_clean(new); + __unlink_buffer(new); + __free_buffer_wake(new); + } + + BUG_ON(!b->hold_count); + BUG_ON(test_bit(B_READING, &b->state)); + + __write_dirty_buffer(b, NULL); + if (b->hold_count == 1) { + wait_on_bit(&b->state, B_WRITING, + do_io_schedule, TASK_UNINTERRUPTIBLE); + set_bit(B_DIRTY, &b->state); + __unlink_buffer(b); + __link_buffer(b, new_block, LIST_DIRTY); + } else { + sector_t old_block; + wait_on_bit_lock(&b->state, B_WRITING, + do_io_schedule, TASK_UNINTERRUPTIBLE); + /* + * Relink buffer to "new_block" so that write_callback + * sees "new_block" as a block number. + * After the write, link the buffer back to old_block. + * All this must be done in bufio lock, so that block number + * change isn't visible to other threads. + */ + old_block = b->block; + __unlink_buffer(b); + __link_buffer(b, new_block, b->list_mode); + submit_io(b, WRITE, new_block, write_endio); + wait_on_bit(&b->state, B_WRITING, + do_io_schedule, TASK_UNINTERRUPTIBLE); + __unlink_buffer(b); + __link_buffer(b, old_block, b->list_mode); + } + + dm_bufio_unlock(c); + dm_bufio_release(b); +} +EXPORT_SYMBOL_GPL(dm_bufio_release_move); + +/* + * Free the given buffer. + * + * This is just a hint, if the buffer is in use or dirty, this function + * does nothing. + */ +void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) +{ + struct dm_buffer *b; + + dm_bufio_lock(c); + + b = __find(c, block); + if (b && likely(!b->hold_count) && likely(!b->state)) { + __unlink_buffer(b); + __free_buffer_wake(b); + } + + dm_bufio_unlock(c); +} +EXPORT_SYMBOL(dm_bufio_forget); + +void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) +{ + c->minimum_buffers = n; +} +EXPORT_SYMBOL(dm_bufio_set_minimum_buffers); + +unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) +{ + return c->block_size; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); + +sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) +{ + return i_size_read(c->bdev->bd_inode) >> + (SECTOR_SHIFT + c->sectors_per_block_bits); +} +EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); + +sector_t dm_bufio_get_block_number(struct dm_buffer *b) +{ + return b->block; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); + +void *dm_bufio_get_block_data(struct dm_buffer *b) +{ + return b->data; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); + +void *dm_bufio_get_aux_data(struct dm_buffer *b) +{ + return b + 1; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); + +struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) +{ + return b->c; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_client); + +static void drop_buffers(struct dm_bufio_client *c) +{ + struct dm_buffer *b; + int i; + + BUG_ON(dm_bufio_in_request()); + + /* + * An optimization so that the buffers are not written one-by-one. + */ + dm_bufio_write_dirty_buffers_async(c); + + dm_bufio_lock(c); + + while ((b = __get_unclaimed_buffer(c))) + __free_buffer_wake(b); + + for (i = 0; i < LIST_SIZE; i++) + list_for_each_entry(b, &c->lru[i], lru_list) + DMERR("leaked buffer %llx, hold count %u, list %d", + (unsigned long long)b->block, b->hold_count, i); + + for (i = 0; i < LIST_SIZE; i++) + BUG_ON(!list_empty(&c->lru[i])); + + dm_bufio_unlock(c); +} + +/* + * Test if the buffer is unused and too old, and commit it. + * At if noio is set, we must not do any I/O because we hold + * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to + * different bufio client. + */ +static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, + unsigned long max_jiffies) +{ + if (jiffies - b->last_accessed < max_jiffies) + return 0; + + if (!(gfp & __GFP_IO)) { + if (test_bit(B_READING, &b->state) || + test_bit(B_WRITING, &b->state) || + test_bit(B_DIRTY, &b->state)) + return 0; + } + + if (b->hold_count) + return 0; + + __make_buffer_clean(b); + __unlink_buffer(b); + __free_buffer_wake(b); + + return 1; +} + +static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, + gfp_t gfp_mask) +{ + int l; + struct dm_buffer *b, *tmp; + long freed = 0; + + for (l = 0; l < LIST_SIZE; l++) { + list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { + freed += __cleanup_old_buffer(b, gfp_mask, 0); + if (!--nr_to_scan) + break; + } + dm_bufio_cond_resched(); + } + return freed; +} + +static unsigned long +dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + struct dm_bufio_client *c; + unsigned long freed; + + c = container_of(shrink, struct dm_bufio_client, shrinker); + if (sc->gfp_mask & __GFP_IO) + dm_bufio_lock(c); + else if (!dm_bufio_trylock(c)) + return SHRINK_STOP; + + freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); + dm_bufio_unlock(c); + return freed; +} + +static unsigned long +dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + struct dm_bufio_client *c; + unsigned long count; + + c = container_of(shrink, struct dm_bufio_client, shrinker); + if (sc->gfp_mask & __GFP_IO) + dm_bufio_lock(c); + else if (!dm_bufio_trylock(c)) + return 0; + + count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; + dm_bufio_unlock(c); + return count; +} + +/* + * Create the buffering interface + */ +struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, + unsigned reserved_buffers, unsigned aux_size, + void (*alloc_callback)(struct dm_buffer *), + void (*write_callback)(struct dm_buffer *)) +{ + int r; + struct dm_bufio_client *c; + unsigned i; + + BUG_ON(block_size < 1 << SECTOR_SHIFT || + (block_size & (block_size - 1))); + + c = kzalloc(sizeof(*c), GFP_KERNEL); + if (!c) { + r = -ENOMEM; + goto bad_client; + } + c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); + if (!c->cache_hash) { + r = -ENOMEM; + goto bad_hash; + } + + c->bdev = bdev; + c->block_size = block_size; + c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; + c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? + ffs(block_size) - 1 - PAGE_SHIFT : 0; + c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? + PAGE_SHIFT - (ffs(block_size) - 1) : 0); + + c->aux_size = aux_size; + c->alloc_callback = alloc_callback; + c->write_callback = write_callback; + + for (i = 0; i < LIST_SIZE; i++) { + INIT_LIST_HEAD(&c->lru[i]); + c->n_buffers[i] = 0; + } + + for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) + INIT_HLIST_HEAD(&c->cache_hash[i]); + + mutex_init(&c->lock); + INIT_LIST_HEAD(&c->reserved_buffers); + c->need_reserved_buffers = reserved_buffers; + + c->minimum_buffers = DM_BUFIO_MIN_BUFFERS; + + init_waitqueue_head(&c->free_buffer_wait); + c->async_write_error = 0; + + c->dm_io = dm_io_client_create(); + if (IS_ERR(c->dm_io)) { + r = PTR_ERR(c->dm_io); + goto bad_dm_io; + } + + mutex_lock(&dm_bufio_clients_lock); + if (c->blocks_per_page_bits) { + if (!DM_BUFIO_CACHE_NAME(c)) { + DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); + if (!DM_BUFIO_CACHE_NAME(c)) { + r = -ENOMEM; + mutex_unlock(&dm_bufio_clients_lock); + goto bad_cache; + } + } + + if (!DM_BUFIO_CACHE(c)) { + DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), + c->block_size, + c->block_size, 0, NULL); + if (!DM_BUFIO_CACHE(c)) { + r = -ENOMEM; + mutex_unlock(&dm_bufio_clients_lock); + goto bad_cache; + } + } + } + mutex_unlock(&dm_bufio_clients_lock); + + while (c->need_reserved_buffers) { + struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); + + if (!b) { + r = -ENOMEM; + goto bad_buffer; + } + __free_buffer_wake(b); + } + + mutex_lock(&dm_bufio_clients_lock); + dm_bufio_client_count++; + list_add(&c->client_list, &dm_bufio_all_clients); + __cache_size_refresh(); + mutex_unlock(&dm_bufio_clients_lock); + + c->shrinker.count_objects = dm_bufio_shrink_count; + c->shrinker.scan_objects = dm_bufio_shrink_scan; + c->shrinker.seeks = 1; + c->shrinker.batch = 0; + register_shrinker(&c->shrinker); + + return c; + +bad_buffer: +bad_cache: + while (!list_empty(&c->reserved_buffers)) { + struct dm_buffer *b = list_entry(c->reserved_buffers.next, + struct dm_buffer, lru_list); + list_del(&b->lru_list); + free_buffer(b); + } + dm_io_client_destroy(c->dm_io); +bad_dm_io: + vfree(c->cache_hash); +bad_hash: + kfree(c); +bad_client: + return ERR_PTR(r); +} +EXPORT_SYMBOL_GPL(dm_bufio_client_create); + +/* + * Free the buffering interface. + * It is required that there are no references on any buffers. + */ +void dm_bufio_client_destroy(struct dm_bufio_client *c) +{ + unsigned i; + + drop_buffers(c); + + unregister_shrinker(&c->shrinker); + + mutex_lock(&dm_bufio_clients_lock); + + list_del(&c->client_list); + dm_bufio_client_count--; + __cache_size_refresh(); + + mutex_unlock(&dm_bufio_clients_lock); + + for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) + BUG_ON(!hlist_empty(&c->cache_hash[i])); + + BUG_ON(c->need_reserved_buffers); + + while (!list_empty(&c->reserved_buffers)) { + struct dm_buffer *b = list_entry(c->reserved_buffers.next, + struct dm_buffer, lru_list); + list_del(&b->lru_list); + free_buffer(b); + } + + for (i = 0; i < LIST_SIZE; i++) + if (c->n_buffers[i]) + DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); + + for (i = 0; i < LIST_SIZE; i++) + BUG_ON(c->n_buffers[i]); + + dm_io_client_destroy(c->dm_io); + vfree(c->cache_hash); + kfree(c); +} +EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); + +static void cleanup_old_buffers(void) +{ + unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); + struct dm_bufio_client *c; + + if (max_age > ULONG_MAX / HZ) + max_age = ULONG_MAX / HZ; + + mutex_lock(&dm_bufio_clients_lock); + list_for_each_entry(c, &dm_bufio_all_clients, client_list) { + if (!dm_bufio_trylock(c)) + continue; + + while (!list_empty(&c->lru[LIST_CLEAN])) { + struct dm_buffer *b; + b = list_entry(c->lru[LIST_CLEAN].prev, + struct dm_buffer, lru_list); + if (!__cleanup_old_buffer(b, 0, max_age * HZ)) + break; + dm_bufio_cond_resched(); + } + + dm_bufio_unlock(c); + dm_bufio_cond_resched(); + } + mutex_unlock(&dm_bufio_clients_lock); +} + +static struct workqueue_struct *dm_bufio_wq; +static struct delayed_work dm_bufio_work; + +static void work_fn(struct work_struct *w) +{ + cleanup_old_buffers(); + + queue_delayed_work(dm_bufio_wq, &dm_bufio_work, + DM_BUFIO_WORK_TIMER_SECS * HZ); +} + +/*---------------------------------------------------------------- + * Module setup + *--------------------------------------------------------------*/ + +/* + * This is called only once for the whole dm_bufio module. + * It initializes memory limit. + */ +static int __init dm_bufio_init(void) +{ + __u64 mem; + + dm_bufio_allocated_kmem_cache = 0; + dm_bufio_allocated_get_free_pages = 0; + dm_bufio_allocated_vmalloc = 0; + dm_bufio_current_allocated = 0; + + memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); + memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); + + mem = (__u64)((totalram_pages - totalhigh_pages) * + DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; + + if (mem > ULONG_MAX) + mem = ULONG_MAX; + +#ifdef CONFIG_MMU + /* + * Get the size of vmalloc space the same way as VMALLOC_TOTAL + * in fs/proc/internal.h + */ + if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) + mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; +#endif + + dm_bufio_default_cache_size = mem; + + mutex_lock(&dm_bufio_clients_lock); + __cache_size_refresh(); + mutex_unlock(&dm_bufio_clients_lock); + + dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); + if (!dm_bufio_wq) + return -ENOMEM; + + INIT_DELAYED_WORK(&dm_bufio_work, work_fn); + queue_delayed_work(dm_bufio_wq, &dm_bufio_work, + DM_BUFIO_WORK_TIMER_SECS * HZ); + + return 0; +} + +/* + * This is called once when unloading the dm_bufio module. + */ +static void __exit dm_bufio_exit(void) +{ + int bug = 0; + int i; + + cancel_delayed_work_sync(&dm_bufio_work); + destroy_workqueue(dm_bufio_wq); + + for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { + struct kmem_cache *kc = dm_bufio_caches[i]; + + if (kc) + kmem_cache_destroy(kc); + } + + for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) + kfree(dm_bufio_cache_names[i]); + + if (dm_bufio_client_count) { + DMCRIT("%s: dm_bufio_client_count leaked: %d", + __func__, dm_bufio_client_count); + bug = 1; + } + + if (dm_bufio_current_allocated) { + DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", + __func__, dm_bufio_current_allocated); + bug = 1; + } + + if (dm_bufio_allocated_get_free_pages) { + DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", + __func__, dm_bufio_allocated_get_free_pages); + bug = 1; + } + + if (dm_bufio_allocated_vmalloc) { + DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", + __func__, dm_bufio_allocated_vmalloc); + bug = 1; + } + + if (bug) + BUG(); +} + +module_init(dm_bufio_init) +module_exit(dm_bufio_exit) + +module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); + +module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); + +module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); + +module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); +MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); + +module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); +MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); + +module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); +MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); + +module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); +MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); + +MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); +MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h new file mode 100644 index 00000000000..c096779a729 --- /dev/null +++ b/drivers/md/dm-bufio.h @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2009-2011 Red Hat, Inc. + * + * Author: Mikulas Patocka <mpatocka@redhat.com> + * + * This file is released under the GPL. + */ + +#ifndef DM_BUFIO_H +#define DM_BUFIO_H + +#include <linux/blkdev.h> +#include <linux/types.h> + +/*----------------------------------------------------------------*/ + +struct dm_bufio_client; +struct dm_buffer; + +/* + * Create a buffered IO cache on a given device + */ +struct dm_bufio_client * +dm_bufio_client_create(struct block_device *bdev, unsigned block_size, + unsigned reserved_buffers, unsigned aux_size, + void (*alloc_callback)(struct dm_buffer *), + void (*write_callback)(struct dm_buffer *)); + +/* + * Release a buffered IO cache. + */ +void dm_bufio_client_destroy(struct dm_bufio_client *c); + +/* + * WARNING: to avoid deadlocks, these conditions are observed: + * + * - At most one thread can hold at most "reserved_buffers" simultaneously. + * - Each other threads can hold at most one buffer. + * - Threads which call only dm_bufio_get can hold unlimited number of + * buffers. + */ + +/* + * Read a given block from disk. Returns pointer to data. Returns a + * pointer to dm_buffer that can be used to release the buffer or to make + * it dirty. + */ +void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp); + +/* + * Like dm_bufio_read, but return buffer from cache, don't read + * it. If the buffer is not in the cache, return NULL. + */ +void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp); + +/* + * Like dm_bufio_read, but don't read anything from the disk. It is + * expected that the caller initializes the buffer and marks it dirty. + */ +void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, + struct dm_buffer **bp); + +/* + * Prefetch the specified blocks to the cache. + * The function starts to read the blocks and returns without waiting for + * I/O to finish. + */ +void dm_bufio_prefetch(struct dm_bufio_client *c, + sector_t block, unsigned n_blocks); + +/* + * Release a reference obtained with dm_bufio_{read,get,new}. The data + * pointer and dm_buffer pointer is no longer valid after this call. + */ +void dm_bufio_release(struct dm_buffer *b); + +/* + * Mark a buffer dirty. It should be called after the buffer is modified. + * + * In case of memory pressure, the buffer may be written after + * dm_bufio_mark_buffer_dirty, but before dm_bufio_write_dirty_buffers. So + * dm_bufio_write_dirty_buffers guarantees that the buffer is on-disk but + * the actual writing may occur earlier. + */ +void dm_bufio_mark_buffer_dirty(struct dm_buffer *b); + +/* + * Initiate writing of dirty buffers, without waiting for completion. + */ +void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c); + +/* + * Write all dirty buffers. Guarantees that all dirty buffers created prior + * to this call are on disk when this call exits. + */ +int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c); + +/* + * Send an empty write barrier to the device to flush hardware disk cache. + */ +int dm_bufio_issue_flush(struct dm_bufio_client *c); + +/* + * Like dm_bufio_release but also move the buffer to the new + * block. dm_bufio_write_dirty_buffers is needed to commit the new block. + */ +void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block); + +/* + * Free the given buffer. + * This is just a hint, if the buffer is in use or dirty, this function + * does nothing. + */ +void dm_bufio_forget(struct dm_bufio_client *c, sector_t block); + +/* + * Set the minimum number of buffers before cleanup happens. + */ +void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n); + +unsigned dm_bufio_get_block_size(struct dm_bufio_client *c); +sector_t dm_bufio_get_device_size(struct dm_bufio_client *c); +sector_t dm_bufio_get_block_number(struct dm_buffer *b); +void *dm_bufio_get_block_data(struct dm_buffer *b); +void *dm_bufio_get_aux_data(struct dm_buffer *b); +struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c new file mode 100644 index 00000000000..6c9049c51b2 --- /dev/null +++ b/drivers/md/dm-builtin.c @@ -0,0 +1,48 @@ +#include "dm.h" + +/* + * The kobject release method must not be placed in the module itself, + * otherwise we are subject to module unload races. + * + * The release method is called when the last reference to the kobject is + * dropped. It may be called by any other kernel code that drops the last + * reference. + * + * The release method suffers from module unload race. We may prevent the + * module from being unloaded at the start of the release method (using + * increased module reference count or synchronizing against the release + * method), however there is no way to prevent the module from being + * unloaded at the end of the release method. + * + * If this code were placed in the dm module, the following race may + * happen: + * 1. Some other process takes a reference to dm kobject + * 2. The user issues ioctl function to unload the dm device + * 3. dm_sysfs_exit calls kobject_put, however the object is not released + * because of the other reference taken at step 1 + * 4. dm_sysfs_exit waits on the completion + * 5. The other process that took the reference in step 1 drops it, + * dm_kobject_release is called from this process + * 6. dm_kobject_release calls complete() + * 7. a reschedule happens before dm_kobject_release returns + * 8. dm_sysfs_exit continues, the dm device is unloaded, module reference + * count is decremented + * 9. The user unloads the dm module + * 10. The other process that was rescheduled in step 7 continues to run, + * it is now executing code in unloaded module, so it crashes + * + * Note that if the process that takes the foreign reference to dm kobject + * has a low priority and the system is sufficiently loaded with + * higher-priority processes that prevent the low-priority process from + * being scheduled long enough, this bug may really happen. + * + * In order to fix this module unload race, we place the release method + * into a helper code that is compiled directly into the kernel. + */ + +void dm_kobject_release(struct kobject *kobj) +{ + complete(dm_get_completion_from_kobject(kobj)); +} + +EXPORT_SYMBOL(dm_kobject_release); diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h new file mode 100644 index 00000000000..aac0e2df06b --- /dev/null +++ b/drivers/md/dm-cache-block-types.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_BLOCK_TYPES_H +#define DM_CACHE_BLOCK_TYPES_H + +#include "persistent-data/dm-block-manager.h" + +/*----------------------------------------------------------------*/ + +/* + * It's helpful to get sparse to differentiate between indexes into the + * origin device, indexes into the cache device, and indexes into the + * discard bitset. + */ + +typedef dm_block_t __bitwise__ dm_oblock_t; +typedef uint32_t __bitwise__ dm_cblock_t; + +static inline dm_oblock_t to_oblock(dm_block_t b) +{ + return (__force dm_oblock_t) b; +} + +static inline dm_block_t from_oblock(dm_oblock_t b) +{ + return (__force dm_block_t) b; +} + +static inline dm_cblock_t to_cblock(uint32_t b) +{ + return (__force dm_cblock_t) b; +} + +static inline uint32_t from_cblock(dm_cblock_t b) +{ + return (__force uint32_t) b; +} + +#endif /* DM_CACHE_BLOCK_TYPES_H */ diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c new file mode 100644 index 00000000000..d2899e7eb3a --- /dev/null +++ b/drivers/md/dm-cache-metadata.c @@ -0,0 +1,1299 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-cache-metadata.h" + +#include "persistent-data/dm-array.h" +#include "persistent-data/dm-bitset.h" +#include "persistent-data/dm-space-map.h" +#include "persistent-data/dm-space-map-disk.h" +#include "persistent-data/dm-transaction-manager.h" + +#include <linux/device-mapper.h> + +/*----------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "cache metadata" + +#define CACHE_SUPERBLOCK_MAGIC 06142003 +#define CACHE_SUPERBLOCK_LOCATION 0 + +/* + * defines a range of metadata versions that this module can handle. + */ +#define MIN_CACHE_VERSION 1 +#define MAX_CACHE_VERSION 1 + +#define CACHE_METADATA_CACHE_SIZE 64 + +/* + * 3 for btree insert + + * 2 for btree lookup used within space map + */ +#define CACHE_MAX_CONCURRENT_LOCKS 5 +#define SPACE_MAP_ROOT_SIZE 128 + +enum superblock_flag_bits { + /* for spotting crashes that would invalidate the dirty bitset */ + CLEAN_SHUTDOWN, +}; + +/* + * Each mapping from cache block -> origin block carries a set of flags. + */ +enum mapping_bits { + /* + * A valid mapping. Because we're using an array we clear this + * flag for an non existant mapping. + */ + M_VALID = 1, + + /* + * The data on the cache is different from that on the origin. + */ + M_DIRTY = 2 +}; + +struct cache_disk_superblock { + __le32 csum; + __le32 flags; + __le64 blocknr; + + __u8 uuid[16]; + __le64 magic; + __le32 version; + + __u8 policy_name[CACHE_POLICY_NAME_SIZE]; + __le32 policy_hint_size; + + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + __le64 mapping_root; + __le64 hint_root; + + __le64 discard_root; + __le64 discard_block_size; + __le64 discard_nr_blocks; + + __le32 data_block_size; + __le32 metadata_block_size; + __le32 cache_blocks; + + __le32 compat_flags; + __le32 compat_ro_flags; + __le32 incompat_flags; + + __le32 read_hits; + __le32 read_misses; + __le32 write_hits; + __le32 write_misses; + + __le32 policy_version[CACHE_POLICY_VERSION_SIZE]; +} __packed; + +struct dm_cache_metadata { + struct block_device *bdev; + struct dm_block_manager *bm; + struct dm_space_map *metadata_sm; + struct dm_transaction_manager *tm; + + struct dm_array_info info; + struct dm_array_info hint_info; + struct dm_disk_bitset discard_info; + + struct rw_semaphore root_lock; + dm_block_t root; + dm_block_t hint_root; + dm_block_t discard_root; + + sector_t discard_block_size; + dm_oblock_t discard_nr_blocks; + + sector_t data_block_size; + dm_cblock_t cache_blocks; + bool changed:1; + bool clean_when_opened:1; + + char policy_name[CACHE_POLICY_NAME_SIZE]; + unsigned policy_version[CACHE_POLICY_VERSION_SIZE]; + size_t policy_hint_size; + struct dm_cache_statistics stats; + + /* + * Reading the space map root can fail, so we read it into this + * buffer before the superblock is locked and updated. + */ + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; +}; + +/*------------------------------------------------------------------- + * superblock validator + *-----------------------------------------------------------------*/ + +#define SUPERBLOCK_CSUM_XOR 9031977 + +static void sb_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t sb_block_size) +{ + struct cache_disk_superblock *disk_super = dm_block_data(b); + + disk_super->blocknr = cpu_to_le64(dm_block_location(b)); + disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); +} + +static int check_metadata_version(struct cache_disk_superblock *disk_super) +{ + uint32_t metadata_version = le32_to_cpu(disk_super->version); + if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) { + DMERR("Cache metadata version %u found, but only versions between %u and %u supported.", + metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION); + return -EINVAL; + } + + return 0; +} + +static int sb_check(struct dm_block_validator *v, + struct dm_block *b, + size_t sb_block_size) +{ + struct cache_disk_superblock *disk_super = dm_block_data(b); + __le32 csum_le; + + if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { + DMERR("sb_check failed: blocknr %llu: wanted %llu", + le64_to_cpu(disk_super->blocknr), + (unsigned long long)dm_block_location(b)); + return -ENOTBLK; + } + + if (le64_to_cpu(disk_super->magic) != CACHE_SUPERBLOCK_MAGIC) { + DMERR("sb_check failed: magic %llu: wanted %llu", + le64_to_cpu(disk_super->magic), + (unsigned long long)CACHE_SUPERBLOCK_MAGIC); + return -EILSEQ; + } + + csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); + if (csum_le != disk_super->csum) { + DMERR("sb_check failed: csum %u: wanted %u", + le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); + return -EILSEQ; + } + + return check_metadata_version(disk_super); +} + +static struct dm_block_validator sb_validator = { + .name = "superblock", + .prepare_for_write = sb_prepare_for_write, + .check = sb_check +}; + +/*----------------------------------------------------------------*/ + +static int superblock_read_lock(struct dm_cache_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_read_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int superblock_lock_zero(struct dm_cache_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock_zero(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int superblock_lock(struct dm_cache_metadata *cmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +/*----------------------------------------------------------------*/ + +static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result) +{ + int r; + unsigned i; + struct dm_block *b; + __le64 *data_le, zero = cpu_to_le64(0); + unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64); + + /* + * We can't use a validator here - it may be all zeroes. + */ + r = dm_bm_read_lock(bm, CACHE_SUPERBLOCK_LOCATION, NULL, &b); + if (r) + return r; + + data_le = dm_block_data(b); + *result = true; + for (i = 0; i < sb_block_size; i++) { + if (data_le[i] != zero) { + *result = false; + break; + } + } + + return dm_bm_unlock(b); +} + +static void __setup_mapping_info(struct dm_cache_metadata *cmd) +{ + struct dm_btree_value_type vt; + + vt.context = NULL; + vt.size = sizeof(__le64); + vt.inc = NULL; + vt.dec = NULL; + vt.equal = NULL; + dm_array_info_init(&cmd->info, cmd->tm, &vt); + + if (cmd->policy_hint_size) { + vt.size = sizeof(__le32); + dm_array_info_init(&cmd->hint_info, cmd->tm, &vt); + } +} + +static int __save_sm_root(struct dm_cache_metadata *cmd) +{ + int r; + size_t metadata_len; + + r = dm_sm_root_size(cmd->metadata_sm, &metadata_len); + if (r < 0) + return r; + + return dm_sm_copy_root(cmd->metadata_sm, &cmd->metadata_space_map_root, + metadata_len); +} + +static void __copy_sm_root(struct dm_cache_metadata *cmd, + struct cache_disk_superblock *disk_super) +{ + memcpy(&disk_super->metadata_space_map_root, + &cmd->metadata_space_map_root, + sizeof(cmd->metadata_space_map_root)); +} + +static int __write_initial_superblock(struct dm_cache_metadata *cmd) +{ + int r; + struct dm_block *sblock; + struct cache_disk_superblock *disk_super; + sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT; + + /* FIXME: see if we can lose the max sectors limit */ + if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS) + bdev_size = DM_CACHE_METADATA_MAX_SECTORS; + + r = dm_tm_pre_commit(cmd->tm); + if (r < 0) + return r; + + /* + * dm_sm_copy_root() can fail. So we need to do it before we start + * updating the superblock. + */ + r = __save_sm_root(cmd); + if (r) + return r; + + r = superblock_lock_zero(cmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + disk_super->flags = 0; + memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); + disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); + disk_super->version = cpu_to_le32(MAX_CACHE_VERSION); + memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); + memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); + disk_super->policy_hint_size = 0; + + __copy_sm_root(cmd, disk_super); + + disk_super->mapping_root = cpu_to_le64(cmd->root); + disk_super->hint_root = cpu_to_le64(cmd->hint_root); + disk_super->discard_root = cpu_to_le64(cmd->discard_root); + disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); + disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); + disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); + disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); + disk_super->cache_blocks = cpu_to_le32(0); + + disk_super->read_hits = cpu_to_le32(0); + disk_super->read_misses = cpu_to_le32(0); + disk_super->write_hits = cpu_to_le32(0); + disk_super->write_misses = cpu_to_le32(0); + + return dm_tm_commit(cmd->tm, sblock); +} + +static int __format_metadata(struct dm_cache_metadata *cmd) +{ + int r; + + r = dm_tm_create_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + &cmd->tm, &cmd->metadata_sm); + if (r < 0) { + DMERR("tm_create_with_sm failed"); + return r; + } + + __setup_mapping_info(cmd); + + r = dm_array_empty(&cmd->info, &cmd->root); + if (r < 0) + goto bad; + + dm_disk_bitset_init(cmd->tm, &cmd->discard_info); + + r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root); + if (r < 0) + goto bad; + + cmd->discard_block_size = 0; + cmd->discard_nr_blocks = 0; + + r = __write_initial_superblock(cmd); + if (r) + goto bad; + + cmd->clean_when_opened = true; + return 0; + +bad: + dm_tm_destroy(cmd->tm); + dm_sm_destroy(cmd->metadata_sm); + + return r; +} + +static int __check_incompat_features(struct cache_disk_superblock *disk_super, + struct dm_cache_metadata *cmd) +{ + uint32_t features; + + features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP; + if (features) { + DMERR("could not access metadata due to unsupported optional features (%lx).", + (unsigned long)features); + return -EINVAL; + } + + /* + * Check for read-only metadata to skip the following RDWR checks. + */ + if (get_disk_ro(cmd->bdev->bd_disk)) + return 0; + + features = le32_to_cpu(disk_super->compat_ro_flags) & ~DM_CACHE_FEATURE_COMPAT_RO_SUPP; + if (features) { + DMERR("could not access metadata RDWR due to unsupported optional features (%lx).", + (unsigned long)features); + return -EINVAL; + } + + return 0; +} + +static int __open_metadata(struct dm_cache_metadata *cmd) +{ + int r; + struct dm_block *sblock; + struct cache_disk_superblock *disk_super; + unsigned long sb_flags; + + r = superblock_read_lock(cmd, &sblock); + if (r < 0) { + DMERR("couldn't read lock superblock"); + return r; + } + + disk_super = dm_block_data(sblock); + + /* Verify the data block size hasn't changed */ + if (le32_to_cpu(disk_super->data_block_size) != cmd->data_block_size) { + DMERR("changing the data block size (from %u to %llu) is not supported", + le32_to_cpu(disk_super->data_block_size), + (unsigned long long)cmd->data_block_size); + r = -EINVAL; + goto bad; + } + + r = __check_incompat_features(disk_super, cmd); + if (r < 0) + goto bad; + + r = dm_tm_open_with_sm(cmd->bm, CACHE_SUPERBLOCK_LOCATION, + disk_super->metadata_space_map_root, + sizeof(disk_super->metadata_space_map_root), + &cmd->tm, &cmd->metadata_sm); + if (r < 0) { + DMERR("tm_open_with_sm failed"); + goto bad; + } + + __setup_mapping_info(cmd); + dm_disk_bitset_init(cmd->tm, &cmd->discard_info); + sb_flags = le32_to_cpu(disk_super->flags); + cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags); + return dm_bm_unlock(sblock); + +bad: + dm_bm_unlock(sblock); + return r; +} + +static int __open_or_format_metadata(struct dm_cache_metadata *cmd, + bool format_device) +{ + int r; + bool unformatted = false; + + r = __superblock_all_zeroes(cmd->bm, &unformatted); + if (r) + return r; + + if (unformatted) + return format_device ? __format_metadata(cmd) : -EPERM; + + return __open_metadata(cmd); +} + +static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, + bool may_format_device) +{ + int r; + cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE, + CACHE_METADATA_CACHE_SIZE, + CACHE_MAX_CONCURRENT_LOCKS); + if (IS_ERR(cmd->bm)) { + DMERR("could not create block manager"); + return PTR_ERR(cmd->bm); + } + + r = __open_or_format_metadata(cmd, may_format_device); + if (r) + dm_block_manager_destroy(cmd->bm); + + return r; +} + +static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd) +{ + dm_sm_destroy(cmd->metadata_sm); + dm_tm_destroy(cmd->tm); + dm_block_manager_destroy(cmd->bm); +} + +typedef unsigned long (*flags_mutator)(unsigned long); + +static void update_flags(struct cache_disk_superblock *disk_super, + flags_mutator mutator) +{ + uint32_t sb_flags = mutator(le32_to_cpu(disk_super->flags)); + disk_super->flags = cpu_to_le32(sb_flags); +} + +static unsigned long set_clean_shutdown(unsigned long flags) +{ + set_bit(CLEAN_SHUTDOWN, &flags); + return flags; +} + +static unsigned long clear_clean_shutdown(unsigned long flags) +{ + clear_bit(CLEAN_SHUTDOWN, &flags); + return flags; +} + +static void read_superblock_fields(struct dm_cache_metadata *cmd, + struct cache_disk_superblock *disk_super) +{ + cmd->root = le64_to_cpu(disk_super->mapping_root); + cmd->hint_root = le64_to_cpu(disk_super->hint_root); + cmd->discard_root = le64_to_cpu(disk_super->discard_root); + cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size); + cmd->discard_nr_blocks = to_oblock(le64_to_cpu(disk_super->discard_nr_blocks)); + cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); + cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); + strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); + cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]); + cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]); + cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]); + cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size); + + cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits); + cmd->stats.read_misses = le32_to_cpu(disk_super->read_misses); + cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits); + cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses); + + cmd->changed = false; +} + +/* + * The mutator updates the superblock flags. + */ +static int __begin_transaction_flags(struct dm_cache_metadata *cmd, + flags_mutator mutator) +{ + int r; + struct cache_disk_superblock *disk_super; + struct dm_block *sblock; + + r = superblock_lock(cmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + update_flags(disk_super, mutator); + read_superblock_fields(cmd, disk_super); + dm_bm_unlock(sblock); + + return dm_bm_flush(cmd->bm); +} + +static int __begin_transaction(struct dm_cache_metadata *cmd) +{ + int r; + struct cache_disk_superblock *disk_super; + struct dm_block *sblock; + + /* + * We re-read the superblock every time. Shouldn't need to do this + * really. + */ + r = superblock_read_lock(cmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + read_superblock_fields(cmd, disk_super); + dm_bm_unlock(sblock); + + return 0; +} + +static int __commit_transaction(struct dm_cache_metadata *cmd, + flags_mutator mutator) +{ + int r; + struct cache_disk_superblock *disk_super; + struct dm_block *sblock; + + /* + * We need to know if the cache_disk_superblock exceeds a 512-byte sector. + */ + BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512); + + r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, + &cmd->discard_root); + if (r) + return r; + + r = dm_tm_pre_commit(cmd->tm); + if (r < 0) + return r; + + r = __save_sm_root(cmd); + if (r) + return r; + + r = superblock_lock(cmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + + if (mutator) + update_flags(disk_super, mutator); + + disk_super->mapping_root = cpu_to_le64(cmd->root); + disk_super->hint_root = cpu_to_le64(cmd->hint_root); + disk_super->discard_root = cpu_to_le64(cmd->discard_root); + disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); + disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); + disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); + strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); + disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); + disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]); + disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]); + + disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits); + disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses); + disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits); + disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses); + __copy_sm_root(cmd, disk_super); + + return dm_tm_commit(cmd->tm, sblock); +} + +/*----------------------------------------------------------------*/ + +/* + * The mappings are held in a dm-array that has 64-bit values stored in + * little-endian format. The index is the cblock, the high 48bits of the + * value are the oblock and the low 16 bit the flags. + */ +#define FLAGS_MASK ((1 << 16) - 1) + +static __le64 pack_value(dm_oblock_t block, unsigned flags) +{ + uint64_t value = from_oblock(block); + value <<= 16; + value = value | (flags & FLAGS_MASK); + return cpu_to_le64(value); +} + +static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags) +{ + uint64_t value = le64_to_cpu(value_le); + uint64_t b = value >> 16; + *block = to_oblock(b); + *flags = value & FLAGS_MASK; +} + +/*----------------------------------------------------------------*/ + +struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size) +{ + int r; + struct dm_cache_metadata *cmd; + + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) { + DMERR("could not allocate metadata struct"); + return NULL; + } + + init_rwsem(&cmd->root_lock); + cmd->bdev = bdev; + cmd->data_block_size = data_block_size; + cmd->cache_blocks = 0; + cmd->policy_hint_size = policy_hint_size; + cmd->changed = true; + + r = __create_persistent_data_objects(cmd, may_format_device); + if (r) { + kfree(cmd); + return ERR_PTR(r); + } + + r = __begin_transaction_flags(cmd, clear_clean_shutdown); + if (r < 0) { + dm_cache_metadata_close(cmd); + return ERR_PTR(r); + } + + return cmd; +} + +void dm_cache_metadata_close(struct dm_cache_metadata *cmd) +{ + __destroy_persistent_data_objects(cmd); + kfree(cmd); +} + +/* + * Checks that the given cache block is either unmapped or clean. + */ +static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, + bool *result) +{ + int r; + __le64 value; + dm_oblock_t ob; + unsigned flags; + + r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value); + if (r) { + DMERR("block_unmapped_or_clean failed"); + return r; + } + + unpack_value(value, &ob, &flags); + *result = !((flags & M_VALID) && (flags & M_DIRTY)); + + return 0; +} + +static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) +{ + int r; + *result = true; + + while (begin != end) { + r = block_unmapped_or_clean(cmd, begin, result); + if (r) + return r; + + if (!*result) { + DMERR("cache block %llu is dirty", + (unsigned long long) from_cblock(begin)); + return 0; + } + + begin = to_cblock(from_cblock(begin) + 1); + } + + return 0; +} + +int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) +{ + int r; + bool clean; + __le64 null_mapping = pack_value(0, 0); + + down_write(&cmd->root_lock); + __dm_bless_for_disk(&null_mapping); + + if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) { + r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean); + if (r) { + __dm_unbless_for_disk(&null_mapping); + goto out; + } + + if (!clean) { + DMERR("unable to shrink cache due to dirty blocks"); + r = -EINVAL; + __dm_unbless_for_disk(&null_mapping); + goto out; + } + } + + r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks), + from_cblock(new_cache_size), + &null_mapping, &cmd->root); + if (!r) + cmd->cache_blocks = new_cache_size; + cmd->changed = true; + +out: + up_write(&cmd->root_lock); + + return r; +} + +int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, + sector_t discard_block_size, + dm_oblock_t new_nr_entries) +{ + int r; + + down_write(&cmd->root_lock); + r = dm_bitset_resize(&cmd->discard_info, + cmd->discard_root, + from_oblock(cmd->discard_nr_blocks), + from_oblock(new_nr_entries), + false, &cmd->discard_root); + if (!r) { + cmd->discard_block_size = discard_block_size; + cmd->discard_nr_blocks = new_nr_entries; + } + + cmd->changed = true; + up_write(&cmd->root_lock); + + return r; +} + +static int __set_discard(struct dm_cache_metadata *cmd, dm_oblock_t b) +{ + return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root, + from_oblock(b), &cmd->discard_root); +} + +static int __clear_discard(struct dm_cache_metadata *cmd, dm_oblock_t b) +{ + return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root, + from_oblock(b), &cmd->discard_root); +} + +static int __is_discarded(struct dm_cache_metadata *cmd, dm_oblock_t b, + bool *is_discarded) +{ + return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root, + from_oblock(b), &cmd->discard_root, + is_discarded); +} + +static int __discard(struct dm_cache_metadata *cmd, + dm_oblock_t dblock, bool discard) +{ + int r; + + r = (discard ? __set_discard : __clear_discard)(cmd, dblock); + if (r) + return r; + + cmd->changed = true; + return 0; +} + +int dm_cache_set_discard(struct dm_cache_metadata *cmd, + dm_oblock_t dblock, bool discard) +{ + int r; + + down_write(&cmd->root_lock); + r = __discard(cmd, dblock, discard); + up_write(&cmd->root_lock); + + return r; +} + +static int __load_discards(struct dm_cache_metadata *cmd, + load_discard_fn fn, void *context) +{ + int r = 0; + dm_block_t b; + bool discard; + + for (b = 0; b < from_oblock(cmd->discard_nr_blocks); b++) { + dm_oblock_t dblock = to_oblock(b); + + if (cmd->clean_when_opened) { + r = __is_discarded(cmd, dblock, &discard); + if (r) + return r; + } else + discard = false; + + r = fn(context, cmd->discard_block_size, dblock, discard); + if (r) + break; + } + + return r; +} + +int dm_cache_load_discards(struct dm_cache_metadata *cmd, + load_discard_fn fn, void *context) +{ + int r; + + down_read(&cmd->root_lock); + r = __load_discards(cmd, fn, context); + up_read(&cmd->root_lock); + + return r; +} + +dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd) +{ + dm_cblock_t r; + + down_read(&cmd->root_lock); + r = cmd->cache_blocks; + up_read(&cmd->root_lock); + + return r; +} + +static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock) +{ + int r; + __le64 value = pack_value(0, 0); + + __dm_bless_for_disk(&value); + r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), + &value, &cmd->root); + if (r) + return r; + + cmd->changed = true; + return 0; +} + +int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock) +{ + int r; + + down_write(&cmd->root_lock); + r = __remove(cmd, cblock); + up_write(&cmd->root_lock); + + return r; +} + +static int __insert(struct dm_cache_metadata *cmd, + dm_cblock_t cblock, dm_oblock_t oblock) +{ + int r; + __le64 value = pack_value(oblock, M_VALID); + __dm_bless_for_disk(&value); + + r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), + &value, &cmd->root); + if (r) + return r; + + cmd->changed = true; + return 0; +} + +int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, + dm_cblock_t cblock, dm_oblock_t oblock) +{ + int r; + + down_write(&cmd->root_lock); + r = __insert(cmd, cblock, oblock); + up_write(&cmd->root_lock); + + return r; +} + +struct thunk { + load_mapping_fn fn; + void *context; + + struct dm_cache_metadata *cmd; + bool respect_dirty_flags; + bool hints_valid; +}; + +static bool policy_unchanged(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy) +{ + const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned *policy_version = dm_cache_policy_get_version(policy); + size_t policy_hint_size = dm_cache_policy_get_hint_size(policy); + + /* + * Ensure policy names match. + */ + if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name))) + return false; + + /* + * Ensure policy major versions match. + */ + if (cmd->policy_version[0] != policy_version[0]) + return false; + + /* + * Ensure policy hint sizes match. + */ + if (cmd->policy_hint_size != policy_hint_size) + return false; + + return true; +} + +static bool hints_array_initialized(struct dm_cache_metadata *cmd) +{ + return cmd->hint_root && cmd->policy_hint_size; +} + +static bool hints_array_available(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy) +{ + return cmd->clean_when_opened && policy_unchanged(cmd, policy) && + hints_array_initialized(cmd); +} + +static int __load_mapping(void *context, uint64_t cblock, void *leaf) +{ + int r = 0; + bool dirty; + __le64 value; + __le32 hint_value = 0; + dm_oblock_t oblock; + unsigned flags; + struct thunk *thunk = context; + struct dm_cache_metadata *cmd = thunk->cmd; + + memcpy(&value, leaf, sizeof(value)); + unpack_value(value, &oblock, &flags); + + if (flags & M_VALID) { + if (thunk->hints_valid) { + r = dm_array_get_value(&cmd->hint_info, cmd->hint_root, + cblock, &hint_value); + if (r && r != -ENODATA) + return r; + } + + dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true; + r = thunk->fn(thunk->context, oblock, to_cblock(cblock), + dirty, le32_to_cpu(hint_value), thunk->hints_valid); + } + + return r; +} + +static int __load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, + load_mapping_fn fn, void *context) +{ + struct thunk thunk; + + thunk.fn = fn; + thunk.context = context; + + thunk.cmd = cmd; + thunk.respect_dirty_flags = cmd->clean_when_opened; + thunk.hints_valid = hints_array_available(cmd, policy); + + return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk); +} + +int dm_cache_load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, + load_mapping_fn fn, void *context) +{ + int r; + + down_read(&cmd->root_lock); + r = __load_mappings(cmd, policy, fn, context); + up_read(&cmd->root_lock); + + return r; +} + +static int __dump_mapping(void *context, uint64_t cblock, void *leaf) +{ + int r = 0; + __le64 value; + dm_oblock_t oblock; + unsigned flags; + + memcpy(&value, leaf, sizeof(value)); + unpack_value(value, &oblock, &flags); + + return r; +} + +static int __dump_mappings(struct dm_cache_metadata *cmd) +{ + return dm_array_walk(&cmd->info, cmd->root, __dump_mapping, NULL); +} + +void dm_cache_dump(struct dm_cache_metadata *cmd) +{ + down_read(&cmd->root_lock); + __dump_mappings(cmd); + up_read(&cmd->root_lock); +} + +int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd) +{ + int r; + + down_read(&cmd->root_lock); + r = cmd->changed; + up_read(&cmd->root_lock); + + return r; +} + +static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty) +{ + int r; + unsigned flags; + dm_oblock_t oblock; + __le64 value; + + r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(cblock), &value); + if (r) + return r; + + unpack_value(value, &oblock, &flags); + + if (((flags & M_DIRTY) && dirty) || (!(flags & M_DIRTY) && !dirty)) + /* nothing to be done */ + return 0; + + value = pack_value(oblock, (flags & ~M_DIRTY) | (dirty ? M_DIRTY : 0)); + __dm_bless_for_disk(&value); + + r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock), + &value, &cmd->root); + if (r) + return r; + + cmd->changed = true; + return 0; + +} + +int dm_cache_set_dirty(struct dm_cache_metadata *cmd, + dm_cblock_t cblock, bool dirty) +{ + int r; + + down_write(&cmd->root_lock); + r = __dirty(cmd, cblock, dirty); + up_write(&cmd->root_lock); + + return r; +} + +void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd, + struct dm_cache_statistics *stats) +{ + down_read(&cmd->root_lock); + *stats = cmd->stats; + up_read(&cmd->root_lock); +} + +void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd, + struct dm_cache_statistics *stats) +{ + down_write(&cmd->root_lock); + cmd->stats = *stats; + up_write(&cmd->root_lock); +} + +int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown) +{ + int r; + flags_mutator mutator = (clean_shutdown ? set_clean_shutdown : + clear_clean_shutdown); + + down_write(&cmd->root_lock); + r = __commit_transaction(cmd, mutator); + if (r) + goto out; + + r = __begin_transaction(cmd); + +out: + up_write(&cmd->root_lock); + return r; +} + +int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd, + dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&cmd->root_lock); + r = dm_sm_get_nr_free(cmd->metadata_sm, result); + up_read(&cmd->root_lock); + + return r; +} + +int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, + dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&cmd->root_lock); + r = dm_sm_get_nr_blocks(cmd->metadata_sm, result); + up_read(&cmd->root_lock); + + return r; +} + +/*----------------------------------------------------------------*/ + +static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) +{ + int r; + __le32 value; + size_t hint_size; + const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned *policy_version = dm_cache_policy_get_version(policy); + + if (!policy_name[0] || + (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) + return -EINVAL; + + if (!policy_unchanged(cmd, policy)) { + strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); + memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version)); + + hint_size = dm_cache_policy_get_hint_size(policy); + if (!hint_size) + return 0; /* short-circuit hints initialization */ + cmd->policy_hint_size = hint_size; + + if (cmd->hint_root) { + r = dm_array_del(&cmd->hint_info, cmd->hint_root); + if (r) + return r; + } + + r = dm_array_empty(&cmd->hint_info, &cmd->hint_root); + if (r) + return r; + + value = cpu_to_le32(0); + __dm_bless_for_disk(&value); + r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0, + from_cblock(cmd->cache_blocks), + &value, &cmd->hint_root); + if (r) + return r; + } + + return 0; +} + +static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, uint32_t hint) +{ + struct dm_cache_metadata *cmd = context; + __le32 value = cpu_to_le32(hint); + int r; + + __dm_bless_for_disk(&value); + + r = dm_array_set_value(&cmd->hint_info, cmd->hint_root, + from_cblock(cblock), &value, &cmd->hint_root); + cmd->changed = true; + + return r; +} + +static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) +{ + int r; + + r = begin_hints(cmd, policy); + if (r) { + DMERR("begin_hints failed"); + return r; + } + + return policy_walk_mappings(policy, save_hint, cmd); +} + +int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) +{ + int r; + + down_write(&cmd->root_lock); + r = write_hints(cmd, policy); + up_write(&cmd->root_lock); + + return r; +} + +int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result) +{ + return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); +} diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h new file mode 100644 index 00000000000..cd70a78623a --- /dev/null +++ b/drivers/md/dm-cache-metadata.h @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_METADATA_H +#define DM_CACHE_METADATA_H + +#include "dm-cache-block-types.h" +#include "dm-cache-policy-internal.h" + +/*----------------------------------------------------------------*/ + +#define DM_CACHE_METADATA_BLOCK_SIZE 4096 + +/* FIXME: remove this restriction */ +/* + * The metadata device is currently limited in size. + * + * We have one block of index, which can hold 255 index entries. Each + * index entry contains allocation info about 16k metadata blocks. + */ +#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) + +/* + * A metadata device larger than 16GB triggers a warning. + */ +#define DM_CACHE_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) + +/*----------------------------------------------------------------*/ + +/* + * Ext[234]-style compat feature flags. + * + * A new feature which old metadata will still be compatible with should + * define a DM_CACHE_FEATURE_COMPAT_* flag (rarely useful). + * + * A new feature that is not compatible with old code should define a + * DM_CACHE_FEATURE_INCOMPAT_* flag and guard the relevant code with + * that flag. + * + * A new feature that is not compatible with old code accessing the + * metadata RDWR should define a DM_CACHE_FEATURE_RO_COMPAT_* flag and + * guard the relevant code with that flag. + * + * As these various flags are defined they should be added to the + * following masks. + */ +#define DM_CACHE_FEATURE_COMPAT_SUPP 0UL +#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL +#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL + +/* + * Reopens or creates a new, empty metadata volume. + * Returns an ERR_PTR on failure. + */ +struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size); + +void dm_cache_metadata_close(struct dm_cache_metadata *cmd); + +/* + * The metadata needs to know how many cache blocks there are. We don't + * care about the origin, assuming the core target is giving us valid + * origin blocks to map to. + */ +int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size); +dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd); + +int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, + sector_t discard_block_size, + dm_oblock_t new_nr_entries); + +typedef int (*load_discard_fn)(void *context, sector_t discard_block_size, + dm_oblock_t dblock, bool discarded); +int dm_cache_load_discards(struct dm_cache_metadata *cmd, + load_discard_fn fn, void *context); + +int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_oblock_t dblock, bool discard); + +int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock); +int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock); +int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd); + +typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock, + dm_cblock_t cblock, bool dirty, + uint32_t hint, bool hint_valid); +int dm_cache_load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, + load_mapping_fn fn, + void *context); + +int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty); + +struct dm_cache_statistics { + uint32_t read_hits; + uint32_t read_misses; + uint32_t write_hits; + uint32_t write_misses; +}; + +void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd, + struct dm_cache_statistics *stats); +void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd, + struct dm_cache_statistics *stats); + +int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown); + +int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd, + dm_block_t *result); + +int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, + dm_block_t *result); + +void dm_cache_dump(struct dm_cache_metadata *cmd); + +/* + * The policy is invited to save a 32bit hint value for every cblock (eg, + * for a hit count). These are stored against the policy name. If + * policies are changed, then hints will be lost. If the machine crashes, + * hints will be lost. + * + * The hints are indexed by the cblock, but many policies will not + * neccessarily have a fast way of accessing efficiently via cblock. So + * rather than querying the policy for each cblock, we let it walk its data + * structures and fill in the hints in whatever order it wishes. + */ +int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p); + +/* + * Query method. Are all the blocks in the cache clean? + */ +int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result); + +/*----------------------------------------------------------------*/ + +#endif /* DM_CACHE_METADATA_H */ diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c new file mode 100644 index 00000000000..b04d1f904d0 --- /dev/null +++ b/drivers/md/dm-cache-policy-cleaner.c @@ -0,0 +1,467 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * writeback cache policy supporting flushing out dirty cache blocks. + * + * This file is released under the GPL. + */ + +#include "dm-cache-policy.h" +#include "dm.h" + +#include <linux/hash.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +/*----------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "cache cleaner" + +/* Cache entry struct. */ +struct wb_cache_entry { + struct list_head list; + struct hlist_node hlist; + + dm_oblock_t oblock; + dm_cblock_t cblock; + bool dirty:1; + bool pending:1; +}; + +struct hash { + struct hlist_head *table; + dm_block_t hash_bits; + unsigned nr_buckets; +}; + +struct policy { + struct dm_cache_policy policy; + spinlock_t lock; + + struct list_head free; + struct list_head clean; + struct list_head clean_pending; + struct list_head dirty; + + /* + * We know exactly how many cblocks will be needed, + * so we can allocate them up front. + */ + dm_cblock_t cache_size, nr_cblocks_allocated; + struct wb_cache_entry *cblocks; + struct hash chash; +}; + +/*----------------------------------------------------------------------------*/ + +/* + * Low-level functions. + */ +static unsigned next_power(unsigned n, unsigned min) +{ + return roundup_pow_of_two(max(n, min)); +} + +static struct policy *to_policy(struct dm_cache_policy *p) +{ + return container_of(p, struct policy, policy); +} + +static struct list_head *list_pop(struct list_head *q) +{ + struct list_head *r = q->next; + + list_del(r); + + return r; +} + +/*----------------------------------------------------------------------------*/ + +/* Allocate/free various resources. */ +static int alloc_hash(struct hash *hash, unsigned elts) +{ + hash->nr_buckets = next_power(elts >> 4, 16); + hash->hash_bits = ffs(hash->nr_buckets) - 1; + hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets); + + return hash->table ? 0 : -ENOMEM; +} + +static void free_hash(struct hash *hash) +{ + vfree(hash->table); +} + +static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size) +{ + int r = -ENOMEM; + + p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size)); + if (p->cblocks) { + unsigned u = from_cblock(cache_size); + + while (u--) + list_add(&p->cblocks[u].list, &p->free); + + p->nr_cblocks_allocated = 0; + + /* Cache entries hash. */ + r = alloc_hash(&p->chash, from_cblock(cache_size)); + if (r) + vfree(p->cblocks); + } + + return r; +} + +static void free_cache_blocks_and_hash(struct policy *p) +{ + free_hash(&p->chash); + vfree(p->cblocks); +} + +static struct wb_cache_entry *alloc_cache_entry(struct policy *p) +{ + struct wb_cache_entry *e; + + BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size)); + + e = list_entry(list_pop(&p->free), struct wb_cache_entry, list); + p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1); + + return e; +} + +/*----------------------------------------------------------------------------*/ + +/* Hash functions (lookup, insert, remove). */ +static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock) +{ + struct hash *hash = &p->chash; + unsigned h = hash_64(from_oblock(oblock), hash->hash_bits); + struct wb_cache_entry *cur; + struct hlist_head *bucket = &hash->table[h]; + + hlist_for_each_entry(cur, bucket, hlist) { + if (cur->oblock == oblock) { + /* Move upfront bucket for faster access. */ + hlist_del(&cur->hlist); + hlist_add_head(&cur->hlist, bucket); + return cur; + } + } + + return NULL; +} + +static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e) +{ + unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits); + + hlist_add_head(&e->hlist, &p->chash.table[h]); +} + +static void remove_cache_hash_entry(struct wb_cache_entry *e) +{ + hlist_del(&e->hlist); +} + +/* Public interface (see dm-cache-policy.h */ +static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock, + bool can_block, bool can_migrate, bool discarded_oblock, + struct bio *bio, struct policy_result *result) +{ + struct policy *p = to_policy(pe); + struct wb_cache_entry *e; + unsigned long flags; + + result->op = POLICY_MISS; + + if (can_block) + spin_lock_irqsave(&p->lock, flags); + + else if (!spin_trylock_irqsave(&p->lock, flags)) + return -EWOULDBLOCK; + + e = lookup_cache_entry(p, oblock); + if (e) { + result->op = POLICY_HIT; + result->cblock = e->cblock; + + } + + spin_unlock_irqrestore(&p->lock, flags); + + return 0; +} + +static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock) +{ + int r; + struct policy *p = to_policy(pe); + struct wb_cache_entry *e; + unsigned long flags; + + if (!spin_trylock_irqsave(&p->lock, flags)) + return -EWOULDBLOCK; + + e = lookup_cache_entry(p, oblock); + if (e) { + *cblock = e->cblock; + r = 0; + + } else + r = -ENOENT; + + spin_unlock_irqrestore(&p->lock, flags); + + return r; +} + +static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set) +{ + struct policy *p = to_policy(pe); + struct wb_cache_entry *e; + + e = lookup_cache_entry(p, oblock); + BUG_ON(!e); + + if (set) { + if (!e->dirty) { + e->dirty = true; + list_move(&e->list, &p->dirty); + } + + } else { + if (e->dirty) { + e->pending = false; + e->dirty = false; + list_move(&e->list, &p->clean); + } + } +} + +static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) +{ + struct policy *p = to_policy(pe); + unsigned long flags; + + spin_lock_irqsave(&p->lock, flags); + __set_clear_dirty(pe, oblock, true); + spin_unlock_irqrestore(&p->lock, flags); +} + +static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) +{ + struct policy *p = to_policy(pe); + unsigned long flags; + + spin_lock_irqsave(&p->lock, flags); + __set_clear_dirty(pe, oblock, false); + spin_unlock_irqrestore(&p->lock, flags); +} + +static void add_cache_entry(struct policy *p, struct wb_cache_entry *e) +{ + insert_cache_hash_entry(p, e); + if (e->dirty) + list_add(&e->list, &p->dirty); + else + list_add(&e->list, &p->clean); +} + +static int wb_load_mapping(struct dm_cache_policy *pe, + dm_oblock_t oblock, dm_cblock_t cblock, + uint32_t hint, bool hint_valid) +{ + int r; + struct policy *p = to_policy(pe); + struct wb_cache_entry *e = alloc_cache_entry(p); + + if (e) { + e->cblock = cblock; + e->oblock = oblock; + e->dirty = false; /* blocks default to clean */ + add_cache_entry(p, e); + r = 0; + + } else + r = -ENOMEM; + + return r; +} + +static void wb_destroy(struct dm_cache_policy *pe) +{ + struct policy *p = to_policy(pe); + + free_cache_blocks_and_hash(p); + kfree(p); +} + +static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock) +{ + struct wb_cache_entry *r = lookup_cache_entry(p, oblock); + + BUG_ON(!r); + + remove_cache_hash_entry(r); + list_del(&r->list); + + return r; +} + +static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock) +{ + struct policy *p = to_policy(pe); + struct wb_cache_entry *e; + unsigned long flags; + + spin_lock_irqsave(&p->lock, flags); + e = __wb_force_remove_mapping(p, oblock); + list_add_tail(&e->list, &p->free); + BUG_ON(!from_cblock(p->nr_cblocks_allocated)); + p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1); + spin_unlock_irqrestore(&p->lock, flags); +} + +static void wb_force_mapping(struct dm_cache_policy *pe, + dm_oblock_t current_oblock, dm_oblock_t oblock) +{ + struct policy *p = to_policy(pe); + struct wb_cache_entry *e; + unsigned long flags; + + spin_lock_irqsave(&p->lock, flags); + e = __wb_force_remove_mapping(p, current_oblock); + e->oblock = oblock; + add_cache_entry(p, e); + spin_unlock_irqrestore(&p->lock, flags); +} + +static struct wb_cache_entry *get_next_dirty_entry(struct policy *p) +{ + struct list_head *l; + struct wb_cache_entry *r; + + if (list_empty(&p->dirty)) + return NULL; + + l = list_pop(&p->dirty); + r = container_of(l, struct wb_cache_entry, list); + list_add(l, &p->clean_pending); + + return r; +} + +static int wb_writeback_work(struct dm_cache_policy *pe, + dm_oblock_t *oblock, + dm_cblock_t *cblock) +{ + int r = -ENOENT; + struct policy *p = to_policy(pe); + struct wb_cache_entry *e; + unsigned long flags; + + spin_lock_irqsave(&p->lock, flags); + + e = get_next_dirty_entry(p); + if (e) { + *oblock = e->oblock; + *cblock = e->cblock; + r = 0; + } + + spin_unlock_irqrestore(&p->lock, flags); + + return r; +} + +static dm_cblock_t wb_residency(struct dm_cache_policy *pe) +{ + return to_policy(pe)->nr_cblocks_allocated; +} + +/* Init the policy plugin interface function pointers. */ +static void init_policy_functions(struct policy *p) +{ + p->policy.destroy = wb_destroy; + p->policy.map = wb_map; + p->policy.lookup = wb_lookup; + p->policy.set_dirty = wb_set_dirty; + p->policy.clear_dirty = wb_clear_dirty; + p->policy.load_mapping = wb_load_mapping; + p->policy.walk_mappings = NULL; + p->policy.remove_mapping = wb_remove_mapping; + p->policy.writeback_work = wb_writeback_work; + p->policy.force_mapping = wb_force_mapping; + p->policy.residency = wb_residency; + p->policy.tick = NULL; +} + +static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + int r; + struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL); + + if (!p) + return NULL; + + init_policy_functions(p); + INIT_LIST_HEAD(&p->free); + INIT_LIST_HEAD(&p->clean); + INIT_LIST_HEAD(&p->clean_pending); + INIT_LIST_HEAD(&p->dirty); + + p->cache_size = cache_size; + spin_lock_init(&p->lock); + + /* Allocate cache entry structs and add them to free list. */ + r = alloc_cache_blocks_with_hash(p, cache_size); + if (!r) + return &p->policy; + + kfree(p); + + return NULL; +} +/*----------------------------------------------------------------------------*/ + +static struct dm_cache_policy_type wb_policy_type = { + .name = "cleaner", + .version = {1, 0, 0}, + .hint_size = 0, + .owner = THIS_MODULE, + .create = wb_create +}; + +static int __init wb_init(void) +{ + int r = dm_cache_policy_register(&wb_policy_type); + + if (r < 0) + DMERR("register failed %d", r); + else + DMINFO("version %u.%u.%u loaded", + wb_policy_type.version[0], + wb_policy_type.version[1], + wb_policy_type.version[2]); + + return r; +} + +static void __exit wb_exit(void) +{ + dm_cache_policy_unregister(&wb_policy_type); +} + +module_init(wb_init); +module_exit(wb_exit); + +MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("cleaner cache policy"); diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h new file mode 100644 index 00000000000..2256a1f24f7 --- /dev/null +++ b/drivers/md/dm-cache-policy-internal.h @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_POLICY_INTERNAL_H +#define DM_CACHE_POLICY_INTERNAL_H + +#include "dm-cache-policy.h" + +/*----------------------------------------------------------------*/ + +/* + * Little inline functions that simplify calling the policy methods. + */ +static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock, + bool can_block, bool can_migrate, bool discarded_oblock, + struct bio *bio, struct policy_result *result) +{ + return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result); +} + +static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) +{ + BUG_ON(!p->lookup); + return p->lookup(p, oblock, cblock); +} + +static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + if (p->set_dirty) + p->set_dirty(p, oblock); +} + +static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + if (p->clear_dirty) + p->clear_dirty(p, oblock); +} + +static inline int policy_load_mapping(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t cblock, + uint32_t hint, bool hint_valid) +{ + return p->load_mapping(p, oblock, cblock, hint, hint_valid); +} + +static inline int policy_walk_mappings(struct dm_cache_policy *p, + policy_walk_fn fn, void *context) +{ + return p->walk_mappings ? p->walk_mappings(p, fn, context) : 0; +} + +static inline int policy_writeback_work(struct dm_cache_policy *p, + dm_oblock_t *oblock, + dm_cblock_t *cblock) +{ + return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT; +} + +static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + p->remove_mapping(p, oblock); +} + +static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + return p->remove_cblock(p, cblock); +} + +static inline void policy_force_mapping(struct dm_cache_policy *p, + dm_oblock_t current_oblock, dm_oblock_t new_oblock) +{ + return p->force_mapping(p, current_oblock, new_oblock); +} + +static inline dm_cblock_t policy_residency(struct dm_cache_policy *p) +{ + return p->residency(p); +} + +static inline void policy_tick(struct dm_cache_policy *p) +{ + if (p->tick) + return p->tick(p); +} + +static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen) +{ + ssize_t sz = 0; + if (p->emit_config_values) + return p->emit_config_values(p, result, maxlen); + + DMEMIT("0"); + return 0; +} + +static inline int policy_set_config_value(struct dm_cache_policy *p, + const char *key, const char *value) +{ + return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL; +} + +/*----------------------------------------------------------------*/ + +/* + * Creates a new cache policy given a policy name, a cache size, an origin size and the block size. + */ +struct dm_cache_policy *dm_cache_policy_create(const char *name, dm_cblock_t cache_size, + sector_t origin_size, sector_t block_size); + +/* + * Destroys the policy. This drops references to the policy module as well + * as calling it's destroy method. So always use this rather than calling + * the policy->destroy method directly. + */ +void dm_cache_policy_destroy(struct dm_cache_policy *p); + +/* + * In case we've forgotten. + */ +const char *dm_cache_policy_get_name(struct dm_cache_policy *p); + +const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p); + +size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p); + +/*----------------------------------------------------------------*/ + +#endif /* DM_CACHE_POLICY_INTERNAL_H */ diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c new file mode 100644 index 00000000000..0e385e40909 --- /dev/null +++ b/drivers/md/dm-cache-policy-mq.c @@ -0,0 +1,1333 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-cache-policy.h" +#include "dm.h" + +#include <linux/hash.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#define DM_MSG_PREFIX "cache-policy-mq" + +static struct kmem_cache *mq_entry_cache; + +/*----------------------------------------------------------------*/ + +static unsigned next_power(unsigned n, unsigned min) +{ + return roundup_pow_of_two(max(n, min)); +} + +/*----------------------------------------------------------------*/ + +/* + * Large, sequential ios are probably better left on the origin device since + * spindles tend to have good bandwidth. + * + * The io_tracker tries to spot when the io is in one of these sequential + * modes. + * + * Two thresholds to switch between random and sequential io mode are defaulting + * as follows and can be adjusted via the constructor and message interfaces. + */ +#define RANDOM_THRESHOLD_DEFAULT 4 +#define SEQUENTIAL_THRESHOLD_DEFAULT 512 + +enum io_pattern { + PATTERN_SEQUENTIAL, + PATTERN_RANDOM +}; + +struct io_tracker { + enum io_pattern pattern; + + unsigned nr_seq_samples; + unsigned nr_rand_samples; + unsigned thresholds[2]; + + dm_oblock_t last_end_oblock; +}; + +static void iot_init(struct io_tracker *t, + int sequential_threshold, int random_threshold) +{ + t->pattern = PATTERN_RANDOM; + t->nr_seq_samples = 0; + t->nr_rand_samples = 0; + t->last_end_oblock = 0; + t->thresholds[PATTERN_RANDOM] = random_threshold; + t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold; +} + +static enum io_pattern iot_pattern(struct io_tracker *t) +{ + return t->pattern; +} + +static void iot_update_stats(struct io_tracker *t, struct bio *bio) +{ + if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1) + t->nr_seq_samples++; + else { + /* + * Just one non-sequential IO is enough to reset the + * counters. + */ + if (t->nr_seq_samples) { + t->nr_seq_samples = 0; + t->nr_rand_samples = 0; + } + + t->nr_rand_samples++; + } + + t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1); +} + +static void iot_check_for_pattern_switch(struct io_tracker *t) +{ + switch (t->pattern) { + case PATTERN_SEQUENTIAL: + if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) { + t->pattern = PATTERN_RANDOM; + t->nr_seq_samples = t->nr_rand_samples = 0; + } + break; + + case PATTERN_RANDOM: + if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) { + t->pattern = PATTERN_SEQUENTIAL; + t->nr_seq_samples = t->nr_rand_samples = 0; + } + break; + } +} + +static void iot_examine_bio(struct io_tracker *t, struct bio *bio) +{ + iot_update_stats(t, bio); + iot_check_for_pattern_switch(t); +} + +/*----------------------------------------------------------------*/ + + +/* + * This queue is divided up into different levels. Allowing us to push + * entries to the back of any of the levels. Think of it as a partially + * sorted queue. + */ +#define NR_QUEUE_LEVELS 16u + +struct queue { + struct list_head qs[NR_QUEUE_LEVELS]; +}; + +static void queue_init(struct queue *q) +{ + unsigned i; + + for (i = 0; i < NR_QUEUE_LEVELS; i++) + INIT_LIST_HEAD(q->qs + i); +} + +/* + * Checks to see if the queue is empty. + * FIXME: reduce cpu usage. + */ +static bool queue_empty(struct queue *q) +{ + unsigned i; + + for (i = 0; i < NR_QUEUE_LEVELS; i++) + if (!list_empty(q->qs + i)) + return false; + + return true; +} + +/* + * Insert an entry to the back of the given level. + */ +static void queue_push(struct queue *q, unsigned level, struct list_head *elt) +{ + list_add_tail(elt, q->qs + level); +} + +static void queue_remove(struct list_head *elt) +{ + list_del(elt); +} + +/* + * Shifts all regions down one level. This has no effect on the order of + * the queue. + */ +static void queue_shift_down(struct queue *q) +{ + unsigned level; + + for (level = 1; level < NR_QUEUE_LEVELS; level++) + list_splice_init(q->qs + level, q->qs + level - 1); +} + +/* + * Gives us the oldest entry of the lowest popoulated level. If the first + * level is emptied then we shift down one level. + */ +static struct list_head *queue_pop(struct queue *q) +{ + unsigned level; + struct list_head *r; + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + if (!list_empty(q->qs + level)) { + r = q->qs[level].next; + list_del(r); + + /* have we just emptied the bottom level? */ + if (level == 0 && list_empty(q->qs)) + queue_shift_down(q); + + return r; + } + + return NULL; +} + +static struct list_head *list_pop(struct list_head *lh) +{ + struct list_head *r = lh->next; + + BUG_ON(!r); + list_del_init(r); + + return r; +} + +/*----------------------------------------------------------------*/ + +/* + * Describes a cache entry. Used in both the cache and the pre_cache. + */ +struct entry { + struct hlist_node hlist; + struct list_head list; + dm_oblock_t oblock; + + /* + * FIXME: pack these better + */ + bool dirty:1; + unsigned hit_count; + unsigned generation; + unsigned tick; +}; + +/* + * Rather than storing the cblock in an entry, we allocate all entries in + * an array, and infer the cblock from the entry position. + * + * Free entries are linked together into a list. + */ +struct entry_pool { + struct entry *entries, *entries_end; + struct list_head free; + unsigned nr_allocated; +}; + +static int epool_init(struct entry_pool *ep, unsigned nr_entries) +{ + unsigned i; + + ep->entries = vzalloc(sizeof(struct entry) * nr_entries); + if (!ep->entries) + return -ENOMEM; + + ep->entries_end = ep->entries + nr_entries; + + INIT_LIST_HEAD(&ep->free); + for (i = 0; i < nr_entries; i++) + list_add(&ep->entries[i].list, &ep->free); + + ep->nr_allocated = 0; + + return 0; +} + +static void epool_exit(struct entry_pool *ep) +{ + vfree(ep->entries); +} + +static struct entry *alloc_entry(struct entry_pool *ep) +{ + struct entry *e; + + if (list_empty(&ep->free)) + return NULL; + + e = list_entry(list_pop(&ep->free), struct entry, list); + INIT_LIST_HEAD(&e->list); + INIT_HLIST_NODE(&e->hlist); + ep->nr_allocated++; + + return e; +} + +/* + * This assumes the cblock hasn't already been allocated. + */ +static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) +{ + struct entry *e = ep->entries + from_cblock(cblock); + + list_del_init(&e->list); + INIT_HLIST_NODE(&e->hlist); + ep->nr_allocated++; + + return e; +} + +static void free_entry(struct entry_pool *ep, struct entry *e) +{ + BUG_ON(!ep->nr_allocated); + ep->nr_allocated--; + INIT_HLIST_NODE(&e->hlist); + list_add(&e->list, &ep->free); +} + +/* + * Returns NULL if the entry is free. + */ +static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock) +{ + struct entry *e = ep->entries + from_cblock(cblock); + return !hlist_unhashed(&e->hlist) ? e : NULL; +} + +static bool epool_empty(struct entry_pool *ep) +{ + return list_empty(&ep->free); +} + +static bool in_pool(struct entry_pool *ep, struct entry *e) +{ + return e >= ep->entries && e < ep->entries_end; +} + +static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e) +{ + return to_cblock(e - ep->entries); +} + +/*----------------------------------------------------------------*/ + +struct mq_policy { + struct dm_cache_policy policy; + + /* protects everything */ + struct mutex lock; + dm_cblock_t cache_size; + struct io_tracker tracker; + + /* + * Entries come from two pools, one of pre-cache entries, and one + * for the cache proper. + */ + struct entry_pool pre_cache_pool; + struct entry_pool cache_pool; + + /* + * We maintain three queues of entries. The cache proper, + * consisting of a clean and dirty queue, contains the currently + * active mappings. Whereas the pre_cache tracks blocks that + * are being hit frequently and potential candidates for promotion + * to the cache. + */ + struct queue pre_cache; + struct queue cache_clean; + struct queue cache_dirty; + + /* + * Keeps track of time, incremented by the core. We use this to + * avoid attributing multiple hits within the same tick. + * + * Access to tick_protected should be done with the spin lock held. + * It's copied to tick at the start of the map function (within the + * mutex). + */ + spinlock_t tick_lock; + unsigned tick_protected; + unsigned tick; + + /* + * A count of the number of times the map function has been called + * and found an entry in the pre_cache or cache. Currently used to + * calculate the generation. + */ + unsigned hit_count; + + /* + * A generation is a longish period that is used to trigger some + * book keeping effects. eg, decrementing hit counts on entries. + * This is needed to allow the cache to evolve as io patterns + * change. + */ + unsigned generation; + unsigned generation_period; /* in lookups (will probably change) */ + + /* + * Entries in the pre_cache whose hit count passes the promotion + * threshold move to the cache proper. Working out the correct + * value for the promotion_threshold is crucial to this policy. + */ + unsigned promote_threshold; + + unsigned discard_promote_adjustment; + unsigned read_promote_adjustment; + unsigned write_promote_adjustment; + + /* + * The hash table allows us to quickly find an entry by origin + * block. Both pre_cache and cache entries are in here. + */ + unsigned nr_buckets; + dm_block_t hash_bits; + struct hlist_head *table; +}; + +#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 +#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 +#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 + +/*----------------------------------------------------------------*/ + +/* + * Simple hash table implementation. Should replace with the standard hash + * table that's making its way upstream. + */ +static void hash_insert(struct mq_policy *mq, struct entry *e) +{ + unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits); + + hlist_add_head(&e->hlist, mq->table + h); +} + +static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock) +{ + unsigned h = hash_64(from_oblock(oblock), mq->hash_bits); + struct hlist_head *bucket = mq->table + h; + struct entry *e; + + hlist_for_each_entry(e, bucket, hlist) + if (e->oblock == oblock) { + hlist_del(&e->hlist); + hlist_add_head(&e->hlist, bucket); + return e; + } + + return NULL; +} + +static void hash_remove(struct entry *e) +{ + hlist_del(&e->hlist); +} + +/*----------------------------------------------------------------*/ + +static bool any_free_cblocks(struct mq_policy *mq) +{ + return !epool_empty(&mq->cache_pool); +} + +static bool any_clean_cblocks(struct mq_policy *mq) +{ + return !queue_empty(&mq->cache_clean); +} + +/*----------------------------------------------------------------*/ + +/* + * Now we get to the meat of the policy. This section deals with deciding + * when to to add entries to the pre_cache and cache, and move between + * them. + */ + +/* + * The queue level is based on the log2 of the hit count. + */ +static unsigned queue_level(struct entry *e) +{ + return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); +} + +static bool in_cache(struct mq_policy *mq, struct entry *e) +{ + return in_pool(&mq->cache_pool, e); +} + +/* + * Inserts the entry into the pre_cache or the cache. Ensures the cache + * block is marked as allocated if necc. Inserts into the hash table. + * Sets the tick which records when the entry was last moved about. + */ +static void push(struct mq_policy *mq, struct entry *e) +{ + e->tick = mq->tick; + hash_insert(mq, e); + + if (in_cache(mq, e)) + queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, + queue_level(e), &e->list); + else + queue_push(&mq->pre_cache, queue_level(e), &e->list); +} + +/* + * Removes an entry from pre_cache or cache. Removes from the hash table. + */ +static void del(struct mq_policy *mq, struct entry *e) +{ + queue_remove(&e->list); + hash_remove(e); +} + +/* + * Like del, except it removes the first entry in the queue (ie. the least + * recently used). + */ +static struct entry *pop(struct mq_policy *mq, struct queue *q) +{ + struct entry *e; + struct list_head *h = queue_pop(q); + + if (!h) + return NULL; + + e = container_of(h, struct entry, list); + hash_remove(e); + + return e; +} + +/* + * Has this entry already been updated? + */ +static bool updated_this_tick(struct mq_policy *mq, struct entry *e) +{ + return mq->tick == e->tick; +} + +/* + * The promotion threshold is adjusted every generation. As are the counts + * of the entries. + * + * At the moment the threshold is taken by averaging the hit counts of some + * of the entries in the cache (the first 20 entries across all levels in + * ascending order, giving preference to the clean entries at each level). + * + * We can be much cleverer than this though. For example, each promotion + * could bump up the threshold helping to prevent churn. Much more to do + * here. + */ + +#define MAX_TO_AVERAGE 20 + +static void check_generation(struct mq_policy *mq) +{ + unsigned total = 0, nr = 0, count = 0, level; + struct list_head *head; + struct entry *e; + + if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) { + mq->hit_count = 0; + mq->generation++; + + for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { + head = mq->cache_clean.qs + level; + list_for_each_entry(e, head, list) { + nr++; + total += e->hit_count; + + if (++count >= MAX_TO_AVERAGE) + break; + } + + head = mq->cache_dirty.qs + level; + list_for_each_entry(e, head, list) { + nr++; + total += e->hit_count; + + if (++count >= MAX_TO_AVERAGE) + break; + } + } + + mq->promote_threshold = nr ? total / nr : 1; + if (mq->promote_threshold * nr < total) + mq->promote_threshold++; + } +} + +/* + * Whenever we use an entry we bump up it's hit counter, and push it to the + * back to it's current level. + */ +static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) +{ + if (updated_this_tick(mq, e)) + return; + + e->hit_count++; + mq->hit_count++; + check_generation(mq); + + /* generation adjustment, to stop the counts increasing forever. */ + /* FIXME: divide? */ + /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */ + e->generation = mq->generation; + + del(mq, e); + push(mq, e); +} + +/* + * Demote the least recently used entry from the cache to the pre_cache. + * Returns the new cache entry to use, and the old origin block it was + * mapped to. + * + * We drop the hit count on the demoted entry back to 1 to stop it bouncing + * straight back into the cache if it's subsequently hit. There are + * various options here, and more experimentation would be good: + * + * - just forget about the demoted entry completely (ie. don't insert it + into the pre_cache). + * - divide the hit count rather that setting to some hard coded value. + * - set the hit count to a hard coded value other than 1, eg, is it better + * if it goes in at level 2? + */ +static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) +{ + struct entry *demoted = pop(mq, &mq->cache_clean); + + if (!demoted) + /* + * We could get a block from mq->cache_dirty, but that + * would add extra latency to the triggering bio as it + * waits for the writeback. Better to not promote this + * time and hope there's a clean block next time this block + * is hit. + */ + return -ENOSPC; + + *oblock = demoted->oblock; + free_entry(&mq->cache_pool, demoted); + + /* + * We used to put the demoted block into the pre-cache, but I think + * it's simpler to just let it work it's way up from zero again. + * Stops blocks flickering in and out of the cache. + */ + + return 0; +} + +/* + * We modify the basic promotion_threshold depending on the specific io. + * + * If the origin block has been discarded then there's no cost to copy it + * to the cache. + * + * We bias towards reads, since they can be demoted at no cost if they + * haven't been dirtied. + */ +static unsigned adjusted_promote_threshold(struct mq_policy *mq, + bool discarded_oblock, int data_dir) +{ + if (data_dir == READ) + return mq->promote_threshold + mq->read_promote_adjustment; + + if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { + /* + * We don't need to do any copying at all, so give this a + * very low threshold. + */ + return mq->discard_promote_adjustment; + } + + return mq->promote_threshold + mq->write_promote_adjustment; +} + +static bool should_promote(struct mq_policy *mq, struct entry *e, + bool discarded_oblock, int data_dir) +{ + return e->hit_count >= + adjusted_promote_threshold(mq, discarded_oblock, data_dir); +} + +static int cache_entry_found(struct mq_policy *mq, + struct entry *e, + struct policy_result *result) +{ + requeue_and_update_tick(mq, e); + + if (in_cache(mq, e)) { + result->op = POLICY_HIT; + result->cblock = infer_cblock(&mq->cache_pool, e); + } + + return 0; +} + +/* + * Moves an entry from the pre_cache to the cache. The main work is + * finding which cache block to use. + */ +static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, + struct policy_result *result) +{ + int r; + struct entry *new_e; + + /* Ensure there's a free cblock in the cache */ + if (epool_empty(&mq->cache_pool)) { + result->op = POLICY_REPLACE; + r = demote_cblock(mq, &result->old_oblock); + if (r) { + result->op = POLICY_MISS; + return 0; + } + } else + result->op = POLICY_NEW; + + new_e = alloc_entry(&mq->cache_pool); + BUG_ON(!new_e); + + new_e->oblock = e->oblock; + new_e->dirty = false; + new_e->hit_count = e->hit_count; + new_e->generation = e->generation; + new_e->tick = e->tick; + + del(mq, e); + free_entry(&mq->pre_cache_pool, e); + push(mq, new_e); + + result->cblock = infer_cblock(&mq->cache_pool, new_e); + + return 0; +} + +static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, + bool can_migrate, bool discarded_oblock, + int data_dir, struct policy_result *result) +{ + int r = 0; + bool updated = updated_this_tick(mq, e); + + if ((!discarded_oblock && updated) || + !should_promote(mq, e, discarded_oblock, data_dir)) { + requeue_and_update_tick(mq, e); + result->op = POLICY_MISS; + + } else if (!can_migrate) + r = -EWOULDBLOCK; + + else { + requeue_and_update_tick(mq, e); + r = pre_cache_to_cache(mq, e, result); + } + + return r; +} + +static void insert_in_pre_cache(struct mq_policy *mq, + dm_oblock_t oblock) +{ + struct entry *e = alloc_entry(&mq->pre_cache_pool); + + if (!e) + /* + * There's no spare entry structure, so we grab the least + * used one from the pre_cache. + */ + e = pop(mq, &mq->pre_cache); + + if (unlikely(!e)) { + DMWARN("couldn't pop from pre cache"); + return; + } + + e->dirty = false; + e->oblock = oblock; + e->hit_count = 1; + e->generation = mq->generation; + push(mq, e); +} + +static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, + struct policy_result *result) +{ + int r; + struct entry *e; + + if (epool_empty(&mq->cache_pool)) { + result->op = POLICY_REPLACE; + r = demote_cblock(mq, &result->old_oblock); + if (unlikely(r)) { + result->op = POLICY_MISS; + insert_in_pre_cache(mq, oblock); + return; + } + + /* + * This will always succeed, since we've just demoted. + */ + e = alloc_entry(&mq->cache_pool); + BUG_ON(!e); + + } else { + e = alloc_entry(&mq->cache_pool); + result->op = POLICY_NEW; + } + + e->oblock = oblock; + e->dirty = false; + e->hit_count = 1; + e->generation = mq->generation; + push(mq, e); + + result->cblock = infer_cblock(&mq->cache_pool, e); +} + +static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, + bool can_migrate, bool discarded_oblock, + int data_dir, struct policy_result *result) +{ + if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) { + if (can_migrate) + insert_in_cache(mq, oblock, result); + else + return -EWOULDBLOCK; + } else { + insert_in_pre_cache(mq, oblock); + result->op = POLICY_MISS; + } + + return 0; +} + +/* + * Looks the oblock up in the hash table, then decides whether to put in + * pre_cache, or cache etc. + */ +static int map(struct mq_policy *mq, dm_oblock_t oblock, + bool can_migrate, bool discarded_oblock, + int data_dir, struct policy_result *result) +{ + int r = 0; + struct entry *e = hash_lookup(mq, oblock); + + if (e && in_cache(mq, e)) + r = cache_entry_found(mq, e, result); + + else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) + result->op = POLICY_MISS; + + else if (e) + r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, + data_dir, result); + + else + r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, + data_dir, result); + + if (r == -EWOULDBLOCK) + result->op = POLICY_MISS; + + return r; +} + +/*----------------------------------------------------------------*/ + +/* + * Public interface, via the policy struct. See dm-cache-policy.h for a + * description of these. + */ + +static struct mq_policy *to_mq_policy(struct dm_cache_policy *p) +{ + return container_of(p, struct mq_policy, policy); +} + +static void mq_destroy(struct dm_cache_policy *p) +{ + struct mq_policy *mq = to_mq_policy(p); + + vfree(mq->table); + epool_exit(&mq->cache_pool); + epool_exit(&mq->pre_cache_pool); + kfree(mq); +} + +static void copy_tick(struct mq_policy *mq) +{ + unsigned long flags; + + spin_lock_irqsave(&mq->tick_lock, flags); + mq->tick = mq->tick_protected; + spin_unlock_irqrestore(&mq->tick_lock, flags); +} + +static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock, + bool can_block, bool can_migrate, bool discarded_oblock, + struct bio *bio, struct policy_result *result) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + + result->op = POLICY_MISS; + + if (can_block) + mutex_lock(&mq->lock); + else if (!mutex_trylock(&mq->lock)) + return -EWOULDBLOCK; + + copy_tick(mq); + + iot_examine_bio(&mq->tracker, bio); + r = map(mq, oblock, can_migrate, discarded_oblock, + bio_data_dir(bio), result); + + mutex_unlock(&mq->lock); + + return r; +} + +static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + struct entry *e; + + if (!mutex_trylock(&mq->lock)) + return -EWOULDBLOCK; + + e = hash_lookup(mq, oblock); + if (e && in_cache(mq, e)) { + *cblock = infer_cblock(&mq->cache_pool, e); + r = 0; + } else + r = -ENOENT; + + mutex_unlock(&mq->lock); + + return r; +} + +static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set) +{ + struct entry *e; + + e = hash_lookup(mq, oblock); + BUG_ON(!e || !in_cache(mq, e)); + + del(mq, e); + e->dirty = set; + push(mq, e); +} + +static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + __mq_set_clear_dirty(mq, oblock, true); + mutex_unlock(&mq->lock); +} + +static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + __mq_set_clear_dirty(mq, oblock, false); + mutex_unlock(&mq->lock); +} + +static int mq_load_mapping(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t cblock, + uint32_t hint, bool hint_valid) +{ + struct mq_policy *mq = to_mq_policy(p); + struct entry *e; + + e = alloc_particular_entry(&mq->cache_pool, cblock); + e->oblock = oblock; + e->dirty = false; /* this gets corrected in a minute */ + e->hit_count = hint_valid ? hint : 1; + e->generation = mq->generation; + push(mq, e); + + return 0; +} + +static int mq_save_hints(struct mq_policy *mq, struct queue *q, + policy_walk_fn fn, void *context) +{ + int r; + unsigned level; + struct entry *e; + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + list_for_each_entry(e, q->qs + level, list) { + r = fn(context, infer_cblock(&mq->cache_pool, e), + e->oblock, e->hit_count); + if (r) + return r; + } + + return 0; +} + +static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, + void *context) +{ + struct mq_policy *mq = to_mq_policy(p); + int r = 0; + + mutex_lock(&mq->lock); + + r = mq_save_hints(mq, &mq->cache_clean, fn, context); + if (!r) + r = mq_save_hints(mq, &mq->cache_dirty, fn, context); + + mutex_unlock(&mq->lock); + + return r; +} + +static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) +{ + struct entry *e; + + e = hash_lookup(mq, oblock); + BUG_ON(!e || !in_cache(mq, e)); + + del(mq, e); + free_entry(&mq->cache_pool, e); +} + +static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + __remove_mapping(mq, oblock); + mutex_unlock(&mq->lock); +} + +static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock) +{ + struct entry *e = epool_find(&mq->cache_pool, cblock); + + if (!e) + return -ENODATA; + + del(mq, e); + free_entry(&mq->cache_pool, e); + + return 0; +} + +static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + r = __remove_cblock(mq, cblock); + mutex_unlock(&mq->lock); + + return r; +} + +static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, + dm_cblock_t *cblock) +{ + struct entry *e = pop(mq, &mq->cache_dirty); + + if (!e) + return -ENODATA; + + *oblock = e->oblock; + *cblock = infer_cblock(&mq->cache_pool, e); + e->dirty = false; + push(mq, e); + + return 0; +} + +static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, + dm_cblock_t *cblock) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + r = __mq_writeback_work(mq, oblock, cblock); + mutex_unlock(&mq->lock); + + return r; +} + +static void __force_mapping(struct mq_policy *mq, + dm_oblock_t current_oblock, dm_oblock_t new_oblock) +{ + struct entry *e = hash_lookup(mq, current_oblock); + + if (e && in_cache(mq, e)) { + del(mq, e); + e->oblock = new_oblock; + e->dirty = true; + push(mq, e); + } +} + +static void mq_force_mapping(struct dm_cache_policy *p, + dm_oblock_t current_oblock, dm_oblock_t new_oblock) +{ + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + __force_mapping(mq, current_oblock, new_oblock); + mutex_unlock(&mq->lock); +} + +static dm_cblock_t mq_residency(struct dm_cache_policy *p) +{ + dm_cblock_t r; + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + r = to_cblock(mq->cache_pool.nr_allocated); + mutex_unlock(&mq->lock); + + return r; +} + +static void mq_tick(struct dm_cache_policy *p) +{ + struct mq_policy *mq = to_mq_policy(p); + unsigned long flags; + + spin_lock_irqsave(&mq->tick_lock, flags); + mq->tick_protected++; + spin_unlock_irqrestore(&mq->tick_lock, flags); +} + +static int mq_set_config_value(struct dm_cache_policy *p, + const char *key, const char *value) +{ + struct mq_policy *mq = to_mq_policy(p); + unsigned long tmp; + + if (kstrtoul(value, 10, &tmp)) + return -EINVAL; + + if (!strcasecmp(key, "random_threshold")) { + mq->tracker.thresholds[PATTERN_RANDOM] = tmp; + + } else if (!strcasecmp(key, "sequential_threshold")) { + mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp; + + } else if (!strcasecmp(key, "discard_promote_adjustment")) + mq->discard_promote_adjustment = tmp; + + else if (!strcasecmp(key, "read_promote_adjustment")) + mq->read_promote_adjustment = tmp; + + else if (!strcasecmp(key, "write_promote_adjustment")) + mq->write_promote_adjustment = tmp; + + else + return -EINVAL; + + return 0; +} + +static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen) +{ + ssize_t sz = 0; + struct mq_policy *mq = to_mq_policy(p); + + DMEMIT("10 random_threshold %u " + "sequential_threshold %u " + "discard_promote_adjustment %u " + "read_promote_adjustment %u " + "write_promote_adjustment %u", + mq->tracker.thresholds[PATTERN_RANDOM], + mq->tracker.thresholds[PATTERN_SEQUENTIAL], + mq->discard_promote_adjustment, + mq->read_promote_adjustment, + mq->write_promote_adjustment); + + return 0; +} + +/* Init the policy plugin interface function pointers. */ +static void init_policy_functions(struct mq_policy *mq) +{ + mq->policy.destroy = mq_destroy; + mq->policy.map = mq_map; + mq->policy.lookup = mq_lookup; + mq->policy.set_dirty = mq_set_dirty; + mq->policy.clear_dirty = mq_clear_dirty; + mq->policy.load_mapping = mq_load_mapping; + mq->policy.walk_mappings = mq_walk_mappings; + mq->policy.remove_mapping = mq_remove_mapping; + mq->policy.remove_cblock = mq_remove_cblock; + mq->policy.writeback_work = mq_writeback_work; + mq->policy.force_mapping = mq_force_mapping; + mq->policy.residency = mq_residency; + mq->policy.tick = mq_tick; + mq->policy.emit_config_values = mq_emit_config_values; + mq->policy.set_config_value = mq_set_config_value; +} + +static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); + + if (!mq) + return NULL; + + init_policy_functions(mq); + iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); + mq->cache_size = cache_size; + + if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) { + DMERR("couldn't initialize pool of pre-cache entries"); + goto bad_pre_cache_init; + } + + if (epool_init(&mq->cache_pool, from_cblock(cache_size))) { + DMERR("couldn't initialize pool of cache entries"); + goto bad_cache_init; + } + + mq->tick_protected = 0; + mq->tick = 0; + mq->hit_count = 0; + mq->generation = 0; + mq->promote_threshold = 0; + mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; + mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; + mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; + mutex_init(&mq->lock); + spin_lock_init(&mq->tick_lock); + + queue_init(&mq->pre_cache); + queue_init(&mq->cache_clean); + queue_init(&mq->cache_dirty); + + mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); + + mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); + mq->hash_bits = ffs(mq->nr_buckets) - 1; + mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets); + if (!mq->table) + goto bad_alloc_table; + + return &mq->policy; + +bad_alloc_table: + epool_exit(&mq->cache_pool); +bad_cache_init: + epool_exit(&mq->pre_cache_pool); +bad_pre_cache_init: + kfree(mq); + + return NULL; +} + +/*----------------------------------------------------------------*/ + +static struct dm_cache_policy_type mq_policy_type = { + .name = "mq", + .version = {1, 2, 0}, + .hint_size = 4, + .owner = THIS_MODULE, + .create = mq_create +}; + +static struct dm_cache_policy_type default_policy_type = { + .name = "default", + .version = {1, 2, 0}, + .hint_size = 4, + .owner = THIS_MODULE, + .create = mq_create, + .real = &mq_policy_type +}; + +static int __init mq_init(void) +{ + int r; + + mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry", + sizeof(struct entry), + __alignof__(struct entry), + 0, NULL); + if (!mq_entry_cache) + goto bad; + + r = dm_cache_policy_register(&mq_policy_type); + if (r) { + DMERR("register failed %d", r); + goto bad_register_mq; + } + + r = dm_cache_policy_register(&default_policy_type); + if (!r) { + DMINFO("version %u.%u.%u loaded", + mq_policy_type.version[0], + mq_policy_type.version[1], + mq_policy_type.version[2]); + return 0; + } + + DMERR("register failed (as default) %d", r); + + dm_cache_policy_unregister(&mq_policy_type); +bad_register_mq: + kmem_cache_destroy(mq_entry_cache); +bad: + return -ENOMEM; +} + +static void __exit mq_exit(void) +{ + dm_cache_policy_unregister(&mq_policy_type); + dm_cache_policy_unregister(&default_policy_type); + + kmem_cache_destroy(mq_entry_cache); +} + +module_init(mq_init); +module_exit(mq_exit); + +MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("mq cache policy"); + +MODULE_ALIAS("dm-cache-default"); diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c new file mode 100644 index 00000000000..c1a3cee99b4 --- /dev/null +++ b/drivers/md/dm-cache-policy.c @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-cache-policy-internal.h" +#include "dm.h" + +#include <linux/module.h> +#include <linux/slab.h> + +/*----------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "cache-policy" + +static DEFINE_SPINLOCK(register_lock); +static LIST_HEAD(register_list); + +static struct dm_cache_policy_type *__find_policy(const char *name) +{ + struct dm_cache_policy_type *t; + + list_for_each_entry(t, ®ister_list, list) + if (!strcmp(t->name, name)) + return t; + + return NULL; +} + +static struct dm_cache_policy_type *__get_policy_once(const char *name) +{ + struct dm_cache_policy_type *t = __find_policy(name); + + if (t && !try_module_get(t->owner)) { + DMWARN("couldn't get module %s", name); + t = ERR_PTR(-EINVAL); + } + + return t; +} + +static struct dm_cache_policy_type *get_policy_once(const char *name) +{ + struct dm_cache_policy_type *t; + + spin_lock(®ister_lock); + t = __get_policy_once(name); + spin_unlock(®ister_lock); + + return t; +} + +static struct dm_cache_policy_type *get_policy(const char *name) +{ + struct dm_cache_policy_type *t; + + t = get_policy_once(name); + if (IS_ERR(t)) + return NULL; + + if (t) + return t; + + request_module("dm-cache-%s", name); + + t = get_policy_once(name); + if (IS_ERR(t)) + return NULL; + + return t; +} + +static void put_policy(struct dm_cache_policy_type *t) +{ + module_put(t->owner); +} + +int dm_cache_policy_register(struct dm_cache_policy_type *type) +{ + int r; + + /* One size fits all for now */ + if (type->hint_size != 0 && type->hint_size != 4) { + DMWARN("hint size must be 0 or 4 but %llu supplied.", (unsigned long long) type->hint_size); + return -EINVAL; + } + + spin_lock(®ister_lock); + if (__find_policy(type->name)) { + DMWARN("attempt to register policy under duplicate name %s", type->name); + r = -EINVAL; + } else { + list_add(&type->list, ®ister_list); + r = 0; + } + spin_unlock(®ister_lock); + + return r; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_register); + +void dm_cache_policy_unregister(struct dm_cache_policy_type *type) +{ + spin_lock(®ister_lock); + list_del_init(&type->list); + spin_unlock(®ister_lock); +} +EXPORT_SYMBOL_GPL(dm_cache_policy_unregister); + +struct dm_cache_policy *dm_cache_policy_create(const char *name, + dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + struct dm_cache_policy *p = NULL; + struct dm_cache_policy_type *type; + + type = get_policy(name); + if (!type) { + DMWARN("unknown policy type"); + return ERR_PTR(-EINVAL); + } + + p = type->create(cache_size, origin_size, cache_block_size); + if (!p) { + put_policy(type); + return ERR_PTR(-ENOMEM); + } + p->private = type; + + return p; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_create); + +void dm_cache_policy_destroy(struct dm_cache_policy *p) +{ + struct dm_cache_policy_type *t = p->private; + + p->destroy(p); + put_policy(t); +} +EXPORT_SYMBOL_GPL(dm_cache_policy_destroy); + +const char *dm_cache_policy_get_name(struct dm_cache_policy *p) +{ + struct dm_cache_policy_type *t = p->private; + + /* if t->real is set then an alias was used (e.g. "default") */ + if (t->real) + return t->real->name; + + return t->name; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_get_name); + +const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p) +{ + struct dm_cache_policy_type *t = p->private; + + return t->version; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_get_version); + +size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p) +{ + struct dm_cache_policy_type *t = p->private; + + return t->hint_size; +} +EXPORT_SYMBOL_GPL(dm_cache_policy_get_hint_size); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h new file mode 100644 index 00000000000..f50fe360c54 --- /dev/null +++ b/drivers/md/dm-cache-policy.h @@ -0,0 +1,249 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_POLICY_H +#define DM_CACHE_POLICY_H + +#include "dm-cache-block-types.h" + +#include <linux/device-mapper.h> + +/*----------------------------------------------------------------*/ + +/* FIXME: make it clear which methods are optional. Get debug policy to + * double check this at start. + */ + +/* + * The cache policy makes the important decisions about which blocks get to + * live on the faster cache device. + * + * When the core target has to remap a bio it calls the 'map' method of the + * policy. This returns an instruction telling the core target what to do. + * + * POLICY_HIT: + * That block is in the cache. Remap to the cache and carry on. + * + * POLICY_MISS: + * This block is on the origin device. Remap and carry on. + * + * POLICY_NEW: + * This block is currently on the origin device, but the policy wants to + * move it. The core should: + * + * - hold any further io to this origin block + * - copy the origin to the given cache block + * - release all the held blocks + * - remap the original block to the cache + * + * POLICY_REPLACE: + * This block is currently on the origin device. The policy wants to + * move it to the cache, with the added complication that the destination + * cache block needs a writeback first. The core should: + * + * - hold any further io to this origin block + * - hold any further io to the origin block that's being written back + * - writeback + * - copy new block to cache + * - release held blocks + * - remap bio to cache and reissue. + * + * Should the core run into trouble while processing a POLICY_NEW or + * POLICY_REPLACE instruction it will roll back the policies mapping using + * remove_mapping() or force_mapping(). These methods must not fail. This + * approach avoids having transactional semantics in the policy (ie, the + * core informing the policy when a migration is complete), and hence makes + * it easier to write new policies. + * + * In general policy methods should never block, except in the case of the + * map function when can_migrate is set. So be careful to implement using + * bounded, preallocated memory. + */ +enum policy_operation { + POLICY_HIT, + POLICY_MISS, + POLICY_NEW, + POLICY_REPLACE +}; + +/* + * This is the instruction passed back to the core target. + */ +struct policy_result { + enum policy_operation op; + dm_oblock_t old_oblock; /* POLICY_REPLACE */ + dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */ +}; + +typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock, + dm_oblock_t oblock, uint32_t hint); + +/* + * The cache policy object. Just a bunch of methods. It is envisaged that + * this structure will be embedded in a bigger, policy specific structure + * (ie. use container_of()). + */ +struct dm_cache_policy { + + /* + * FIXME: make it clear which methods are optional, and which may + * block. + */ + + /* + * Destroys this object. + */ + void (*destroy)(struct dm_cache_policy *p); + + /* + * See large comment above. + * + * oblock - the origin block we're interested in. + * + * can_block - indicates whether the current thread is allowed to + * block. -EWOULDBLOCK returned if it can't and would. + * + * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE + * instructions. If denied and the policy would have + * returned one of these instructions it should + * return -EWOULDBLOCK. + * + * discarded_oblock - indicates whether the whole origin block is + * in a discarded state (FIXME: better to tell the + * policy about this sooner, so it can recycle that + * cache block if it wants.) + * bio - the bio that triggered this call. + * result - gets filled in with the instruction. + * + * May only return 0, or -EWOULDBLOCK (if !can_migrate) + */ + int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock, + bool can_block, bool can_migrate, bool discarded_oblock, + struct bio *bio, struct policy_result *result); + + /* + * Sometimes we want to see if a block is in the cache, without + * triggering any update of stats. (ie. it's not a real hit). + * + * Must not block. + * + * Returns 0 if in cache, -ENOENT if not, < 0 for other errors + * (-EWOULDBLOCK would be typical). + */ + int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); + + void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); + void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); + + /* + * Called when a cache target is first created. Used to load a + * mapping from the metadata device into the policy. + */ + int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, + dm_cblock_t cblock, uint32_t hint, bool hint_valid); + + int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn, + void *context); + + /* + * Override functions used on the error paths of the core target. + * They must succeed. + */ + void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock); + void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, + dm_oblock_t new_oblock); + + /* + * This is called via the invalidate_cblocks message. It is + * possible the particular cblock has already been removed due to a + * write io in passthrough mode. In which case this should return + * -ENODATA. + */ + int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock); + + /* + * Provide a dirty block to be written back by the core target. + * + * Returns: + * + * 0 and @cblock,@oblock: block to write back provided + * + * -ENODATA: no dirty blocks available + */ + int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock); + + /* + * How full is the cache? + */ + dm_cblock_t (*residency)(struct dm_cache_policy *p); + + /* + * Because of where we sit in the block layer, we can be asked to + * map a lot of little bios that are all in the same block (no + * queue merging has occurred). To stop the policy being fooled by + * these the core target sends regular tick() calls to the policy. + * The policy should only count an entry as hit once per tick. + */ + void (*tick)(struct dm_cache_policy *p); + + /* + * Configuration. + */ + int (*emit_config_values)(struct dm_cache_policy *p, + char *result, unsigned maxlen); + int (*set_config_value)(struct dm_cache_policy *p, + const char *key, const char *value); + + /* + * Book keeping ptr for the policy register, not for general use. + */ + void *private; +}; + +/*----------------------------------------------------------------*/ + +/* + * We maintain a little register of the different policy types. + */ +#define CACHE_POLICY_NAME_SIZE 16 +#define CACHE_POLICY_VERSION_SIZE 3 + +struct dm_cache_policy_type { + /* For use by the register code only. */ + struct list_head list; + + /* + * Policy writers should fill in these fields. The name field is + * what gets passed on the target line to select your policy. + */ + char name[CACHE_POLICY_NAME_SIZE]; + unsigned version[CACHE_POLICY_VERSION_SIZE]; + + /* + * For use by an alias dm_cache_policy_type to point to the + * real dm_cache_policy_type. + */ + struct dm_cache_policy_type *real; + + /* + * Policies may store a hint for each each cache block. + * Currently the size of this hint must be 0 or 4 bytes but we + * expect to relax this in future. + */ + size_t hint_size; + + struct module *owner; + struct dm_cache_policy *(*create)(dm_cblock_t cache_size, + sector_t origin_size, + sector_t block_size); +}; + +int dm_cache_policy_register(struct dm_cache_policy_type *type); +void dm_cache_policy_unregister(struct dm_cache_policy_type *type); + +/*----------------------------------------------------------------*/ + +#endif /* DM_CACHE_POLICY_H */ diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c new file mode 100644 index 00000000000..2c63326638b --- /dev/null +++ b/drivers/md/dm-cache-target.c @@ -0,0 +1,3121 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-bio-prison.h" +#include "dm-bio-record.h" +#include "dm-cache-metadata.h" + +#include <linux/dm-io.h> +#include <linux/dm-kcopyd.h> +#include <linux/init.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#define DM_MSG_PREFIX "cache" + +DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, + "A percentage of time allocated for copying to and/or from cache"); + +/*----------------------------------------------------------------*/ + +/* + * Glossary: + * + * oblock: index of an origin block + * cblock: index of a cache block + * promotion: movement of a block from origin to cache + * demotion: movement of a block from cache to origin + * migration: movement of a block between the origin and cache device, + * either direction + */ + +/*----------------------------------------------------------------*/ + +static size_t bitset_size_in_bytes(unsigned nr_entries) +{ + return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); +} + +static unsigned long *alloc_bitset(unsigned nr_entries) +{ + size_t s = bitset_size_in_bytes(nr_entries); + return vzalloc(s); +} + +static void clear_bitset(void *bitset, unsigned nr_entries) +{ + size_t s = bitset_size_in_bytes(nr_entries); + memset(bitset, 0, s); +} + +static void free_bitset(unsigned long *bits) +{ + vfree(bits); +} + +/*----------------------------------------------------------------*/ + +/* + * There are a couple of places where we let a bio run, but want to do some + * work before calling its endio function. We do this by temporarily + * changing the endio fn. + */ +struct dm_hook_info { + bio_end_io_t *bi_end_io; + void *bi_private; +}; + +static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, + bio_end_io_t *bi_end_io, void *bi_private) +{ + h->bi_end_io = bio->bi_end_io; + h->bi_private = bio->bi_private; + + bio->bi_end_io = bi_end_io; + bio->bi_private = bi_private; +} + +static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) +{ + bio->bi_end_io = h->bi_end_io; + bio->bi_private = h->bi_private; + + /* + * Must bump bi_remaining to allow bio to complete with + * restored bi_end_io. + */ + atomic_inc(&bio->bi_remaining); +} + +/*----------------------------------------------------------------*/ + +#define PRISON_CELLS 1024 +#define MIGRATION_POOL_SIZE 128 +#define COMMIT_PERIOD HZ +#define MIGRATION_COUNT_WINDOW 10 + +/* + * The block size of the device holding cache data must be + * between 32KB and 1GB. + */ +#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT) +#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) + +/* + * FIXME: the cache is read/write for the time being. + */ +enum cache_metadata_mode { + CM_WRITE, /* metadata may be changed */ + CM_READ_ONLY, /* metadata may not be changed */ +}; + +enum cache_io_mode { + /* + * Data is written to cached blocks only. These blocks are marked + * dirty. If you lose the cache device you will lose data. + * Potential performance increase for both reads and writes. + */ + CM_IO_WRITEBACK, + + /* + * Data is written to both cache and origin. Blocks are never + * dirty. Potential performance benfit for reads only. + */ + CM_IO_WRITETHROUGH, + + /* + * A degraded mode useful for various cache coherency situations + * (eg, rolling back snapshots). Reads and writes always go to the + * origin. If a write goes to a cached oblock, then the cache + * block is invalidated. + */ + CM_IO_PASSTHROUGH +}; + +struct cache_features { + enum cache_metadata_mode mode; + enum cache_io_mode io_mode; +}; + +struct cache_stats { + atomic_t read_hit; + atomic_t read_miss; + atomic_t write_hit; + atomic_t write_miss; + atomic_t demotion; + atomic_t promotion; + atomic_t copies_avoided; + atomic_t cache_cell_clash; + atomic_t commit_count; + atomic_t discard_count; +}; + +/* + * Defines a range of cblocks, begin to (end - 1) are in the range. end is + * the one-past-the-end value. + */ +struct cblock_range { + dm_cblock_t begin; + dm_cblock_t end; +}; + +struct invalidation_request { + struct list_head list; + struct cblock_range *cblocks; + + atomic_t complete; + int err; + + wait_queue_head_t result_wait; +}; + +struct cache { + struct dm_target *ti; + struct dm_target_callbacks callbacks; + + struct dm_cache_metadata *cmd; + + /* + * Metadata is written to this device. + */ + struct dm_dev *metadata_dev; + + /* + * The slower of the two data devices. Typically a spindle. + */ + struct dm_dev *origin_dev; + + /* + * The faster of the two data devices. Typically an SSD. + */ + struct dm_dev *cache_dev; + + /* + * Size of the origin device in _complete_ blocks and native sectors. + */ + dm_oblock_t origin_blocks; + sector_t origin_sectors; + + /* + * Size of the cache device in blocks. + */ + dm_cblock_t cache_size; + + /* + * Fields for converting from sectors to blocks. + */ + uint32_t sectors_per_block; + int sectors_per_block_shift; + + spinlock_t lock; + struct bio_list deferred_bios; + struct bio_list deferred_flush_bios; + struct bio_list deferred_writethrough_bios; + struct list_head quiesced_migrations; + struct list_head completed_migrations; + struct list_head need_commit_migrations; + sector_t migration_threshold; + wait_queue_head_t migration_wait; + atomic_t nr_migrations; + + wait_queue_head_t quiescing_wait; + atomic_t quiescing; + atomic_t quiescing_ack; + + /* + * cache_size entries, dirty if set + */ + atomic_t nr_dirty; + unsigned long *dirty_bitset; + + /* + * origin_blocks entries, discarded if set. + */ + dm_oblock_t discard_nr_blocks; + unsigned long *discard_bitset; + + /* + * Rather than reconstructing the table line for the status we just + * save it and regurgitate. + */ + unsigned nr_ctr_args; + const char **ctr_args; + + struct dm_kcopyd_client *copier; + struct workqueue_struct *wq; + struct work_struct worker; + + struct delayed_work waker; + unsigned long last_commit_jiffies; + + struct dm_bio_prison *prison; + struct dm_deferred_set *all_io_ds; + + mempool_t *migration_pool; + struct dm_cache_migration *next_migration; + + struct dm_cache_policy *policy; + unsigned policy_nr_args; + + bool need_tick_bio:1; + bool sized:1; + bool invalidate:1; + bool commit_requested:1; + bool loaded_mappings:1; + bool loaded_discards:1; + + /* + * Cache features such as write-through. + */ + struct cache_features features; + + struct cache_stats stats; + + /* + * Invalidation fields. + */ + spinlock_t invalidation_lock; + struct list_head invalidation_requests; +}; + +struct per_bio_data { + bool tick:1; + unsigned req_nr:2; + struct dm_deferred_entry *all_io_entry; + struct dm_hook_info hook_info; + + /* + * writethrough fields. These MUST remain at the end of this + * structure and the 'cache' member must be the first as it + * is used to determine the offset of the writethrough fields. + */ + struct cache *cache; + dm_cblock_t cblock; + struct dm_bio_details bio_details; +}; + +struct dm_cache_migration { + struct list_head list; + struct cache *cache; + + unsigned long start_jiffies; + dm_oblock_t old_oblock; + dm_oblock_t new_oblock; + dm_cblock_t cblock; + + bool err:1; + bool writeback:1; + bool demote:1; + bool promote:1; + bool requeue_holder:1; + bool invalidate:1; + + struct dm_bio_prison_cell *old_ocell; + struct dm_bio_prison_cell *new_ocell; +}; + +/* + * Processing a bio in the worker thread may require these memory + * allocations. We prealloc to avoid deadlocks (the same worker thread + * frees them back to the mempool). + */ +struct prealloc { + struct dm_cache_migration *mg; + struct dm_bio_prison_cell *cell1; + struct dm_bio_prison_cell *cell2; +}; + +static void wake_worker(struct cache *cache) +{ + queue_work(cache->wq, &cache->worker); +} + +/*----------------------------------------------------------------*/ + +static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) +{ + /* FIXME: change to use a local slab. */ + return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); +} + +static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) +{ + dm_bio_prison_free_cell(cache->prison, cell); +} + +static int prealloc_data_structs(struct cache *cache, struct prealloc *p) +{ + if (!p->mg) { + p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); + if (!p->mg) + return -ENOMEM; + } + + if (!p->cell1) { + p->cell1 = alloc_prison_cell(cache); + if (!p->cell1) + return -ENOMEM; + } + + if (!p->cell2) { + p->cell2 = alloc_prison_cell(cache); + if (!p->cell2) + return -ENOMEM; + } + + return 0; +} + +static void prealloc_free_structs(struct cache *cache, struct prealloc *p) +{ + if (p->cell2) + free_prison_cell(cache, p->cell2); + + if (p->cell1) + free_prison_cell(cache, p->cell1); + + if (p->mg) + mempool_free(p->mg, cache->migration_pool); +} + +static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) +{ + struct dm_cache_migration *mg = p->mg; + + BUG_ON(!mg); + p->mg = NULL; + + return mg; +} + +/* + * You must have a cell within the prealloc struct to return. If not this + * function will BUG() rather than returning NULL. + */ +static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) +{ + struct dm_bio_prison_cell *r = NULL; + + if (p->cell1) { + r = p->cell1; + p->cell1 = NULL; + + } else if (p->cell2) { + r = p->cell2; + p->cell2 = NULL; + } else + BUG(); + + return r; +} + +/* + * You can't have more than two cells in a prealloc struct. BUG() will be + * called if you try and overfill. + */ +static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) +{ + if (!p->cell2) + p->cell2 = cell; + + else if (!p->cell1) + p->cell1 = cell; + + else + BUG(); +} + +/*----------------------------------------------------------------*/ + +static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) +{ + key->virtual = 0; + key->dev = 0; + key->block = from_oblock(oblock); +} + +/* + * The caller hands in a preallocated cell, and a free function for it. + * The cell will be freed if there's an error, or if it wasn't used because + * a cell with that key already exists. + */ +typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); + +static int bio_detain(struct cache *cache, dm_oblock_t oblock, + struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, + cell_free_fn free_fn, void *free_context, + struct dm_bio_prison_cell **cell_result) +{ + int r; + struct dm_cell_key key; + + build_key(oblock, &key); + r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); + if (r) + free_fn(free_context, cell_prealloc); + + return r; +} + +static int get_cell(struct cache *cache, + dm_oblock_t oblock, + struct prealloc *structs, + struct dm_bio_prison_cell **cell_result) +{ + int r; + struct dm_cell_key key; + struct dm_bio_prison_cell *cell_prealloc; + + cell_prealloc = prealloc_get_cell(structs); + + build_key(oblock, &key); + r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); + if (r) + prealloc_put_cell(structs, cell_prealloc); + + return r; +} + +/*----------------------------------------------------------------*/ + +static bool is_dirty(struct cache *cache, dm_cblock_t b) +{ + return test_bit(from_cblock(b), cache->dirty_bitset); +} + +static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) +{ + if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { + atomic_inc(&cache->nr_dirty); + policy_set_dirty(cache->policy, oblock); + } +} + +static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) +{ + if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { + policy_clear_dirty(cache->policy, oblock); + if (atomic_dec_return(&cache->nr_dirty) == 0) + dm_table_event(cache->ti->table); + } +} + +/*----------------------------------------------------------------*/ + +static bool block_size_is_power_of_two(struct cache *cache) +{ + return cache->sectors_per_block_shift >= 0; +} + +/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */ +#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6 +__always_inline +#endif +static dm_block_t block_div(dm_block_t b, uint32_t n) +{ + do_div(b, n); + + return b; +} + +static void set_discard(struct cache *cache, dm_oblock_t b) +{ + unsigned long flags; + + atomic_inc(&cache->stats.discard_count); + + spin_lock_irqsave(&cache->lock, flags); + set_bit(from_oblock(b), cache->discard_bitset); + spin_unlock_irqrestore(&cache->lock, flags); +} + +static void clear_discard(struct cache *cache, dm_oblock_t b) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + clear_bit(from_oblock(b), cache->discard_bitset); + spin_unlock_irqrestore(&cache->lock, flags); +} + +static bool is_discarded(struct cache *cache, dm_oblock_t b) +{ + int r; + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + r = test_bit(from_oblock(b), cache->discard_bitset); + spin_unlock_irqrestore(&cache->lock, flags); + + return r; +} + +static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) +{ + int r; + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + r = test_bit(from_oblock(b), cache->discard_bitset); + spin_unlock_irqrestore(&cache->lock, flags); + + return r; +} + +/*----------------------------------------------------------------*/ + +static void load_stats(struct cache *cache) +{ + struct dm_cache_statistics stats; + + dm_cache_metadata_get_stats(cache->cmd, &stats); + atomic_set(&cache->stats.read_hit, stats.read_hits); + atomic_set(&cache->stats.read_miss, stats.read_misses); + atomic_set(&cache->stats.write_hit, stats.write_hits); + atomic_set(&cache->stats.write_miss, stats.write_misses); +} + +static void save_stats(struct cache *cache) +{ + struct dm_cache_statistics stats; + + stats.read_hits = atomic_read(&cache->stats.read_hit); + stats.read_misses = atomic_read(&cache->stats.read_miss); + stats.write_hits = atomic_read(&cache->stats.write_hit); + stats.write_misses = atomic_read(&cache->stats.write_miss); + + dm_cache_metadata_set_stats(cache->cmd, &stats); +} + +/*---------------------------------------------------------------- + * Per bio data + *--------------------------------------------------------------*/ + +/* + * If using writeback, leave out struct per_bio_data's writethrough fields. + */ +#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) +#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) + +static bool writethrough_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_WRITETHROUGH; +} + +static bool writeback_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_WRITEBACK; +} + +static bool passthrough_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_PASSTHROUGH; +} + +static size_t get_per_bio_data_size(struct cache *cache) +{ + return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; +} + +static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) +{ + struct per_bio_data *pb = dm_per_bio_data(bio, data_size); + BUG_ON(!pb); + return pb; +} + +static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) +{ + struct per_bio_data *pb = get_per_bio_data(bio, data_size); + + pb->tick = false; + pb->req_nr = dm_bio_get_target_bio_nr(bio); + pb->all_io_entry = NULL; + + return pb; +} + +/*---------------------------------------------------------------- + * Remapping + *--------------------------------------------------------------*/ +static void remap_to_origin(struct cache *cache, struct bio *bio) +{ + bio->bi_bdev = cache->origin_dev->bdev; +} + +static void remap_to_cache(struct cache *cache, struct bio *bio, + dm_cblock_t cblock) +{ + sector_t bi_sector = bio->bi_iter.bi_sector; + sector_t block = from_cblock(cblock); + + bio->bi_bdev = cache->cache_dev->bdev; + if (!block_size_is_power_of_two(cache)) + bio->bi_iter.bi_sector = + (block * cache->sectors_per_block) + + sector_div(bi_sector, cache->sectors_per_block); + else + bio->bi_iter.bi_sector = + (block << cache->sectors_per_block_shift) | + (bi_sector & (cache->sectors_per_block - 1)); +} + +static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + spin_lock_irqsave(&cache->lock, flags); + if (cache->need_tick_bio && + !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) { + pb->tick = true; + cache->need_tick_bio = false; + } + spin_unlock_irqrestore(&cache->lock, flags); +} + +static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, + dm_oblock_t oblock) +{ + check_if_tick_bio_needed(cache, bio); + remap_to_origin(cache, bio); + if (bio_data_dir(bio) == WRITE) + clear_discard(cache, oblock); +} + +static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + check_if_tick_bio_needed(cache, bio); + remap_to_cache(cache, bio, cblock); + if (bio_data_dir(bio) == WRITE) { + set_dirty(cache, oblock, cblock); + clear_discard(cache, oblock); + } +} + +static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) +{ + sector_t block_nr = bio->bi_iter.bi_sector; + + if (!block_size_is_power_of_two(cache)) + (void) sector_div(block_nr, cache->sectors_per_block); + else + block_nr >>= cache->sectors_per_block_shift; + + return to_oblock(block_nr); +} + +static int bio_triggers_commit(struct cache *cache, struct bio *bio) +{ + return bio->bi_rw & (REQ_FLUSH | REQ_FUA); +} + +static void issue(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + + if (!bio_triggers_commit(cache, bio)) { + generic_make_request(bio); + return; + } + + /* + * Batch together any bios that trigger commits and then issue a + * single commit for them in do_worker(). + */ + spin_lock_irqsave(&cache->lock, flags); + cache->commit_requested = true; + bio_list_add(&cache->deferred_flush_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); +} + +static void defer_writethrough_bio(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + bio_list_add(&cache->deferred_writethrough_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void writethrough_endio(struct bio *bio, int err) +{ + struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); + + dm_unhook_bio(&pb->hook_info, bio); + + if (err) { + bio_endio(bio, err); + return; + } + + dm_bio_restore(&pb->bio_details, bio); + remap_to_cache(pb->cache, bio, pb->cblock); + + /* + * We can't issue this bio directly, since we're in interrupt + * context. So it gets put on a bio list for processing by the + * worker thread. + */ + defer_writethrough_bio(pb->cache, bio); +} + +/* + * When running in writethrough mode we need to send writes to clean blocks + * to both the cache and origin devices. In future we'd like to clone the + * bio and send them in parallel, but for now we're doing them in + * series as this is easier. + */ +static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); + + pb->cache = cache; + pb->cblock = cblock; + dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); + dm_bio_record(&pb->bio_details, bio); + + remap_to_origin_clear_discard(pb->cache, bio, oblock); +} + +/*---------------------------------------------------------------- + * Migration processing + * + * Migration covers moving data from the origin device to the cache, or + * vice versa. + *--------------------------------------------------------------*/ +static void free_migration(struct dm_cache_migration *mg) +{ + mempool_free(mg, mg->cache->migration_pool); +} + +static void inc_nr_migrations(struct cache *cache) +{ + atomic_inc(&cache->nr_migrations); +} + +static void dec_nr_migrations(struct cache *cache) +{ + atomic_dec(&cache->nr_migrations); + + /* + * Wake the worker in case we're suspending the target. + */ + wake_up(&cache->migration_wait); +} + +static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, + bool holder) +{ + (holder ? dm_cell_release : dm_cell_release_no_holder) + (cache->prison, cell, &cache->deferred_bios); + free_prison_cell(cache, cell); +} + +static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, + bool holder) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + __cell_defer(cache, cell, holder); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void cleanup_migration(struct dm_cache_migration *mg) +{ + struct cache *cache = mg->cache; + free_migration(mg); + dec_nr_migrations(cache); +} + +static void migration_failure(struct dm_cache_migration *mg) +{ + struct cache *cache = mg->cache; + + if (mg->writeback) { + DMWARN_LIMIT("writeback failed; couldn't copy block"); + set_dirty(cache, mg->old_oblock, mg->cblock); + cell_defer(cache, mg->old_ocell, false); + + } else if (mg->demote) { + DMWARN_LIMIT("demotion failed; couldn't copy block"); + policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); + + cell_defer(cache, mg->old_ocell, mg->promote ? false : true); + if (mg->promote) + cell_defer(cache, mg->new_ocell, true); + } else { + DMWARN_LIMIT("promotion failed; couldn't copy block"); + policy_remove_mapping(cache->policy, mg->new_oblock); + cell_defer(cache, mg->new_ocell, true); + } + + cleanup_migration(mg); +} + +static void migration_success_pre_commit(struct dm_cache_migration *mg) +{ + unsigned long flags; + struct cache *cache = mg->cache; + + if (mg->writeback) { + cell_defer(cache, mg->old_ocell, false); + clear_dirty(cache, mg->old_oblock, mg->cblock); + cleanup_migration(mg); + return; + + } else if (mg->demote) { + if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) { + DMWARN_LIMIT("demotion failed; couldn't update on disk metadata"); + policy_force_mapping(cache->policy, mg->new_oblock, + mg->old_oblock); + if (mg->promote) + cell_defer(cache, mg->new_ocell, true); + cleanup_migration(mg); + return; + } + } else { + if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { + DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); + policy_remove_mapping(cache->policy, mg->new_oblock); + cleanup_migration(mg); + return; + } + } + + spin_lock_irqsave(&cache->lock, flags); + list_add_tail(&mg->list, &cache->need_commit_migrations); + cache->commit_requested = true; + spin_unlock_irqrestore(&cache->lock, flags); +} + +static void migration_success_post_commit(struct dm_cache_migration *mg) +{ + unsigned long flags; + struct cache *cache = mg->cache; + + if (mg->writeback) { + DMWARN("writeback unexpectedly triggered commit"); + return; + + } else if (mg->demote) { + cell_defer(cache, mg->old_ocell, mg->promote ? false : true); + + if (mg->promote) { + mg->demote = false; + + spin_lock_irqsave(&cache->lock, flags); + list_add_tail(&mg->list, &cache->quiesced_migrations); + spin_unlock_irqrestore(&cache->lock, flags); + + } else { + if (mg->invalidate) + policy_remove_mapping(cache->policy, mg->old_oblock); + cleanup_migration(mg); + } + + } else { + if (mg->requeue_holder) + cell_defer(cache, mg->new_ocell, true); + else { + bio_endio(mg->new_ocell->holder, 0); + cell_defer(cache, mg->new_ocell, false); + } + clear_dirty(cache, mg->new_oblock, mg->cblock); + cleanup_migration(mg); + } +} + +static void copy_complete(int read_err, unsigned long write_err, void *context) +{ + unsigned long flags; + struct dm_cache_migration *mg = (struct dm_cache_migration *) context; + struct cache *cache = mg->cache; + + if (read_err || write_err) + mg->err = true; + + spin_lock_irqsave(&cache->lock, flags); + list_add_tail(&mg->list, &cache->completed_migrations); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void issue_copy_real(struct dm_cache_migration *mg) +{ + int r; + struct dm_io_region o_region, c_region; + struct cache *cache = mg->cache; + sector_t cblock = from_cblock(mg->cblock); + + o_region.bdev = cache->origin_dev->bdev; + o_region.count = cache->sectors_per_block; + + c_region.bdev = cache->cache_dev->bdev; + c_region.sector = cblock * cache->sectors_per_block; + c_region.count = cache->sectors_per_block; + + if (mg->writeback || mg->demote) { + /* demote */ + o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; + r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); + } else { + /* promote */ + o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; + r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); + } + + if (r < 0) { + DMERR_LIMIT("issuing migration failed"); + migration_failure(mg); + } +} + +static void overwrite_endio(struct bio *bio, int err) +{ + struct dm_cache_migration *mg = bio->bi_private; + struct cache *cache = mg->cache; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + unsigned long flags; + + dm_unhook_bio(&pb->hook_info, bio); + + if (err) + mg->err = true; + + mg->requeue_holder = false; + + spin_lock_irqsave(&cache->lock, flags); + list_add_tail(&mg->list, &cache->completed_migrations); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) +{ + size_t pb_data_size = get_per_bio_data_size(mg->cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); + remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); + generic_make_request(bio); +} + +static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE) && + (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); +} + +static void avoid_copy(struct dm_cache_migration *mg) +{ + atomic_inc(&mg->cache->stats.copies_avoided); + migration_success_pre_commit(mg); +} + +static void issue_copy(struct dm_cache_migration *mg) +{ + bool avoid; + struct cache *cache = mg->cache; + + if (mg->writeback || mg->demote) + avoid = !is_dirty(cache, mg->cblock) || + is_discarded_oblock(cache, mg->old_oblock); + else { + struct bio *bio = mg->new_ocell->holder; + + avoid = is_discarded_oblock(cache, mg->new_oblock); + + if (!avoid && bio_writes_complete_block(cache, bio)) { + issue_overwrite(mg, bio); + return; + } + } + + avoid ? avoid_copy(mg) : issue_copy_real(mg); +} + +static void complete_migration(struct dm_cache_migration *mg) +{ + if (mg->err) + migration_failure(mg); + else + migration_success_pre_commit(mg); +} + +static void process_migrations(struct cache *cache, struct list_head *head, + void (*fn)(struct dm_cache_migration *)) +{ + unsigned long flags; + struct list_head list; + struct dm_cache_migration *mg, *tmp; + + INIT_LIST_HEAD(&list); + spin_lock_irqsave(&cache->lock, flags); + list_splice_init(head, &list); + spin_unlock_irqrestore(&cache->lock, flags); + + list_for_each_entry_safe(mg, tmp, &list, list) + fn(mg); +} + +static void __queue_quiesced_migration(struct dm_cache_migration *mg) +{ + list_add_tail(&mg->list, &mg->cache->quiesced_migrations); +} + +static void queue_quiesced_migration(struct dm_cache_migration *mg) +{ + unsigned long flags; + struct cache *cache = mg->cache; + + spin_lock_irqsave(&cache->lock, flags); + __queue_quiesced_migration(mg); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) +{ + unsigned long flags; + struct dm_cache_migration *mg, *tmp; + + spin_lock_irqsave(&cache->lock, flags); + list_for_each_entry_safe(mg, tmp, work, list) + __queue_quiesced_migration(mg); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void check_for_quiesced_migrations(struct cache *cache, + struct per_bio_data *pb) +{ + struct list_head work; + + if (!pb->all_io_entry) + return; + + INIT_LIST_HEAD(&work); + if (pb->all_io_entry) + dm_deferred_entry_dec(pb->all_io_entry, &work); + + if (!list_empty(&work)) + queue_quiesced_migrations(cache, &work); +} + +static void quiesce_migration(struct dm_cache_migration *mg) +{ + if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) + queue_quiesced_migration(mg); +} + +static void promote(struct cache *cache, struct prealloc *structs, + dm_oblock_t oblock, dm_cblock_t cblock, + struct dm_bio_prison_cell *cell) +{ + struct dm_cache_migration *mg = prealloc_get_migration(structs); + + mg->err = false; + mg->writeback = false; + mg->demote = false; + mg->promote = true; + mg->requeue_holder = true; + mg->invalidate = false; + mg->cache = cache; + mg->new_oblock = oblock; + mg->cblock = cblock; + mg->old_ocell = NULL; + mg->new_ocell = cell; + mg->start_jiffies = jiffies; + + inc_nr_migrations(cache); + quiesce_migration(mg); +} + +static void writeback(struct cache *cache, struct prealloc *structs, + dm_oblock_t oblock, dm_cblock_t cblock, + struct dm_bio_prison_cell *cell) +{ + struct dm_cache_migration *mg = prealloc_get_migration(structs); + + mg->err = false; + mg->writeback = true; + mg->demote = false; + mg->promote = false; + mg->requeue_holder = true; + mg->invalidate = false; + mg->cache = cache; + mg->old_oblock = oblock; + mg->cblock = cblock; + mg->old_ocell = cell; + mg->new_ocell = NULL; + mg->start_jiffies = jiffies; + + inc_nr_migrations(cache); + quiesce_migration(mg); +} + +static void demote_then_promote(struct cache *cache, struct prealloc *structs, + dm_oblock_t old_oblock, dm_oblock_t new_oblock, + dm_cblock_t cblock, + struct dm_bio_prison_cell *old_ocell, + struct dm_bio_prison_cell *new_ocell) +{ + struct dm_cache_migration *mg = prealloc_get_migration(structs); + + mg->err = false; + mg->writeback = false; + mg->demote = true; + mg->promote = true; + mg->requeue_holder = true; + mg->invalidate = false; + mg->cache = cache; + mg->old_oblock = old_oblock; + mg->new_oblock = new_oblock; + mg->cblock = cblock; + mg->old_ocell = old_ocell; + mg->new_ocell = new_ocell; + mg->start_jiffies = jiffies; + + inc_nr_migrations(cache); + quiesce_migration(mg); +} + +/* + * Invalidate a cache entry. No writeback occurs; any changes in the cache + * block are thrown away. + */ +static void invalidate(struct cache *cache, struct prealloc *structs, + dm_oblock_t oblock, dm_cblock_t cblock, + struct dm_bio_prison_cell *cell) +{ + struct dm_cache_migration *mg = prealloc_get_migration(structs); + + mg->err = false; + mg->writeback = false; + mg->demote = true; + mg->promote = false; + mg->requeue_holder = true; + mg->invalidate = true; + mg->cache = cache; + mg->old_oblock = oblock; + mg->cblock = cblock; + mg->old_ocell = cell; + mg->new_ocell = NULL; + mg->start_jiffies = jiffies; + + inc_nr_migrations(cache); + quiesce_migration(mg); +} + +/*---------------------------------------------------------------- + * bio processing + *--------------------------------------------------------------*/ +static void defer_bio(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + bio_list_add(&cache->deferred_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void process_flush_bio(struct cache *cache, struct bio *bio) +{ + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + BUG_ON(bio->bi_iter.bi_size); + if (!pb->req_nr) + remap_to_origin(cache, bio); + else + remap_to_cache(cache, bio, 0); + + issue(cache, bio); +} + +/* + * People generally discard large parts of a device, eg, the whole device + * when formatting. Splitting these large discards up into cache block + * sized ios and then quiescing (always neccessary for discard) takes too + * long. + * + * We keep it simple, and allow any size of discard to come in, and just + * mark off blocks on the discard bitset. No passdown occurs! + * + * To implement passdown we need to change the bio_prison such that a cell + * can have a key that spans many blocks. + */ +static void process_discard_bio(struct cache *cache, struct bio *bio) +{ + dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector, + cache->sectors_per_block); + dm_block_t end_block = bio_end_sector(bio); + dm_block_t b; + + end_block = block_div(end_block, cache->sectors_per_block); + + for (b = start_block; b < end_block; b++) + set_discard(cache, to_oblock(b)); + + bio_endio(bio, 0); +} + +static bool spare_migration_bandwidth(struct cache *cache) +{ + sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) * + cache->sectors_per_block; + return current_volume < cache->migration_threshold; +} + +static void inc_hit_counter(struct cache *cache, struct bio *bio) +{ + atomic_inc(bio_data_dir(bio) == READ ? + &cache->stats.read_hit : &cache->stats.write_hit); +} + +static void inc_miss_counter(struct cache *cache, struct bio *bio) +{ + atomic_inc(bio_data_dir(bio) == READ ? + &cache->stats.read_miss : &cache->stats.write_miss); +} + +static void issue_cache_bio(struct cache *cache, struct bio *bio, + struct per_bio_data *pb, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + remap_to_cache_dirty(cache, bio, oblock, cblock); + issue(cache, bio); +} + +static void process_bio(struct cache *cache, struct prealloc *structs, + struct bio *bio) +{ + int r; + bool release_cell = true; + dm_oblock_t block = get_bio_block(cache, bio); + struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; + struct policy_result lookup_result; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + bool discarded_block = is_discarded_oblock(cache, block); + bool passthrough = passthrough_mode(&cache->features); + bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); + + /* + * Check to see if that block is currently migrating. + */ + cell_prealloc = prealloc_get_cell(structs); + r = bio_detain(cache, block, bio, cell_prealloc, + (cell_free_fn) prealloc_put_cell, + structs, &new_ocell); + if (r > 0) + return; + + r = policy_map(cache->policy, block, true, can_migrate, discarded_block, + bio, &lookup_result); + + if (r == -EWOULDBLOCK) + /* migration has been denied */ + lookup_result.op = POLICY_MISS; + + switch (lookup_result.op) { + case POLICY_HIT: + if (passthrough) { + inc_miss_counter(cache, bio); + + /* + * Passthrough always maps to the origin, + * invalidating any cache blocks that are written + * to. + */ + + if (bio_data_dir(bio) == WRITE) { + atomic_inc(&cache->stats.demotion); + invalidate(cache, structs, block, lookup_result.cblock, new_ocell); + release_cell = false; + + } else { + /* FIXME: factor out issue_origin() */ + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + remap_to_origin_clear_discard(cache, bio, block); + issue(cache, bio); + } + } else { + inc_hit_counter(cache, bio); + + if (bio_data_dir(bio) == WRITE && + writethrough_mode(&cache->features) && + !is_dirty(cache, lookup_result.cblock)) { + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + issue(cache, bio); + } else + issue_cache_bio(cache, bio, pb, block, lookup_result.cblock); + } + + break; + + case POLICY_MISS: + inc_miss_counter(cache, bio); + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + remap_to_origin_clear_discard(cache, bio, block); + issue(cache, bio); + break; + + case POLICY_NEW: + atomic_inc(&cache->stats.promotion); + promote(cache, structs, block, lookup_result.cblock, new_ocell); + release_cell = false; + break; + + case POLICY_REPLACE: + cell_prealloc = prealloc_get_cell(structs); + r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc, + (cell_free_fn) prealloc_put_cell, + structs, &old_ocell); + if (r > 0) { + /* + * We have to be careful to avoid lock inversion of + * the cells. So we back off, and wait for the + * old_ocell to become free. + */ + policy_force_mapping(cache->policy, block, + lookup_result.old_oblock); + atomic_inc(&cache->stats.cache_cell_clash); + break; + } + atomic_inc(&cache->stats.demotion); + atomic_inc(&cache->stats.promotion); + + demote_then_promote(cache, structs, lookup_result.old_oblock, + block, lookup_result.cblock, + old_ocell, new_ocell); + release_cell = false; + break; + + default: + DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__, + (unsigned) lookup_result.op); + bio_io_error(bio); + } + + if (release_cell) + cell_defer(cache, new_ocell, false); +} + +static int need_commit_due_to_time(struct cache *cache) +{ + return jiffies < cache->last_commit_jiffies || + jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; +} + +static int commit_if_needed(struct cache *cache) +{ + int r = 0; + + if ((cache->commit_requested || need_commit_due_to_time(cache)) && + dm_cache_changed_this_transaction(cache->cmd)) { + atomic_inc(&cache->stats.commit_count); + cache->commit_requested = false; + r = dm_cache_commit(cache->cmd, false); + cache->last_commit_jiffies = jiffies; + } + + return r; +} + +static void process_deferred_bios(struct cache *cache) +{ + unsigned long flags; + struct bio_list bios; + struct bio *bio; + struct prealloc structs; + + memset(&structs, 0, sizeof(structs)); + bio_list_init(&bios); + + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&bios, &cache->deferred_bios); + bio_list_init(&cache->deferred_bios); + spin_unlock_irqrestore(&cache->lock, flags); + + while (!bio_list_empty(&bios)) { + /* + * If we've got no free migration structs, and processing + * this bio might require one, we pause until there are some + * prepared mappings to process. + */ + if (prealloc_data_structs(cache, &structs)) { + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&cache->deferred_bios, &bios); + spin_unlock_irqrestore(&cache->lock, flags); + break; + } + + bio = bio_list_pop(&bios); + + if (bio->bi_rw & REQ_FLUSH) + process_flush_bio(cache, bio); + else if (bio->bi_rw & REQ_DISCARD) + process_discard_bio(cache, bio); + else + process_bio(cache, &structs, bio); + } + + prealloc_free_structs(cache, &structs); +} + +static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) +{ + unsigned long flags; + struct bio_list bios; + struct bio *bio; + + bio_list_init(&bios); + + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&bios, &cache->deferred_flush_bios); + bio_list_init(&cache->deferred_flush_bios); + spin_unlock_irqrestore(&cache->lock, flags); + + while ((bio = bio_list_pop(&bios))) + submit_bios ? generic_make_request(bio) : bio_io_error(bio); +} + +static void process_deferred_writethrough_bios(struct cache *cache) +{ + unsigned long flags; + struct bio_list bios; + struct bio *bio; + + bio_list_init(&bios); + + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&bios, &cache->deferred_writethrough_bios); + bio_list_init(&cache->deferred_writethrough_bios); + spin_unlock_irqrestore(&cache->lock, flags); + + while ((bio = bio_list_pop(&bios))) + generic_make_request(bio); +} + +static void writeback_some_dirty_blocks(struct cache *cache) +{ + int r = 0; + dm_oblock_t oblock; + dm_cblock_t cblock; + struct prealloc structs; + struct dm_bio_prison_cell *old_ocell; + + memset(&structs, 0, sizeof(structs)); + + while (spare_migration_bandwidth(cache)) { + if (prealloc_data_structs(cache, &structs)) + break; + + r = policy_writeback_work(cache->policy, &oblock, &cblock); + if (r) + break; + + r = get_cell(cache, oblock, &structs, &old_ocell); + if (r) { + policy_set_dirty(cache->policy, oblock); + break; + } + + writeback(cache, &structs, oblock, cblock, old_ocell); + } + + prealloc_free_structs(cache, &structs); +} + +/*---------------------------------------------------------------- + * Invalidations. + * Dropping something from the cache *without* writing back. + *--------------------------------------------------------------*/ + +static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) +{ + int r = 0; + uint64_t begin = from_cblock(req->cblocks->begin); + uint64_t end = from_cblock(req->cblocks->end); + + while (begin != end) { + r = policy_remove_cblock(cache->policy, to_cblock(begin)); + if (!r) { + r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); + if (r) + break; + + } else if (r == -ENODATA) { + /* harmless, already unmapped */ + r = 0; + + } else { + DMERR("policy_remove_cblock failed"); + break; + } + + begin++; + } + + cache->commit_requested = true; + + req->err = r; + atomic_set(&req->complete, 1); + + wake_up(&req->result_wait); +} + +static void process_invalidation_requests(struct cache *cache) +{ + struct list_head list; + struct invalidation_request *req, *tmp; + + INIT_LIST_HEAD(&list); + spin_lock(&cache->invalidation_lock); + list_splice_init(&cache->invalidation_requests, &list); + spin_unlock(&cache->invalidation_lock); + + list_for_each_entry_safe (req, tmp, &list, list) + process_invalidation_request(cache, req); +} + +/*---------------------------------------------------------------- + * Main worker loop + *--------------------------------------------------------------*/ +static bool is_quiescing(struct cache *cache) +{ + return atomic_read(&cache->quiescing); +} + +static void ack_quiescing(struct cache *cache) +{ + if (is_quiescing(cache)) { + atomic_inc(&cache->quiescing_ack); + wake_up(&cache->quiescing_wait); + } +} + +static void wait_for_quiescing_ack(struct cache *cache) +{ + wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); +} + +static void start_quiescing(struct cache *cache) +{ + atomic_inc(&cache->quiescing); + wait_for_quiescing_ack(cache); +} + +static void stop_quiescing(struct cache *cache) +{ + atomic_set(&cache->quiescing, 0); + atomic_set(&cache->quiescing_ack, 0); +} + +static void wait_for_migrations(struct cache *cache) +{ + wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations)); +} + +static void stop_worker(struct cache *cache) +{ + cancel_delayed_work(&cache->waker); + flush_workqueue(cache->wq); +} + +static void requeue_deferred_io(struct cache *cache) +{ + struct bio *bio; + struct bio_list bios; + + bio_list_init(&bios); + bio_list_merge(&bios, &cache->deferred_bios); + bio_list_init(&cache->deferred_bios); + + while ((bio = bio_list_pop(&bios))) + bio_endio(bio, DM_ENDIO_REQUEUE); +} + +static int more_work(struct cache *cache) +{ + if (is_quiescing(cache)) + return !list_empty(&cache->quiesced_migrations) || + !list_empty(&cache->completed_migrations) || + !list_empty(&cache->need_commit_migrations); + else + return !bio_list_empty(&cache->deferred_bios) || + !bio_list_empty(&cache->deferred_flush_bios) || + !bio_list_empty(&cache->deferred_writethrough_bios) || + !list_empty(&cache->quiesced_migrations) || + !list_empty(&cache->completed_migrations) || + !list_empty(&cache->need_commit_migrations) || + cache->invalidate; +} + +static void do_worker(struct work_struct *ws) +{ + struct cache *cache = container_of(ws, struct cache, worker); + + do { + if (!is_quiescing(cache)) { + writeback_some_dirty_blocks(cache); + process_deferred_writethrough_bios(cache); + process_deferred_bios(cache); + process_invalidation_requests(cache); + } + + process_migrations(cache, &cache->quiesced_migrations, issue_copy); + process_migrations(cache, &cache->completed_migrations, complete_migration); + + if (commit_if_needed(cache)) { + process_deferred_flush_bios(cache, false); + + /* + * FIXME: rollback metadata or just go into a + * failure mode and error everything + */ + } else { + process_deferred_flush_bios(cache, true); + process_migrations(cache, &cache->need_commit_migrations, + migration_success_post_commit); + } + + ack_quiescing(cache); + + } while (more_work(cache)); +} + +/* + * We want to commit periodically so that not too much + * unwritten metadata builds up. + */ +static void do_waker(struct work_struct *ws) +{ + struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); + policy_tick(cache->policy); + wake_worker(cache); + queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); +} + +/*----------------------------------------------------------------*/ + +static int is_congested(struct dm_dev *dev, int bdi_bits) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + return bdi_congested(&q->backing_dev_info, bdi_bits); +} + +static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) +{ + struct cache *cache = container_of(cb, struct cache, callbacks); + + return is_congested(cache->origin_dev, bdi_bits) || + is_congested(cache->cache_dev, bdi_bits); +} + +/*---------------------------------------------------------------- + * Target methods + *--------------------------------------------------------------*/ + +/* + * This function gets called on the error paths of the constructor, so we + * have to cope with a partially initialised struct. + */ +static void destroy(struct cache *cache) +{ + unsigned i; + + if (cache->next_migration) + mempool_free(cache->next_migration, cache->migration_pool); + + if (cache->migration_pool) + mempool_destroy(cache->migration_pool); + + if (cache->all_io_ds) + dm_deferred_set_destroy(cache->all_io_ds); + + if (cache->prison) + dm_bio_prison_destroy(cache->prison); + + if (cache->wq) + destroy_workqueue(cache->wq); + + if (cache->dirty_bitset) + free_bitset(cache->dirty_bitset); + + if (cache->discard_bitset) + free_bitset(cache->discard_bitset); + + if (cache->copier) + dm_kcopyd_client_destroy(cache->copier); + + if (cache->cmd) + dm_cache_metadata_close(cache->cmd); + + if (cache->metadata_dev) + dm_put_device(cache->ti, cache->metadata_dev); + + if (cache->origin_dev) + dm_put_device(cache->ti, cache->origin_dev); + + if (cache->cache_dev) + dm_put_device(cache->ti, cache->cache_dev); + + if (cache->policy) + dm_cache_policy_destroy(cache->policy); + + for (i = 0; i < cache->nr_ctr_args ; i++) + kfree(cache->ctr_args[i]); + kfree(cache->ctr_args); + + kfree(cache); +} + +static void cache_dtr(struct dm_target *ti) +{ + struct cache *cache = ti->private; + + destroy(cache); +} + +static sector_t get_dev_size(struct dm_dev *dev) +{ + return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; +} + +/*----------------------------------------------------------------*/ + +/* + * Construct a cache device mapping. + * + * cache <metadata dev> <cache dev> <origin dev> <block size> + * <#feature args> [<feature arg>]* + * <policy> <#policy args> [<policy arg>]* + * + * metadata dev : fast device holding the persistent metadata + * cache dev : fast device holding cached data blocks + * origin dev : slow device holding original data blocks + * block size : cache unit size in sectors + * + * #feature args : number of feature arguments passed + * feature args : writethrough. (The default is writeback.) + * + * policy : the replacement policy to use + * #policy args : an even number of policy arguments corresponding + * to key/value pairs passed to the policy + * policy args : key/value pairs passed to the policy + * E.g. 'sequential_threshold 1024' + * See cache-policies.txt for details. + * + * Optional feature arguments are: + * writethrough : write through caching that prohibits cache block + * content from being different from origin block content. + * Without this argument, the default behaviour is to write + * back cache block contents later for performance reasons, + * so they may differ from the corresponding origin blocks. + */ +struct cache_args { + struct dm_target *ti; + + struct dm_dev *metadata_dev; + + struct dm_dev *cache_dev; + sector_t cache_sectors; + + struct dm_dev *origin_dev; + sector_t origin_sectors; + + uint32_t block_size; + + const char *policy_name; + int policy_argc; + const char **policy_argv; + + struct cache_features features; +}; + +static void destroy_cache_args(struct cache_args *ca) +{ + if (ca->metadata_dev) + dm_put_device(ca->ti, ca->metadata_dev); + + if (ca->cache_dev) + dm_put_device(ca->ti, ca->cache_dev); + + if (ca->origin_dev) + dm_put_device(ca->ti, ca->origin_dev); + + kfree(ca); +} + +static bool at_least_one_arg(struct dm_arg_set *as, char **error) +{ + if (!as->argc) { + *error = "Insufficient args"; + return false; + } + + return true; +} + +static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, + char **error) +{ + int r; + sector_t metadata_dev_size; + char b[BDEVNAME_SIZE]; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, + &ca->metadata_dev); + if (r) { + *error = "Error opening metadata device"; + return r; + } + + metadata_dev_size = get_dev_size(ca->metadata_dev); + if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING) + DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", + bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); + + return 0; +} + +static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, + char **error) +{ + int r; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, + &ca->cache_dev); + if (r) { + *error = "Error opening cache device"; + return r; + } + ca->cache_sectors = get_dev_size(ca->cache_dev); + + return 0; +} + +static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, + char **error) +{ + int r; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, + &ca->origin_dev); + if (r) { + *error = "Error opening origin device"; + return r; + } + + ca->origin_sectors = get_dev_size(ca->origin_dev); + if (ca->ti->len > ca->origin_sectors) { + *error = "Device size larger than cached device"; + return -EINVAL; + } + + return 0; +} + +static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, + char **error) +{ + unsigned long block_size; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size || + block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || + block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || + block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { + *error = "Invalid data block size"; + return -EINVAL; + } + + if (block_size > ca->cache_sectors) { + *error = "Data block size is larger than the cache device"; + return -EINVAL; + } + + ca->block_size = block_size; + + return 0; +} + +static void init_features(struct cache_features *cf) +{ + cf->mode = CM_WRITE; + cf->io_mode = CM_IO_WRITEBACK; +} + +static int parse_features(struct cache_args *ca, struct dm_arg_set *as, + char **error) +{ + static struct dm_arg _args[] = { + {0, 1, "Invalid number of cache feature arguments"}, + }; + + int r; + unsigned argc; + const char *arg; + struct cache_features *cf = &ca->features; + + init_features(cf); + + r = dm_read_arg_group(_args, as, &argc, error); + if (r) + return -EINVAL; + + while (argc--) { + arg = dm_shift_arg(as); + + if (!strcasecmp(arg, "writeback")) + cf->io_mode = CM_IO_WRITEBACK; + + else if (!strcasecmp(arg, "writethrough")) + cf->io_mode = CM_IO_WRITETHROUGH; + + else if (!strcasecmp(arg, "passthrough")) + cf->io_mode = CM_IO_PASSTHROUGH; + + else { + *error = "Unrecognised cache feature requested"; + return -EINVAL; + } + } + + return 0; +} + +static int parse_policy(struct cache_args *ca, struct dm_arg_set *as, + char **error) +{ + static struct dm_arg _args[] = { + {0, 1024, "Invalid number of policy arguments"}, + }; + + int r; + + if (!at_least_one_arg(as, error)) + return -EINVAL; + + ca->policy_name = dm_shift_arg(as); + + r = dm_read_arg_group(_args, as, &ca->policy_argc, error); + if (r) + return -EINVAL; + + ca->policy_argv = (const char **)as->argv; + dm_consume_args(as, ca->policy_argc); + + return 0; +} + +static int parse_cache_args(struct cache_args *ca, int argc, char **argv, + char **error) +{ + int r; + struct dm_arg_set as; + + as.argc = argc; + as.argv = argv; + + r = parse_metadata_dev(ca, &as, error); + if (r) + return r; + + r = parse_cache_dev(ca, &as, error); + if (r) + return r; + + r = parse_origin_dev(ca, &as, error); + if (r) + return r; + + r = parse_block_size(ca, &as, error); + if (r) + return r; + + r = parse_features(ca, &as, error); + if (r) + return r; + + r = parse_policy(ca, &as, error); + if (r) + return r; + + return 0; +} + +/*----------------------------------------------------------------*/ + +static struct kmem_cache *migration_cache; + +#define NOT_CORE_OPTION 1 + +static int process_config_option(struct cache *cache, const char *key, const char *value) +{ + unsigned long tmp; + + if (!strcasecmp(key, "migration_threshold")) { + if (kstrtoul(value, 10, &tmp)) + return -EINVAL; + + cache->migration_threshold = tmp; + return 0; + } + + return NOT_CORE_OPTION; +} + +static int set_config_value(struct cache *cache, const char *key, const char *value) +{ + int r = process_config_option(cache, key, value); + + if (r == NOT_CORE_OPTION) + r = policy_set_config_value(cache->policy, key, value); + + if (r) + DMWARN("bad config value for %s: %s", key, value); + + return r; +} + +static int set_config_values(struct cache *cache, int argc, const char **argv) +{ + int r = 0; + + if (argc & 1) { + DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs."); + return -EINVAL; + } + + while (argc) { + r = set_config_value(cache, argv[0], argv[1]); + if (r) + break; + + argc -= 2; + argv += 2; + } + + return r; +} + +static int create_cache_policy(struct cache *cache, struct cache_args *ca, + char **error) +{ + struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, + cache->cache_size, + cache->origin_sectors, + cache->sectors_per_block); + if (IS_ERR(p)) { + *error = "Error creating cache's policy"; + return PTR_ERR(p); + } + cache->policy = p; + + return 0; +} + +#define DEFAULT_MIGRATION_THRESHOLD 2048 + +static int cache_create(struct cache_args *ca, struct cache **result) +{ + int r = 0; + char **error = &ca->ti->error; + struct cache *cache; + struct dm_target *ti = ca->ti; + dm_block_t origin_blocks; + struct dm_cache_metadata *cmd; + bool may_format = ca->features.mode == CM_WRITE; + + cache = kzalloc(sizeof(*cache), GFP_KERNEL); + if (!cache) + return -ENOMEM; + + cache->ti = ca->ti; + ti->private = cache; + ti->num_flush_bios = 2; + ti->flush_supported = true; + + ti->num_discard_bios = 1; + ti->discards_supported = true; + ti->discard_zeroes_data_unsupported = true; + /* Discard bios must be split on a block boundary */ + ti->split_discard_bios = true; + + cache->features = ca->features; + ti->per_bio_data_size = get_per_bio_data_size(cache); + + cache->callbacks.congested_fn = cache_is_congested; + dm_table_add_target_callbacks(ti->table, &cache->callbacks); + + cache->metadata_dev = ca->metadata_dev; + cache->origin_dev = ca->origin_dev; + cache->cache_dev = ca->cache_dev; + + ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; + + /* FIXME: factor out this whole section */ + origin_blocks = cache->origin_sectors = ca->origin_sectors; + origin_blocks = block_div(origin_blocks, ca->block_size); + cache->origin_blocks = to_oblock(origin_blocks); + + cache->sectors_per_block = ca->block_size; + if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) { + r = -EINVAL; + goto bad; + } + + if (ca->block_size & (ca->block_size - 1)) { + dm_block_t cache_size = ca->cache_sectors; + + cache->sectors_per_block_shift = -1; + cache_size = block_div(cache_size, ca->block_size); + cache->cache_size = to_cblock(cache_size); + } else { + cache->sectors_per_block_shift = __ffs(ca->block_size); + cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift); + } + + r = create_cache_policy(cache, ca, error); + if (r) + goto bad; + + cache->policy_nr_args = ca->policy_argc; + cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD; + + r = set_config_values(cache, ca->policy_argc, ca->policy_argv); + if (r) { + *error = "Error setting cache policy's config values"; + goto bad; + } + + cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, + ca->block_size, may_format, + dm_cache_policy_get_hint_size(cache->policy)); + if (IS_ERR(cmd)) { + *error = "Error creating metadata object"; + r = PTR_ERR(cmd); + goto bad; + } + cache->cmd = cmd; + + if (passthrough_mode(&cache->features)) { + bool all_clean; + + r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); + if (r) { + *error = "dm_cache_metadata_all_clean() failed"; + goto bad; + } + + if (!all_clean) { + *error = "Cannot enter passthrough mode unless all blocks are clean"; + r = -EINVAL; + goto bad; + } + } + + spin_lock_init(&cache->lock); + bio_list_init(&cache->deferred_bios); + bio_list_init(&cache->deferred_flush_bios); + bio_list_init(&cache->deferred_writethrough_bios); + INIT_LIST_HEAD(&cache->quiesced_migrations); + INIT_LIST_HEAD(&cache->completed_migrations); + INIT_LIST_HEAD(&cache->need_commit_migrations); + atomic_set(&cache->nr_migrations, 0); + init_waitqueue_head(&cache->migration_wait); + + init_waitqueue_head(&cache->quiescing_wait); + atomic_set(&cache->quiescing, 0); + atomic_set(&cache->quiescing_ack, 0); + + r = -ENOMEM; + atomic_set(&cache->nr_dirty, 0); + cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); + if (!cache->dirty_bitset) { + *error = "could not allocate dirty bitset"; + goto bad; + } + clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); + + cache->discard_nr_blocks = cache->origin_blocks; + cache->discard_bitset = alloc_bitset(from_oblock(cache->discard_nr_blocks)); + if (!cache->discard_bitset) { + *error = "could not allocate discard bitset"; + goto bad; + } + clear_bitset(cache->discard_bitset, from_oblock(cache->discard_nr_blocks)); + + cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); + if (IS_ERR(cache->copier)) { + *error = "could not create kcopyd client"; + r = PTR_ERR(cache->copier); + goto bad; + } + + cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + if (!cache->wq) { + *error = "could not create workqueue for metadata object"; + goto bad; + } + INIT_WORK(&cache->worker, do_worker); + INIT_DELAYED_WORK(&cache->waker, do_waker); + cache->last_commit_jiffies = jiffies; + + cache->prison = dm_bio_prison_create(PRISON_CELLS); + if (!cache->prison) { + *error = "could not create bio prison"; + goto bad; + } + + cache->all_io_ds = dm_deferred_set_create(); + if (!cache->all_io_ds) { + *error = "could not create all_io deferred set"; + goto bad; + } + + cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, + migration_cache); + if (!cache->migration_pool) { + *error = "Error creating cache's migration mempool"; + goto bad; + } + + cache->next_migration = NULL; + + cache->need_tick_bio = true; + cache->sized = false; + cache->invalidate = false; + cache->commit_requested = false; + cache->loaded_mappings = false; + cache->loaded_discards = false; + + load_stats(cache); + + atomic_set(&cache->stats.demotion, 0); + atomic_set(&cache->stats.promotion, 0); + atomic_set(&cache->stats.copies_avoided, 0); + atomic_set(&cache->stats.cache_cell_clash, 0); + atomic_set(&cache->stats.commit_count, 0); + atomic_set(&cache->stats.discard_count, 0); + + spin_lock_init(&cache->invalidation_lock); + INIT_LIST_HEAD(&cache->invalidation_requests); + + *result = cache; + return 0; + +bad: + destroy(cache); + return r; +} + +static int copy_ctr_args(struct cache *cache, int argc, const char **argv) +{ + unsigned i; + const char **copy; + + copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL); + if (!copy) + return -ENOMEM; + for (i = 0; i < argc; i++) { + copy[i] = kstrdup(argv[i], GFP_KERNEL); + if (!copy[i]) { + while (i--) + kfree(copy[i]); + kfree(copy); + return -ENOMEM; + } + } + + cache->nr_ctr_args = argc; + cache->ctr_args = copy; + + return 0; +} + +static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int r = -EINVAL; + struct cache_args *ca; + struct cache *cache = NULL; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) { + ti->error = "Error allocating memory for cache"; + return -ENOMEM; + } + ca->ti = ti; + + r = parse_cache_args(ca, argc, argv, &ti->error); + if (r) + goto out; + + r = cache_create(ca, &cache); + if (r) + goto out; + + r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); + if (r) { + destroy(cache); + goto out; + } + + ti->private = cache; + +out: + destroy_cache_args(ca); + return r; +} + +static int cache_map(struct dm_target *ti, struct bio *bio) +{ + struct cache *cache = ti->private; + + int r; + dm_oblock_t block = get_bio_block(cache, bio); + size_t pb_data_size = get_per_bio_data_size(cache); + bool can_migrate = false; + bool discarded_block; + struct dm_bio_prison_cell *cell; + struct policy_result lookup_result; + struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); + + if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { + /* + * This can only occur if the io goes to a partial block at + * the end of the origin device. We don't cache these. + * Just remap to the origin and carry on. + */ + remap_to_origin(cache, bio); + return DM_MAPIO_REMAPPED; + } + + if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { + defer_bio(cache, bio); + return DM_MAPIO_SUBMITTED; + } + + /* + * Check to see if that block is currently migrating. + */ + cell = alloc_prison_cell(cache); + if (!cell) { + defer_bio(cache, bio); + return DM_MAPIO_SUBMITTED; + } + + r = bio_detain(cache, block, bio, cell, + (cell_free_fn) free_prison_cell, + cache, &cell); + if (r) { + if (r < 0) + defer_bio(cache, bio); + + return DM_MAPIO_SUBMITTED; + } + + discarded_block = is_discarded_oblock(cache, block); + + r = policy_map(cache->policy, block, false, can_migrate, discarded_block, + bio, &lookup_result); + if (r == -EWOULDBLOCK) { + cell_defer(cache, cell, true); + return DM_MAPIO_SUBMITTED; + + } else if (r) { + DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } + + r = DM_MAPIO_REMAPPED; + switch (lookup_result.op) { + case POLICY_HIT: + if (passthrough_mode(&cache->features)) { + if (bio_data_dir(bio) == WRITE) { + /* + * We need to invalidate this block, so + * defer for the worker thread. + */ + cell_defer(cache, cell, true); + r = DM_MAPIO_SUBMITTED; + + } else { + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + inc_miss_counter(cache, bio); + remap_to_origin_clear_discard(cache, bio, block); + + cell_defer(cache, cell, false); + } + + } else { + inc_hit_counter(cache, bio); + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + + if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && + !is_dirty(cache, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else + remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); + + cell_defer(cache, cell, false); + } + break; + + case POLICY_MISS: + inc_miss_counter(cache, bio); + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + + if (pb->req_nr != 0) { + /* + * This is a duplicate writethrough io that is no + * longer needed because the block has been demoted. + */ + bio_endio(bio, 0); + cell_defer(cache, cell, false); + return DM_MAPIO_SUBMITTED; + } else { + remap_to_origin_clear_discard(cache, bio, block); + cell_defer(cache, cell, false); + } + break; + + default: + DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, + (unsigned) lookup_result.op); + bio_io_error(bio); + r = DM_MAPIO_SUBMITTED; + } + + return r; +} + +static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) +{ + struct cache *cache = ti->private; + unsigned long flags; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + if (pb->tick) { + policy_tick(cache->policy); + + spin_lock_irqsave(&cache->lock, flags); + cache->need_tick_bio = true; + spin_unlock_irqrestore(&cache->lock, flags); + } + + check_for_quiesced_migrations(cache, pb); + + return 0; +} + +static int write_dirty_bitset(struct cache *cache) +{ + unsigned i, r; + + for (i = 0; i < from_cblock(cache->cache_size); i++) { + r = dm_cache_set_dirty(cache->cmd, to_cblock(i), + is_dirty(cache, to_cblock(i))); + if (r) + return r; + } + + return 0; +} + +static int write_discard_bitset(struct cache *cache) +{ + unsigned i, r; + + r = dm_cache_discard_bitset_resize(cache->cmd, cache->sectors_per_block, + cache->origin_blocks); + if (r) { + DMERR("could not resize on-disk discard bitset"); + return r; + } + + for (i = 0; i < from_oblock(cache->discard_nr_blocks); i++) { + r = dm_cache_set_discard(cache->cmd, to_oblock(i), + is_discarded(cache, to_oblock(i))); + if (r) + return r; + } + + return 0; +} + +/* + * returns true on success + */ +static bool sync_metadata(struct cache *cache) +{ + int r1, r2, r3, r4; + + r1 = write_dirty_bitset(cache); + if (r1) + DMERR("could not write dirty bitset"); + + r2 = write_discard_bitset(cache); + if (r2) + DMERR("could not write discard bitset"); + + save_stats(cache); + + r3 = dm_cache_write_hints(cache->cmd, cache->policy); + if (r3) + DMERR("could not write hints"); + + /* + * If writing the above metadata failed, we still commit, but don't + * set the clean shutdown flag. This will effectively force every + * dirty bit to be set on reload. + */ + r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3); + if (r4) + DMERR("could not write cache metadata. Data loss may occur."); + + return !r1 && !r2 && !r3 && !r4; +} + +static void cache_postsuspend(struct dm_target *ti) +{ + struct cache *cache = ti->private; + + start_quiescing(cache); + wait_for_migrations(cache); + stop_worker(cache); + requeue_deferred_io(cache); + stop_quiescing(cache); + + (void) sync_metadata(cache); +} + +static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) +{ + int r; + struct cache *cache = context; + + r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); + if (r) + return r; + + if (dirty) + set_dirty(cache, oblock, cblock); + else + clear_dirty(cache, oblock, cblock); + + return 0; +} + +static int load_discard(void *context, sector_t discard_block_size, + dm_oblock_t oblock, bool discard) +{ + struct cache *cache = context; + + if (discard) + set_discard(cache, oblock); + else + clear_discard(cache, oblock); + + return 0; +} + +static dm_cblock_t get_cache_dev_size(struct cache *cache) +{ + sector_t size = get_dev_size(cache->cache_dev); + (void) sector_div(size, cache->sectors_per_block); + return to_cblock(size); +} + +static bool can_resize(struct cache *cache, dm_cblock_t new_size) +{ + if (from_cblock(new_size) > from_cblock(cache->cache_size)) + return true; + + /* + * We can't drop a dirty block when shrinking the cache. + */ + while (from_cblock(new_size) < from_cblock(cache->cache_size)) { + new_size = to_cblock(from_cblock(new_size) + 1); + if (is_dirty(cache, new_size)) { + DMERR("unable to shrink cache; cache block %llu is dirty", + (unsigned long long) from_cblock(new_size)); + return false; + } + } + + return true; +} + +static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) +{ + int r; + + r = dm_cache_resize(cache->cmd, new_size); + if (r) { + DMERR("could not resize cache metadata"); + return r; + } + + cache->cache_size = new_size; + + return 0; +} + +static int cache_preresume(struct dm_target *ti) +{ + int r = 0; + struct cache *cache = ti->private; + dm_cblock_t csize = get_cache_dev_size(cache); + + /* + * Check to see if the cache has resized. + */ + if (!cache->sized) { + r = resize_cache_dev(cache, csize); + if (r) + return r; + + cache->sized = true; + + } else if (csize != cache->cache_size) { + if (!can_resize(cache, csize)) + return -EINVAL; + + r = resize_cache_dev(cache, csize); + if (r) + return r; + } + + if (!cache->loaded_mappings) { + r = dm_cache_load_mappings(cache->cmd, cache->policy, + load_mapping, cache); + if (r) { + DMERR("could not load cache mappings"); + return r; + } + + cache->loaded_mappings = true; + } + + if (!cache->loaded_discards) { + r = dm_cache_load_discards(cache->cmd, load_discard, cache); + if (r) { + DMERR("could not load origin discards"); + return r; + } + + cache->loaded_discards = true; + } + + return r; +} + +static void cache_resume(struct dm_target *ti) +{ + struct cache *cache = ti->private; + + cache->need_tick_bio = true; + do_waker(&cache->waker.work); +} + +/* + * Status format: + * + * <metadata block size> <#used metadata blocks>/<#total metadata blocks> + * <cache block size> <#used cache blocks>/<#total cache blocks> + * <#read hits> <#read misses> <#write hits> <#write misses> + * <#demotions> <#promotions> <#dirty> + * <#features> <features>* + * <#core args> <core args> + * <policy name> <#policy args> <policy args>* + */ +static void cache_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + int r = 0; + unsigned i; + ssize_t sz = 0; + dm_block_t nr_free_blocks_metadata = 0; + dm_block_t nr_blocks_metadata = 0; + char buf[BDEVNAME_SIZE]; + struct cache *cache = ti->private; + dm_cblock_t residency; + + switch (type) { + case STATUSTYPE_INFO: + /* Commit to ensure statistics aren't out-of-date */ + if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) { + r = dm_cache_commit(cache->cmd, false); + if (r) + DMERR("could not commit metadata for accurate status"); + } + + r = dm_cache_get_free_metadata_block_count(cache->cmd, + &nr_free_blocks_metadata); + if (r) { + DMERR("could not get metadata free block count"); + goto err; + } + + r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata); + if (r) { + DMERR("could not get metadata device size"); + goto err; + } + + residency = policy_residency(cache->policy); + + DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", + (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT), + (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), + (unsigned long long)nr_blocks_metadata, + cache->sectors_per_block, + (unsigned long long) from_cblock(residency), + (unsigned long long) from_cblock(cache->cache_size), + (unsigned) atomic_read(&cache->stats.read_hit), + (unsigned) atomic_read(&cache->stats.read_miss), + (unsigned) atomic_read(&cache->stats.write_hit), + (unsigned) atomic_read(&cache->stats.write_miss), + (unsigned) atomic_read(&cache->stats.demotion), + (unsigned) atomic_read(&cache->stats.promotion), + (unsigned long) atomic_read(&cache->nr_dirty)); + + if (writethrough_mode(&cache->features)) + DMEMIT("1 writethrough "); + + else if (passthrough_mode(&cache->features)) + DMEMIT("1 passthrough "); + + else if (writeback_mode(&cache->features)) + DMEMIT("1 writeback "); + + else { + DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode); + goto err; + } + + DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); + + DMEMIT("%s ", dm_cache_policy_get_name(cache->policy)); + if (sz < maxlen) { + r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz); + if (r) + DMERR("policy_emit_config_values returned %d", r); + } + + break; + + case STATUSTYPE_TABLE: + format_dev_t(buf, cache->metadata_dev->bdev->bd_dev); + DMEMIT("%s ", buf); + format_dev_t(buf, cache->cache_dev->bdev->bd_dev); + DMEMIT("%s ", buf); + format_dev_t(buf, cache->origin_dev->bdev->bd_dev); + DMEMIT("%s", buf); + + for (i = 0; i < cache->nr_ctr_args - 1; i++) + DMEMIT(" %s", cache->ctr_args[i]); + if (cache->nr_ctr_args) + DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]); + } + + return; + +err: + DMEMIT("Error"); +} + +/* + * A cache block range can take two forms: + * + * i) A single cblock, eg. '3456' + * ii) A begin and end cblock with dots between, eg. 123-234 + */ +static int parse_cblock_range(struct cache *cache, const char *str, + struct cblock_range *result) +{ + char dummy; + uint64_t b, e; + int r; + + /* + * Try and parse form (ii) first. + */ + r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); + if (r < 0) + return r; + + if (r == 2) { + result->begin = to_cblock(b); + result->end = to_cblock(e); + return 0; + } + + /* + * That didn't work, try form (i). + */ + r = sscanf(str, "%llu%c", &b, &dummy); + if (r < 0) + return r; + + if (r == 1) { + result->begin = to_cblock(b); + result->end = to_cblock(from_cblock(result->begin) + 1u); + return 0; + } + + DMERR("invalid cblock range '%s'", str); + return -EINVAL; +} + +static int validate_cblock_range(struct cache *cache, struct cblock_range *range) +{ + uint64_t b = from_cblock(range->begin); + uint64_t e = from_cblock(range->end); + uint64_t n = from_cblock(cache->cache_size); + + if (b >= n) { + DMERR("begin cblock out of range: %llu >= %llu", b, n); + return -EINVAL; + } + + if (e > n) { + DMERR("end cblock out of range: %llu > %llu", e, n); + return -EINVAL; + } + + if (b >= e) { + DMERR("invalid cblock range: %llu >= %llu", b, e); + return -EINVAL; + } + + return 0; +} + +static int request_invalidation(struct cache *cache, struct cblock_range *range) +{ + struct invalidation_request req; + + INIT_LIST_HEAD(&req.list); + req.cblocks = range; + atomic_set(&req.complete, 0); + req.err = 0; + init_waitqueue_head(&req.result_wait); + + spin_lock(&cache->invalidation_lock); + list_add(&req.list, &cache->invalidation_requests); + spin_unlock(&cache->invalidation_lock); + wake_worker(cache); + + wait_event(req.result_wait, atomic_read(&req.complete)); + return req.err; +} + +static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, + const char **cblock_ranges) +{ + int r = 0; + unsigned i; + struct cblock_range range; + + if (!passthrough_mode(&cache->features)) { + DMERR("cache has to be in passthrough mode for invalidation"); + return -EPERM; + } + + for (i = 0; i < count; i++) { + r = parse_cblock_range(cache, cblock_ranges[i], &range); + if (r) + break; + + r = validate_cblock_range(cache, &range); + if (r) + break; + + /* + * Pass begin and end origin blocks to the worker and wake it. + */ + r = request_invalidation(cache, &range); + if (r) + break; + } + + return r; +} + +/* + * Supports + * "<key> <value>" + * and + * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* + * + * The key migration_threshold is supported by the cache target core. + */ +static int cache_message(struct dm_target *ti, unsigned argc, char **argv) +{ + struct cache *cache = ti->private; + + if (!argc) + return -EINVAL; + + if (!strcasecmp(argv[0], "invalidate_cblocks")) + return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); + + if (argc != 2) + return -EINVAL; + + return set_config_value(cache, argv[0], argv[1]); +} + +static int cache_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + int r = 0; + struct cache *cache = ti->private; + + r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data); + if (!r) + r = fn(ti, cache->origin_dev, 0, ti->len, data); + + return r; +} + +/* + * We assume I/O is going to the origin (which is the volume + * more likely to have restrictions e.g. by being striped). + * (Looking up the exact location of the data would be expensive + * and could always be out of date by the time the bio is submitted.) + */ +static int cache_bvec_merge(struct dm_target *ti, + struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct cache *cache = ti->private; + struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = cache->origin_dev->bdev; + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static void set_discard_limits(struct cache *cache, struct queue_limits *limits) +{ + /* + * FIXME: these limits may be incompatible with the cache device + */ + limits->max_discard_sectors = cache->sectors_per_block; + limits->discard_granularity = cache->sectors_per_block << SECTOR_SHIFT; +} + +static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct cache *cache = ti->private; + uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + + /* + * If the system-determined stacked limits are compatible with the + * cache's blocksize (io_opt is a factor) do not override them. + */ + if (io_opt_sectors < cache->sectors_per_block || + do_div(io_opt_sectors, cache->sectors_per_block)) { + blk_limits_io_min(limits, 0); + blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); + } + set_discard_limits(cache, limits); +} + +/*----------------------------------------------------------------*/ + +static struct target_type cache_target = { + .name = "cache", + .version = {1, 4, 0}, + .module = THIS_MODULE, + .ctr = cache_ctr, + .dtr = cache_dtr, + .map = cache_map, + .end_io = cache_end_io, + .postsuspend = cache_postsuspend, + .preresume = cache_preresume, + .resume = cache_resume, + .status = cache_status, + .message = cache_message, + .iterate_devices = cache_iterate_devices, + .merge = cache_bvec_merge, + .io_hints = cache_io_hints, +}; + +static int __init dm_cache_init(void) +{ + int r; + + r = dm_register_target(&cache_target); + if (r) { + DMERR("cache target registration failed: %d", r); + return r; + } + + migration_cache = KMEM_CACHE(dm_cache_migration, 0); + if (!migration_cache) { + dm_unregister_target(&cache_target); + return -ENOMEM; + } + + return 0; +} + +static void __exit dm_cache_exit(void) +{ + dm_unregister_target(&cache_target); + kmem_cache_destroy(migration_cache); +} + +module_init(dm_cache_init); +module_exit(dm_cache_exit); + +MODULE_DESCRIPTION(DM_NAME " cache target"); +MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 13956437bc8..4cba2d808af 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1,7 +1,8 @@ /* - * Copyright (C) 2003 Christophe Saout <christophe@saout.de> + * Copyright (C) 2003 Jana Saout <jana@saout.de> * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> - * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. + * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. + * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com> * * This file is released under the GPL. */ @@ -18,15 +19,17 @@ #include <linux/crypto.h> #include <linux/workqueue.h> #include <linux/backing-dev.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/scatterlist.h> #include <asm/page.h> #include <asm/unaligned.h> +#include <crypto/hash.h> +#include <crypto/md5.h> +#include <crypto/algapi.h> -#include "dm.h" +#include <linux/device-mapper.h> #define DM_MSG_PREFIX "crypt" -#define MESG_STR(x) x, sizeof(x) /* * context holding the current state of a multi-part conversion @@ -35,32 +38,34 @@ struct convert_context { struct completion restart; struct bio *bio_in; struct bio *bio_out; - unsigned int offset_in; - unsigned int offset_out; - unsigned int idx_in; - unsigned int idx_out; - sector_t sector; - atomic_t pending; + struct bvec_iter iter_in; + struct bvec_iter iter_out; + sector_t cc_sector; + atomic_t cc_pending; + struct ablkcipher_request *req; }; /* * per bio private data */ struct dm_crypt_io { - struct dm_target *target; + struct crypt_config *cc; struct bio *base_bio; struct work_struct work; struct convert_context ctx; - atomic_t pending; + atomic_t io_pending; int error; sector_t sector; + struct dm_crypt_io *base_io; }; struct dm_crypt_request { + struct convert_context *ctx; struct scatterlist sg_in; struct scatterlist sg_out; + sector_t iv_sector; }; struct crypt_config; @@ -69,8 +74,34 @@ struct crypt_iv_operations { int (*ctr)(struct crypt_config *cc, struct dm_target *ti, const char *opts); void (*dtr)(struct crypt_config *cc); - const char *(*status)(struct crypt_config *cc); - int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector); + int (*init)(struct crypt_config *cc); + int (*wipe)(struct crypt_config *cc); + int (*generator)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); + int (*post)(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq); +}; + +struct iv_essiv_private { + struct crypto_hash *hash_tfm; + u8 *salt; +}; + +struct iv_benbi_private { + int shift; +}; + +#define LMK_SEED_SIZE 64 /* hash + 0 */ +struct iv_lmk_private { + struct crypto_shash *hash_tfm; + u8 *seed; +}; + +#define TCW_WHITENING_SIZE 16 +struct iv_tcw_private { + struct crypto_shash *crc32_tfm; + u8 *iv_seed; + u8 *whitening; }; /* @@ -78,6 +109,10 @@ struct crypt_iv_operations { * and encrypts / decrypts at the same time. */ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; + +/* + * The fields in here must be read only after initialization. + */ struct crypt_config { struct dm_dev *dev; sector_t start; @@ -93,20 +128,25 @@ struct crypt_config { struct workqueue_struct *io_queue; struct workqueue_struct *crypt_queue; - wait_queue_head_t writeq; - /* - * crypto related data - */ + char *cipher; + char *cipher_string; + struct crypt_iv_operations *iv_gen_ops; - char *iv_mode; union { - struct crypto_cipher *essiv_tfm; - int benbi_shift; + struct iv_essiv_private essiv; + struct iv_benbi_private benbi; + struct iv_lmk_private lmk; + struct iv_tcw_private tcw; } iv_gen_private; sector_t iv_offset; unsigned int iv_size; + /* ESSIV: struct crypto_cipher *essiv_tfm */ + void *iv_private; + struct crypto_ablkcipher **tfms; + unsigned tfms_count; + /* * Layout of each crypto request: * @@ -121,24 +161,30 @@ struct crypt_config { * correctly aligned. */ unsigned int dmreq_start; - struct ablkcipher_request *req; - char cipher[CRYPTO_MAX_ALG_NAME]; - char chainmode[CRYPTO_MAX_ALG_NAME]; - struct crypto_ablkcipher *tfm; unsigned long flags; unsigned int key_size; + unsigned int key_parts; /* independent parts in key buffer */ + unsigned int key_extra_size; /* additional keys length */ u8 key[0]; }; #define MIN_IOS 16 #define MIN_POOL_PAGES 32 -#define MIN_BIO_PAGES 8 static struct kmem_cache *_crypt_io_pool; static void clone_init(struct dm_crypt_io *, struct bio *); static void kcryptd_queue_crypt(struct dm_crypt_io *io); +static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); + +/* + * Use this to access cipher attributes that are the same for each CPU. + */ +static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) +{ + return cc->tfms[0]; +} /* * Different IV generation algorithms: @@ -146,6 +192,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); * plain: the initial vector is the 32-bit little-endian version of the sector * number, padded with zeros if necessary. * + * plain64: the initial vector is the 64-bit little-endian version of the sector + * number, padded with zeros if necessary. + * * essiv: "encrypted sector|salt initial vector", the sector number is * encrypted with the bulk cipher using a salt as key. The salt * should be derived from the bulk cipher's key via hashing. @@ -156,107 +205,213 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); * null: the initial vector is always zero. Provides compatibility with * obsolete loop_fish2 devices. Do not use for new devices. * + * lmk: Compatible implementation of the block chaining mode used + * by the Loop-AES block device encryption system + * designed by Jari Ruusu. See http://loop-aes.sourceforge.net/ + * It operates on full 512 byte sectors and uses CBC + * with an IV derived from the sector number, the data and + * optionally extra IV seed. + * This means that after decryption the first block + * of sector must be tweaked according to decrypted data. + * Loop-AES can use three encryption schemes: + * version 1: is plain aes-cbc mode + * version 2: uses 64 multikey scheme with lmk IV generator + * version 3: the same as version 2 with additional IV seed + * (it uses 65 keys, last key is used as IV seed) + * + * tcw: Compatible implementation of the block chaining mode used + * by the TrueCrypt device encryption system (prior to version 4.1). + * For more info see: http://www.truecrypt.org + * It operates on full 512 byte sectors and uses CBC + * with an IV derived from initial key and the sector number. + * In addition, whitening value is applied on every sector, whitening + * is calculated from initial key, sector number and mixed using CRC32. + * Note that this encryption scheme is vulnerable to watermarking attacks + * and should be used for old compatible containers access only. + * * plumb: unimplemented, see: * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ -static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); - *(u32 *)iv = cpu_to_le32(sector & 0xffffffff); + *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff); return 0; } -static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, - const char *opts) +static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { - struct crypto_cipher *essiv_tfm; - struct crypto_hash *hash_tfm; + memset(iv, 0, cc->iv_size); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); + + return 0; +} + +/* Initialise ESSIV - compute salt but no local memory allocations */ +static int crypt_iv_essiv_init(struct crypt_config *cc) +{ + struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; struct hash_desc desc; struct scatterlist sg; - unsigned int saltsize; - u8 *salt; + struct crypto_cipher *essiv_tfm; int err; - if (opts == NULL) { - ti->error = "Digest algorithm missing for ESSIV mode"; - return -EINVAL; - } - - /* Hash the cipher key with the given hash algorithm */ - hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(hash_tfm)) { - ti->error = "Error initializing ESSIV hash"; - return PTR_ERR(hash_tfm); - } - - saltsize = crypto_hash_digestsize(hash_tfm); - salt = kmalloc(saltsize, GFP_KERNEL); - if (salt == NULL) { - ti->error = "Error kmallocing salt storage in ESSIV"; - crypto_free_hash(hash_tfm); - return -ENOMEM; - } - sg_init_one(&sg, cc->key, cc->key_size); - desc.tfm = hash_tfm; + desc.tfm = essiv->hash_tfm; desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; - err = crypto_hash_digest(&desc, &sg, cc->key_size, salt); - crypto_free_hash(hash_tfm); - if (err) { - ti->error = "Error calculating hash in ESSIV"; - kfree(salt); + err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt); + if (err) return err; - } + + essiv_tfm = cc->iv_private; + + err = crypto_cipher_setkey(essiv_tfm, essiv->salt, + crypto_hash_digestsize(essiv->hash_tfm)); + if (err) + return err; + + return 0; +} + +/* Wipe salt and reset key derived from volume key */ +static int crypt_iv_essiv_wipe(struct crypt_config *cc) +{ + struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; + unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm); + struct crypto_cipher *essiv_tfm; + int r, err = 0; + + memset(essiv->salt, 0, salt_size); + + essiv_tfm = cc->iv_private; + r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size); + if (r) + err = r; + + return err; +} + +/* Set up per cpu cipher state */ +static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc, + struct dm_target *ti, + u8 *salt, unsigned saltsize) +{ + struct crypto_cipher *essiv_tfm; + int err; /* Setup the essiv_tfm with the given salt */ essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(essiv_tfm)) { ti->error = "Error allocating crypto tfm for ESSIV"; - kfree(salt); - return PTR_ERR(essiv_tfm); + return essiv_tfm; } + if (crypto_cipher_blocksize(essiv_tfm) != - crypto_ablkcipher_ivsize(cc->tfm)) { + crypto_ablkcipher_ivsize(any_tfm(cc))) { ti->error = "Block size of ESSIV cipher does " "not match IV size of block cipher"; crypto_free_cipher(essiv_tfm); - kfree(salt); - return -EINVAL; + return ERR_PTR(-EINVAL); } + err = crypto_cipher_setkey(essiv_tfm, salt, saltsize); if (err) { ti->error = "Failed to set key for ESSIV cipher"; crypto_free_cipher(essiv_tfm); - kfree(salt); - return err; + return ERR_PTR(err); } - kfree(salt); - cc->iv_gen_private.essiv_tfm = essiv_tfm; - return 0; + return essiv_tfm; } static void crypt_iv_essiv_dtr(struct crypt_config *cc) { - crypto_free_cipher(cc->iv_gen_private.essiv_tfm); - cc->iv_gen_private.essiv_tfm = NULL; + struct crypto_cipher *essiv_tfm; + struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv; + + crypto_free_hash(essiv->hash_tfm); + essiv->hash_tfm = NULL; + + kzfree(essiv->salt); + essiv->salt = NULL; + + essiv_tfm = cc->iv_private; + + if (essiv_tfm) + crypto_free_cipher(essiv_tfm); + + cc->iv_private = NULL; } -static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) { + struct crypto_cipher *essiv_tfm = NULL; + struct crypto_hash *hash_tfm = NULL; + u8 *salt = NULL; + int err; + + if (!opts) { + ti->error = "Digest algorithm missing for ESSIV mode"; + return -EINVAL; + } + + /* Allocate hash algorithm */ + hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(hash_tfm)) { + ti->error = "Error initializing ESSIV hash"; + err = PTR_ERR(hash_tfm); + goto bad; + } + + salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL); + if (!salt) { + ti->error = "Error kmallocing salt storage in ESSIV"; + err = -ENOMEM; + goto bad; + } + + cc->iv_gen_private.essiv.salt = salt; + cc->iv_gen_private.essiv.hash_tfm = hash_tfm; + + essiv_tfm = setup_essiv_cpu(cc, ti, salt, + crypto_hash_digestsize(hash_tfm)); + if (IS_ERR(essiv_tfm)) { + crypt_iv_essiv_dtr(cc); + return PTR_ERR(essiv_tfm); + } + cc->iv_private = essiv_tfm; + + return 0; + +bad: + if (hash_tfm && !IS_ERR(hash_tfm)) + crypto_free_hash(hash_tfm); + kfree(salt); + return err; +} + +static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + struct crypto_cipher *essiv_tfm = cc->iv_private; + memset(iv, 0, cc->iv_size); - *(u64 *)iv = cpu_to_le64(sector); - crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector); + crypto_cipher_encrypt_one(essiv_tfm, iv, iv); + return 0; } static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - unsigned bs = crypto_ablkcipher_blocksize(cc->tfm); + unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc)); int log = ilog2(bs); /* we need to calculate how far we must shift the sector count @@ -272,7 +427,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, return -EINVAL; } - cc->iv_gen_private.benbi_shift = 9 - log; + cc->iv_gen_private.benbi.shift = 9 - log; return 0; } @@ -281,32 +436,337 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc) { } -static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { __be64 val; memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */ - val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1); + val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1); put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64))); return 0; } -static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector) +static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) { memset(iv, 0, cc->iv_size); return 0; } +static void crypt_iv_lmk_dtr(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm)) + crypto_free_shash(lmk->hash_tfm); + lmk->hash_tfm = NULL; + + kzfree(lmk->seed); + lmk->seed = NULL; +} + +static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + lmk->hash_tfm = crypto_alloc_shash("md5", 0, 0); + if (IS_ERR(lmk->hash_tfm)) { + ti->error = "Error initializing LMK hash"; + return PTR_ERR(lmk->hash_tfm); + } + + /* No seed in LMK version 2 */ + if (cc->key_parts == cc->tfms_count) { + lmk->seed = NULL; + return 0; + } + + lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL); + if (!lmk->seed) { + crypt_iv_lmk_dtr(cc); + ti->error = "Error kmallocing seed storage in LMK"; + return -ENOMEM; + } + + return 0; +} + +static int crypt_iv_lmk_init(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + int subkey_size = cc->key_size / cc->key_parts; + + /* LMK seed is on the position of LMK_KEYS + 1 key */ + if (lmk->seed) + memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size), + crypto_shash_digestsize(lmk->hash_tfm)); + + return 0; +} + +static int crypt_iv_lmk_wipe(struct crypt_config *cc) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + + if (lmk->seed) + memset(lmk->seed, 0, LMK_SEED_SIZE); + + return 0; +} + +static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq, + u8 *data) +{ + struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; + struct { + struct shash_desc desc; + char ctx[crypto_shash_descsize(lmk->hash_tfm)]; + } sdesc; + struct md5_state md5state; + __le32 buf[4]; + int i, r; + + sdesc.desc.tfm = lmk->hash_tfm; + sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + r = crypto_shash_init(&sdesc.desc); + if (r) + return r; + + if (lmk->seed) { + r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE); + if (r) + return r; + } + + /* Sector is always 512B, block size 16, add data of blocks 1-31 */ + r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31); + if (r) + return r; + + /* Sector is cropped to 56 bits here */ + buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF); + buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000); + buf[2] = cpu_to_le32(4024); + buf[3] = 0; + r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf)); + if (r) + return r; + + /* No MD5 padding here */ + r = crypto_shash_export(&sdesc.desc, &md5state); + if (r) + return r; + + for (i = 0; i < MD5_HASH_WORDS; i++) + __cpu_to_le32s(&md5state.hash[i]); + memcpy(iv, &md5state.hash, cc->iv_size); + + return 0; +} + +static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *src; + int r = 0; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + src = kmap_atomic(sg_page(&dmreq->sg_in)); + r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); + kunmap_atomic(src); + } else + memset(iv, 0, cc->iv_size); + + return r; +} + +static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *dst; + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) + return 0; + + dst = kmap_atomic(sg_page(&dmreq->sg_out)); + r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); + + /* Tweak the first block of plaintext sector */ + if (!r) + crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); + + kunmap_atomic(dst); + return r; +} + +static void crypt_iv_tcw_dtr(struct crypt_config *cc) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + + kzfree(tcw->iv_seed); + tcw->iv_seed = NULL; + kzfree(tcw->whitening); + tcw->whitening = NULL; + + if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm)) + crypto_free_shash(tcw->crc32_tfm); + tcw->crc32_tfm = NULL; +} + +static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + + if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) { + ti->error = "Wrong key size for TCW"; + return -EINVAL; + } + + tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, 0); + if (IS_ERR(tcw->crc32_tfm)) { + ti->error = "Error initializing CRC32 in TCW"; + return PTR_ERR(tcw->crc32_tfm); + } + + tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL); + tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL); + if (!tcw->iv_seed || !tcw->whitening) { + crypt_iv_tcw_dtr(cc); + ti->error = "Error allocating seed storage in TCW"; + return -ENOMEM; + } + + return 0; +} + +static int crypt_iv_tcw_init(struct crypt_config *cc) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + int key_offset = cc->key_size - cc->iv_size - TCW_WHITENING_SIZE; + + memcpy(tcw->iv_seed, &cc->key[key_offset], cc->iv_size); + memcpy(tcw->whitening, &cc->key[key_offset + cc->iv_size], + TCW_WHITENING_SIZE); + + return 0; +} + +static int crypt_iv_tcw_wipe(struct crypt_config *cc) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + + memset(tcw->iv_seed, 0, cc->iv_size); + memset(tcw->whitening, 0, TCW_WHITENING_SIZE); + + return 0; +} + +static int crypt_iv_tcw_whitening(struct crypt_config *cc, + struct dm_crypt_request *dmreq, + u8 *data) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + u64 sector = cpu_to_le64((u64)dmreq->iv_sector); + u8 buf[TCW_WHITENING_SIZE]; + struct { + struct shash_desc desc; + char ctx[crypto_shash_descsize(tcw->crc32_tfm)]; + } sdesc; + int i, r; + + /* xor whitening with sector number */ + memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE); + crypto_xor(buf, (u8 *)§or, 8); + crypto_xor(&buf[8], (u8 *)§or, 8); + + /* calculate crc32 for every 32bit part and xor it */ + sdesc.desc.tfm = tcw->crc32_tfm; + sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + for (i = 0; i < 4; i++) { + r = crypto_shash_init(&sdesc.desc); + if (r) + goto out; + r = crypto_shash_update(&sdesc.desc, &buf[i * 4], 4); + if (r) + goto out; + r = crypto_shash_final(&sdesc.desc, &buf[i * 4]); + if (r) + goto out; + } + crypto_xor(&buf[0], &buf[12], 4); + crypto_xor(&buf[4], &buf[8], 4); + + /* apply whitening (8 bytes) to whole sector */ + for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) + crypto_xor(data + i * 8, buf, 8); +out: + memset(buf, 0, sizeof(buf)); + return r; +} + +static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + u64 sector = cpu_to_le64((u64)dmreq->iv_sector); + u8 *src; + int r = 0; + + /* Remove whitening from ciphertext */ + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { + src = kmap_atomic(sg_page(&dmreq->sg_in)); + r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset); + kunmap_atomic(src); + } + + /* Calculate IV */ + memcpy(iv, tcw->iv_seed, cc->iv_size); + crypto_xor(iv, (u8 *)§or, 8); + if (cc->iv_size > 8) + crypto_xor(&iv[8], (u8 *)§or, cc->iv_size - 8); + + return r; +} + +static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *dst; + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) + return 0; + + /* Apply whitening on ciphertext */ + dst = kmap_atomic(sg_page(&dmreq->sg_out)); + r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset); + kunmap_atomic(dst); + + return r; +} + static struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; +static struct crypt_iv_operations crypt_iv_plain64_ops = { + .generator = crypt_iv_plain64_gen +}; + static struct crypt_iv_operations crypt_iv_essiv_ops = { .ctr = crypt_iv_essiv_ctr, .dtr = crypt_iv_essiv_dtr, + .init = crypt_iv_essiv_init, + .wipe = crypt_iv_essiv_wipe, .generator = crypt_iv_essiv_gen }; @@ -320,6 +780,24 @@ static struct crypt_iv_operations crypt_iv_null_ops = { .generator = crypt_iv_null_gen }; +static struct crypt_iv_operations crypt_iv_lmk_ops = { + .ctr = crypt_iv_lmk_ctr, + .dtr = crypt_iv_lmk_dtr, + .init = crypt_iv_lmk_init, + .wipe = crypt_iv_lmk_wipe, + .generator = crypt_iv_lmk_gen, + .post = crypt_iv_lmk_post +}; + +static struct crypt_iv_operations crypt_iv_tcw_ops = { + .ctr = crypt_iv_tcw_ctr, + .dtr = crypt_iv_tcw_dtr, + .init = crypt_iv_tcw_init, + .wipe = crypt_iv_tcw_wipe, + .generator = crypt_iv_tcw_gen, + .post = crypt_iv_tcw_post +}; + static void crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx, struct bio *bio_out, struct bio *bio_in, @@ -327,51 +805,61 @@ static void crypt_convert_init(struct crypt_config *cc, { ctx->bio_in = bio_in; ctx->bio_out = bio_out; - ctx->offset_in = 0; - ctx->offset_out = 0; - ctx->idx_in = bio_in ? bio_in->bi_idx : 0; - ctx->idx_out = bio_out ? bio_out->bi_idx : 0; - ctx->sector = sector + cc->iv_offset; + if (bio_in) + ctx->iter_in = bio_in->bi_iter; + if (bio_out) + ctx->iter_out = bio_out->bi_iter; + ctx->cc_sector = sector + cc->iv_offset; init_completion(&ctx->restart); - atomic_set(&ctx->pending, 1); +} + +static struct dm_crypt_request *dmreq_of_req(struct crypt_config *cc, + struct ablkcipher_request *req) +{ + return (struct dm_crypt_request *)((char *)req + cc->dmreq_start); +} + +static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start); +} + +static u8 *iv_of_dmreq(struct crypt_config *cc, + struct dm_crypt_request *dmreq) +{ + return (u8 *)ALIGN((unsigned long)(dmreq + 1), + crypto_ablkcipher_alignmask(any_tfm(cc)) + 1); } static int crypt_convert_block(struct crypt_config *cc, struct convert_context *ctx, struct ablkcipher_request *req) { - struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in); - struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out); + struct bio_vec bv_in = bio_iter_iovec(ctx->bio_in, ctx->iter_in); + struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); struct dm_crypt_request *dmreq; u8 *iv; - int r = 0; + int r; - dmreq = (struct dm_crypt_request *)((char *)req + cc->dmreq_start); - iv = (u8 *)ALIGN((unsigned long)(dmreq + 1), - crypto_ablkcipher_alignmask(cc->tfm) + 1); + dmreq = dmreq_of_req(cc, req); + iv = iv_of_dmreq(cc, dmreq); + dmreq->iv_sector = ctx->cc_sector; + dmreq->ctx = ctx; sg_init_table(&dmreq->sg_in, 1); - sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT, - bv_in->bv_offset + ctx->offset_in); + sg_set_page(&dmreq->sg_in, bv_in.bv_page, 1 << SECTOR_SHIFT, + bv_in.bv_offset); sg_init_table(&dmreq->sg_out, 1); - sg_set_page(&dmreq->sg_out, bv_out->bv_page, 1 << SECTOR_SHIFT, - bv_out->bv_offset + ctx->offset_out); - - ctx->offset_in += 1 << SECTOR_SHIFT; - if (ctx->offset_in >= bv_in->bv_len) { - ctx->offset_in = 0; - ctx->idx_in++; - } + sg_set_page(&dmreq->sg_out, bv_out.bv_page, 1 << SECTOR_SHIFT, + bv_out.bv_offset); - ctx->offset_out += 1 << SECTOR_SHIFT; - if (ctx->offset_out >= bv_out->bv_len) { - ctx->offset_out = 0; - ctx->idx_out++; - } + bio_advance_iter(ctx->bio_in, &ctx->iter_in, 1 << SECTOR_SHIFT); + bio_advance_iter(ctx->bio_out, &ctx->iter_out, 1 << SECTOR_SHIFT); if (cc->iv_gen_ops) { - r = cc->iv_gen_ops->generator(cc, iv, ctx->sector); + r = cc->iv_gen_ops->generator(cc, iv, dmreq); if (r < 0) return r; } @@ -384,20 +872,27 @@ static int crypt_convert_block(struct crypt_config *cc, else r = crypto_ablkcipher_decrypt(req); + if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) + r = cc->iv_gen_ops->post(cc, iv, dmreq); + return r; } static void kcryptd_async_done(struct crypto_async_request *async_req, int error); + static void crypt_alloc_req(struct crypt_config *cc, struct convert_context *ctx) { - if (!cc->req) - cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); - ablkcipher_request_set_tfm(cc->req, cc->tfm); - ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG | - CRYPTO_TFM_REQ_MAY_SLEEP, - kcryptd_async_done, ctx); + unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); + + if (!ctx->req) + ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO); + + ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]); + ablkcipher_request_set_callback(ctx->req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + kcryptd_async_done, dmreq_of_req(cc, ctx->req)); } /* @@ -408,36 +903,37 @@ static int crypt_convert(struct crypt_config *cc, { int r; - while(ctx->idx_in < ctx->bio_in->bi_vcnt && - ctx->idx_out < ctx->bio_out->bi_vcnt) { + atomic_set(&ctx->cc_pending, 1); + + while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) { crypt_alloc_req(cc, ctx); - atomic_inc(&ctx->pending); + atomic_inc(&ctx->cc_pending); - r = crypt_convert_block(cc, ctx, cc->req); + r = crypt_convert_block(cc, ctx, ctx->req); switch (r) { /* async */ case -EBUSY: wait_for_completion(&ctx->restart); - INIT_COMPLETION(ctx->restart); + reinit_completion(&ctx->restart); /* fall through*/ case -EINPROGRESS: - cc->req = NULL; - ctx->sector++; + ctx->req = NULL; + ctx->cc_sector++; continue; /* sync */ case 0: - atomic_dec(&ctx->pending); - ctx->sector++; + atomic_dec(&ctx->cc_pending); + ctx->cc_sector++; cond_resched(); continue; /* error */ default: - atomic_dec(&ctx->pending); + atomic_dec(&ctx->cc_pending); return r; } } @@ -445,22 +941,16 @@ static int crypt_convert(struct crypt_config *cc, return 0; } -static void dm_crypt_bio_destructor(struct bio *bio) -{ - struct dm_crypt_io *io = bio->bi_private; - struct crypt_config *cc = io->target->private; - - bio_free(bio, cc->bs); -} - /* * Generate a new unfragmented bio with the given size * This should never violate the device limitations - * May return a smaller bio when running out of pages + * May return a smaller bio when running out of pages, indicated by + * *out_of_pages set to 1. */ -static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) +static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, + unsigned *out_of_pages) { - struct crypt_config *cc = io->target->private; + struct crypt_config *cc = io->cc; struct bio *clone; unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; @@ -472,19 +962,21 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) return NULL; clone_init(io, clone); + *out_of_pages = 0; for (i = 0; i < nr_iovecs; i++) { page = mempool_alloc(cc->page_pool, gfp_mask); - if (!page) + if (!page) { + *out_of_pages = 1; break; + } /* - * if additional pages cannot be allocated without waiting, - * return a partially allocated bio, the caller will then try - * to allocate additional bios while submitting this partial bio + * If additional pages cannot be allocated without waiting, + * return a partially-allocated bio. The caller will then try + * to allocate more bios while submitting this partial bio. */ - if (i == (MIN_BIO_PAGES - 1)) - gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; + gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; len = (size > PAGE_SIZE) ? PAGE_SIZE : size; @@ -496,7 +988,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) size -= len; } - if (!clone->bi_size) { + if (!clone->bi_iter.bi_size) { bio_put(clone); return NULL; } @@ -509,27 +1001,61 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) unsigned int i; struct bio_vec *bv; - for (i = 0; i < clone->bi_vcnt; i++) { - bv = bio_iovec_idx(clone, i); + bio_for_each_segment_all(bv, clone, i) { BUG_ON(!bv->bv_page); mempool_free(bv->bv_page, cc->page_pool); bv->bv_page = NULL; } } +static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc, + struct bio *bio, sector_t sector) +{ + struct dm_crypt_io *io; + + io = mempool_alloc(cc->io_pool, GFP_NOIO); + io->cc = cc; + io->base_bio = bio; + io->sector = sector; + io->error = 0; + io->base_io = NULL; + io->ctx.req = NULL; + atomic_set(&io->io_pending, 0); + + return io; +} + +static void crypt_inc_pending(struct dm_crypt_io *io) +{ + atomic_inc(&io->io_pending); +} + /* * One of the bios was finished. Check for completion of * the whole request and correctly clean up the buffer. + * If base_io is set, wait for the last fragment to complete. */ static void crypt_dec_pending(struct dm_crypt_io *io) { - struct crypt_config *cc = io->target->private; + struct crypt_config *cc = io->cc; + struct bio *base_bio = io->base_bio; + struct dm_crypt_io *base_io = io->base_io; + int error = io->error; - if (!atomic_dec_and_test(&io->pending)) + if (!atomic_dec_and_test(&io->io_pending)) return; - bio_endio(io->base_bio, io->error); + if (io->ctx.req) + mempool_free(io->ctx.req, cc->req_pool); mempool_free(io, cc->io_pool); + + if (likely(!base_io)) + bio_endio(base_bio, error); + else { + if (error && !base_io->error) + base_io->error = error; + crypt_dec_pending(base_io); + } } /* @@ -545,11 +1071,14 @@ static void crypt_dec_pending(struct dm_crypt_io *io) * They must be separated as otherwise the final stages could be * starved by new requests which can block in the first stages due * to memory allocation. + * + * The work is done per CPU global for all dm-crypt instances. + * They should not depend on each other and do not block. */ static void crypt_endio(struct bio *clone, int error) { struct dm_crypt_io *io = clone->bi_private; - struct crypt_config *cc = io->target->private; + struct crypt_config *cc = io->cc; unsigned rw = bio_data_dir(clone); if (unlikely(!bio_flagged(clone, BIO_UPTODATE) && !error)) @@ -576,180 +1105,203 @@ static void crypt_endio(struct bio *clone, int error) static void clone_init(struct dm_crypt_io *io, struct bio *clone) { - struct crypt_config *cc = io->target->private; + struct crypt_config *cc = io->cc; clone->bi_private = io; clone->bi_end_io = crypt_endio; clone->bi_bdev = cc->dev->bdev; clone->bi_rw = io->base_bio->bi_rw; - clone->bi_destructor = dm_crypt_bio_destructor; } -static void kcryptd_io_read(struct dm_crypt_io *io) +static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) { - struct crypt_config *cc = io->target->private; + struct crypt_config *cc = io->cc; struct bio *base_bio = io->base_bio; struct bio *clone; - atomic_inc(&io->pending); - /* * The block layer might modify the bvec array, so always * copy the required bvecs because we need the original * one in order to decrypt the whole bio data *afterwards*. */ - clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs); - if (unlikely(!clone)) { - io->error = -ENOMEM; - crypt_dec_pending(io); - return; - } + clone = bio_clone_bioset(base_bio, gfp, cc->bs); + if (!clone) + return 1; + + crypt_inc_pending(io); clone_init(io, clone); - clone->bi_idx = 0; - clone->bi_vcnt = bio_segments(base_bio); - clone->bi_size = base_bio->bi_size; - clone->bi_sector = cc->start + io->sector; - memcpy(clone->bi_io_vec, bio_iovec(base_bio), - sizeof(struct bio_vec) * clone->bi_vcnt); + clone->bi_iter.bi_sector = cc->start + io->sector; generic_make_request(clone); + return 0; } static void kcryptd_io_write(struct dm_crypt_io *io) { struct bio *clone = io->ctx.bio_out; - struct crypt_config *cc = io->target->private; - generic_make_request(clone); - wake_up(&cc->writeq); } static void kcryptd_io(struct work_struct *work) { struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); - if (bio_data_dir(io->base_bio) == READ) - kcryptd_io_read(io); - else + if (bio_data_dir(io->base_bio) == READ) { + crypt_inc_pending(io); + if (kcryptd_io_read(io, GFP_NOIO)) + io->error = -ENOMEM; + crypt_dec_pending(io); + } else kcryptd_io_write(io); } static void kcryptd_queue_io(struct dm_crypt_io *io) { - struct crypt_config *cc = io->target->private; + struct crypt_config *cc = io->cc; INIT_WORK(&io->work, kcryptd_io); queue_work(cc->io_queue, &io->work); } -static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, - int error, int async) +static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) { struct bio *clone = io->ctx.bio_out; - struct crypt_config *cc = io->target->private; + struct crypt_config *cc = io->cc; - if (unlikely(error < 0)) { + if (unlikely(io->error < 0)) { crypt_free_buffer_pages(cc, clone); bio_put(clone); - io->error = -EIO; + crypt_dec_pending(io); return; } /* crypt_convert should have filled the clone bio */ - BUG_ON(io->ctx.idx_out < clone->bi_vcnt); + BUG_ON(io->ctx.iter_out.bi_size); - clone->bi_sector = cc->start + io->sector; - io->sector += bio_sectors(clone); + clone->bi_iter.bi_sector = cc->start + io->sector; if (async) kcryptd_queue_io(io); - else { - atomic_inc(&io->pending); + else generic_make_request(clone); - } } -static void kcryptd_crypt_write_convert_loop(struct dm_crypt_io *io) +static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) { - struct crypt_config *cc = io->target->private; + struct crypt_config *cc = io->cc; struct bio *clone; - unsigned remaining = io->base_bio->bi_size; + struct dm_crypt_io *new_io; + int crypt_finished; + unsigned out_of_pages = 0; + unsigned remaining = io->base_bio->bi_iter.bi_size; + sector_t sector = io->sector; int r; /* + * Prevent io from disappearing until this function completes. + */ + crypt_inc_pending(io); + crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector); + + /* * The allocated buffers can be smaller than the whole bio, * so repeat the whole process until all the data can be handled. */ while (remaining) { - clone = crypt_alloc_buffer(io, remaining); + clone = crypt_alloc_buffer(io, remaining, &out_of_pages); if (unlikely(!clone)) { io->error = -ENOMEM; - return; + break; } io->ctx.bio_out = clone; - io->ctx.idx_out = 0; + io->ctx.iter_out = clone->bi_iter; - remaining -= clone->bi_size; + remaining -= clone->bi_iter.bi_size; + sector += bio_sectors(clone); + + crypt_inc_pending(io); r = crypt_convert(cc, &io->ctx); + if (r < 0) + io->error = -EIO; + + crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); + + /* Encryption was already finished, submit io now */ + if (crypt_finished) { + kcryptd_crypt_write_io_submit(io, 0); - if (atomic_dec_and_test(&io->ctx.pending)) { - /* processed, no running async crypto */ - kcryptd_crypt_write_io_submit(io, r, 0); + /* + * If there was an error, do not try next fragments. + * For async, error is processed in async handler. + */ if (unlikely(r < 0)) - return; - } else - atomic_inc(&io->pending); - - /* out of memory -> run queues */ - if (unlikely(remaining)) { - /* wait for async crypto then reinitialize pending */ - wait_event(cc->writeq, !atomic_read(&io->ctx.pending)); - atomic_set(&io->ctx.pending, 1); - congestion_wait(WRITE, HZ/100); - } - } -} + break; -static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) -{ - struct crypt_config *cc = io->target->private; + io->sector = sector; + } - /* - * Prevent io from disappearing until this function completes. - */ - atomic_inc(&io->pending); + /* + * Out of memory -> run queues + * But don't wait if split was due to the io size restriction + */ + if (unlikely(out_of_pages)) + congestion_wait(BLK_RW_ASYNC, HZ/100); - crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, io->sector); - kcryptd_crypt_write_convert_loop(io); + /* + * With async crypto it is unsafe to share the crypto context + * between fragments, so switch to a new dm_crypt_io structure. + */ + if (unlikely(!crypt_finished && remaining)) { + new_io = crypt_io_alloc(io->cc, io->base_bio, + sector); + crypt_inc_pending(new_io); + crypt_convert_init(cc, &new_io->ctx, NULL, + io->base_bio, sector); + new_io->ctx.iter_in = io->ctx.iter_in; + + /* + * Fragments after the first use the base_io + * pending count. + */ + if (!io->base_io) + new_io->base_io = io; + else { + new_io->base_io = io->base_io; + crypt_inc_pending(io->base_io); + crypt_dec_pending(io); + } + + io = new_io; + } + } crypt_dec_pending(io); } -static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error) +static void kcryptd_crypt_read_done(struct dm_crypt_io *io) { - if (unlikely(error < 0)) - io->error = -EIO; - crypt_dec_pending(io); } static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) { - struct crypt_config *cc = io->target->private; + struct crypt_config *cc = io->cc; int r = 0; - atomic_inc(&io->pending); + crypt_inc_pending(io); crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, io->sector); r = crypt_convert(cc, &io->ctx); + if (r < 0) + io->error = -EIO; - if (atomic_dec_and_test(&io->ctx.pending)) - kcryptd_crypt_read_done(io, r); + if (atomic_dec_and_test(&io->ctx.cc_pending)) + kcryptd_crypt_read_done(io); crypt_dec_pending(io); } @@ -757,24 +1309,31 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) static void kcryptd_async_done(struct crypto_async_request *async_req, int error) { - struct convert_context *ctx = async_req->data; + struct dm_crypt_request *dmreq = async_req->data; + struct convert_context *ctx = dmreq->ctx; struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); - struct crypt_config *cc = io->target->private; + struct crypt_config *cc = io->cc; if (error == -EINPROGRESS) { complete(&ctx->restart); return; } - mempool_free(ablkcipher_request_cast(async_req), cc->req_pool); + if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) + error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); + + if (error < 0) + io->error = -EIO; + + mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); - if (!atomic_dec_and_test(&ctx->pending)) + if (!atomic_dec_and_test(&ctx->cc_pending)) return; if (bio_data_dir(io->base_bio) == READ) - kcryptd_crypt_read_done(io, error); + kcryptd_crypt_read_done(io); else - kcryptd_crypt_write_io_submit(io, error, 1); + kcryptd_crypt_write_io_submit(io, 1); } static void kcryptd_crypt(struct work_struct *work) @@ -789,7 +1348,7 @@ static void kcryptd_crypt(struct work_struct *work) static void kcryptd_queue_crypt(struct dm_crypt_io *io) { - struct crypt_config *cc = io->target->private; + struct crypt_config *cc = io->cc; INIT_WORK(&io->work, kcryptd_crypt); queue_work(cc->crypt_queue, &io->work); @@ -801,7 +1360,6 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io) static int crypt_decode_key(u8 *key, char *hex, unsigned int size) { char buffer[3]; - char *endp; unsigned int i; buffer[2] = '\0'; @@ -810,9 +1368,7 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size) buffer[0] = *hex++; buffer[1] = *hex++; - key[i] = (u8)simple_strtoul(buffer, &endp, 16); - - if (endp != &buffer[2]) + if (kstrtou8(buffer, 16, &key[i])) return -EINVAL; } @@ -822,315 +1378,468 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size) return 0; } -/* - * Encode key into its hex representation - */ -static void crypt_encode_key(char *hex, u8 *key, unsigned int size) +static void crypt_free_tfms(struct crypt_config *cc) { - unsigned int i; + unsigned i; - for (i = 0; i < size; i++) { - sprintf(hex, "%02x", *key); - hex += 2; - key++; + if (!cc->tfms) + return; + + for (i = 0; i < cc->tfms_count; i++) + if (cc->tfms[i] && !IS_ERR(cc->tfms[i])) { + crypto_free_ablkcipher(cc->tfms[i]); + cc->tfms[i] = NULL; + } + + kfree(cc->tfms); + cc->tfms = NULL; +} + +static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) +{ + unsigned i; + int err; + + cc->tfms = kmalloc(cc->tfms_count * sizeof(struct crypto_ablkcipher *), + GFP_KERNEL); + if (!cc->tfms) + return -ENOMEM; + + for (i = 0; i < cc->tfms_count; i++) { + cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0); + if (IS_ERR(cc->tfms[i])) { + err = PTR_ERR(cc->tfms[i]); + crypt_free_tfms(cc); + return err; + } + } + + return 0; +} + +static int crypt_setkey_allcpus(struct crypt_config *cc) +{ + unsigned subkey_size; + int err = 0, i, r; + + /* Ignore extra keys (which are used for IV etc) */ + subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count); + + for (i = 0; i < cc->tfms_count; i++) { + r = crypto_ablkcipher_setkey(cc->tfms[i], + cc->key + (i * subkey_size), + subkey_size); + if (r) + err = r; } + + return err; } static int crypt_set_key(struct crypt_config *cc, char *key) { - unsigned key_size = strlen(key) >> 1; + int r = -EINVAL; + int key_string_len = strlen(key); - if (cc->key_size && cc->key_size != key_size) - return -EINVAL; + /* The key size may not be changed. */ + if (cc->key_size != (key_string_len >> 1)) + goto out; - cc->key_size = key_size; /* initial settings */ + /* Hyphen (which gives a key_size of zero) means there is no key. */ + if (!cc->key_size && strcmp(key, "-")) + goto out; - if ((!key_size && strcmp(key, "-")) || - (key_size && crypt_decode_key(cc->key, key, key_size) < 0)) - return -EINVAL; + if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) + goto out; set_bit(DM_CRYPT_KEY_VALID, &cc->flags); - return 0; + r = crypt_setkey_allcpus(cc); + +out: + /* Hex key string not needed after here, so wipe it. */ + memset(key, '0', key_string_len); + + return r; } static int crypt_wipe_key(struct crypt_config *cc) { clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); memset(&cc->key, 0, cc->key_size * sizeof(u8)); - return 0; + + return crypt_setkey_allcpus(cc); } -/* - * Construct an encryption mapping: - * <cipher> <key> <iv_offset> <dev_path> <start> - */ -static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +static void crypt_dtr(struct dm_target *ti) { - struct crypt_config *cc; - struct crypto_ablkcipher *tfm; - char *tmp; - char *cipher; - char *chainmode; - char *ivmode; - char *ivopts; - unsigned int key_size; - unsigned long long tmpll; + struct crypt_config *cc = ti->private; - if (argc != 5) { - ti->error = "Not enough arguments"; + ti->private = NULL; + + if (!cc) + return; + + if (cc->io_queue) + destroy_workqueue(cc->io_queue); + if (cc->crypt_queue) + destroy_workqueue(cc->crypt_queue); + + crypt_free_tfms(cc); + + if (cc->bs) + bioset_free(cc->bs); + + if (cc->page_pool) + mempool_destroy(cc->page_pool); + if (cc->req_pool) + mempool_destroy(cc->req_pool); + if (cc->io_pool) + mempool_destroy(cc->io_pool); + + if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) + cc->iv_gen_ops->dtr(cc); + + if (cc->dev) + dm_put_device(ti, cc->dev); + + kzfree(cc->cipher); + kzfree(cc->cipher_string); + + /* Must zero key material before freeing */ + kzfree(cc); +} + +static int crypt_ctr_cipher(struct dm_target *ti, + char *cipher_in, char *key) +{ + struct crypt_config *cc = ti->private; + char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; + char *cipher_api = NULL; + int ret = -EINVAL; + char dummy; + + /* Convert to crypto api definition? */ + if (strchr(cipher_in, '(')) { + ti->error = "Bad cipher specification"; + return -EINVAL; + } + + cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL); + if (!cc->cipher_string) + goto bad_mem; + + /* + * Legacy dm-crypt cipher specification + * cipher[:keycount]-mode-iv:ivopts + */ + tmp = cipher_in; + keycount = strsep(&tmp, "-"); + cipher = strsep(&keycount, ":"); + + if (!keycount) + cc->tfms_count = 1; + else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 || + !is_power_of_2(cc->tfms_count)) { + ti->error = "Bad cipher key count specification"; return -EINVAL; } + cc->key_parts = cc->tfms_count; + cc->key_extra_size = 0; + + cc->cipher = kstrdup(cipher, GFP_KERNEL); + if (!cc->cipher) + goto bad_mem; - tmp = argv[0]; - cipher = strsep(&tmp, "-"); chainmode = strsep(&tmp, "-"); ivopts = strsep(&tmp, "-"); ivmode = strsep(&ivopts, ":"); if (tmp) - DMWARN("Unexpected additional cipher options"); - - key_size = strlen(argv[1]) >> 1; - - cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); - if (cc == NULL) { - ti->error = - "Cannot allocate transparent encryption context"; - return -ENOMEM; - } + DMWARN("Ignoring unexpected additional cipher options"); - if (crypt_set_key(cc, argv[1])) { - ti->error = "Error decoding key"; - goto bad_cipher; - } - - /* Compatiblity mode for old dm-crypt cipher strings */ - if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) { + /* + * For compatibility with the original dm-crypt mapping format, if + * only the cipher name is supplied, use cbc-plain. + */ + if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) { chainmode = "cbc"; ivmode = "plain"; } if (strcmp(chainmode, "ecb") && !ivmode) { - ti->error = "This chaining mode requires an IV mechanism"; - goto bad_cipher; + ti->error = "IV mechanism required"; + return -EINVAL; } - if (snprintf(cc->cipher, CRYPTO_MAX_ALG_NAME, "%s(%s)", - chainmode, cipher) >= CRYPTO_MAX_ALG_NAME) { - ti->error = "Chain mode + cipher name is too long"; - goto bad_cipher; + cipher_api = kmalloc(CRYPTO_MAX_ALG_NAME, GFP_KERNEL); + if (!cipher_api) + goto bad_mem; + + ret = snprintf(cipher_api, CRYPTO_MAX_ALG_NAME, + "%s(%s)", chainmode, cipher); + if (ret < 0) { + kfree(cipher_api); + goto bad_mem; } - tfm = crypto_alloc_ablkcipher(cc->cipher, 0, 0); - if (IS_ERR(tfm)) { + /* Allocate cipher */ + ret = crypt_alloc_tfms(cc, cipher_api); + if (ret < 0) { ti->error = "Error allocating crypto tfm"; - goto bad_cipher; + goto bad; } - strcpy(cc->cipher, cipher); - strcpy(cc->chainmode, chainmode); - cc->tfm = tfm; - - /* - * Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi". - * See comments at iv code - */ + /* Initialize IV */ + cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); + if (cc->iv_size) + /* at least a 64 bit sector number should fit in our buffer */ + cc->iv_size = max(cc->iv_size, + (unsigned int)(sizeof(u64) / sizeof(u8))); + else if (ivmode) { + DMWARN("Selected cipher does not support IVs"); + ivmode = NULL; + } + /* Choose ivmode, see comments at iv code. */ if (ivmode == NULL) cc->iv_gen_ops = NULL; else if (strcmp(ivmode, "plain") == 0) cc->iv_gen_ops = &crypt_iv_plain_ops; + else if (strcmp(ivmode, "plain64") == 0) + cc->iv_gen_ops = &crypt_iv_plain64_ops; else if (strcmp(ivmode, "essiv") == 0) cc->iv_gen_ops = &crypt_iv_essiv_ops; else if (strcmp(ivmode, "benbi") == 0) cc->iv_gen_ops = &crypt_iv_benbi_ops; else if (strcmp(ivmode, "null") == 0) cc->iv_gen_ops = &crypt_iv_null_ops; - else { + else if (strcmp(ivmode, "lmk") == 0) { + cc->iv_gen_ops = &crypt_iv_lmk_ops; + /* + * Version 2 and 3 is recognised according + * to length of provided multi-key string. + * If present (version 3), last key is used as IV seed. + * All keys (including IV seed) are always the same size. + */ + if (cc->key_size % cc->key_parts) { + cc->key_parts++; + cc->key_extra_size = cc->key_size / cc->key_parts; + } + } else if (strcmp(ivmode, "tcw") == 0) { + cc->iv_gen_ops = &crypt_iv_tcw_ops; + cc->key_parts += 2; /* IV + whitening */ + cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE; + } else { + ret = -EINVAL; ti->error = "Invalid IV mode"; - goto bad_ivmode; + goto bad; } - if (cc->iv_gen_ops && cc->iv_gen_ops->ctr && - cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0) - goto bad_ivmode; + /* Initialize and set key */ + ret = crypt_set_key(cc, key); + if (ret < 0) { + ti->error = "Error decoding and setting key"; + goto bad; + } - cc->iv_size = crypto_ablkcipher_ivsize(tfm); - if (cc->iv_size) - /* at least a 64 bit sector number should fit in our buffer */ - cc->iv_size = max(cc->iv_size, - (unsigned int)(sizeof(u64) / sizeof(u8))); - else { - if (cc->iv_gen_ops) { - DMWARN("Selected cipher does not support IVs"); - if (cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); - cc->iv_gen_ops = NULL; + /* Allocate IV */ + if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) { + ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); + if (ret < 0) { + ti->error = "Error creating IV"; + goto bad; } } + /* Initialize IV (set keys for ESSIV etc) */ + if (cc->iv_gen_ops && cc->iv_gen_ops->init) { + ret = cc->iv_gen_ops->init(cc); + if (ret < 0) { + ti->error = "Error initialising IV"; + goto bad; + } + } + + ret = 0; +bad: + kfree(cipher_api); + return ret; + +bad_mem: + ti->error = "Cannot allocate cipher strings"; + return -ENOMEM; +} + +/* + * Construct an encryption mapping: + * <cipher> <key> <iv_offset> <dev_path> <start> + */ +static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct crypt_config *cc; + unsigned int key_size, opt_params; + unsigned long long tmpll; + int ret; + struct dm_arg_set as; + const char *opt_string; + char dummy; + + static struct dm_arg _args[] = { + {0, 1, "Invalid number of feature args"}, + }; + + if (argc < 5) { + ti->error = "Not enough arguments"; + return -EINVAL; + } + + key_size = strlen(argv[1]) >> 1; + + cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); + if (!cc) { + ti->error = "Cannot allocate encryption context"; + return -ENOMEM; + } + cc->key_size = key_size; + + ti->private = cc; + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); + if (ret < 0) + goto bad; + + ret = -ENOMEM; cc->io_pool = mempool_create_slab_pool(MIN_IOS, _crypt_io_pool); if (!cc->io_pool) { ti->error = "Cannot allocate crypt io mempool"; - goto bad_slab_pool; + goto bad; } cc->dmreq_start = sizeof(struct ablkcipher_request); - cc->dmreq_start += crypto_ablkcipher_reqsize(tfm); + cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); - cc->dmreq_start += crypto_ablkcipher_alignmask(tfm) & + cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & ~(crypto_tfm_ctx_alignment() - 1); cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + sizeof(struct dm_crypt_request) + cc->iv_size); if (!cc->req_pool) { ti->error = "Cannot allocate crypt request mempool"; - goto bad_req_pool; + goto bad; } - cc->req = NULL; cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { ti->error = "Cannot allocate page mempool"; - goto bad_page_pool; + goto bad; } - cc->bs = bioset_create(MIN_IOS, MIN_IOS); + cc->bs = bioset_create(MIN_IOS, 0); if (!cc->bs) { ti->error = "Cannot allocate crypt bioset"; - goto bad_bs; + goto bad; } - if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) { - ti->error = "Error setting key"; - goto bad_device; - } - - if (sscanf(argv[2], "%llu", &tmpll) != 1) { + ret = -EINVAL; + if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid iv_offset sector"; - goto bad_device; + goto bad; } cc->iv_offset = tmpll; - if (sscanf(argv[4], "%llu", &tmpll) != 1) { + if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev)) { + ti->error = "Device lookup failed"; + goto bad; + } + + if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid device sector"; - goto bad_device; + goto bad; } cc->start = tmpll; - if (dm_get_device(ti, argv[3], cc->start, ti->len, - dm_table_get_mode(ti->table), &cc->dev)) { - ti->error = "Device lookup failed"; - goto bad_device; - } + argv += 5; + argc -= 5; + + /* Optional parameters */ + if (argc) { + as.argc = argc; + as.argv = argv; - if (ivmode && cc->iv_gen_ops) { - if (ivopts) - *(ivopts - 1) = ':'; - cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL); - if (!cc->iv_mode) { - ti->error = "Error kmallocing iv_mode string"; - goto bad_ivmode_string; + ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (ret) + goto bad; + + opt_string = dm_shift_arg(&as); + + if (opt_params == 1 && opt_string && + !strcasecmp(opt_string, "allow_discards")) + ti->num_discard_bios = 1; + else if (opt_params) { + ret = -EINVAL; + ti->error = "Invalid feature arguments"; + goto bad; } - strcpy(cc->iv_mode, ivmode); - } else - cc->iv_mode = NULL; + } - cc->io_queue = create_singlethread_workqueue("kcryptd_io"); + ret = -ENOMEM; + cc->io_queue = alloc_workqueue("kcryptd_io", WQ_MEM_RECLAIM, 1); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; - goto bad_io_queue; + goto bad; } - cc->crypt_queue = create_singlethread_workqueue("kcryptd"); + cc->crypt_queue = alloc_workqueue("kcryptd", + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; - goto bad_crypt_queue; + goto bad; } - init_waitqueue_head(&cc->writeq); - ti->private = cc; - return 0; - -bad_crypt_queue: - destroy_workqueue(cc->io_queue); -bad_io_queue: - kfree(cc->iv_mode); -bad_ivmode_string: - dm_put_device(ti, cc->dev); -bad_device: - bioset_free(cc->bs); -bad_bs: - mempool_destroy(cc->page_pool); -bad_page_pool: - mempool_destroy(cc->req_pool); -bad_req_pool: - mempool_destroy(cc->io_pool); -bad_slab_pool: - if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); -bad_ivmode: - crypto_free_ablkcipher(tfm); -bad_cipher: - /* Must zero key material before freeing */ - memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); - kfree(cc); - return -EINVAL; -} + ti->num_flush_bios = 1; + ti->discard_zeroes_data_unsupported = true; -static void crypt_dtr(struct dm_target *ti) -{ - struct crypt_config *cc = (struct crypt_config *) ti->private; - - destroy_workqueue(cc->io_queue); - destroy_workqueue(cc->crypt_queue); - - if (cc->req) - mempool_free(cc->req, cc->req_pool); - - bioset_free(cc->bs); - mempool_destroy(cc->page_pool); - mempool_destroy(cc->req_pool); - mempool_destroy(cc->io_pool); - - kfree(cc->iv_mode); - if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) - cc->iv_gen_ops->dtr(cc); - crypto_free_ablkcipher(cc->tfm); - dm_put_device(ti, cc->dev); + return 0; - /* Must zero key material before freeing */ - memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8)); - kfree(cc); +bad: + crypt_dtr(ti); + return ret; } -static int crypt_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) +static int crypt_map(struct dm_target *ti, struct bio *bio) { - struct crypt_config *cc = ti->private; struct dm_crypt_io *io; + struct crypt_config *cc = ti->private; - io = mempool_alloc(cc->io_pool, GFP_NOIO); - io->target = ti; - io->base_bio = bio; - io->sector = bio->bi_sector - ti->begin; - io->error = 0; - atomic_set(&io->pending, 0); + /* + * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues. + * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight + * - for REQ_DISCARD caller must use flush if IO ordering matters + */ + if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) { + bio->bi_bdev = cc->dev->bdev; + if (bio_sectors(bio)) + bio->bi_iter.bi_sector = cc->start + + dm_target_offset(ti, bio->bi_iter.bi_sector); + return DM_MAPIO_REMAPPED; + } - if (bio_data_dir(io->base_bio) == READ) - kcryptd_queue_io(io); - else + io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); + + if (bio_data_dir(io->base_bio) == READ) { + if (kcryptd_io_read(io, GFP_NOWAIT)) + kcryptd_queue_io(io); + } else kcryptd_queue_crypt(io); return DM_MAPIO_SUBMITTED; } -static int crypt_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) +static void crypt_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) { - struct crypt_config *cc = (struct crypt_config *) ti->private; - unsigned int sz = 0; + struct crypt_config *cc = ti->private; + unsigned i, sz = 0; switch (type) { case STATUSTYPE_INFO: @@ -1138,29 +1847,22 @@ static int crypt_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - if (cc->iv_mode) - DMEMIT("%s-%s-%s ", cc->cipher, cc->chainmode, - cc->iv_mode); + DMEMIT("%s ", cc->cipher_string); + + if (cc->key_size > 0) + for (i = 0; i < cc->key_size; i++) + DMEMIT("%02x", cc->key[i]); else - DMEMIT("%s-%s ", cc->cipher, cc->chainmode); - - if (cc->key_size > 0) { - if ((maxlen - sz) < ((cc->key_size << 1) + 1)) - return -ENOMEM; - - crypt_encode_key(result + sz, cc->key, cc->key_size); - sz += cc->key_size << 1; - } else { - if (sz >= maxlen) - return -ENOMEM; - result[sz++] = '-'; - } + DMEMIT("-"); DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, cc->dev->name, (unsigned long long)cc->start); + + if (ti->num_discard_bios) + DMEMIT(" 1 allow_discards"); + break; } - return 0; } static void crypt_postsuspend(struct dm_target *ti) @@ -1196,19 +1898,32 @@ static void crypt_resume(struct dm_target *ti) static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) { struct crypt_config *cc = ti->private; + int ret = -EINVAL; if (argc < 2) goto error; - if (!strnicmp(argv[0], MESG_STR("key"))) { + if (!strcasecmp(argv[0], "key")) { if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) { DMWARN("not suspended during key manipulation."); return -EINVAL; } - if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) - return crypt_set_key(cc, argv[2]); - if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) + if (argc == 3 && !strcasecmp(argv[1], "set")) { + ret = crypt_set_key(cc, argv[2]); + if (ret) + return ret; + if (cc->iv_gen_ops && cc->iv_gen_ops->init) + ret = cc->iv_gen_ops->init(cc); + return ret; + } + if (argc == 2 && !strcasecmp(argv[1], "wipe")) { + if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { + ret = cc->iv_gen_ops->wipe(cc); + if (ret) + return ret; + } return crypt_wipe_key(cc); + } } error: @@ -1226,14 +1941,22 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return max_size; bvm->bi_bdev = cc->dev->bdev; - bvm->bi_sector = cc->start + bvm->bi_sector - ti->begin; + bvm->bi_sector = cc->start + dm_target_offset(ti, bvm->bi_sector); return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } +static int crypt_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct crypt_config *cc = ti->private; + + return fn(ti, cc->dev, cc->start, ti->len, data); +} + static struct target_type crypt_target = { .name = "crypt", - .version= {1, 6, 0}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, @@ -1244,6 +1967,7 @@ static struct target_type crypt_target = { .resume = crypt_resume, .message = crypt_message, .merge = crypt_merge, + .iterate_devices = crypt_iterate_devices, }; static int __init dm_crypt_init(void) @@ -1265,17 +1989,13 @@ static int __init dm_crypt_init(void) static void __exit dm_crypt_exit(void) { - int r = dm_unregister_target(&crypt_target); - - if (r < 0) - DMERR("unregister failed %d", r); - + dm_unregister_target(&crypt_target); kmem_cache_destroy(_crypt_io_pool); } module_init(dm_crypt_init); module_exit(dm_crypt_exit); -MODULE_AUTHOR("Christophe Saout <christophe@saout.de>"); +MODULE_AUTHOR("Jana Saout <jana@saout.de>"); MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index bdd37f881c4..42c3a27a14c 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -13,18 +13,17 @@ #include <linux/bio.h> #include <linux/slab.h> -#include "dm.h" -#include "dm-bio-list.h" +#include <linux/device-mapper.h> #define DM_MSG_PREFIX "delay" struct delay_c { struct timer_list delay_timer; struct mutex timer_lock; + struct workqueue_struct *kdelayd_wq; struct work_struct flush_expired_bios; struct list_head delayed_bios; atomic_t may_delay; - mempool_t *delayed_pool; struct dm_dev *dev_read; sector_t start_read; @@ -40,20 +39,16 @@ struct delay_c { struct dm_delay_info { struct delay_c *context; struct list_head list; - struct bio *bio; unsigned long expires; }; static DEFINE_MUTEX(delayed_bios_lock); -static struct workqueue_struct *kdelayd_wq; -static struct kmem_cache *delayed_cache; - static void handle_delayed_timer(unsigned long data) { struct delay_c *dc = (struct delay_c *)data; - queue_work(kdelayd_wq, &dc->flush_expired_bios); + queue_work(dc->kdelayd_wq, &dc->flush_expired_bios); } static void queue_timeout(struct delay_c *dc, unsigned long expires) @@ -88,13 +83,14 @@ static struct bio *flush_delayed_bios(struct delay_c *dc, int flush_all) mutex_lock(&delayed_bios_lock); list_for_each_entry_safe(delayed, next, &dc->delayed_bios, list) { if (flush_all || time_after_eq(jiffies, delayed->expires)) { + struct bio *bio = dm_bio_from_per_bio_data(delayed, + sizeof(struct dm_delay_info)); list_del(&delayed->list); - bio_list_add(&flush_bios, delayed->bio); - if ((bio_data_dir(delayed->bio) == WRITE)) + bio_list_add(&flush_bios, bio); + if ((bio_data_dir(bio) == WRITE)) delayed->context->writes--; else delayed->context->reads--; - mempool_free(delayed, dc->delayed_pool); continue; } @@ -132,6 +128,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct delay_c *dc; unsigned long long tmpll; + char dummy; if (argc != 3 && argc != 6) { ti->error = "requires exactly 3 or 6 arguments"; @@ -146,19 +143,19 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) dc->reads = dc->writes = 0; - if (sscanf(argv[1], "%llu", &tmpll) != 1) { + if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid device sector"; goto bad; } dc->start_read = tmpll; - if (sscanf(argv[2], "%u", &dc->read_delay) != 1) { + if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) { ti->error = "Invalid delay"; goto bad; } - if (dm_get_device(ti, argv[0], dc->start_read, ti->len, - dm_table_get_mode(ti->table), &dc->dev_read)) { + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &dc->dev_read)) { ti->error = "Device lookup failed"; goto bad; } @@ -167,28 +164,28 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (argc == 3) goto out; - if (sscanf(argv[4], "%llu", &tmpll) != 1) { + if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { ti->error = "Invalid write device sector"; goto bad_dev_read; } dc->start_write = tmpll; - if (sscanf(argv[5], "%u", &dc->write_delay) != 1) { + if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) { ti->error = "Invalid write delay"; goto bad_dev_read; } - if (dm_get_device(ti, argv[3], dc->start_write, ti->len, - dm_table_get_mode(ti->table), &dc->dev_write)) { + if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), + &dc->dev_write)) { ti->error = "Write device lookup failed"; goto bad_dev_read; } out: - dc->delayed_pool = mempool_create_slab_pool(128, delayed_cache); - if (!dc->delayed_pool) { - DMERR("Couldn't create delayed bio pool."); - goto bad_dev_write; + dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0); + if (!dc->kdelayd_wq) { + DMERR("Couldn't start kdelayd"); + goto bad_queue; } setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc); @@ -198,10 +195,13 @@ out: mutex_init(&dc->timer_lock); atomic_set(&dc->may_delay, 1); + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->per_bio_data_size = sizeof(struct dm_delay_info); ti->private = dc; return 0; -bad_dev_write: +bad_queue: if (dc->dev_write) dm_put_device(ti, dc->dev_write); bad_dev_read: @@ -215,14 +215,13 @@ static void delay_dtr(struct dm_target *ti) { struct delay_c *dc = ti->private; - flush_workqueue(kdelayd_wq); + destroy_workqueue(dc->kdelayd_wq); dm_put_device(ti, dc->dev_read); if (dc->dev_write) dm_put_device(ti, dc->dev_write); - mempool_destroy(dc->delayed_pool); kfree(dc); } @@ -234,10 +233,9 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio) if (!delay || !atomic_read(&dc->may_delay)) return 1; - delayed = mempool_alloc(dc->delayed_pool, GFP_NOIO); + delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); delayed->context = dc; - delayed->bio = bio; delayed->expires = expires = jiffies + (delay * HZ / 1000); mutex_lock(&delayed_bios_lock); @@ -272,28 +270,28 @@ static void delay_resume(struct dm_target *ti) atomic_set(&dc->may_delay, 1); } -static int delay_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) +static int delay_map(struct dm_target *ti, struct bio *bio) { struct delay_c *dc = ti->private; if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) { bio->bi_bdev = dc->dev_write->bdev; - bio->bi_sector = dc->start_write + - (bio->bi_sector - ti->begin); + if (bio_sectors(bio)) + bio->bi_iter.bi_sector = dc->start_write + + dm_target_offset(ti, bio->bi_iter.bi_sector); return delay_bio(dc, dc->write_delay, bio); } bio->bi_bdev = dc->dev_read->bdev; - bio->bi_sector = dc->start_read + - (bio->bi_sector - ti->begin); + bio->bi_iter.bi_sector = dc->start_read + + dm_target_offset(ti, bio->bi_iter.bi_sector); return delay_bio(dc, dc->read_delay, bio); } -static int delay_status(struct dm_target *ti, status_type_t type, - char *result, unsigned maxlen) +static void delay_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) { struct delay_c *dc = ti->private; int sz = 0; @@ -313,13 +311,28 @@ static int delay_status(struct dm_target *ti, status_type_t type, dc->write_delay); break; } +} - return 0; +static int delay_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct delay_c *dc = ti->private; + int ret = 0; + + ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data); + if (ret) + goto out; + + if (dc->dev_write) + ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data); + +out: + return ret; } static struct target_type delay_target = { .name = "delay", - .version = {1, 0, 2}, + .version = {1, 2, 1}, .module = THIS_MODULE, .ctr = delay_ctr, .dtr = delay_dtr, @@ -327,23 +340,12 @@ static struct target_type delay_target = { .presuspend = delay_presuspend, .resume = delay_resume, .status = delay_status, + .iterate_devices = delay_iterate_devices, }; static int __init dm_delay_init(void) { - int r = -ENOMEM; - - kdelayd_wq = create_workqueue("kdelayd"); - if (!kdelayd_wq) { - DMERR("Couldn't start kdelayd"); - goto bad_queue; - } - - delayed_cache = KMEM_CACHE(dm_delay_info, 0); - if (!delayed_cache) { - DMERR("Couldn't create delayed bio cache."); - goto bad_memcache; - } + int r; r = dm_register_target(&delay_target); if (r < 0) { @@ -354,22 +356,12 @@ static int __init dm_delay_init(void) return 0; bad_register: - kmem_cache_destroy(delayed_cache); -bad_memcache: - destroy_workqueue(kdelayd_wq); -bad_queue: return r; } static void __exit dm_delay_exit(void) { - int r = dm_unregister_target(&delay_target); - - if (r < 0) - DMERR("unregister failed %d", r); - - kmem_cache_destroy(delayed_cache); - destroy_workqueue(kdelayd_wq); + dm_unregister_target(&delay_target); } /* Module hooks */ diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c new file mode 100644 index 00000000000..ad913cd4ade --- /dev/null +++ b/drivers/md/dm-era-target.c @@ -0,0 +1,1747 @@ +#include "dm.h" +#include "persistent-data/dm-transaction-manager.h" +#include "persistent-data/dm-bitset.h" +#include "persistent-data/dm-space-map.h" + +#include <linux/dm-io.h> +#include <linux/dm-kcopyd.h> +#include <linux/init.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#define DM_MSG_PREFIX "era" + +#define SUPERBLOCK_LOCATION 0 +#define SUPERBLOCK_MAGIC 2126579579 +#define SUPERBLOCK_CSUM_XOR 146538381 +#define MIN_ERA_VERSION 1 +#define MAX_ERA_VERSION 1 +#define INVALID_WRITESET_ROOT SUPERBLOCK_LOCATION +#define MIN_BLOCK_SIZE 8 + +/*---------------------------------------------------------------- + * Writeset + *--------------------------------------------------------------*/ +struct writeset_metadata { + uint32_t nr_bits; + dm_block_t root; +}; + +struct writeset { + struct writeset_metadata md; + + /* + * An in core copy of the bits to save constantly doing look ups on + * disk. + */ + unsigned long *bits; +}; + +/* + * This does not free off the on disk bitset as this will normally be done + * after digesting into the era array. + */ +static void writeset_free(struct writeset *ws) +{ + vfree(ws->bits); +} + +static int setup_on_disk_bitset(struct dm_disk_bitset *info, + unsigned nr_bits, dm_block_t *root) +{ + int r; + + r = dm_bitset_empty(info, root); + if (r) + return r; + + return dm_bitset_resize(info, *root, 0, nr_bits, false, root); +} + +static size_t bitset_size(unsigned nr_bits) +{ + return sizeof(unsigned long) * dm_div_up(nr_bits, BITS_PER_LONG); +} + +/* + * Allocates memory for the in core bitset. + */ +static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks) +{ + ws->md.nr_bits = nr_blocks; + ws->md.root = INVALID_WRITESET_ROOT; + ws->bits = vzalloc(bitset_size(nr_blocks)); + if (!ws->bits) { + DMERR("%s: couldn't allocate in memory bitset", __func__); + return -ENOMEM; + } + + return 0; +} + +/* + * Wipes the in-core bitset, and creates a new on disk bitset. + */ +static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws) +{ + int r; + + memset(ws->bits, 0, bitset_size(ws->md.nr_bits)); + + r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root); + if (r) { + DMERR("%s: setup_on_disk_bitset failed", __func__); + return r; + } + + return 0; +} + +static bool writeset_marked(struct writeset *ws, dm_block_t block) +{ + return test_bit(block, ws->bits); +} + +static int writeset_marked_on_disk(struct dm_disk_bitset *info, + struct writeset_metadata *m, dm_block_t block, + bool *result) +{ + dm_block_t old = m->root; + + /* + * The bitset was flushed when it was archived, so we know there'll + * be no change to the root. + */ + int r = dm_bitset_test_bit(info, m->root, block, &m->root, result); + if (r) { + DMERR("%s: dm_bitset_test_bit failed", __func__); + return r; + } + + BUG_ON(m->root != old); + + return r; +} + +/* + * Returns < 0 on error, 0 if the bit wasn't previously set, 1 if it was. + */ +static int writeset_test_and_set(struct dm_disk_bitset *info, + struct writeset *ws, uint32_t block) +{ + int r; + + if (!test_and_set_bit(block, ws->bits)) { + r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root); + if (r) { + /* FIXME: fail mode */ + return r; + } + + return 0; + } + + return 1; +} + +/*---------------------------------------------------------------- + * On disk metadata layout + *--------------------------------------------------------------*/ +#define SPACE_MAP_ROOT_SIZE 128 +#define UUID_LEN 16 + +struct writeset_disk { + __le32 nr_bits; + __le64 root; +} __packed; + +struct superblock_disk { + __le32 csum; + __le32 flags; + __le64 blocknr; + + __u8 uuid[UUID_LEN]; + __le64 magic; + __le32 version; + + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + + __le32 data_block_size; + __le32 metadata_block_size; + __le32 nr_blocks; + + __le32 current_era; + struct writeset_disk current_writeset; + + /* + * Only these two fields are valid within the metadata snapshot. + */ + __le64 writeset_tree_root; + __le64 era_array_root; + + __le64 metadata_snap; +} __packed; + +/*---------------------------------------------------------------- + * Superblock validation + *--------------------------------------------------------------*/ +static void sb_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t sb_block_size) +{ + struct superblock_disk *disk = dm_block_data(b); + + disk->blocknr = cpu_to_le64(dm_block_location(b)); + disk->csum = cpu_to_le32(dm_bm_checksum(&disk->flags, + sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); +} + +static int check_metadata_version(struct superblock_disk *disk) +{ + uint32_t metadata_version = le32_to_cpu(disk->version); + if (metadata_version < MIN_ERA_VERSION || metadata_version > MAX_ERA_VERSION) { + DMERR("Era metadata version %u found, but only versions between %u and %u supported.", + metadata_version, MIN_ERA_VERSION, MAX_ERA_VERSION); + return -EINVAL; + } + + return 0; +} + +static int sb_check(struct dm_block_validator *v, + struct dm_block *b, + size_t sb_block_size) +{ + struct superblock_disk *disk = dm_block_data(b); + __le32 csum_le; + + if (dm_block_location(b) != le64_to_cpu(disk->blocknr)) { + DMERR("sb_check failed: blocknr %llu: wanted %llu", + le64_to_cpu(disk->blocknr), + (unsigned long long)dm_block_location(b)); + return -ENOTBLK; + } + + if (le64_to_cpu(disk->magic) != SUPERBLOCK_MAGIC) { + DMERR("sb_check failed: magic %llu: wanted %llu", + le64_to_cpu(disk->magic), + (unsigned long long) SUPERBLOCK_MAGIC); + return -EILSEQ; + } + + csum_le = cpu_to_le32(dm_bm_checksum(&disk->flags, + sb_block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); + if (csum_le != disk->csum) { + DMERR("sb_check failed: csum %u: wanted %u", + le32_to_cpu(csum_le), le32_to_cpu(disk->csum)); + return -EILSEQ; + } + + return check_metadata_version(disk); +} + +static struct dm_block_validator sb_validator = { + .name = "superblock", + .prepare_for_write = sb_prepare_for_write, + .check = sb_check +}; + +/*---------------------------------------------------------------- + * Low level metadata handling + *--------------------------------------------------------------*/ +#define DM_ERA_METADATA_BLOCK_SIZE 4096 +#define DM_ERA_METADATA_CACHE_SIZE 64 +#define ERA_MAX_CONCURRENT_LOCKS 5 + +struct era_metadata { + struct block_device *bdev; + struct dm_block_manager *bm; + struct dm_space_map *sm; + struct dm_transaction_manager *tm; + + dm_block_t block_size; + uint32_t nr_blocks; + + uint32_t current_era; + + /* + * We preallocate 2 writesets. When an era rolls over we + * switch between them. This means the allocation is done at + * preresume time, rather than on the io path. + */ + struct writeset writesets[2]; + struct writeset *current_writeset; + + dm_block_t writeset_tree_root; + dm_block_t era_array_root; + + struct dm_disk_bitset bitset_info; + struct dm_btree_info writeset_tree_info; + struct dm_array_info era_array_info; + + dm_block_t metadata_snap; + + /* + * A flag that is set whenever a writeset has been archived. + */ + bool archived_writesets; + + /* + * Reading the space map root can fail, so we read it into this + * buffer before the superblock is locked and updated. + */ + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; +}; + +static int superblock_read_lock(struct era_metadata *md, + struct dm_block **sblock) +{ + return dm_bm_read_lock(md->bm, SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int superblock_lock_zero(struct era_metadata *md, + struct dm_block **sblock) +{ + return dm_bm_write_lock_zero(md->bm, SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int superblock_lock(struct era_metadata *md, + struct dm_block **sblock) +{ + return dm_bm_write_lock(md->bm, SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +/* FIXME: duplication with cache and thin */ +static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result) +{ + int r; + unsigned i; + struct dm_block *b; + __le64 *data_le, zero = cpu_to_le64(0); + unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64); + + /* + * We can't use a validator here - it may be all zeroes. + */ + r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &b); + if (r) + return r; + + data_le = dm_block_data(b); + *result = true; + for (i = 0; i < sb_block_size; i++) { + if (data_le[i] != zero) { + *result = false; + break; + } + } + + return dm_bm_unlock(b); +} + +/*----------------------------------------------------------------*/ + +static void ws_pack(const struct writeset_metadata *core, struct writeset_disk *disk) +{ + disk->nr_bits = cpu_to_le32(core->nr_bits); + disk->root = cpu_to_le64(core->root); +} + +static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata *core) +{ + core->nr_bits = le32_to_cpu(disk->nr_bits); + core->root = le64_to_cpu(disk->root); +} + +static void ws_inc(void *context, const void *value) +{ + struct era_metadata *md = context; + struct writeset_disk ws_d; + dm_block_t b; + + memcpy(&ws_d, value, sizeof(ws_d)); + b = le64_to_cpu(ws_d.root); + + dm_tm_inc(md->tm, b); +} + +static void ws_dec(void *context, const void *value) +{ + struct era_metadata *md = context; + struct writeset_disk ws_d; + dm_block_t b; + + memcpy(&ws_d, value, sizeof(ws_d)); + b = le64_to_cpu(ws_d.root); + + dm_bitset_del(&md->bitset_info, b); +} + +static int ws_eq(void *context, const void *value1, const void *value2) +{ + return !memcmp(value1, value2, sizeof(struct writeset_metadata)); +} + +/*----------------------------------------------------------------*/ + +static void setup_writeset_tree_info(struct era_metadata *md) +{ + struct dm_btree_value_type *vt = &md->writeset_tree_info.value_type; + md->writeset_tree_info.tm = md->tm; + md->writeset_tree_info.levels = 1; + vt->context = md; + vt->size = sizeof(struct writeset_disk); + vt->inc = ws_inc; + vt->dec = ws_dec; + vt->equal = ws_eq; +} + +static void setup_era_array_info(struct era_metadata *md) + +{ + struct dm_btree_value_type vt; + vt.context = NULL; + vt.size = sizeof(__le32); + vt.inc = NULL; + vt.dec = NULL; + vt.equal = NULL; + + dm_array_info_init(&md->era_array_info, md->tm, &vt); +} + +static void setup_infos(struct era_metadata *md) +{ + dm_disk_bitset_init(md->tm, &md->bitset_info); + setup_writeset_tree_info(md); + setup_era_array_info(md); +} + +/*----------------------------------------------------------------*/ + +static int create_fresh_metadata(struct era_metadata *md) +{ + int r; + + r = dm_tm_create_with_sm(md->bm, SUPERBLOCK_LOCATION, + &md->tm, &md->sm); + if (r < 0) { + DMERR("dm_tm_create_with_sm failed"); + return r; + } + + setup_infos(md); + + r = dm_btree_empty(&md->writeset_tree_info, &md->writeset_tree_root); + if (r) { + DMERR("couldn't create new writeset tree"); + goto bad; + } + + r = dm_array_empty(&md->era_array_info, &md->era_array_root); + if (r) { + DMERR("couldn't create era array"); + goto bad; + } + + return 0; + +bad: + dm_sm_destroy(md->sm); + dm_tm_destroy(md->tm); + + return r; +} + +static int save_sm_root(struct era_metadata *md) +{ + int r; + size_t metadata_len; + + r = dm_sm_root_size(md->sm, &metadata_len); + if (r < 0) + return r; + + return dm_sm_copy_root(md->sm, &md->metadata_space_map_root, + metadata_len); +} + +static void copy_sm_root(struct era_metadata *md, struct superblock_disk *disk) +{ + memcpy(&disk->metadata_space_map_root, + &md->metadata_space_map_root, + sizeof(md->metadata_space_map_root)); +} + +/* + * Writes a superblock, including the static fields that don't get updated + * with every commit (possible optimisation here). 'md' should be fully + * constructed when this is called. + */ +static void prepare_superblock(struct era_metadata *md, struct superblock_disk *disk) +{ + disk->magic = cpu_to_le64(SUPERBLOCK_MAGIC); + disk->flags = cpu_to_le32(0ul); + + /* FIXME: can't keep blanking the uuid (uuid is currently unused though) */ + memset(disk->uuid, 0, sizeof(disk->uuid)); + disk->version = cpu_to_le32(MAX_ERA_VERSION); + + copy_sm_root(md, disk); + + disk->data_block_size = cpu_to_le32(md->block_size); + disk->metadata_block_size = cpu_to_le32(DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); + disk->nr_blocks = cpu_to_le32(md->nr_blocks); + disk->current_era = cpu_to_le32(md->current_era); + + ws_pack(&md->current_writeset->md, &disk->current_writeset); + disk->writeset_tree_root = cpu_to_le64(md->writeset_tree_root); + disk->era_array_root = cpu_to_le64(md->era_array_root); + disk->metadata_snap = cpu_to_le64(md->metadata_snap); +} + +static int write_superblock(struct era_metadata *md) +{ + int r; + struct dm_block *sblock; + struct superblock_disk *disk; + + r = save_sm_root(md); + if (r) { + DMERR("%s: save_sm_root failed", __func__); + return r; + } + + r = superblock_lock_zero(md, &sblock); + if (r) + return r; + + disk = dm_block_data(sblock); + prepare_superblock(md, disk); + + return dm_tm_commit(md->tm, sblock); +} + +/* + * Assumes block_size and the infos are set. + */ +static int format_metadata(struct era_metadata *md) +{ + int r; + + r = create_fresh_metadata(md); + if (r) + return r; + + r = write_superblock(md); + if (r) { + dm_sm_destroy(md->sm); + dm_tm_destroy(md->tm); + return r; + } + + return 0; +} + +static int open_metadata(struct era_metadata *md) +{ + int r; + struct dm_block *sblock; + struct superblock_disk *disk; + + r = superblock_read_lock(md, &sblock); + if (r) { + DMERR("couldn't read_lock superblock"); + return r; + } + + disk = dm_block_data(sblock); + r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION, + disk->metadata_space_map_root, + sizeof(disk->metadata_space_map_root), + &md->tm, &md->sm); + if (r) { + DMERR("dm_tm_open_with_sm failed"); + goto bad; + } + + setup_infos(md); + + md->block_size = le32_to_cpu(disk->data_block_size); + md->nr_blocks = le32_to_cpu(disk->nr_blocks); + md->current_era = le32_to_cpu(disk->current_era); + + md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root); + md->era_array_root = le64_to_cpu(disk->era_array_root); + md->metadata_snap = le64_to_cpu(disk->metadata_snap); + md->archived_writesets = true; + + return dm_bm_unlock(sblock); + +bad: + dm_bm_unlock(sblock); + return r; +} + +static int open_or_format_metadata(struct era_metadata *md, + bool may_format) +{ + int r; + bool unformatted = false; + + r = superblock_all_zeroes(md->bm, &unformatted); + if (r) + return r; + + if (unformatted) + return may_format ? format_metadata(md) : -EPERM; + + return open_metadata(md); +} + +static int create_persistent_data_objects(struct era_metadata *md, + bool may_format) +{ + int r; + + md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE, + DM_ERA_METADATA_CACHE_SIZE, + ERA_MAX_CONCURRENT_LOCKS); + if (IS_ERR(md->bm)) { + DMERR("could not create block manager"); + return PTR_ERR(md->bm); + } + + r = open_or_format_metadata(md, may_format); + if (r) + dm_block_manager_destroy(md->bm); + + return r; +} + +static void destroy_persistent_data_objects(struct era_metadata *md) +{ + dm_sm_destroy(md->sm); + dm_tm_destroy(md->tm); + dm_block_manager_destroy(md->bm); +} + +/* + * This waits until all era_map threads have picked up the new filter. + */ +static void swap_writeset(struct era_metadata *md, struct writeset *new_writeset) +{ + rcu_assign_pointer(md->current_writeset, new_writeset); + synchronize_rcu(); +} + +/*---------------------------------------------------------------- + * Writesets get 'digested' into the main era array. + * + * We're using a coroutine here so the worker thread can do the digestion, + * thus avoiding synchronisation of the metadata. Digesting a whole + * writeset in one go would cause too much latency. + *--------------------------------------------------------------*/ +struct digest { + uint32_t era; + unsigned nr_bits, current_bit; + struct writeset_metadata writeset; + __le32 value; + struct dm_disk_bitset info; + + int (*step)(struct era_metadata *, struct digest *); +}; + +static int metadata_digest_lookup_writeset(struct era_metadata *md, + struct digest *d); + +static int metadata_digest_remove_writeset(struct era_metadata *md, + struct digest *d) +{ + int r; + uint64_t key = d->era; + + r = dm_btree_remove(&md->writeset_tree_info, md->writeset_tree_root, + &key, &md->writeset_tree_root); + if (r) { + DMERR("%s: dm_btree_remove failed", __func__); + return r; + } + + d->step = metadata_digest_lookup_writeset; + return 0; +} + +#define INSERTS_PER_STEP 100 + +static int metadata_digest_transcribe_writeset(struct era_metadata *md, + struct digest *d) +{ + int r; + bool marked; + unsigned b, e = min(d->current_bit + INSERTS_PER_STEP, d->nr_bits); + + for (b = d->current_bit; b < e; b++) { + r = writeset_marked_on_disk(&d->info, &d->writeset, b, &marked); + if (r) { + DMERR("%s: writeset_marked_on_disk failed", __func__); + return r; + } + + if (!marked) + continue; + + __dm_bless_for_disk(&d->value); + r = dm_array_set_value(&md->era_array_info, md->era_array_root, + b, &d->value, &md->era_array_root); + if (r) { + DMERR("%s: dm_array_set_value failed", __func__); + return r; + } + } + + if (b == d->nr_bits) + d->step = metadata_digest_remove_writeset; + else + d->current_bit = b; + + return 0; +} + +static int metadata_digest_lookup_writeset(struct era_metadata *md, + struct digest *d) +{ + int r; + uint64_t key; + struct writeset_disk disk; + + r = dm_btree_find_lowest_key(&md->writeset_tree_info, + md->writeset_tree_root, &key); + if (r < 0) + return r; + + d->era = key; + + r = dm_btree_lookup(&md->writeset_tree_info, + md->writeset_tree_root, &key, &disk); + if (r) { + if (r == -ENODATA) { + d->step = NULL; + return 0; + } + + DMERR("%s: dm_btree_lookup failed", __func__); + return r; + } + + ws_unpack(&disk, &d->writeset); + d->value = cpu_to_le32(key); + + d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks); + d->current_bit = 0; + d->step = metadata_digest_transcribe_writeset; + + return 0; +} + +static int metadata_digest_start(struct era_metadata *md, struct digest *d) +{ + if (d->step) + return 0; + + memset(d, 0, sizeof(*d)); + + /* + * We initialise another bitset info to avoid any caching side + * effects with the previous one. + */ + dm_disk_bitset_init(md->tm, &d->info); + d->step = metadata_digest_lookup_writeset; + + return 0; +} + +/*---------------------------------------------------------------- + * High level metadata interface. Target methods should use these, and not + * the lower level ones. + *--------------------------------------------------------------*/ +static struct era_metadata *metadata_open(struct block_device *bdev, + sector_t block_size, + bool may_format) +{ + int r; + struct era_metadata *md = kzalloc(sizeof(*md), GFP_KERNEL); + + if (!md) + return NULL; + + md->bdev = bdev; + md->block_size = block_size; + + md->writesets[0].md.root = INVALID_WRITESET_ROOT; + md->writesets[1].md.root = INVALID_WRITESET_ROOT; + md->current_writeset = &md->writesets[0]; + + r = create_persistent_data_objects(md, may_format); + if (r) { + kfree(md); + return ERR_PTR(r); + } + + return md; +} + +static void metadata_close(struct era_metadata *md) +{ + destroy_persistent_data_objects(md); + kfree(md); +} + +static bool valid_nr_blocks(dm_block_t n) +{ + /* + * dm_bitset restricts us to 2^32. test_bit & co. restrict us + * further to 2^31 - 1 + */ + return n < (1ull << 31); +} + +static int metadata_resize(struct era_metadata *md, void *arg) +{ + int r; + dm_block_t *new_size = arg; + __le32 value; + + if (!valid_nr_blocks(*new_size)) { + DMERR("Invalid number of origin blocks %llu", + (unsigned long long) *new_size); + return -EINVAL; + } + + writeset_free(&md->writesets[0]); + writeset_free(&md->writesets[1]); + + r = writeset_alloc(&md->writesets[0], *new_size); + if (r) { + DMERR("%s: writeset_alloc failed for writeset 0", __func__); + return r; + } + + r = writeset_alloc(&md->writesets[1], *new_size); + if (r) { + DMERR("%s: writeset_alloc failed for writeset 1", __func__); + return r; + } + + value = cpu_to_le32(0u); + __dm_bless_for_disk(&value); + r = dm_array_resize(&md->era_array_info, md->era_array_root, + md->nr_blocks, *new_size, + &value, &md->era_array_root); + if (r) { + DMERR("%s: dm_array_resize failed", __func__); + return r; + } + + md->nr_blocks = *new_size; + return 0; +} + +static int metadata_era_archive(struct era_metadata *md) +{ + int r; + uint64_t keys[1]; + struct writeset_disk value; + + r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root, + &md->current_writeset->md.root); + if (r) { + DMERR("%s: dm_bitset_flush failed", __func__); + return r; + } + + ws_pack(&md->current_writeset->md, &value); + md->current_writeset->md.root = INVALID_WRITESET_ROOT; + + keys[0] = md->current_era; + __dm_bless_for_disk(&value); + r = dm_btree_insert(&md->writeset_tree_info, md->writeset_tree_root, + keys, &value, &md->writeset_tree_root); + if (r) { + DMERR("%s: couldn't insert writeset into btree", __func__); + /* FIXME: fail mode */ + return r; + } + + md->archived_writesets = true; + + return 0; +} + +static struct writeset *next_writeset(struct era_metadata *md) +{ + return (md->current_writeset == &md->writesets[0]) ? + &md->writesets[1] : &md->writesets[0]; +} + +static int metadata_new_era(struct era_metadata *md) +{ + int r; + struct writeset *new_writeset = next_writeset(md); + + r = writeset_init(&md->bitset_info, new_writeset); + if (r) { + DMERR("%s: writeset_init failed", __func__); + return r; + } + + swap_writeset(md, new_writeset); + md->current_era++; + + return 0; +} + +static int metadata_era_rollover(struct era_metadata *md) +{ + int r; + + if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) { + r = metadata_era_archive(md); + if (r) { + DMERR("%s: metadata_archive_era failed", __func__); + /* FIXME: fail mode? */ + return r; + } + } + + r = metadata_new_era(md); + if (r) { + DMERR("%s: new era failed", __func__); + /* FIXME: fail mode */ + return r; + } + + return 0; +} + +static bool metadata_current_marked(struct era_metadata *md, dm_block_t block) +{ + bool r; + struct writeset *ws; + + rcu_read_lock(); + ws = rcu_dereference(md->current_writeset); + r = writeset_marked(ws, block); + rcu_read_unlock(); + + return r; +} + +static int metadata_commit(struct era_metadata *md) +{ + int r; + struct dm_block *sblock; + + if (md->current_writeset->md.root != SUPERBLOCK_LOCATION) { + r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root, + &md->current_writeset->md.root); + if (r) { + DMERR("%s: bitset flush failed", __func__); + return r; + } + } + + r = save_sm_root(md); + if (r) { + DMERR("%s: save_sm_root failed", __func__); + return r; + } + + r = dm_tm_pre_commit(md->tm); + if (r) { + DMERR("%s: pre commit failed", __func__); + return r; + } + + r = superblock_lock(md, &sblock); + if (r) { + DMERR("%s: superblock lock failed", __func__); + return r; + } + + prepare_superblock(md, dm_block_data(sblock)); + + return dm_tm_commit(md->tm, sblock); +} + +static int metadata_checkpoint(struct era_metadata *md) +{ + /* + * For now we just rollover, but later I want to put a check in to + * avoid this if the filter is still pretty fresh. + */ + return metadata_era_rollover(md); +} + +/* + * Metadata snapshots allow userland to access era data. + */ +static int metadata_take_snap(struct era_metadata *md) +{ + int r, inc; + struct dm_block *clone; + + if (md->metadata_snap != SUPERBLOCK_LOCATION) { + DMERR("%s: metadata snapshot already exists", __func__); + return -EINVAL; + } + + r = metadata_era_rollover(md); + if (r) { + DMERR("%s: era rollover failed", __func__); + return r; + } + + r = metadata_commit(md); + if (r) { + DMERR("%s: pre commit failed", __func__); + return r; + } + + r = dm_sm_inc_block(md->sm, SUPERBLOCK_LOCATION); + if (r) { + DMERR("%s: couldn't increment superblock", __func__); + return r; + } + + r = dm_tm_shadow_block(md->tm, SUPERBLOCK_LOCATION, + &sb_validator, &clone, &inc); + if (r) { + DMERR("%s: couldn't shadow superblock", __func__); + dm_sm_dec_block(md->sm, SUPERBLOCK_LOCATION); + return r; + } + BUG_ON(!inc); + + r = dm_sm_inc_block(md->sm, md->writeset_tree_root); + if (r) { + DMERR("%s: couldn't inc writeset tree root", __func__); + dm_tm_unlock(md->tm, clone); + return r; + } + + r = dm_sm_inc_block(md->sm, md->era_array_root); + if (r) { + DMERR("%s: couldn't inc era tree root", __func__); + dm_sm_dec_block(md->sm, md->writeset_tree_root); + dm_tm_unlock(md->tm, clone); + return r; + } + + md->metadata_snap = dm_block_location(clone); + + r = dm_tm_unlock(md->tm, clone); + if (r) { + DMERR("%s: couldn't unlock clone", __func__); + md->metadata_snap = SUPERBLOCK_LOCATION; + return r; + } + + return 0; +} + +static int metadata_drop_snap(struct era_metadata *md) +{ + int r; + dm_block_t location; + struct dm_block *clone; + struct superblock_disk *disk; + + if (md->metadata_snap == SUPERBLOCK_LOCATION) { + DMERR("%s: no snap to drop", __func__); + return -EINVAL; + } + + r = dm_tm_read_lock(md->tm, md->metadata_snap, &sb_validator, &clone); + if (r) { + DMERR("%s: couldn't read lock superblock clone", __func__); + return r; + } + + /* + * Whatever happens now we'll commit with no record of the metadata + * snap. + */ + md->metadata_snap = SUPERBLOCK_LOCATION; + + disk = dm_block_data(clone); + r = dm_btree_del(&md->writeset_tree_info, + le64_to_cpu(disk->writeset_tree_root)); + if (r) { + DMERR("%s: error deleting writeset tree clone", __func__); + dm_tm_unlock(md->tm, clone); + return r; + } + + r = dm_array_del(&md->era_array_info, le64_to_cpu(disk->era_array_root)); + if (r) { + DMERR("%s: error deleting era array clone", __func__); + dm_tm_unlock(md->tm, clone); + return r; + } + + location = dm_block_location(clone); + dm_tm_unlock(md->tm, clone); + + return dm_sm_dec_block(md->sm, location); +} + +struct metadata_stats { + dm_block_t used; + dm_block_t total; + dm_block_t snap; + uint32_t era; +}; + +static int metadata_get_stats(struct era_metadata *md, void *ptr) +{ + int r; + struct metadata_stats *s = ptr; + dm_block_t nr_free, nr_total; + + r = dm_sm_get_nr_free(md->sm, &nr_free); + if (r) { + DMERR("dm_sm_get_nr_free returned %d", r); + return r; + } + + r = dm_sm_get_nr_blocks(md->sm, &nr_total); + if (r) { + DMERR("dm_pool_get_metadata_dev_size returned %d", r); + return r; + } + + s->used = nr_total - nr_free; + s->total = nr_total; + s->snap = md->metadata_snap; + s->era = md->current_era; + + return 0; +} + +/*----------------------------------------------------------------*/ + +struct era { + struct dm_target *ti; + struct dm_target_callbacks callbacks; + + struct dm_dev *metadata_dev; + struct dm_dev *origin_dev; + + dm_block_t nr_blocks; + uint32_t sectors_per_block; + int sectors_per_block_shift; + struct era_metadata *md; + + struct workqueue_struct *wq; + struct work_struct worker; + + spinlock_t deferred_lock; + struct bio_list deferred_bios; + + spinlock_t rpc_lock; + struct list_head rpc_calls; + + struct digest digest; + atomic_t suspended; +}; + +struct rpc { + struct list_head list; + + int (*fn0)(struct era_metadata *); + int (*fn1)(struct era_metadata *, void *); + void *arg; + int result; + + struct completion complete; +}; + +/*---------------------------------------------------------------- + * Remapping. + *---------------------------------------------------------------*/ +static bool block_size_is_power_of_two(struct era *era) +{ + return era->sectors_per_block_shift >= 0; +} + +static dm_block_t get_block(struct era *era, struct bio *bio) +{ + sector_t block_nr = bio->bi_iter.bi_sector; + + if (!block_size_is_power_of_two(era)) + (void) sector_div(block_nr, era->sectors_per_block); + else + block_nr >>= era->sectors_per_block_shift; + + return block_nr; +} + +static void remap_to_origin(struct era *era, struct bio *bio) +{ + bio->bi_bdev = era->origin_dev->bdev; +} + +/*---------------------------------------------------------------- + * Worker thread + *--------------------------------------------------------------*/ +static void wake_worker(struct era *era) +{ + if (!atomic_read(&era->suspended)) + queue_work(era->wq, &era->worker); +} + +static void process_old_eras(struct era *era) +{ + int r; + + if (!era->digest.step) + return; + + r = era->digest.step(era->md, &era->digest); + if (r < 0) { + DMERR("%s: digest step failed, stopping digestion", __func__); + era->digest.step = NULL; + + } else if (era->digest.step) + wake_worker(era); +} + +static void process_deferred_bios(struct era *era) +{ + int r; + struct bio_list deferred_bios, marked_bios; + struct bio *bio; + bool commit_needed = false; + bool failed = false; + + bio_list_init(&deferred_bios); + bio_list_init(&marked_bios); + + spin_lock(&era->deferred_lock); + bio_list_merge(&deferred_bios, &era->deferred_bios); + bio_list_init(&era->deferred_bios); + spin_unlock(&era->deferred_lock); + + while ((bio = bio_list_pop(&deferred_bios))) { + r = writeset_test_and_set(&era->md->bitset_info, + era->md->current_writeset, + get_block(era, bio)); + if (r < 0) { + /* + * This is bad news, we need to rollback. + * FIXME: finish. + */ + failed = true; + + } else if (r == 0) + commit_needed = true; + + bio_list_add(&marked_bios, bio); + } + + if (commit_needed) { + r = metadata_commit(era->md); + if (r) + failed = true; + } + + if (failed) + while ((bio = bio_list_pop(&marked_bios))) + bio_io_error(bio); + else + while ((bio = bio_list_pop(&marked_bios))) + generic_make_request(bio); +} + +static void process_rpc_calls(struct era *era) +{ + int r; + bool need_commit = false; + struct list_head calls; + struct rpc *rpc, *tmp; + + INIT_LIST_HEAD(&calls); + spin_lock(&era->rpc_lock); + list_splice_init(&era->rpc_calls, &calls); + spin_unlock(&era->rpc_lock); + + list_for_each_entry_safe(rpc, tmp, &calls, list) { + rpc->result = rpc->fn0 ? rpc->fn0(era->md) : rpc->fn1(era->md, rpc->arg); + need_commit = true; + } + + if (need_commit) { + r = metadata_commit(era->md); + if (r) + list_for_each_entry_safe(rpc, tmp, &calls, list) + rpc->result = r; + } + + list_for_each_entry_safe(rpc, tmp, &calls, list) + complete(&rpc->complete); +} + +static void kick_off_digest(struct era *era) +{ + if (era->md->archived_writesets) { + era->md->archived_writesets = false; + metadata_digest_start(era->md, &era->digest); + } +} + +static void do_work(struct work_struct *ws) +{ + struct era *era = container_of(ws, struct era, worker); + + kick_off_digest(era); + process_old_eras(era); + process_deferred_bios(era); + process_rpc_calls(era); +} + +static void defer_bio(struct era *era, struct bio *bio) +{ + spin_lock(&era->deferred_lock); + bio_list_add(&era->deferred_bios, bio); + spin_unlock(&era->deferred_lock); + + wake_worker(era); +} + +/* + * Make an rpc call to the worker to change the metadata. + */ +static int perform_rpc(struct era *era, struct rpc *rpc) +{ + rpc->result = 0; + init_completion(&rpc->complete); + + spin_lock(&era->rpc_lock); + list_add(&rpc->list, &era->rpc_calls); + spin_unlock(&era->rpc_lock); + + wake_worker(era); + wait_for_completion(&rpc->complete); + + return rpc->result; +} + +static int in_worker0(struct era *era, int (*fn)(struct era_metadata *)) +{ + struct rpc rpc; + rpc.fn0 = fn; + rpc.fn1 = NULL; + + return perform_rpc(era, &rpc); +} + +static int in_worker1(struct era *era, + int (*fn)(struct era_metadata *, void *), void *arg) +{ + struct rpc rpc; + rpc.fn0 = NULL; + rpc.fn1 = fn; + rpc.arg = arg; + + return perform_rpc(era, &rpc); +} + +static void start_worker(struct era *era) +{ + atomic_set(&era->suspended, 0); +} + +static void stop_worker(struct era *era) +{ + atomic_set(&era->suspended, 1); + flush_workqueue(era->wq); +} + +/*---------------------------------------------------------------- + * Target methods + *--------------------------------------------------------------*/ +static int dev_is_congested(struct dm_dev *dev, int bdi_bits) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + return bdi_congested(&q->backing_dev_info, bdi_bits); +} + +static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits) +{ + struct era *era = container_of(cb, struct era, callbacks); + return dev_is_congested(era->origin_dev, bdi_bits); +} + +static void era_destroy(struct era *era) +{ + if (era->md) + metadata_close(era->md); + + if (era->wq) + destroy_workqueue(era->wq); + + if (era->origin_dev) + dm_put_device(era->ti, era->origin_dev); + + if (era->metadata_dev) + dm_put_device(era->ti, era->metadata_dev); + + kfree(era); +} + +static dm_block_t calc_nr_blocks(struct era *era) +{ + return dm_sector_div_up(era->ti->len, era->sectors_per_block); +} + +static bool valid_block_size(dm_block_t block_size) +{ + bool greater_than_zero = block_size > 0; + bool multiple_of_min_block_size = (block_size & (MIN_BLOCK_SIZE - 1)) == 0; + + return greater_than_zero && multiple_of_min_block_size; +} + +/* + * <metadata dev> <data dev> <data block size (sectors)> + */ +static int era_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int r; + char dummy; + struct era *era; + struct era_metadata *md; + + if (argc != 3) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + era = kzalloc(sizeof(*era), GFP_KERNEL); + if (!era) { + ti->error = "Error allocating era structure"; + return -ENOMEM; + } + + era->ti = ti; + + r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev); + if (r) { + ti->error = "Error opening metadata device"; + era_destroy(era); + return -EINVAL; + } + + r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev); + if (r) { + ti->error = "Error opening data device"; + era_destroy(era); + return -EINVAL; + } + + r = sscanf(argv[2], "%u%c", &era->sectors_per_block, &dummy); + if (r != 1) { + ti->error = "Error parsing block size"; + era_destroy(era); + return -EINVAL; + } + + r = dm_set_target_max_io_len(ti, era->sectors_per_block); + if (r) { + ti->error = "could not set max io len"; + era_destroy(era); + return -EINVAL; + } + + if (!valid_block_size(era->sectors_per_block)) { + ti->error = "Invalid block size"; + era_destroy(era); + return -EINVAL; + } + if (era->sectors_per_block & (era->sectors_per_block - 1)) + era->sectors_per_block_shift = -1; + else + era->sectors_per_block_shift = __ffs(era->sectors_per_block); + + md = metadata_open(era->metadata_dev->bdev, era->sectors_per_block, true); + if (IS_ERR(md)) { + ti->error = "Error reading metadata"; + era_destroy(era); + return PTR_ERR(md); + } + era->md = md; + + era->nr_blocks = calc_nr_blocks(era); + + r = metadata_resize(era->md, &era->nr_blocks); + if (r) { + ti->error = "couldn't resize metadata"; + era_destroy(era); + return -ENOMEM; + } + + era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + if (!era->wq) { + ti->error = "could not create workqueue for metadata object"; + era_destroy(era); + return -ENOMEM; + } + INIT_WORK(&era->worker, do_work); + + spin_lock_init(&era->deferred_lock); + bio_list_init(&era->deferred_bios); + + spin_lock_init(&era->rpc_lock); + INIT_LIST_HEAD(&era->rpc_calls); + + ti->private = era; + ti->num_flush_bios = 1; + ti->flush_supported = true; + + ti->num_discard_bios = 1; + ti->discards_supported = true; + era->callbacks.congested_fn = era_is_congested; + dm_table_add_target_callbacks(ti->table, &era->callbacks); + + return 0; +} + +static void era_dtr(struct dm_target *ti) +{ + era_destroy(ti->private); +} + +static int era_map(struct dm_target *ti, struct bio *bio) +{ + struct era *era = ti->private; + dm_block_t block = get_block(era, bio); + + /* + * All bios get remapped to the origin device. We do this now, but + * it may not get issued until later. Depending on whether the + * block is marked in this era. + */ + remap_to_origin(era, bio); + + /* + * REQ_FLUSH bios carry no data, so we're not interested in them. + */ + if (!(bio->bi_rw & REQ_FLUSH) && + (bio_data_dir(bio) == WRITE) && + !metadata_current_marked(era->md, block)) { + defer_bio(era, bio); + return DM_MAPIO_SUBMITTED; + } + + return DM_MAPIO_REMAPPED; +} + +static void era_postsuspend(struct dm_target *ti) +{ + int r; + struct era *era = ti->private; + + r = in_worker0(era, metadata_era_archive); + if (r) { + DMERR("%s: couldn't archive current era", __func__); + /* FIXME: fail mode */ + } + + stop_worker(era); +} + +static int era_preresume(struct dm_target *ti) +{ + int r; + struct era *era = ti->private; + dm_block_t new_size = calc_nr_blocks(era); + + if (era->nr_blocks != new_size) { + r = in_worker1(era, metadata_resize, &new_size); + if (r) + return r; + + era->nr_blocks = new_size; + } + + start_worker(era); + + r = in_worker0(era, metadata_new_era); + if (r) { + DMERR("%s: metadata_era_rollover failed", __func__); + return r; + } + + return 0; +} + +/* + * Status format: + * + * <metadata block size> <#used metadata blocks>/<#total metadata blocks> + * <current era> <held metadata root | '-'> + */ +static void era_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + int r; + struct era *era = ti->private; + ssize_t sz = 0; + struct metadata_stats stats; + char buf[BDEVNAME_SIZE]; + + switch (type) { + case STATUSTYPE_INFO: + r = in_worker1(era, metadata_get_stats, &stats); + if (r) + goto err; + + DMEMIT("%u %llu/%llu %u", + (unsigned) (DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT), + (unsigned long long) stats.used, + (unsigned long long) stats.total, + (unsigned) stats.era); + + if (stats.snap != SUPERBLOCK_LOCATION) + DMEMIT(" %llu", stats.snap); + else + DMEMIT(" -"); + break; + + case STATUSTYPE_TABLE: + format_dev_t(buf, era->metadata_dev->bdev->bd_dev); + DMEMIT("%s ", buf); + format_dev_t(buf, era->origin_dev->bdev->bd_dev); + DMEMIT("%s %u", buf, era->sectors_per_block); + break; + } + + return; + +err: + DMEMIT("Error"); +} + +static int era_message(struct dm_target *ti, unsigned argc, char **argv) +{ + struct era *era = ti->private; + + if (argc != 1) { + DMERR("incorrect number of message arguments"); + return -EINVAL; + } + + if (!strcasecmp(argv[0], "checkpoint")) + return in_worker0(era, metadata_checkpoint); + + if (!strcasecmp(argv[0], "take_metadata_snap")) + return in_worker0(era, metadata_take_snap); + + if (!strcasecmp(argv[0], "drop_metadata_snap")) + return in_worker0(era, metadata_drop_snap); + + DMERR("unsupported message '%s'", argv[0]); + return -EINVAL; +} + +static sector_t get_dev_size(struct dm_dev *dev) +{ + return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; +} + +static int era_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct era *era = ti->private; + return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data); +} + +static int era_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct era *era = ti->private; + struct request_queue *q = bdev_get_queue(era->origin_dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = era->origin_dev->bdev; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static void era_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct era *era = ti->private; + uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + + /* + * If the system-determined stacked limits are compatible with the + * era device's blocksize (io_opt is a factor) do not override them. + */ + if (io_opt_sectors < era->sectors_per_block || + do_div(io_opt_sectors, era->sectors_per_block)) { + blk_limits_io_min(limits, 0); + blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT); + } +} + +/*----------------------------------------------------------------*/ + +static struct target_type era_target = { + .name = "era", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = era_ctr, + .dtr = era_dtr, + .map = era_map, + .postsuspend = era_postsuspend, + .preresume = era_preresume, + .status = era_status, + .message = era_message, + .iterate_devices = era_iterate_devices, + .merge = era_merge, + .io_hints = era_io_hints +}; + +static int __init dm_era_init(void) +{ + int r; + + r = dm_register_target(&era_target); + if (r) { + DMERR("era target registration failed: %d", r); + return r; + } + + return 0; +} + +static void __exit dm_era_exit(void) +{ + dm_unregister_target(&era_target); +} + +module_init(dm_era_init); +module_exit(dm_era_exit); + +MODULE_DESCRIPTION(DM_NAME " era target"); +MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 41f408068a7..ebaa4f803ee 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -1,709 +1,290 @@ /* - * dm-exception-store.c - * * Copyright (C) 2001-2002 Sistina Software (UK) Limited. - * Copyright (C) 2006 Red Hat GmbH + * Copyright (C) 2006-2008 Red Hat GmbH * * This file is released under the GPL. */ -#include "dm.h" -#include "dm-snap.h" +#include "dm-exception-store.h" +#include <linux/ctype.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> +#include <linux/module.h> #include <linux/slab.h> -#include <linux/dm-io.h> -#include <linux/dm-kcopyd.h> - -#define DM_MSG_PREFIX "snapshots" -#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ - -/*----------------------------------------------------------------- - * Persistent snapshots, by persistent we mean that the snapshot - * will survive a reboot. - *---------------------------------------------------------------*/ - -/* - * We need to store a record of which parts of the origin have - * been copied to the snapshot device. The snapshot code - * requires that we copy exception chunks to chunk aligned areas - * of the COW store. It makes sense therefore, to store the - * metadata in chunk size blocks. - * - * There is no backward or forward compatibility implemented, - * snapshots with different disk versions than the kernel will - * not be usable. It is expected that "lvcreate" will blank out - * the start of a fresh COW device before calling the snapshot - * constructor. - * - * The first chunk of the COW device just contains the header. - * After this there is a chunk filled with exception metadata, - * followed by as many exception chunks as can fit in the - * metadata areas. - * - * All on disk structures are in little-endian format. The end - * of the exceptions info is indicated by an exception with a - * new_chunk of 0, which is invalid since it would point to the - * header chunk. - */ - -/* - * Magic for persistent snapshots: "SnAp" - Feeble isn't it. - */ -#define SNAP_MAGIC 0x70416e53 - -/* - * The on-disk version of the metadata. - */ -#define SNAPSHOT_DISK_VERSION 1 - -struct disk_header { - uint32_t magic; - - /* - * Is this snapshot valid. There is no way of recovering - * an invalid snapshot. - */ - uint32_t valid; - - /* - * Simple, incrementing version. no backward - * compatibility. - */ - uint32_t version; - - /* In sectors */ - uint32_t chunk_size; -}; -struct disk_exception { - uint64_t old_chunk; - uint64_t new_chunk; -}; +#define DM_MSG_PREFIX "snapshot exception stores" -struct commit_callback { - void (*callback)(void *, int success); - void *context; -}; +static LIST_HEAD(_exception_store_types); +static DEFINE_SPINLOCK(_lock); -/* - * The top level structure for a persistent exception store. - */ -struct pstore { - struct dm_snapshot *snap; /* up pointer to my snapshot */ - int version; - int valid; - uint32_t exceptions_per_area; - - /* - * Now that we have an asynchronous kcopyd there is no - * need for large chunk sizes, so it wont hurt to have a - * whole chunks worth of metadata in memory at once. - */ - void *area; - - /* - * Used to keep track of which metadata area the data in - * 'chunk' refers to. - */ - uint32_t current_area; - - /* - * The next free chunk for an exception. - */ - uint32_t next_free; - - /* - * The index of next free exception in the current - * metadata area. - */ - uint32_t current_committed; - - atomic_t pending_count; - uint32_t callback_count; - struct commit_callback *callbacks; - struct dm_io_client *io_client; - - struct workqueue_struct *metadata_wq; -}; - -static unsigned sectors_to_pages(unsigned sectors) -{ - return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9); -} - -static int alloc_area(struct pstore *ps) +static struct dm_exception_store_type *__find_exception_store_type(const char *name) { - int r = -ENOMEM; - size_t len; + struct dm_exception_store_type *type; - len = ps->snap->chunk_size << SECTOR_SHIFT; + list_for_each_entry(type, &_exception_store_types, list) + if (!strcmp(name, type->name)) + return type; - /* - * Allocate the chunk_size block of memory that will hold - * a single metadata area. - */ - ps->area = vmalloc(len); - if (!ps->area) - return r; - - return 0; + return NULL; } -static void free_area(struct pstore *ps) +static struct dm_exception_store_type *_get_exception_store_type(const char *name) { - vfree(ps->area); - ps->area = NULL; -} + struct dm_exception_store_type *type; -struct mdata_req { - struct dm_io_region *where; - struct dm_io_request *io_req; - struct work_struct work; - int result; -}; + spin_lock(&_lock); -static void do_metadata(struct work_struct *work) -{ - struct mdata_req *req = container_of(work, struct mdata_req, work); + type = __find_exception_store_type(name); - req->result = dm_io(req->io_req, 1, req->where, NULL); -} + if (type && !try_module_get(type->module)) + type = NULL; -/* - * Read or write a chunk aligned and sized block of data from a device. - */ -static int chunk_io(struct pstore *ps, uint32_t chunk, int rw, int metadata) -{ - struct dm_io_region where = { - .bdev = ps->snap->cow->bdev, - .sector = ps->snap->chunk_size * chunk, - .count = ps->snap->chunk_size, - }; - struct dm_io_request io_req = { - .bi_rw = rw, - .mem.type = DM_IO_VMA, - .mem.ptr.vma = ps->area, - .client = ps->io_client, - .notify.fn = NULL, - }; - struct mdata_req req; - - if (!metadata) - return dm_io(&io_req, 1, &where, NULL); - - req.where = &where; - req.io_req = &io_req; - - /* - * Issue the synchronous I/O from a different thread - * to avoid generic_make_request recursion. - */ - INIT_WORK(&req.work, do_metadata); - queue_work(ps->metadata_wq, &req.work); - flush_workqueue(ps->metadata_wq); - - return req.result; + spin_unlock(&_lock); + + return type; } /* - * Read or write a metadata area. Remembering to skip the first - * chunk which holds the header. + * get_type + * @type_name + * + * Attempt to retrieve the dm_exception_store_type by name. If not already + * available, attempt to load the appropriate module. + * + * Exstore modules are named "dm-exstore-" followed by the 'type_name'. + * Modules may contain multiple types. + * This function will first try the module "dm-exstore-<type_name>", + * then truncate 'type_name' on the last '-' and try again. + * + * For example, if type_name was "clustered-shared", it would search + * 'dm-exstore-clustered-shared' then 'dm-exstore-clustered'. + * + * 'dm-exception-store-<type_name>' is too long of a name in my + * opinion, which is why I've chosen to have the files + * containing exception store implementations be 'dm-exstore-<type_name>'. + * If you want your module to be autoloaded, you will follow this + * naming convention. + * + * Returns: dm_exception_store_type* on success, NULL on failure */ -static int area_io(struct pstore *ps, uint32_t area, int rw) -{ - int r; - uint32_t chunk; - - /* convert a metadata area index to a chunk index */ - chunk = 1 + ((ps->exceptions_per_area + 1) * area); - - r = chunk_io(ps, chunk, rw, 0); - if (r) - return r; - - ps->current_area = area; - return 0; -} - -static int zero_area(struct pstore *ps, uint32_t area) +static struct dm_exception_store_type *get_type(const char *type_name) { - memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); - return area_io(ps, area, WRITE); -} - -static int read_header(struct pstore *ps, int *new_snapshot) -{ - int r; - struct disk_header *dh; - chunk_t chunk_size; - int chunk_size_supplied = 1; - - /* - * Use default chunk size (or hardsect_size, if larger) if none supplied - */ - if (!ps->snap->chunk_size) { - ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, - bdev_hardsect_size(ps->snap->cow->bdev) >> 9); - ps->snap->chunk_mask = ps->snap->chunk_size - 1; - ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1; - chunk_size_supplied = 0; - } - - ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap-> - chunk_size)); - if (IS_ERR(ps->io_client)) - return PTR_ERR(ps->io_client); - - r = alloc_area(ps); - if (r) - return r; + char *p, *type_name_dup; + struct dm_exception_store_type *type; - r = chunk_io(ps, 0, READ, 1); - if (r) - goto bad; - - dh = (struct disk_header *) ps->area; + type = _get_exception_store_type(type_name); + if (type) + return type; - if (le32_to_cpu(dh->magic) == 0) { - *new_snapshot = 1; - return 0; + type_name_dup = kstrdup(type_name, GFP_KERNEL); + if (!type_name_dup) { + DMERR("No memory left to attempt load for \"%s\"", type_name); + return NULL; } - if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { - DMWARN("Invalid or corrupt snapshot"); - r = -ENXIO; - goto bad; + while (request_module("dm-exstore-%s", type_name_dup) || + !(type = _get_exception_store_type(type_name))) { + p = strrchr(type_name_dup, '-'); + if (!p) + break; + p[0] = '\0'; } - *new_snapshot = 0; - ps->valid = le32_to_cpu(dh->valid); - ps->version = le32_to_cpu(dh->version); - chunk_size = le32_to_cpu(dh->chunk_size); - - if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size) - return 0; - - DMWARN("chunk size %llu in device metadata overrides " - "table chunk size of %llu.", - (unsigned long long)chunk_size, - (unsigned long long)ps->snap->chunk_size); + if (!type) + DMWARN("Module for exstore type \"%s\" not found.", type_name); - /* We had a bogus chunk_size. Fix stuff up. */ - free_area(ps); + kfree(type_name_dup); - ps->snap->chunk_size = chunk_size; - ps->snap->chunk_mask = chunk_size - 1; - ps->snap->chunk_shift = ffs(chunk_size) - 1; - - r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size), - ps->io_client); - if (r) - return r; - - r = alloc_area(ps); - return r; - -bad: - free_area(ps); - return r; + return type; } -static int write_header(struct pstore *ps) +static void put_type(struct dm_exception_store_type *type) { - struct disk_header *dh; - - memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); - - dh = (struct disk_header *) ps->area; - dh->magic = cpu_to_le32(SNAP_MAGIC); - dh->valid = cpu_to_le32(ps->valid); - dh->version = cpu_to_le32(ps->version); - dh->chunk_size = cpu_to_le32(ps->snap->chunk_size); - - return chunk_io(ps, 0, WRITE, 1); + spin_lock(&_lock); + module_put(type->module); + spin_unlock(&_lock); } -/* - * Access functions for the disk exceptions, these do the endian conversions. - */ -static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) +int dm_exception_store_type_register(struct dm_exception_store_type *type) { - BUG_ON(index >= ps->exceptions_per_area); - - return ((struct disk_exception *) ps->area) + index; -} + int r = 0; -static void read_exception(struct pstore *ps, - uint32_t index, struct disk_exception *result) -{ - struct disk_exception *e = get_exception(ps, index); + spin_lock(&_lock); + if (!__find_exception_store_type(type->name)) + list_add(&type->list, &_exception_store_types); + else + r = -EEXIST; + spin_unlock(&_lock); - /* copy it */ - result->old_chunk = le64_to_cpu(e->old_chunk); - result->new_chunk = le64_to_cpu(e->new_chunk); + return r; } +EXPORT_SYMBOL(dm_exception_store_type_register); -static void write_exception(struct pstore *ps, - uint32_t index, struct disk_exception *de) +int dm_exception_store_type_unregister(struct dm_exception_store_type *type) { - struct disk_exception *e = get_exception(ps, index); + spin_lock(&_lock); - /* copy it */ - e->old_chunk = cpu_to_le64(de->old_chunk); - e->new_chunk = cpu_to_le64(de->new_chunk); -} - -/* - * Registers the exceptions that are present in the current area. - * 'full' is filled in to indicate if the area has been - * filled. - */ -static int insert_exceptions(struct pstore *ps, int *full) -{ - int r; - unsigned int i; - struct disk_exception de; - - /* presume the area is full */ - *full = 1; - - for (i = 0; i < ps->exceptions_per_area; i++) { - read_exception(ps, i, &de); - - /* - * If the new_chunk is pointing at the start of - * the COW device, where the first metadata area - * is we know that we've hit the end of the - * exceptions. Therefore the area is not full. - */ - if (de.new_chunk == 0LL) { - ps->current_committed = i; - *full = 0; - break; - } - - /* - * Keep track of the start of the free chunks. - */ - if (ps->next_free <= de.new_chunk) - ps->next_free = de.new_chunk + 1; - - /* - * Otherwise we add the exception to the snapshot. - */ - r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); - if (r) - return r; + if (!__find_exception_store_type(type->name)) { + spin_unlock(&_lock); + return -EINVAL; } - return 0; -} + list_del(&type->list); -static int read_exceptions(struct pstore *ps) -{ - uint32_t area; - int r, full = 1; - - /* - * Keeping reading chunks and inserting exceptions until - * we find a partially full area. - */ - for (area = 0; full; area++) { - r = area_io(ps, area, READ); - if (r) - return r; - - r = insert_exceptions(ps, &full); - if (r) - return r; - } + spin_unlock(&_lock); return 0; } +EXPORT_SYMBOL(dm_exception_store_type_unregister); -static struct pstore *get_info(struct exception_store *store) +static int set_chunk_size(struct dm_exception_store *store, + const char *chunk_size_arg, char **error) { - return (struct pstore *) store->context; -} + unsigned chunk_size; -static void persistent_fraction_full(struct exception_store *store, - sector_t *numerator, sector_t *denominator) -{ - *numerator = get_info(store)->next_free * store->snap->chunk_size; - *denominator = get_dev_size(store->snap->cow->bdev); -} - -static void persistent_destroy(struct exception_store *store) -{ - struct pstore *ps = get_info(store); - - destroy_workqueue(ps->metadata_wq); - dm_io_client_destroy(ps->io_client); - vfree(ps->callbacks); - free_area(ps); - kfree(ps); -} - -static int persistent_read_metadata(struct exception_store *store) -{ - int r, uninitialized_var(new_snapshot); - struct pstore *ps = get_info(store); - - /* - * Read the snapshot header. - */ - r = read_header(ps, &new_snapshot); - if (r) - return r; - - /* - * Now we know correct chunk_size, complete the initialisation. - */ - ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) / - sizeof(struct disk_exception); - ps->callbacks = dm_vcalloc(ps->exceptions_per_area, - sizeof(*ps->callbacks)); - if (!ps->callbacks) - return -ENOMEM; + if (kstrtouint(chunk_size_arg, 10, &chunk_size)) { + *error = "Invalid chunk size"; + return -EINVAL; + } - /* - * Do we need to setup a new snapshot ? - */ - if (new_snapshot) { - r = write_header(ps); - if (r) { - DMWARN("write_header failed"); - return r; - } - - r = zero_area(ps, 0); - if (r) { - DMWARN("zero_area(0) failed"); - return r; - } - - } else { - /* - * Sanity checks. - */ - if (ps->version != SNAPSHOT_DISK_VERSION) { - DMWARN("unable to handle snapshot disk version %d", - ps->version); - return -EINVAL; - } - - /* - * Metadata are valid, but snapshot is invalidated - */ - if (!ps->valid) - return 1; - - /* - * Read the metadata. - */ - r = read_exceptions(ps); - if (r) - return r; + if (!chunk_size) { + store->chunk_size = store->chunk_mask = store->chunk_shift = 0; + return 0; } - return 0; + return dm_exception_store_set_chunk_size(store, chunk_size, error); } -static int persistent_prepare(struct exception_store *store, - struct dm_snap_exception *e) +int dm_exception_store_set_chunk_size(struct dm_exception_store *store, + unsigned chunk_size, + char **error) { - struct pstore *ps = get_info(store); - uint32_t stride; - sector_t size = get_dev_size(store->snap->cow->bdev); + /* Check chunk_size is a power of 2 */ + if (!is_power_of_2(chunk_size)) { + *error = "Chunk size is not a power of 2"; + return -EINVAL; + } - /* Is there enough room ? */ - if (size < ((ps->next_free + 1) * store->snap->chunk_size)) - return -ENOSPC; + /* Validate the chunk size against the device block size */ + if (chunk_size % + (bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9) || + chunk_size % + (bdev_logical_block_size(dm_snap_origin(store->snap)->bdev) >> 9)) { + *error = "Chunk size is not a multiple of device blocksize"; + return -EINVAL; + } - e->new_chunk = ps->next_free; + if (chunk_size > INT_MAX >> SECTOR_SHIFT) { + *error = "Chunk size is too high"; + return -EINVAL; + } - /* - * Move onto the next free pending, making sure to take - * into account the location of the metadata chunks. - */ - stride = (ps->exceptions_per_area + 1); - if ((++ps->next_free % stride) == 1) - ps->next_free++; + store->chunk_size = chunk_size; + store->chunk_mask = chunk_size - 1; + store->chunk_shift = ffs(chunk_size) - 1; - atomic_inc(&ps->pending_count); return 0; } -static void persistent_commit(struct exception_store *store, - struct dm_snap_exception *e, - void (*callback) (void *, int success), - void *callback_context) +int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + struct dm_snapshot *snap, + unsigned *args_used, + struct dm_exception_store **store) { - int r; - unsigned int i; - struct pstore *ps = get_info(store); - struct disk_exception de; - struct commit_callback *cb; - - de.old_chunk = e->old_chunk; - de.new_chunk = e->new_chunk; - write_exception(ps, ps->current_committed++, &de); - - /* - * Add the callback to the back of the array. This code - * is the only place where the callback array is - * manipulated, and we know that it will never be called - * multiple times concurrently. - */ - cb = ps->callbacks + ps->callback_count++; - cb->callback = callback; - cb->context = callback_context; - - /* - * If there are no more exceptions in flight, or we have - * filled this metadata area we commit the exceptions to - * disk. - */ - if (atomic_dec_and_test(&ps->pending_count) || - (ps->current_committed == ps->exceptions_per_area)) { - r = area_io(ps, ps->current_area, WRITE); - if (r) - ps->valid = 0; - - /* - * Have we completely filled the current area ? - */ - if (ps->current_committed == ps->exceptions_per_area) { - ps->current_committed = 0; - r = zero_area(ps, ps->current_area + 1); - if (r) - ps->valid = 0; - } - - for (i = 0; i < ps->callback_count; i++) { - cb = ps->callbacks + i; - cb->callback(cb->context, r == 0 ? 1 : 0); - } - - ps->callback_count = 0; + int r = 0; + struct dm_exception_store_type *type = NULL; + struct dm_exception_store *tmp_store; + char persistent; + + if (argc < 2) { + ti->error = "Insufficient exception store arguments"; + return -EINVAL; } -} -static void persistent_drop(struct exception_store *store) -{ - struct pstore *ps = get_info(store); + tmp_store = kmalloc(sizeof(*tmp_store), GFP_KERNEL); + if (!tmp_store) { + ti->error = "Exception store allocation failed"; + return -ENOMEM; + } - ps->valid = 0; - if (write_header(ps)) - DMWARN("write header failed"); -} + persistent = toupper(*argv[0]); + if (persistent == 'P') + type = get_type("P"); + else if (persistent == 'N') + type = get_type("N"); + else { + ti->error = "Persistent flag is not P or N"; + r = -EINVAL; + goto bad_type; + } -int dm_create_persistent(struct exception_store *store) -{ - struct pstore *ps; + if (!type) { + ti->error = "Exception store type not recognised"; + r = -EINVAL; + goto bad_type; + } - /* allocate the pstore */ - ps = kmalloc(sizeof(*ps), GFP_KERNEL); - if (!ps) - return -ENOMEM; + tmp_store->type = type; + tmp_store->snap = snap; - ps->snap = store->snap; - ps->valid = 1; - ps->version = SNAPSHOT_DISK_VERSION; - ps->area = NULL; - ps->next_free = 2; /* skipping the header and first area */ - ps->current_committed = 0; - - ps->callback_count = 0; - atomic_set(&ps->pending_count, 0); - ps->callbacks = NULL; - - ps->metadata_wq = create_singlethread_workqueue("ksnaphd"); - if (!ps->metadata_wq) { - kfree(ps); - DMERR("couldn't start header metadata update thread"); - return -ENOMEM; - } + r = set_chunk_size(tmp_store, argv[1], &ti->error); + if (r) + goto bad; - store->destroy = persistent_destroy; - store->read_metadata = persistent_read_metadata; - store->prepare_exception = persistent_prepare; - store->commit_exception = persistent_commit; - store->drop_snapshot = persistent_drop; - store->fraction_full = persistent_fraction_full; - store->context = ps; + r = type->ctr(tmp_store, 0, NULL); + if (r) { + ti->error = "Exception store type constructor failed"; + goto bad; + } + *args_used = 2; + *store = tmp_store; return 0; -} -/*----------------------------------------------------------------- - * Implementation of the store for non-persistent snapshots. - *---------------------------------------------------------------*/ -struct transient_c { - sector_t next_free; -}; - -static void transient_destroy(struct exception_store *store) -{ - kfree(store->context); +bad: + put_type(type); +bad_type: + kfree(tmp_store); + return r; } +EXPORT_SYMBOL(dm_exception_store_create); -static int transient_read_metadata(struct exception_store *store) +void dm_exception_store_destroy(struct dm_exception_store *store) { - return 0; + store->type->dtr(store); + put_type(store->type); + kfree(store); } +EXPORT_SYMBOL(dm_exception_store_destroy); -static int transient_prepare(struct exception_store *store, - struct dm_snap_exception *e) +int dm_exception_store_init(void) { - struct transient_c *tc = (struct transient_c *) store->context; - sector_t size = get_dev_size(store->snap->cow->bdev); + int r; - if (size < (tc->next_free + store->snap->chunk_size)) - return -1; + r = dm_transient_snapshot_init(); + if (r) { + DMERR("Unable to register transient exception store type."); + goto transient_fail; + } - e->new_chunk = sector_to_chunk(store->snap, tc->next_free); - tc->next_free += store->snap->chunk_size; + r = dm_persistent_snapshot_init(); + if (r) { + DMERR("Unable to register persistent exception store type"); + goto persistent_fail; + } return 0; -} - -static void transient_commit(struct exception_store *store, - struct dm_snap_exception *e, - void (*callback) (void *, int success), - void *callback_context) -{ - /* Just succeed */ - callback(callback_context, 1); -} -static void transient_fraction_full(struct exception_store *store, - sector_t *numerator, sector_t *denominator) -{ - *numerator = ((struct transient_c *) store->context)->next_free; - *denominator = get_dev_size(store->snap->cow->bdev); +persistent_fail: + dm_transient_snapshot_exit(); +transient_fail: + return r; } -int dm_create_transient(struct exception_store *store) +void dm_exception_store_exit(void) { - struct transient_c *tc; - - store->destroy = transient_destroy; - store->read_metadata = transient_read_metadata; - store->prepare_exception = transient_prepare; - store->commit_exception = transient_commit; - store->drop_snapshot = NULL; - store->fraction_full = transient_fraction_full; - - tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); - if (!tc) - return -ENOMEM; - - tc->next_free = 0; - store->context = tc; - - return 0; + dm_persistent_snapshot_exit(); + dm_transient_snapshot_exit(); } diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h new file mode 100644 index 00000000000..0b2536247cf --- /dev/null +++ b/drivers/md/dm-exception-store.h @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2008 Red Hat, Inc. All rights reserved. + * + * Device-mapper snapshot exception store. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_EXCEPTION_STORE +#define _LINUX_DM_EXCEPTION_STORE + +#include <linux/blkdev.h> +#include <linux/device-mapper.h> + +/* + * The snapshot code deals with largish chunks of the disk at a + * time. Typically 32k - 512k. + */ +typedef sector_t chunk_t; + +/* + * An exception is used where an old chunk of data has been + * replaced by a new one. + * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number + * of chunks that follow contiguously. Remaining bits hold the number of the + * chunk within the device. + */ +struct dm_exception { + struct list_head hash_list; + + chunk_t old_chunk; + chunk_t new_chunk; +}; + +/* + * Abstraction to handle the meta/layout of exception stores (the + * COW device). + */ +struct dm_exception_store; +struct dm_exception_store_type { + const char *name; + struct module *module; + + int (*ctr) (struct dm_exception_store *store, + unsigned argc, char **argv); + + /* + * Destroys this object when you've finished with it. + */ + void (*dtr) (struct dm_exception_store *store); + + /* + * The target shouldn't read the COW device until this is + * called. As exceptions are read from the COW, they are + * reported back via the callback. + */ + int (*read_metadata) (struct dm_exception_store *store, + int (*callback)(void *callback_context, + chunk_t old, chunk_t new), + void *callback_context); + + /* + * Find somewhere to store the next exception. + */ + int (*prepare_exception) (struct dm_exception_store *store, + struct dm_exception *e); + + /* + * Update the metadata with this exception. + */ + void (*commit_exception) (struct dm_exception_store *store, + struct dm_exception *e, + void (*callback) (void *, int success), + void *callback_context); + + /* + * Returns 0 if the exception store is empty. + * + * If there are exceptions still to be merged, sets + * *last_old_chunk and *last_new_chunk to the most recent + * still-to-be-merged chunk and returns the number of + * consecutive previous ones. + */ + int (*prepare_merge) (struct dm_exception_store *store, + chunk_t *last_old_chunk, chunk_t *last_new_chunk); + + /* + * Clear the last n exceptions. + * nr_merged must be <= the value returned by prepare_merge. + */ + int (*commit_merge) (struct dm_exception_store *store, int nr_merged); + + /* + * The snapshot is invalid, note this in the metadata. + */ + void (*drop_snapshot) (struct dm_exception_store *store); + + unsigned (*status) (struct dm_exception_store *store, + status_type_t status, char *result, + unsigned maxlen); + + /* + * Return how full the snapshot is. + */ + void (*usage) (struct dm_exception_store *store, + sector_t *total_sectors, sector_t *sectors_allocated, + sector_t *metadata_sectors); + + /* For internal device-mapper use only. */ + struct list_head list; +}; + +struct dm_snapshot; + +struct dm_exception_store { + struct dm_exception_store_type *type; + struct dm_snapshot *snap; + + /* Size of data blocks saved - must be a power of 2 */ + unsigned chunk_size; + unsigned chunk_mask; + unsigned chunk_shift; + + void *context; +}; + +/* + * Obtain the origin or cow device used by a given snapshot. + */ +struct dm_dev *dm_snap_origin(struct dm_snapshot *snap); +struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); + +/* + * Funtions to manipulate consecutive chunks + */ +# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) +# define DM_CHUNK_CONSECUTIVE_BITS 8 +# define DM_CHUNK_NUMBER_BITS 56 + +static inline chunk_t dm_chunk_number(chunk_t chunk) +{ + return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); +} + +static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) +{ + return e->new_chunk >> DM_CHUNK_NUMBER_BITS; +} + +static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) +{ + e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); + + BUG_ON(!dm_consecutive_chunk_count(e)); +} + +static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) +{ + BUG_ON(!dm_consecutive_chunk_count(e)); + + e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS); +} + +# else +# define DM_CHUNK_CONSECUTIVE_BITS 0 + +static inline chunk_t dm_chunk_number(chunk_t chunk) +{ + return chunk; +} + +static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) +{ + return 0; +} + +static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) +{ +} + +static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) +{ +} + +# endif + +/* + * Return the number of sectors in the device. + */ +static inline sector_t get_dev_size(struct block_device *bdev) +{ + return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; +} + +static inline chunk_t sector_to_chunk(struct dm_exception_store *store, + sector_t sector) +{ + return sector >> store->chunk_shift; +} + +int dm_exception_store_type_register(struct dm_exception_store_type *type); +int dm_exception_store_type_unregister(struct dm_exception_store_type *type); + +int dm_exception_store_set_chunk_size(struct dm_exception_store *store, + unsigned chunk_size, + char **error); + +int dm_exception_store_create(struct dm_target *ti, int argc, char **argv, + struct dm_snapshot *snap, + unsigned *args_used, + struct dm_exception_store **store); +void dm_exception_store_destroy(struct dm_exception_store *store); + +int dm_exception_store_init(void); +void dm_exception_store_exit(void); + +/* + * Two exception store implementations. + */ +int dm_persistent_snapshot_init(void); +void dm_persistent_snapshot_exit(void); + +int dm_transient_snapshot_init(void); +void dm_transient_snapshot_exit(void); + +#endif /* _LINUX_DM_EXCEPTION_STORE */ diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c new file mode 100644 index 00000000000..b257e46876d --- /dev/null +++ b/drivers/md/dm-flakey.c @@ -0,0 +1,447 @@ +/* + * Copyright (C) 2003 Sistina Software (UK) Limited. + * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/device-mapper.h> + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/slab.h> + +#define DM_MSG_PREFIX "flakey" + +#define all_corrupt_bio_flags_match(bio, fc) \ + (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags) + +/* + * Flakey: Used for testing only, simulates intermittent, + * catastrophic device failure. + */ +struct flakey_c { + struct dm_dev *dev; + unsigned long start_time; + sector_t start; + unsigned up_interval; + unsigned down_interval; + unsigned long flags; + unsigned corrupt_bio_byte; + unsigned corrupt_bio_rw; + unsigned corrupt_bio_value; + unsigned corrupt_bio_flags; +}; + +enum feature_flag_bits { + DROP_WRITES +}; + +struct per_bio_data { + bool bio_submitted; +}; + +static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, + struct dm_target *ti) +{ + int r; + unsigned argc; + const char *arg_name; + + static struct dm_arg _args[] = { + {0, 6, "Invalid number of feature args"}, + {1, UINT_MAX, "Invalid corrupt bio byte"}, + {0, 255, "Invalid corrupt value to write into bio byte (0-255)"}, + {0, UINT_MAX, "Invalid corrupt bio flags mask"}, + }; + + /* No feature arguments supplied. */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return r; + + while (argc) { + arg_name = dm_shift_arg(as); + argc--; + + /* + * drop_writes + */ + if (!strcasecmp(arg_name, "drop_writes")) { + if (test_and_set_bit(DROP_WRITES, &fc->flags)) { + ti->error = "Feature drop_writes duplicated"; + return -EINVAL; + } + + continue; + } + + /* + * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags> + */ + if (!strcasecmp(arg_name, "corrupt_bio_byte")) { + if (!argc) { + ti->error = "Feature corrupt_bio_byte requires parameters"; + return -EINVAL; + } + + r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error); + if (r) + return r; + argc--; + + /* + * Direction r or w? + */ + arg_name = dm_shift_arg(as); + if (!strcasecmp(arg_name, "w")) + fc->corrupt_bio_rw = WRITE; + else if (!strcasecmp(arg_name, "r")) + fc->corrupt_bio_rw = READ; + else { + ti->error = "Invalid corrupt bio direction (r or w)"; + return -EINVAL; + } + argc--; + + /* + * Value of byte (0-255) to write in place of correct one. + */ + r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error); + if (r) + return r; + argc--; + + /* + * Only corrupt bios with these flags set. + */ + r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error); + if (r) + return r; + argc--; + + continue; + } + + ti->error = "Unrecognised flakey feature requested"; + return -EINVAL; + } + + if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { + ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + return -EINVAL; + } + + return 0; +} + +/* + * Construct a flakey mapping: + * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*] + * + * Feature args: + * [drop_writes] + * [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>] + * + * Nth_byte starts from 1 for the first byte. + * Direction is r for READ or w for WRITE. + * bio_flags is ignored if 0. + */ +static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + static struct dm_arg _args[] = { + {0, UINT_MAX, "Invalid up interval"}, + {0, UINT_MAX, "Invalid down interval"}, + }; + + int r; + struct flakey_c *fc; + unsigned long long tmpll; + struct dm_arg_set as; + const char *devname; + char dummy; + + as.argc = argc; + as.argv = argv; + + if (argc < 4) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + fc = kzalloc(sizeof(*fc), GFP_KERNEL); + if (!fc) { + ti->error = "Cannot allocate context"; + return -ENOMEM; + } + fc->start_time = jiffies; + + devname = dm_shift_arg(&as); + + if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) { + ti->error = "Invalid device sector"; + goto bad; + } + fc->start = tmpll; + + r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error); + if (r) + goto bad; + + r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error); + if (r) + goto bad; + + if (!(fc->up_interval + fc->down_interval)) { + ti->error = "Total (up + down) interval is zero"; + goto bad; + } + + if (fc->up_interval + fc->down_interval < fc->up_interval) { + ti->error = "Interval overflow"; + goto bad; + } + + r = parse_features(&as, fc, ti); + if (r) + goto bad; + + if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) { + ti->error = "Device lookup failed"; + goto bad; + } + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->per_bio_data_size = sizeof(struct per_bio_data); + ti->private = fc; + return 0; + +bad: + kfree(fc); + return -EINVAL; +} + +static void flakey_dtr(struct dm_target *ti) +{ + struct flakey_c *fc = ti->private; + + dm_put_device(ti, fc->dev); + kfree(fc); +} + +static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector) +{ + struct flakey_c *fc = ti->private; + + return fc->start + dm_target_offset(ti, bi_sector); +} + +static void flakey_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct flakey_c *fc = ti->private; + + bio->bi_bdev = fc->dev->bdev; + if (bio_sectors(bio)) + bio->bi_iter.bi_sector = + flakey_map_sector(ti, bio->bi_iter.bi_sector); +} + +static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) +{ + unsigned bio_bytes = bio_cur_bytes(bio); + char *data = bio_data(bio); + + /* + * Overwrite the Nth byte of the data returned. + */ + if (data && bio_bytes >= fc->corrupt_bio_byte) { + data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value; + + DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " + "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n", + bio, fc->corrupt_bio_value, fc->corrupt_bio_byte, + (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_rw, + (unsigned long long)bio->bi_iter.bi_sector, bio_bytes); + } +} + +static int flakey_map(struct dm_target *ti, struct bio *bio) +{ + struct flakey_c *fc = ti->private; + unsigned elapsed; + struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); + pb->bio_submitted = false; + + /* Are we alive ? */ + elapsed = (jiffies - fc->start_time) / HZ; + if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { + /* + * Flag this bio as submitted while down. + */ + pb->bio_submitted = true; + + /* + * Map reads as normal. + */ + if (bio_data_dir(bio) == READ) + goto map_bio; + + /* + * Drop writes? + */ + if (test_bit(DROP_WRITES, &fc->flags)) { + bio_endio(bio, 0); + return DM_MAPIO_SUBMITTED; + } + + /* + * Corrupt matching writes. + */ + if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) { + if (all_corrupt_bio_flags_match(bio, fc)) + corrupt_bio_data(bio, fc); + goto map_bio; + } + + /* + * By default, error all I/O. + */ + return -EIO; + } + +map_bio: + flakey_map_bio(ti, bio); + + return DM_MAPIO_REMAPPED; +} + +static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error) +{ + struct flakey_c *fc = ti->private; + struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); + + /* + * Corrupt successful READs while in down state. + * If flags were specified, only corrupt those that match. + */ + if (fc->corrupt_bio_byte && !error && pb->bio_submitted && + (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && + all_corrupt_bio_flags_match(bio, fc)) + corrupt_bio_data(bio, fc); + + return error; +} + +static void flakey_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct flakey_c *fc = ti->private; + unsigned drop_writes; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %llu %u %u ", fc->dev->name, + (unsigned long long)fc->start, fc->up_interval, + fc->down_interval); + + drop_writes = test_bit(DROP_WRITES, &fc->flags); + DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5); + + if (drop_writes) + DMEMIT("drop_writes "); + + if (fc->corrupt_bio_byte) + DMEMIT("corrupt_bio_byte %u %c %u %u ", + fc->corrupt_bio_byte, + (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r', + fc->corrupt_bio_value, fc->corrupt_bio_flags); + + break; + } +} + +static int flakey_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) +{ + struct flakey_c *fc = ti->private; + struct dm_dev *dev = fc->dev; + int r = 0; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (fc->start || + ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) + r = scsi_verify_blk_ioctl(NULL, cmd); + + return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); +} + +static int flakey_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct flakey_c *fc = ti->private; + struct request_queue *q = bdev_get_queue(fc->dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = fc->dev->bdev; + bvm->bi_sector = flakey_map_sector(ti, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) +{ + struct flakey_c *fc = ti->private; + + return fn(ti, fc->dev, fc->start, ti->len, data); +} + +static struct target_type flakey_target = { + .name = "flakey", + .version = {1, 3, 1}, + .module = THIS_MODULE, + .ctr = flakey_ctr, + .dtr = flakey_dtr, + .map = flakey_map, + .end_io = flakey_end_io, + .status = flakey_status, + .ioctl = flakey_ioctl, + .merge = flakey_merge, + .iterate_devices = flakey_iterate_devices, +}; + +static int __init dm_flakey_init(void) +{ + int r = dm_register_target(&flakey_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_flakey_exit(void) +{ + dm_unregister_target(&flakey_target); +} + +/* Module hooks */ +module_init(dm_flakey_init); +module_exit(dm_flakey_exit); + +MODULE_DESCRIPTION(DM_NAME " flakey target"); +MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 4789c42d9a3..db404a0f7e2 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -7,56 +7,59 @@ #include "dm.h" +#include <linux/device-mapper.h> + #include <linux/bio.h> +#include <linux/completion.h> #include <linux/mempool.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/dm-io.h> +#define DM_MSG_PREFIX "io" + +#define DM_IO_MAX_REGIONS BITS_PER_LONG + struct dm_io_client { mempool_t *pool; struct bio_set *bios; }; -/* FIXME: can we shrink this ? */ +/* + * Aligning 'struct io' reduces the number of bits required to store + * its address. Refer to store_io_and_region_in_bio() below. + */ struct io { unsigned long error_bits; atomic_t count; - struct task_struct *sleeper; + struct completion *wait; struct dm_io_client *client; io_notify_fn callback; void *context; -}; - -/* - * io contexts are only dynamically allocated for asynchronous - * io. Since async io is likely to be the majority of io we'll - * have the same number of io contexts as bios! (FIXME: must reduce this). - */ + void *vma_invalidate_address; + unsigned long vma_invalidate_size; +} __attribute__((aligned(DM_IO_MAX_REGIONS))); -static unsigned int pages_to_ios(unsigned int pages) -{ - return 4 * pages; /* too many ? */ -} +static struct kmem_cache *_dm_io_cache; /* * Create a client with mempool and bioset. */ -struct dm_io_client *dm_io_client_create(unsigned num_pages) +struct dm_io_client *dm_io_client_create(void) { - unsigned ios = pages_to_ios(num_pages); struct dm_io_client *client; + unsigned min_ios = dm_get_reserved_bio_based_ios(); client = kmalloc(sizeof(*client), GFP_KERNEL); if (!client) return ERR_PTR(-ENOMEM); - client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io)); + client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache); if (!client->pool) goto bad; - client->bios = bioset_create(16, 16); + client->bios = bioset_create(min_ios, 0); if (!client->bios) goto bad; @@ -70,13 +73,6 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages) } EXPORT_SYMBOL(dm_io_client_create); -int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client) -{ - return mempool_resize(client->pool, pages_to_ios(num_pages), - GFP_KERNEL); -} -EXPORT_SYMBOL(dm_io_client_resize); - void dm_io_client_destroy(struct dm_io_client *client) { mempool_destroy(client->pool); @@ -87,18 +83,29 @@ EXPORT_SYMBOL(dm_io_client_destroy); /*----------------------------------------------------------------- * We need to keep track of which region a bio is doing io for. - * In order to save a memory allocation we store this the last - * bvec which we know is unused (blech). - * XXX This is ugly and can OOPS with some configs... find another way. + * To avoid a memory allocation to store just 5 or 6 bits, we + * ensure the 'struct io' pointer is aligned so enough low bits are + * always zero and then combine it with the region number directly in + * bi_private. *---------------------------------------------------------------*/ -static inline void bio_set_region(struct bio *bio, unsigned region) +static void store_io_and_region_in_bio(struct bio *bio, struct io *io, + unsigned region) { - bio->bi_io_vec[bio->bi_max_vecs].bv_len = region; + if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) { + DMCRIT("Unaligned struct io pointer %p", io); + BUG(); + } + + bio->bi_private = (void *)((unsigned long)io | region); } -static inline unsigned bio_get_region(struct bio *bio) +static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, + unsigned *region) { - return bio->bi_io_vec[bio->bi_max_vecs].bv_len; + unsigned long val = (unsigned long)bio->bi_private; + + *io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS); + *region = val & (DM_IO_MAX_REGIONS - 1); } /*----------------------------------------------------------------- @@ -111,8 +118,12 @@ static void dec_count(struct io *io, unsigned int region, int error) set_bit(region, &io->error_bits); if (atomic_dec_and_test(&io->count)) { - if (io->sleeper) - wake_up_process(io->sleeper); + if (io->vma_invalidate_size) + invalidate_kernel_vmap_range(io->vma_invalidate_address, + io->vma_invalidate_size); + + if (io->wait) + complete(io->wait); else { unsigned long r = io->error_bits; @@ -136,10 +147,8 @@ static void endio(struct bio *bio, int error) /* * The bio destructor in bio_put() may use the io object. */ - io = bio->bi_private; - region = bio_get_region(bio); + retrieve_io_and_region_from_bio(bio, &io, ®ion); - bio->bi_max_vecs++; bio_put(bio); dec_count(io, region, error); @@ -156,6 +165,9 @@ struct dpages { unsigned context_u; void *context_ptr; + + void *vma_invalidate_address; + unsigned long vma_invalidate_size; }; /* @@ -190,26 +202,28 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse /* * Functions for getting the pages from a bvec. */ -static void bvec_get_page(struct dpages *dp, - struct page **p, unsigned long *len, unsigned *offset) +static void bio_get_page(struct dpages *dp, struct page **p, + unsigned long *len, unsigned *offset) { - struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; + struct bio_vec *bvec = dp->context_ptr; *p = bvec->bv_page; - *len = bvec->bv_len; - *offset = bvec->bv_offset; + *len = bvec->bv_len - dp->context_u; + *offset = bvec->bv_offset + dp->context_u; } -static void bvec_next_page(struct dpages *dp) +static void bio_next_page(struct dpages *dp) { - struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr; + struct bio_vec *bvec = dp->context_ptr; dp->context_ptr = bvec + 1; + dp->context_u = 0; } -static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec) +static void bio_dp_init(struct dpages *dp, struct bio *bio) { - dp->get_page = bvec_get_page; - dp->next_page = bvec_next_page; - dp->context_ptr = bvec; + dp->get_page = bio_get_page; + dp->next_page = bio_next_page; + dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + dp->context_u = bio->bi_iter.bi_bvec_done; } /* @@ -237,13 +251,6 @@ static void vm_dp_init(struct dpages *dp, void *data) dp->context_ptr = data; } -static void dm_bio_destructor(struct bio *bio) -{ - struct io *io = bio->bi_private; - - bio_free(bio, io->client->bios); -} - /* * Functions for getting the pages from kernel memory. */ @@ -281,30 +288,50 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, unsigned offset; unsigned num_bvecs; sector_t remaining = where->count; + struct request_queue *q = bdev_get_queue(where->bdev); + unsigned short logical_block_size = queue_logical_block_size(q); + sector_t num_sectors; - while (remaining) { + /* + * where->count may be zero if rw holds a flush and we need to + * send a zero-sized flush. + */ + do { /* - * Allocate a suitably sized-bio: we add an extra - * bvec for bio_get/set_region() and decrement bi_max_vecs - * to hide it from bio_add_page(). + * Allocate a suitably sized-bio. */ - num_bvecs = dm_sector_div_up(remaining, - (PAGE_SIZE >> SECTOR_SHIFT)); - num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev), - num_bvecs); + if ((rw & REQ_DISCARD) || (rw & REQ_WRITE_SAME)) + num_bvecs = 1; + else + num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), + dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); + bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); - bio->bi_sector = where->sector + (where->count - remaining); + bio->bi_iter.bi_sector = where->sector + (where->count - remaining); bio->bi_bdev = where->bdev; bio->bi_end_io = endio; - bio->bi_private = io; - bio->bi_destructor = dm_bio_destructor; - bio->bi_max_vecs--; - bio_set_region(bio, region); + store_io_and_region_in_bio(bio, io, region); + + if (rw & REQ_DISCARD) { + num_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining); + bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; + remaining -= num_sectors; + } else if (rw & REQ_WRITE_SAME) { + /* + * WRITE SAME only uses a single page. + */ + dp->get_page(dp, &page, &len, &offset); + bio_add_page(bio, page, logical_block_size, offset); + num_sectors = min_t(sector_t, q->limits.max_write_same_sectors, remaining); + bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT; - /* - * Try and add as many pages as possible. - */ - while (remaining) { + offset = 0; + remaining -= num_sectors; + dp->next_page(dp); + } else while (remaining) { + /* + * Try and add as many pages as possible. + */ dp->get_page(dp, &page, &len, &offset); len = min(len, to_bytes(remaining)); if (!bio_add_page(bio, page, len, offset)) @@ -317,7 +344,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, atomic_inc(&io->count); submit_bio(rw, bio); - } + } while (remaining); } static void dispatch_io(int rw, unsigned int num_regions, @@ -327,8 +354,10 @@ static void dispatch_io(int rw, unsigned int num_regions, int i; struct dpages old_pages = *dp; + BUG_ON(num_regions > DM_IO_MAX_REGIONS); + if (sync) - rw |= (1 << BIO_RW_SYNC); + rw |= REQ_SYNC; /* * For multiple regions we need to be careful to rewind @@ -336,7 +365,7 @@ static void dispatch_io(int rw, unsigned int num_regions, */ for (i = 0; i < num_regions; i++) { *dp = old_pages; - if (where[i].count) + if (where[i].count || (rw & REQ_FLUSH)) do_region(rw, i, where + i, dp, io); } @@ -351,37 +380,37 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, struct dm_io_region *where, int rw, struct dpages *dp, unsigned long *error_bits) { - struct io io; + /* + * gcc <= 4.3 can't do the alignment for stack variables, so we must + * align it on our own. + * volatile prevents the optimizer from removing or reusing + * "io_" field from the stack frame (allowed in ANSI C). + */ + volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; + struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); + DECLARE_COMPLETION_ONSTACK(wait); if (num_regions > 1 && (rw & RW_MASK) != WRITE) { WARN_ON(1); return -EIO; } - io.error_bits = 0; - atomic_set(&io.count, 1); /* see dispatch_io() */ - io.sleeper = current; - io.client = client; - - dispatch_io(rw, num_regions, where, dp, &io, 1); + io->error_bits = 0; + atomic_set(&io->count, 1); /* see dispatch_io() */ + io->wait = &wait; + io->client = client; - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; - if (!atomic_read(&io.count) || signal_pending(current)) - break; + dispatch_io(rw, num_regions, where, dp, io, 1); - io_schedule(); - } - set_current_state(TASK_RUNNING); - - if (atomic_read(&io.count)) - return -EINTR; + wait_for_completion_io(&wait); if (error_bits) - *error_bits = io.error_bits; + *error_bits = io->error_bits; - return io.error_bits ? -EIO : 0; + return io->error_bits ? -EIO : 0; } static int async_io(struct dm_io_client *client, unsigned int num_regions, @@ -399,28 +428,41 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ - io->sleeper = NULL; + io->wait = NULL; io->client = client; io->callback = fn; io->context = context; + io->vma_invalidate_address = dp->vma_invalidate_address; + io->vma_invalidate_size = dp->vma_invalidate_size; + dispatch_io(rw, num_regions, where, dp, io, 0); return 0; } -static int dp_init(struct dm_io_request *io_req, struct dpages *dp) +static int dp_init(struct dm_io_request *io_req, struct dpages *dp, + unsigned long size) { /* Set up dpages based on memory type */ + + dp->vma_invalidate_address = NULL; + dp->vma_invalidate_size = 0; + switch (io_req->mem.type) { case DM_IO_PAGE_LIST: list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset); break; - case DM_IO_BVEC: - bvec_dp_init(dp, io_req->mem.ptr.bvec); + case DM_IO_BIO: + bio_dp_init(dp, io_req->mem.ptr.bio); break; case DM_IO_VMA: + flush_kernel_vmap_range(io_req->mem.ptr.vma, size); + if ((io_req->bi_rw & RW_MASK) == READ) { + dp->vma_invalidate_address = io_req->mem.ptr.vma; + dp->vma_invalidate_size = size; + } vm_dp_init(dp, io_req->mem.ptr.vma); break; @@ -439,8 +481,8 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp) * New collapsed (a)synchronous interface. * * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug - * the queue with blk_unplug() some time later or set the BIO_RW_SYNC bit in - * io_req->bi_rw. If you fail to do one of these, the IO will be submitted to + * the queue with blk_unplug() some time later or set REQ_SYNC in +io_req->bi_rw. If you fail to do one of these, the IO will be submitted to * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. */ int dm_io(struct dm_io_request *io_req, unsigned num_regions, @@ -449,7 +491,7 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions, int r; struct dpages dp; - r = dp_init(io_req, &dp); + r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT); if (r) return r; @@ -461,3 +503,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions, &dp, io_req->notify.fn, io_req->notify.context); } EXPORT_SYMBOL(dm_io); + +int __init dm_io_init(void) +{ + _dm_io_cache = KMEM_CACHE(io, 0); + if (!_dm_io_cache) + return -ENOMEM; + + return 0; +} + +void dm_io_exit(void) +{ + kmem_cache_destroy(_dm_io_cache); + _dm_io_cache = NULL; +} diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index b262c0042de..51521429fb5 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -36,6 +36,14 @@ struct hash_cell { struct dm_table *new_map; }; +/* + * A dummy definition to make RCU happy. + * struct dm_table should never be dereferenced in this file. + */ +struct dm_table { + int undefined__; +}; + struct vers_iter { size_t param_size; struct dm_target_versions *vers, *old_vers; @@ -49,13 +57,18 @@ struct vers_iter { static struct list_head _name_buckets[NUM_BUCKETS]; static struct list_head _uuid_buckets[NUM_BUCKETS]; -static void dm_hash_remove_all(int keep_open_devices); +static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred); /* * Guards access to both hash tables. */ static DECLARE_RWSEM(_hash_lock); +/* + * Protects use of mdptr to obtain hash cell name and uuid from mapped device. + */ +static DEFINE_MUTEX(dm_hash_cells_mutex); + static void init_buckets(struct list_head *buckets) { unsigned int i; @@ -73,7 +86,7 @@ static int dm_hash_init(void) static void dm_hash_exit(void) { - dm_hash_remove_all(0); + dm_hash_remove_all(false, false, false); } /*----------------------------------------------------------------- @@ -123,6 +136,24 @@ static struct hash_cell *__get_uuid_cell(const char *str) return NULL; } +static struct hash_cell *__get_dev_cell(uint64_t dev) +{ + struct mapped_device *md; + struct hash_cell *hc; + + md = dm_get_md(huge_decode_dev(dev)); + if (!md) + return NULL; + + hc = dm_get_mdptr(md); + if (!hc) { + dm_put(md); + return NULL; + } + + return hc; +} + /*----------------------------------------------------------------- * Inserting, removing and renaming a device. *---------------------------------------------------------------*/ @@ -206,7 +237,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); } dm_get(md); + mutex_lock(&dm_hash_cells_mutex); dm_set_mdptr(md, cell); + mutex_unlock(&dm_hash_cells_mutex); up_write(&_hash_lock); return 0; @@ -217,128 +250,210 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi return -EBUSY; } -static void __hash_remove(struct hash_cell *hc) +static struct dm_table *__hash_remove(struct hash_cell *hc) { struct dm_table *table; + int srcu_idx; /* remove from the dev hash */ list_del(&hc->uuid_list); list_del(&hc->name_list); + mutex_lock(&dm_hash_cells_mutex); dm_set_mdptr(hc->md, NULL); + mutex_unlock(&dm_hash_cells_mutex); - table = dm_get_table(hc->md); - if (table) { + table = dm_get_live_table(hc->md, &srcu_idx); + if (table) dm_table_event(table); - dm_table_put(table); - } + dm_put_live_table(hc->md, srcu_idx); + table = NULL; if (hc->new_map) - dm_table_put(hc->new_map); + table = hc->new_map; dm_put(hc->md); free_cell(hc); + + return table; } -static void dm_hash_remove_all(int keep_open_devices) +static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred) { - int i, dev_skipped, dev_removed; + int i, dev_skipped; struct hash_cell *hc; - struct list_head *tmp, *n; + struct mapped_device *md; + struct dm_table *t; + +retry: + dev_skipped = 0; down_write(&_hash_lock); -retry: - dev_skipped = dev_removed = 0; for (i = 0; i < NUM_BUCKETS; i++) { - list_for_each_safe (tmp, n, _name_buckets + i) { - hc = list_entry(tmp, struct hash_cell, name_list); + list_for_each_entry(hc, _name_buckets + i, name_list) { + md = hc->md; + dm_get(md); if (keep_open_devices && - dm_lock_for_deletion(hc->md)) { + dm_lock_for_deletion(md, mark_deferred, only_deferred)) { + dm_put(md); dev_skipped++; continue; } - __hash_remove(hc); - dev_removed = 1; + + t = __hash_remove(hc); + + up_write(&_hash_lock); + + if (t) { + dm_sync_table(md); + dm_table_destroy(t); + } + dm_put(md); + if (likely(keep_open_devices)) + dm_destroy(md); + else + dm_destroy_immediate(md); + + /* + * Some mapped devices may be using other mapped + * devices, so repeat until we make no further + * progress. If a new mapped device is created + * here it will also get removed. + */ + goto retry; } } + up_write(&_hash_lock); + + if (dev_skipped) + DMWARN("remove_all left %d open device(s)", dev_skipped); +} + +/* + * Set the uuid of a hash_cell that isn't already set. + */ +static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid) +{ + mutex_lock(&dm_hash_cells_mutex); + hc->uuid = new_uuid; + mutex_unlock(&dm_hash_cells_mutex); + + list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid)); +} + +/* + * Changes the name of a hash_cell and returns the old name for + * the caller to free. + */ +static char *__change_cell_name(struct hash_cell *hc, char *new_name) +{ + char *old_name; + /* - * Some mapped devices may be using other mapped devices, so if any - * still exist, repeat until we make no further progress. + * Rename and move the name cell. */ - if (dev_skipped) { - if (dev_removed) - goto retry; + list_del(&hc->name_list); + old_name = hc->name; - DMWARN("remove_all left %d open device(s)", dev_skipped); - } + mutex_lock(&dm_hash_cells_mutex); + hc->name = new_name; + mutex_unlock(&dm_hash_cells_mutex); - up_write(&_hash_lock); + list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + + return old_name; } -static int dm_hash_rename(const char *old, const char *new) +static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, + const char *new) { - char *new_name, *old_name; + char *new_data, *old_name = NULL; struct hash_cell *hc; struct dm_table *table; + struct mapped_device *md; + unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; + int srcu_idx; /* * duplicate new. */ - new_name = kstrdup(new, GFP_KERNEL); - if (!new_name) - return -ENOMEM; + new_data = kstrdup(new, GFP_KERNEL); + if (!new_data) + return ERR_PTR(-ENOMEM); down_write(&_hash_lock); /* * Is new free ? */ - hc = __get_name_cell(new); + if (change_uuid) + hc = __get_uuid_cell(new); + else + hc = __get_name_cell(new); + if (hc) { - DMWARN("asked to rename to an already existing name %s -> %s", - old, new); + DMWARN("Unable to change %s on mapped device %s to one that " + "already exists: %s", + change_uuid ? "uuid" : "name", + param->name, new); dm_put(hc->md); up_write(&_hash_lock); - kfree(new_name); - return -EBUSY; + kfree(new_data); + return ERR_PTR(-EBUSY); } /* * Is there such a device as 'old' ? */ - hc = __get_name_cell(old); + hc = __get_name_cell(param->name); if (!hc) { - DMWARN("asked to rename a non existent device %s -> %s", - old, new); + DMWARN("Unable to rename non-existent device, %s to %s%s", + param->name, change_uuid ? "uuid " : "", new); up_write(&_hash_lock); - kfree(new_name); - return -ENXIO; + kfree(new_data); + return ERR_PTR(-ENXIO); } /* - * rename and move the name cell. + * Does this device already have a uuid? */ - list_del(&hc->name_list); - old_name = hc->name; - hc->name = new_name; - list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + if (change_uuid && hc->uuid) { + DMWARN("Unable to change uuid of mapped device %s to %s " + "because uuid is already set to %s", + param->name, new, hc->uuid); + dm_put(hc->md); + up_write(&_hash_lock); + kfree(new_data); + return ERR_PTR(-EINVAL); + } + + if (change_uuid) + __set_cell_uuid(hc, new_data); + else + old_name = __change_cell_name(hc, new_data); /* * Wake up any dm event waiters. */ - table = dm_get_table(hc->md); - if (table) { + table = dm_get_live_table(hc->md, &srcu_idx); + if (table) dm_table_event(table); - dm_table_put(table); - } + dm_put_live_table(hc->md, srcu_idx); - dm_kobject_uevent(hc->md); + if (!dm_kobject_uevent(hc->md, KOBJ_CHANGE, param->event_nr)) + param->flags |= DM_UEVENT_GENERATED_FLAG; - dm_put(hc->md); + md = hc->md; up_write(&_hash_lock); kfree(old_name); - return 0; + + return md; +} + +void dm_deferred_remove(void) +{ + dm_hash_remove_all(true, false, true); } /*----------------------------------------------------------------- @@ -352,7 +467,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); static int remove_all(struct dm_ioctl *param, size_t param_size) { - dm_hash_remove_all(1); + dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false); param->data_size = 0; return 0; } @@ -426,7 +541,7 @@ static int list_devices(struct dm_ioctl *param, size_t param_size) old_nl->next = (uint32_t) ((void *) nl - (void *) old_nl); disk = dm_disk(hc->md); - nl->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); + nl->dev = huge_encode_dev(disk_devt(disk)); nl->next = 0; strcpy(nl->name, hc->name); @@ -512,8 +627,6 @@ static int list_versions(struct dm_ioctl *param, size_t param_size) return 0; } - - static int check_name(const char *name) { if (strchr(name, '/')) { @@ -525,21 +638,61 @@ static int check_name(const char *name) } /* + * On successful return, the caller must not attempt to acquire + * _hash_lock without first calling dm_table_put, because dm_table_destroy + * waits for this dm_table_put and could be called under this lock. + */ +static struct dm_table *dm_get_inactive_table(struct mapped_device *md, int *srcu_idx) +{ + struct hash_cell *hc; + struct dm_table *table = NULL; + + /* increment rcu count, we don't care about the table pointer */ + dm_get_live_table(md, srcu_idx); + + down_read(&_hash_lock); + hc = dm_get_mdptr(md); + if (!hc || hc->md != md) { + DMWARN("device has been removed from the dev hash table."); + goto out; + } + + table = hc->new_map; + +out: + up_read(&_hash_lock); + + return table; +} + +static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md, + struct dm_ioctl *param, + int *srcu_idx) +{ + return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ? + dm_get_inactive_table(md, srcu_idx) : dm_get_live_table(md, srcu_idx); +} + +/* * Fills in a dm_ioctl structure, ready for sending back to * userland. */ -static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) +static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) { struct gendisk *disk = dm_disk(md); struct dm_table *table; + int srcu_idx; param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | DM_ACTIVE_PRESENT_FLAG); - if (dm_suspended(md)) + if (dm_suspended_md(md)) param->flags |= DM_SUSPEND_FLAG; - param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor)); + if (dm_test_deferred_remove_flag(md)) + param->flags |= DM_DEFERRED_REMOVE; + + param->dev = huge_encode_dev(disk_devt(disk)); /* * Yes, this will be out of date by the time it gets back @@ -548,20 +701,31 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) */ param->open_count = dm_open_count(md); - if (disk->policy) - param->flags |= DM_READONLY_FLAG; - param->event_nr = dm_get_event_nr(md); + param->target_count = 0; - table = dm_get_table(md); + table = dm_get_live_table(md, &srcu_idx); if (table) { - param->flags |= DM_ACTIVE_PRESENT_FLAG; - param->target_count = dm_table_get_num_targets(table); - dm_table_put(table); - } else - param->target_count = 0; + if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) { + if (get_disk_ro(disk)) + param->flags |= DM_READONLY_FLAG; + param->target_count = dm_table_get_num_targets(table); + } - return 0; + param->flags |= DM_ACTIVE_PRESENT_FLAG; + } + dm_put_live_table(md, srcu_idx); + + if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) { + int srcu_idx; + table = dm_get_inactive_table(md, &srcu_idx); + if (table) { + if (!(dm_table_get_mode(table) & FMODE_WRITE)) + param->flags |= DM_READONLY_FLAG; + param->target_count = dm_table_get_num_targets(table); + } + dm_put_live_table(md, srcu_idx); + } } static int dev_create(struct dm_ioctl *param, size_t param_size) @@ -583,15 +747,17 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); if (r) { dm_put(md); + dm_destroy(md); return r; } param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - r = __dev_status(md, param); + __dev_status(md, param); + dm_put(md); - return r; + return 0; } /* @@ -599,25 +765,45 @@ static int dev_create(struct dm_ioctl *param, size_t param_size) */ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) { - struct mapped_device *md; - void *mdptr = NULL; + struct hash_cell *hc = NULL; - if (*param->uuid) - return __get_uuid_cell(param->uuid); + if (*param->uuid) { + if (*param->name || param->dev) + return NULL; - if (*param->name) - return __get_name_cell(param->name); + hc = __get_uuid_cell(param->uuid); + if (!hc) + return NULL; + } else if (*param->name) { + if (param->dev) + return NULL; - md = dm_get_md(huge_decode_dev(param->dev)); - if (!md) - goto out; + hc = __get_name_cell(param->name); + if (!hc) + return NULL; + } else if (param->dev) { + hc = __get_dev_cell(param->dev); + if (!hc) + return NULL; + } else + return NULL; - mdptr = dm_get_mdptr(md); - if (!mdptr) - dm_put(md); + /* + * Sneakily write in both the name and the uuid + * while we have the cell. + */ + strlcpy(param->name, hc->name, sizeof(param->name)); + if (hc->uuid) + strlcpy(param->uuid, hc->uuid, sizeof(param->uuid)); + else + param->uuid[0] = '\0'; -out: - return mdptr; + if (hc->new_map) + param->flags |= DM_INACTIVE_PRESENT_FLAG; + else + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + return hc; } static struct mapped_device *find_device(struct dm_ioctl *param) @@ -627,24 +813,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param) down_read(&_hash_lock); hc = __find_device_hash_cell(param); - if (hc) { + if (hc) md = hc->md; - - /* - * Sneakily write in both the name and the uuid - * while we have the cell. - */ - strncpy(param->name, hc->name, sizeof(param->name)); - if (hc->uuid) - strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1); - else - param->uuid[0] = '\0'; - - if (hc->new_map) - param->flags |= DM_INACTIVE_PRESENT_FLAG; - else - param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - } up_read(&_hash_lock); return md; @@ -655,12 +825,13 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) struct hash_cell *hc; struct mapped_device *md; int r; + struct dm_table *t; down_write(&_hash_lock); hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } @@ -670,18 +841,34 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) /* * Ensure the device is not open and nothing further can open it. */ - r = dm_lock_for_deletion(md); + r = dm_lock_for_deletion(md, !!(param->flags & DM_DEFERRED_REMOVE), false); if (r) { - DMWARN("unable to remove open device %s", hc->name); + if (r == -EBUSY && param->flags & DM_DEFERRED_REMOVE) { + up_write(&_hash_lock); + dm_put(md); + return 0; + } + DMDEBUG_LIMIT("unable to remove open device %s", hc->name); up_write(&_hash_lock); dm_put(md); return r; } - __hash_remove(hc); + t = __hash_remove(hc); up_write(&_hash_lock); + + if (t) { + dm_sync_table(md); + dm_table_destroy(t); + } + + param->flags &= ~DM_DEFERRED_REMOVE; + + if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) + param->flags |= DM_UEVENT_GENERATED_FLAG; + dm_put(md); - param->data_size = 0; + dm_destroy(md); return 0; } @@ -701,20 +888,31 @@ static int invalid_str(char *str, void *end) static int dev_rename(struct dm_ioctl *param, size_t param_size) { int r; - char *new_name = (char *) param + param->data_start; + char *new_data = (char *) param + param->data_start; + struct mapped_device *md; + unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0; - if (new_name < param->data || - invalid_str(new_name, (void *) param + param_size)) { - DMWARN("Invalid new logical volume name supplied."); + if (new_data < param->data || + invalid_str(new_data, (void *) param + param_size) || !*new_data || + strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) { + DMWARN("Invalid new mapped device name or uuid string supplied."); return -EINVAL; } - r = check_name(new_name); - if (r) - return r; + if (!change_uuid) { + r = check_name(new_data); + if (r) + return r; + } - param->data_size = 0; - return dm_hash_rename(param->name, new_name); + md = dm_hash_rename(param, new_data); + if (IS_ERR(md)) + return PTR_ERR(md); + + __dev_status(md, param); + dm_put(md); + + return 0; } static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) @@ -724,6 +922,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) struct hd_geometry geometry; unsigned long indata[4]; char *geostr = (char *) param + param->data_start; + char dummy; md = find_device(param); if (!md) @@ -735,8 +934,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) goto out; } - x = sscanf(geostr, "%lu %lu %lu %lu", indata, - indata + 1, indata + 2, indata + 3); + x = sscanf(geostr, "%lu %lu %lu %lu%c", indata, + indata + 1, indata + 2, indata + 3, &dummy); if (x != 4) { DMWARN("Unable to interpret geometry settings."); @@ -755,8 +954,6 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) geometry.start = indata[3]; r = dm_set_geometry(md, &geometry); - if (!r) - r = __dev_status(md, param); param->data_size = 0; @@ -780,13 +977,17 @@ static int do_suspend(struct dm_ioctl *param) if (param->flags & DM_NOFLUSH_FLAG) suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; - if (!dm_suspended(md)) + if (!dm_suspended_md(md)) { r = dm_suspend(md, suspend_flags); + if (r) + goto out; + } - if (!r) - r = __dev_status(md, param); + __dev_status(md, param); +out: dm_put(md); + return r; } @@ -796,13 +997,13 @@ static int do_resume(struct dm_ioctl *param) unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG; struct hash_cell *hc; struct mapped_device *md; - struct dm_table *new_map; + struct dm_table *new_map, *old_map = NULL; down_write(&_hash_lock); hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } @@ -822,29 +1023,38 @@ static int do_resume(struct dm_ioctl *param) suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; if (param->flags & DM_NOFLUSH_FLAG) suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; - if (!dm_suspended(md)) + if (!dm_suspended_md(md)) dm_suspend(md, suspend_flags); - r = dm_swap_table(md, new_map); - if (r) { + old_map = dm_swap_table(md, new_map); + if (IS_ERR(old_map)) { + dm_sync_table(md); + dm_table_destroy(new_map); dm_put(md); - dm_table_put(new_map); - return r; + return PTR_ERR(old_map); } if (dm_table_get_mode(new_map) & FMODE_WRITE) set_disk_ro(dm_disk(md), 0); else set_disk_ro(dm_disk(md), 1); - - dm_table_put(new_map); } - if (dm_suspended(md)) + if (dm_suspended_md(md)) { r = dm_resume(md); + if (!r && !dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr)) + param->flags |= DM_UEVENT_GENERATED_FLAG; + } + + /* + * Since dm_swap_table synchronizes RCU, nobody should be in + * read-side critical section already. + */ + if (old_map) + dm_table_destroy(old_map); if (!r) - r = __dev_status(md, param); + __dev_status(md, param); dm_put(md); return r; @@ -868,16 +1078,16 @@ static int dev_suspend(struct dm_ioctl *param, size_t param_size) */ static int dev_status(struct dm_ioctl *param, size_t param_size) { - int r; struct mapped_device *md; md = find_device(param); if (!md) return -ENXIO; - r = __dev_status(md, param); + __dev_status(md, param); dm_put(md); - return r; + + return 0; } /* @@ -891,6 +1101,7 @@ static void retrieve_status(struct dm_table *table, char *outbuf, *outptr; status_type_t type; size_t remaining, len, used = 0; + unsigned status_flags = 0; outptr = outbuf = get_result_buffer(param, param_size, &len); @@ -903,6 +1114,7 @@ static void retrieve_status(struct dm_table *table, num_targets = dm_table_get_num_targets(table); for (i = 0; i < num_targets; i++) { struct dm_target *ti = dm_table_get_target(table, i); + size_t l; remaining = len - (outptr - outbuf); if (remaining <= sizeof(struct dm_target_spec)) { @@ -927,14 +1139,19 @@ static void retrieve_status(struct dm_table *table, /* Get the status/table string from the target driver */ if (ti->type->status) { - if (ti->type->status(ti, type, outptr, remaining)) { - param->flags |= DM_BUFFER_FULL_FLAG; - break; - } + if (param->flags & DM_NOFLUSH_FLAG) + status_flags |= DM_STATUS_NOFLUSH_FLAG; + ti->type->status(ti, type, status_flags, outptr, remaining); } else outptr[0] = '\0'; - outptr += strlen(outptr) + 1; + l = strlen(outptr) + 1; + if (l == remaining) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + + outptr += l; used = param->data_start + (outptr - outbuf); outptr = align_ptr(outptr); @@ -952,9 +1169,10 @@ static void retrieve_status(struct dm_table *table, */ static int dev_wait(struct dm_ioctl *param, size_t param_size) { - int r; + int r = 0; struct mapped_device *md; struct dm_table *table; + int srcu_idx; md = find_device(param); if (!md) @@ -973,24 +1191,22 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size) * changed to trigger the event, so we may as well tell * him and save an ioctl. */ - r = __dev_status(md, param); - if (r) - goto out; + __dev_status(md, param); - table = dm_get_table(md); - if (table) { + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) retrieve_status(table, param, param_size); - dm_table_put(table); - } + dm_put_live_table(md, srcu_idx); - out: +out: dm_put(md); + return r; } -static inline int get_mode(struct dm_ioctl *param) +static inline fmode_t get_mode(struct dm_ioctl *param) { - int mode = FMODE_READ | FMODE_WRITE; + fmode_t mode = FMODE_READ | FMODE_WRITE; if (param->flags & DM_READONLY_FLAG) mode = FMODE_READ; @@ -1052,8 +1268,9 @@ static int table_load(struct dm_ioctl *param, size_t param_size) { int r; struct hash_cell *hc; - struct dm_table *t; + struct dm_table *t, *old_map = NULL; struct mapped_device *md; + struct target_type *immutable_target_type; md = find_device(param); if (!md) @@ -1061,33 +1278,72 @@ static int table_load(struct dm_ioctl *param, size_t param_size) r = dm_table_create(&t, get_mode(param), param->target_count, md); if (r) - goto out; + goto err; + /* Protect md->type and md->queue against concurrent table loads. */ + dm_lock_md_type(md); r = populate_table(t, param, param_size); + if (r) + goto err_unlock_md_type; + + immutable_target_type = dm_get_immutable_target_type(md); + if (immutable_target_type && + (immutable_target_type != dm_table_get_immutable_target_type(t))) { + DMWARN("can't replace immutable target type %s", + immutable_target_type->name); + r = -EINVAL; + goto err_unlock_md_type; + } + + if (dm_get_md_type(md) == DM_TYPE_NONE) + /* Initial table load: acquire type of table. */ + dm_set_md_type(md, dm_table_get_type(t)); + else if (dm_get_md_type(md) != dm_table_get_type(t)) { + DMWARN("can't change device type after initial table load."); + r = -EINVAL; + goto err_unlock_md_type; + } + + /* setup md->queue to reflect md's type (may block) */ + r = dm_setup_md_queue(md); if (r) { - dm_table_put(t); - goto out; + DMWARN("unable to set up device queue for new table."); + goto err_unlock_md_type; } + dm_unlock_md_type(md); + /* stage inactive table */ down_write(&_hash_lock); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { DMWARN("device has been removed from the dev hash table."); - dm_table_put(t); up_write(&_hash_lock); r = -ENXIO; - goto out; + goto err_destroy_table; } if (hc->new_map) - dm_table_put(hc->new_map); + old_map = hc->new_map; hc->new_map = t; up_write(&_hash_lock); param->flags |= DM_INACTIVE_PRESENT_FLAG; - r = __dev_status(md, param); + __dev_status(md, param); + + if (old_map) { + dm_sync_table(md); + dm_table_destroy(old_map); + } -out: + dm_put(md); + + return 0; + +err_unlock_md_type: + dm_unlock_md_type(md); +err_destroy_table: + dm_table_destroy(t); +err: dm_put(md); return r; @@ -1095,31 +1351,36 @@ out: static int table_clear(struct dm_ioctl *param, size_t param_size) { - int r; struct hash_cell *hc; struct mapped_device *md; + struct dm_table *old_map = NULL; down_write(&_hash_lock); hc = __find_device_hash_cell(param); if (!hc) { - DMWARN("device doesn't appear to be in the dev hash table."); + DMDEBUG_LIMIT("device doesn't appear to be in the dev hash table."); up_write(&_hash_lock); return -ENXIO; } if (hc->new_map) { - dm_table_put(hc->new_map); + old_map = hc->new_map; hc->new_map = NULL; } param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - r = __dev_status(hc->md, param); + __dev_status(hc->md, param); md = hc->md; up_write(&_hash_lock); + if (old_map) { + dm_sync_table(md); + dm_table_destroy(old_map); + } dm_put(md); - return r; + + return 0; } /* @@ -1131,7 +1392,7 @@ static void retrieve_deps(struct dm_table *table, unsigned int count = 0; struct list_head *tmp; size_t len, needed; - struct dm_dev *dd; + struct dm_dev_internal *dd; struct dm_target_deps *deps; deps = get_result_buffer(param, param_size, &len); @@ -1157,34 +1418,31 @@ static void retrieve_deps(struct dm_table *table, deps->count = count; count = 0; list_for_each_entry (dd, dm_table_get_devices(table), list) - deps->dev[count++] = huge_encode_dev(dd->bdev->bd_dev); + deps->dev[count++] = huge_encode_dev(dd->dm_dev.bdev->bd_dev); param->data_size = param->data_start + needed; } static int table_deps(struct dm_ioctl *param, size_t param_size) { - int r = 0; struct mapped_device *md; struct dm_table *table; + int srcu_idx; md = find_device(param); if (!md) return -ENXIO; - r = __dev_status(md, param); - if (r) - goto out; + __dev_status(md, param); - table = dm_get_table(md); - if (table) { + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) retrieve_deps(table, param, param_size); - dm_table_put(table); - } + dm_put_live_table(md, srcu_idx); - out: dm_put(md); - return r; + + return 0; } /* @@ -1193,27 +1451,54 @@ static int table_deps(struct dm_ioctl *param, size_t param_size) */ static int table_status(struct dm_ioctl *param, size_t param_size) { - int r; struct mapped_device *md; struct dm_table *table; + int srcu_idx; md = find_device(param); if (!md) return -ENXIO; - r = __dev_status(md, param); - if (r) - goto out; + __dev_status(md, param); - table = dm_get_table(md); - if (table) { + table = dm_get_live_or_inactive_table(md, param, &srcu_idx); + if (table) retrieve_status(table, param, param_size); - dm_table_put(table); - } + dm_put_live_table(md, srcu_idx); - out: dm_put(md); - return r; + + return 0; +} + +/* + * Process device-mapper dependent messages. Messages prefixed with '@' + * are processed by the DM core. All others are delivered to the target. + * Returns a number <= 1 if message was processed by device mapper. + * Returns 2 if message should be delivered to the target. + */ +static int message_for_md(struct mapped_device *md, unsigned argc, char **argv, + char *result, unsigned maxlen) +{ + int r; + + if (**argv != '@') + return 2; /* no '@' prefix, deliver to target */ + + if (!strcasecmp(argv[0], "@cancel_deferred_remove")) { + if (argc != 1) { + DMERR("Invalid arguments for @cancel_deferred_remove"); + return -EINVAL; + } + return dm_cancel_deferred_remove(md); + } + + r = dm_stats_message(md, argc, argv, result, maxlen); + if (r < 2) + return r; + + DMERR("Unsupported message sent to DM core: %s", argv[0]); + return -EINVAL; } /* @@ -1227,15 +1512,14 @@ static int target_message(struct dm_ioctl *param, size_t param_size) struct dm_table *table; struct dm_target *ti; struct dm_target_msg *tmsg = (void *) param + param->data_start; + size_t maxlen; + char *result = get_result_buffer(param, param_size, &maxlen); + int srcu_idx; md = find_device(param); if (!md) return -ENXIO; - r = __dev_status(md, param); - if (r) - goto out; - if (tmsg < (struct dm_target_msg *) param->data || invalid_str(tmsg->message, (void *) param + param_size)) { DMWARN("Invalid target message parameters."); @@ -1249,10 +1533,24 @@ static int target_message(struct dm_ioctl *param, size_t param_size) goto out; } - table = dm_get_table(md); - if (!table) + if (!argc) { + DMWARN("Empty message received."); + goto out_argv; + } + + r = message_for_md(md, argc, argv, result, maxlen); + if (r <= 1) goto out_argv; + table = dm_get_live_table(md, &srcu_idx); + if (!table) + goto out_table; + + if (dm_deleting_md(md)) { + r = -ENXIO; + goto out_table; + } + ti = dm_table_find_target(table, tmsg->sector); if (!dm_target_is_valid(ti)) { DMWARN("Target message sector outside device."); @@ -1264,48 +1562,73 @@ static int target_message(struct dm_ioctl *param, size_t param_size) r = -EINVAL; } - dm_table_put(table); + out_table: + dm_put_live_table(md, srcu_idx); out_argv: kfree(argv); out: - param->data_size = 0; + if (r >= 0) + __dev_status(md, param); + + if (r == 1) { + param->flags |= DM_DATA_OUT_FLAG; + if (dm_message_test_buffer_overflow(result, maxlen)) + param->flags |= DM_BUFFER_FULL_FLAG; + else + param->data_size = param->data_start + strlen(result) + 1; + r = 0; + } + dm_put(md); return r; } +/* + * The ioctl parameter block consists of two parts, a dm_ioctl struct + * followed by a data buffer. This flag is set if the second part, + * which has a variable size, is not used by the function processing + * the ioctl. + */ +#define IOCTL_FLAGS_NO_PARAMS 1 + /*----------------------------------------------------------------- * Implementation of open/close/ioctl on the special char * device. *---------------------------------------------------------------*/ -static ioctl_fn lookup_ioctl(unsigned int cmd) +static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) { static struct { int cmd; + int flags; ioctl_fn fn; } _ioctls[] = { - {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */ - {DM_REMOVE_ALL_CMD, remove_all}, - {DM_LIST_DEVICES_CMD, list_devices}, - - {DM_DEV_CREATE_CMD, dev_create}, - {DM_DEV_REMOVE_CMD, dev_remove}, - {DM_DEV_RENAME_CMD, dev_rename}, - {DM_DEV_SUSPEND_CMD, dev_suspend}, - {DM_DEV_STATUS_CMD, dev_status}, - {DM_DEV_WAIT_CMD, dev_wait}, - - {DM_TABLE_LOAD_CMD, table_load}, - {DM_TABLE_CLEAR_CMD, table_clear}, - {DM_TABLE_DEPS_CMD, table_deps}, - {DM_TABLE_STATUS_CMD, table_status}, - - {DM_LIST_VERSIONS_CMD, list_versions}, - - {DM_TARGET_MSG_CMD, target_message}, - {DM_DEV_SET_GEOMETRY_CMD, dev_set_geometry} + {DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */ + {DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all}, + {DM_LIST_DEVICES_CMD, 0, list_devices}, + + {DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create}, + {DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove}, + {DM_DEV_RENAME_CMD, 0, dev_rename}, + {DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend}, + {DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status}, + {DM_DEV_WAIT_CMD, 0, dev_wait}, + + {DM_TABLE_LOAD_CMD, 0, table_load}, + {DM_TABLE_CLEAR_CMD, IOCTL_FLAGS_NO_PARAMS, table_clear}, + {DM_TABLE_DEPS_CMD, 0, table_deps}, + {DM_TABLE_STATUS_CMD, 0, table_status}, + + {DM_LIST_VERSIONS_CMD, 0, list_versions}, + + {DM_TARGET_MSG_CMD, 0, target_message}, + {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry} }; - return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; + if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) + return NULL; + + *ioctl_flags = _ioctls[cmd].flags; + return _ioctls[cmd].fn; } /* @@ -1342,38 +1665,103 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user) return r; } -static void free_params(struct dm_ioctl *param) +#define DM_PARAMS_KMALLOC 0x0001 /* Params alloced with kmalloc */ +#define DM_PARAMS_VMALLOC 0x0002 /* Params alloced with vmalloc */ +#define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */ + +static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags) { - vfree(param); + if (param_flags & DM_WIPE_BUFFER) + memset(param, 0, param_size); + + if (param_flags & DM_PARAMS_KMALLOC) + kfree(param); + if (param_flags & DM_PARAMS_VMALLOC) + vfree(param); } -static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) +static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel, + int ioctl_flags, + struct dm_ioctl **param, int *param_flags) { - struct dm_ioctl tmp, *dmi; + struct dm_ioctl *dmi; + int secure_data; + const size_t minimum_data_size = sizeof(*param_kernel) - sizeof(param_kernel->data); - if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) + if (copy_from_user(param_kernel, user, minimum_data_size)) return -EFAULT; - if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) + if (param_kernel->data_size < minimum_data_size) return -EINVAL; - dmi = vmalloc(tmp.data_size); - if (!dmi) + secure_data = param_kernel->flags & DM_SECURE_DATA_FLAG; + + *param_flags = secure_data ? DM_WIPE_BUFFER : 0; + + if (ioctl_flags & IOCTL_FLAGS_NO_PARAMS) { + dmi = param_kernel; + dmi->data_size = minimum_data_size; + goto data_copied; + } + + /* + * Try to avoid low memory issues when a device is suspended. + * Use kmalloc() rather than vmalloc() when we can. + */ + dmi = NULL; + if (param_kernel->data_size <= KMALLOC_MAX_SIZE) { + dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (dmi) + *param_flags |= DM_PARAMS_KMALLOC; + } + + if (!dmi) { + unsigned noio_flag; + noio_flag = memalloc_noio_save(); + dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_REPEAT | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL); + memalloc_noio_restore(noio_flag); + if (dmi) + *param_flags |= DM_PARAMS_VMALLOC; + } + + if (!dmi) { + if (secure_data && clear_user(user, param_kernel->data_size)) + return -EFAULT; return -ENOMEM; + } - if (copy_from_user(dmi, user, tmp.data_size)) { - vfree(dmi); - return -EFAULT; + if (copy_from_user(dmi, user, param_kernel->data_size)) + goto bad; + +data_copied: + /* + * Abort if something changed the ioctl data while it was being copied. + */ + if (dmi->data_size != param_kernel->data_size) { + DMERR("rejecting ioctl: data size modified while processing parameters"); + goto bad; } + /* Wipe the user buffer so we do not return it to userspace */ + if (secure_data && clear_user(user, param_kernel->data_size)) + goto bad; + *param = dmi; return 0; + +bad: + free_params(dmi, param_kernel->data_size, *param_flags); + + return -EFAULT; } static int validate_params(uint cmd, struct dm_ioctl *param) { /* Always clear this flag */ param->flags &= ~DM_BUFFER_FULL_FLAG; + param->flags &= ~DM_UEVENT_GENERATED_FLAG; + param->flags &= ~DM_SECURE_DATA_FLAG; + param->flags &= ~DM_DATA_OUT_FLAG; /* Ignores parameters */ if (cmd == DM_REMOVE_ALL_CMD || @@ -1401,10 +1789,13 @@ static int validate_params(uint cmd, struct dm_ioctl *param) static int ctl_ioctl(uint command, struct dm_ioctl __user *user) { int r = 0; + int ioctl_flags; + int param_flags; unsigned int cmd; struct dm_ioctl *uninitialized_var(param); ioctl_fn fn = NULL; - size_t param_size; + size_t input_param_size; + struct dm_ioctl param_kernel; /* only root can play with this */ if (!capable(CAP_SYS_ADMIN)) @@ -1429,35 +1820,31 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) if (cmd == DM_VERSION_CMD) return 0; - fn = lookup_ioctl(cmd); + fn = lookup_ioctl(cmd, &ioctl_flags); if (!fn) { DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); return -ENOTTY; } /* - * Trying to avoid low memory issues when a device is - * suspended. - */ - current->flags |= PF_MEMALLOC; - - /* * Copy the parameters into kernel space. */ - r = copy_params(user, ¶m); - - current->flags &= ~PF_MEMALLOC; + r = copy_params(user, ¶m_kernel, ioctl_flags, ¶m, ¶m_flags); if (r) return r; + input_param_size = param->data_size; r = validate_params(cmd, param); if (r) goto out; - param_size = param->data_size; param->data_size = sizeof(*param); - r = fn(param, param_size); + r = fn(param, input_param_size); + + if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) && + unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS)) + DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd); /* * Copy the results back to userland. @@ -1465,8 +1852,8 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) if (!r && copy_to_user(user, param, param->data_size)) r = -EFAULT; - out: - free_params(param); +out: + free_params(param, input_param_size, param_flags); return r; } @@ -1485,17 +1872,23 @@ static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u) #endif static const struct file_operations _ctl_fops = { + .open = nonseekable_open, .unlocked_ioctl = dm_ctl_ioctl, .compat_ioctl = dm_compat_ctl_ioctl, .owner = THIS_MODULE, + .llseek = noop_llseek, }; static struct miscdevice _dm_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = MAPPER_CTRL_MINOR, .name = DM_NAME, + .nodename = DM_DIR "/" DM_CONTROL_NODE, .fops = &_ctl_fops }; +MODULE_ALIAS_MISCDEV(MAPPER_CTRL_MINOR); +MODULE_ALIAS("devname:" DM_DIR "/" DM_CONTROL_NODE); + /* * Create misc character device and link to DM_DIR/control. */ @@ -1542,20 +1935,20 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid) if (!md) return -ENXIO; - dm_get(md); - down_read(&_hash_lock); + mutex_lock(&dm_hash_cells_mutex); hc = dm_get_mdptr(md); if (!hc || hc->md != md) { r = -ENXIO; goto out; } - strcpy(name, hc->name); - strcpy(uuid, hc->uuid ? : ""); + if (name) + strcpy(name, hc->name); + if (uuid) + strcpy(uuid, hc->uuid ? : ""); out: - up_read(&_hash_lock); - dm_put(md); + mutex_unlock(&dm_hash_cells_mutex); return r; } diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 996802b8a45..3a7cade5e27 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -10,7 +10,7 @@ */ #include <linux/types.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/init.h> @@ -22,19 +22,25 @@ #include <linux/vmalloc.h> #include <linux/workqueue.h> #include <linux/mutex.h> +#include <linux/delay.h> +#include <linux/device-mapper.h> #include <linux/dm-kcopyd.h> #include "dm.h" +#define SUB_JOB_SIZE 128 +#define SPLIT_COUNT 8 +#define MIN_JOBS 8 +#define RESERVE_PAGES (DIV_ROUND_UP(SUB_JOB_SIZE << SECTOR_SHIFT, PAGE_SIZE)) + /*----------------------------------------------------------------- * Each kcopyd client has its own little pool of preallocated * pages for kcopyd io. *---------------------------------------------------------------*/ struct dm_kcopyd_client { - spinlock_t lock; struct page_list *pages; - unsigned int nr_pages; - unsigned int nr_free_pages; + unsigned nr_reserved_pages; + unsigned nr_free_pages; struct dm_io_client *io_client; @@ -46,6 +52,8 @@ struct dm_kcopyd_client { struct workqueue_struct *kcopyd_wq; struct work_struct kcopyd_work; + struct dm_kcopyd_throttle *throttle; + /* * We maintain three lists of jobs: * @@ -61,20 +69,136 @@ struct dm_kcopyd_client { struct list_head pages_jobs; }; +static struct page_list zero_page_list; + +static DEFINE_SPINLOCK(throttle_spinlock); + +/* + * IO/IDLE accounting slowly decays after (1 << ACCOUNT_INTERVAL_SHIFT) period. + * When total_period >= (1 << ACCOUNT_INTERVAL_SHIFT) the counters are divided + * by 2. + */ +#define ACCOUNT_INTERVAL_SHIFT SHIFT_HZ + +/* + * Sleep this number of milliseconds. + * + * The value was decided experimentally. + * Smaller values seem to cause an increased copy rate above the limit. + * The reason for this is unknown but possibly due to jiffies rounding errors + * or read/write cache inside the disk. + */ +#define SLEEP_MSEC 100 + +/* + * Maximum number of sleep events. There is a theoretical livelock if more + * kcopyd clients do work simultaneously which this limit avoids. + */ +#define MAX_SLEEPS 10 + +static void io_job_start(struct dm_kcopyd_throttle *t) +{ + unsigned throttle, now, difference; + int slept = 0, skew; + + if (unlikely(!t)) + return; + +try_again: + spin_lock_irq(&throttle_spinlock); + + throttle = ACCESS_ONCE(t->throttle); + + if (likely(throttle >= 100)) + goto skip_limit; + + now = jiffies; + difference = now - t->last_jiffies; + t->last_jiffies = now; + if (t->num_io_jobs) + t->io_period += difference; + t->total_period += difference; + + /* + * Maintain sane values if we got a temporary overflow. + */ + if (unlikely(t->io_period > t->total_period)) + t->io_period = t->total_period; + + if (unlikely(t->total_period >= (1 << ACCOUNT_INTERVAL_SHIFT))) { + int shift = fls(t->total_period >> ACCOUNT_INTERVAL_SHIFT); + t->total_period >>= shift; + t->io_period >>= shift; + } + + skew = t->io_period - throttle * t->total_period / 100; + + if (unlikely(skew > 0) && slept < MAX_SLEEPS) { + slept++; + spin_unlock_irq(&throttle_spinlock); + msleep(SLEEP_MSEC); + goto try_again; + } + +skip_limit: + t->num_io_jobs++; + + spin_unlock_irq(&throttle_spinlock); +} + +static void io_job_finish(struct dm_kcopyd_throttle *t) +{ + unsigned long flags; + + if (unlikely(!t)) + return; + + spin_lock_irqsave(&throttle_spinlock, flags); + + t->num_io_jobs--; + + if (likely(ACCESS_ONCE(t->throttle) >= 100)) + goto skip_limit; + + if (!t->num_io_jobs) { + unsigned now, difference; + + now = jiffies; + difference = now - t->last_jiffies; + t->last_jiffies = now; + + t->io_period += difference; + t->total_period += difference; + + /* + * Maintain sane values if we got a temporary overflow. + */ + if (unlikely(t->io_period > t->total_period)) + t->io_period = t->total_period; + } + +skip_limit: + spin_unlock_irqrestore(&throttle_spinlock, flags); +} + + static void wake(struct dm_kcopyd_client *kc) { queue_work(kc->kcopyd_wq, &kc->kcopyd_work); } -static struct page_list *alloc_pl(void) +/* + * Obtain one page for the use of kcopyd. + */ +static struct page_list *alloc_pl(gfp_t gfp) { struct page_list *pl; - pl = kmalloc(sizeof(*pl), GFP_KERNEL); + pl = kmalloc(sizeof(*pl), gfp); if (!pl) return NULL; - pl->page = alloc_page(GFP_KERNEL); + pl->page = alloc_page(gfp); if (!pl->page) { kfree(pl); return NULL; @@ -89,41 +213,56 @@ static void free_pl(struct page_list *pl) kfree(pl); } -static int kcopyd_get_pages(struct dm_kcopyd_client *kc, - unsigned int nr, struct page_list **pages) +/* + * Add the provided pages to a client's free page list, releasing + * back to the system any beyond the reserved_pages limit. + */ +static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) { - struct page_list *pl; - - spin_lock(&kc->lock); - if (kc->nr_free_pages < nr) { - spin_unlock(&kc->lock); - return -ENOMEM; - } - - kc->nr_free_pages -= nr; - for (*pages = pl = kc->pages; --nr; pl = pl->next) - ; + struct page_list *next; - kc->pages = pl->next; - pl->next = NULL; + do { + next = pl->next; - spin_unlock(&kc->lock); + if (kc->nr_free_pages >= kc->nr_reserved_pages) + free_pl(pl); + else { + pl->next = kc->pages; + kc->pages = pl; + kc->nr_free_pages++; + } - return 0; + pl = next; + } while (pl); } -static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl) +static int kcopyd_get_pages(struct dm_kcopyd_client *kc, + unsigned int nr, struct page_list **pages) { - struct page_list *cursor; + struct page_list *pl; - spin_lock(&kc->lock); - for (cursor = pl; cursor->next; cursor = cursor->next) - kc->nr_free_pages++; + *pages = NULL; + + do { + pl = alloc_pl(__GFP_NOWARN | __GFP_NORETRY); + if (unlikely(!pl)) { + /* Use reserved pages */ + pl = kc->pages; + if (unlikely(!pl)) + goto out_of_memory; + kc->pages = pl->next; + kc->nr_free_pages--; + } + pl->next = *pages; + *pages = pl; + } while (--nr); - kc->nr_free_pages++; - cursor->next = kc->pages; - kc->pages = pl; - spin_unlock(&kc->lock); + return 0; + +out_of_memory: + if (*pages) + kcopyd_put_pages(kc, *pages); + return -ENOMEM; } /* @@ -140,13 +279,16 @@ static void drop_pages(struct page_list *pl) } } -static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr) +/* + * Allocate and reserve nr_pages for the use of a specific client. + */ +static int client_reserve_pages(struct dm_kcopyd_client *kc, unsigned nr_pages) { - unsigned int i; + unsigned i; struct page_list *pl = NULL, *next; - for (i = 0; i < nr; i++) { - next = alloc_pl(); + for (i = 0; i < nr_pages; i++) { + next = alloc_pl(GFP_KERNEL); if (!next) { if (pl) drop_pages(pl); @@ -156,17 +298,18 @@ static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr) pl = next; } + kc->nr_reserved_pages += nr_pages; kcopyd_put_pages(kc, pl); - kc->nr_pages += nr; + return 0; } static void client_free_pages(struct dm_kcopyd_client *kc) { - BUG_ON(kc->nr_free_pages != kc->nr_pages); + BUG_ON(kc->nr_free_pages != kc->nr_reserved_pages); drop_pages(kc->pages); kc->pages = NULL; - kc->nr_free_pages = kc->nr_pages = 0; + kc->nr_free_pages = kc->nr_reserved_pages = 0; } /*----------------------------------------------------------------- @@ -197,8 +340,6 @@ struct kcopyd_job { unsigned int num_dests; struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS]; - sector_t offset; - unsigned int nr_pages; struct page_list *pages; /* @@ -215,19 +356,23 @@ struct kcopyd_job { struct mutex lock; atomic_t sub_jobs; sector_t progress; -}; -/* FIXME: this should scale with the number of pages */ -#define MIN_JOBS 512 + struct kcopyd_job *master_job; +}; static struct kmem_cache *_job_cache; int __init dm_kcopyd_init(void) { - _job_cache = KMEM_CACHE(kcopyd_job, 0); + _job_cache = kmem_cache_create("kcopyd_job", + sizeof(struct kcopyd_job) * (SPLIT_COUNT + 1), + __alignof__(struct kcopyd_job), 0, NULL); if (!_job_cache) return -ENOMEM; + zero_page_list.next = &zero_page_list; + zero_page_list.page = ZERO_PAGE(0); + return 0; } @@ -268,6 +413,17 @@ static void push(struct list_head *jobs, struct kcopyd_job *job) spin_unlock_irqrestore(&kc->job_lock, flags); } + +static void push_head(struct list_head *jobs, struct kcopyd_job *job) +{ + unsigned long flags; + struct dm_kcopyd_client *kc = job->kc; + + spin_lock_irqsave(&kc->job_lock, flags); + list_add(&job->list, jobs); + spin_unlock_irqrestore(&kc->job_lock, flags); +} + /* * These three functions process 1 item from the corresponding * job list. @@ -285,8 +441,14 @@ static int run_complete_job(struct kcopyd_job *job) dm_kcopyd_notify_fn fn = job->fn; struct dm_kcopyd_client *kc = job->kc; - kcopyd_put_pages(kc, job->pages); - mempool_free(job, kc->job_pool); + if (job->pages && job->pages != &zero_page_list) + kcopyd_put_pages(kc, job->pages); + /* + * If this is the master job, the sub jobs have already + * completed so we can free everything. + */ + if (job->master_job == job) + mempool_free(job, kc->job_pool); fn(read_err, write_err, context); if (atomic_dec_and_test(&kc->nr_jobs)) @@ -300,8 +462,10 @@ static void complete_io(unsigned long error, void *context) struct kcopyd_job *job = (struct kcopyd_job *) context; struct dm_kcopyd_client *kc = job->kc; + io_job_finish(kc->throttle); + if (error) { - if (job->rw == WRITE) + if (job->rw & WRITE) job->write_err |= error; else job->read_err = 1; @@ -313,7 +477,7 @@ static void complete_io(unsigned long error, void *context) } } - if (job->rw == WRITE) + if (job->rw & WRITE) push(&kc->complete_jobs, job); else { @@ -332,15 +496,17 @@ static int run_io_job(struct kcopyd_job *job) { int r; struct dm_io_request io_req = { - .bi_rw = job->rw | (1 << BIO_RW_SYNC), + .bi_rw = job->rw, .mem.type = DM_IO_PAGE_LIST, .mem.ptr.pl = job->pages, - .mem.offset = job->offset, + .mem.offset = 0, .notify.fn = complete_io, .notify.context = job, .client = job->kc->io_client, }; + io_job_start(job->kc->throttle); + if (job->rw == READ) r = dm_io(&io_req, 1, &job->source, NULL); else @@ -352,10 +518,9 @@ static int run_io_job(struct kcopyd_job *job) static int run_pages_job(struct kcopyd_job *job) { int r; + unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9); - job->nr_pages = dm_div_up(job->dests[0].count + job->offset, - PAGE_SIZE >> 9); - r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); + r = kcopyd_get_pages(job->kc, nr_pages, &job->pages); if (!r) { /* this job is ready for io */ push(&job->kc->io_jobs, job); @@ -385,7 +550,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, if (r < 0) { /* error this rogue job */ - if (job->rw == WRITE) + if (job->rw & WRITE) job->write_err = (unsigned long) -1L; else job->read_err = 1; @@ -398,7 +563,7 @@ static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc, * We couldn't service this job ATM, so * push this job back onto the list. */ - push(jobs, job); + push_head(jobs, job); break; } @@ -415,6 +580,7 @@ static void do_work(struct work_struct *work) { struct dm_kcopyd_client *kc = container_of(work, struct dm_kcopyd_client, kcopyd_work); + struct blk_plug plug; /* * The order that these are called is *very* important. @@ -423,9 +589,11 @@ static void do_work(struct work_struct *work) * list. io jobs call wake when they complete and it all * starts again. */ + blk_start_plug(&plug); process_jobs(&kc->complete_jobs, kc, run_complete_job); process_jobs(&kc->pages_jobs, kc, run_pages_job); process_jobs(&kc->io_jobs, kc, run_io_job); + blk_finish_plug(&plug); } /* @@ -437,18 +605,24 @@ static void dispatch_job(struct kcopyd_job *job) { struct dm_kcopyd_client *kc = job->kc; atomic_inc(&kc->nr_jobs); - push(&kc->pages_jobs, job); + if (unlikely(!job->source.count)) + push(&kc->complete_jobs, job); + else if (job->pages == &zero_page_list) + push(&kc->io_jobs, job); + else + push(&kc->pages_jobs, job); wake(kc); } -#define SUB_JOB_SIZE 128 static void segment_complete(int read_err, unsigned long write_err, void *context) { /* FIXME: tidy this function */ sector_t progress = 0; sector_t count = 0; - struct kcopyd_job *job = (struct kcopyd_job *) context; + struct kcopyd_job *sub_job = (struct kcopyd_job *) context; + struct kcopyd_job *job = sub_job->master_job; + struct dm_kcopyd_client *kc = job->kc; mutex_lock(&job->lock); @@ -478,8 +652,6 @@ static void segment_complete(int read_err, unsigned long write_err, if (count) { int i; - struct kcopyd_job *sub_job = mempool_alloc(job->kc->job_pool, - GFP_NOIO); *sub_job = *job; sub_job->source.sector += progress; @@ -491,34 +663,39 @@ static void segment_complete(int read_err, unsigned long write_err, } sub_job->fn = segment_complete; - sub_job->context = job; + sub_job->context = sub_job; dispatch_job(sub_job); } else if (atomic_dec_and_test(&job->sub_jobs)) { /* - * To avoid a race we must keep the job around - * until after the notify function has completed. - * Otherwise the client may try and stop the job - * after we've completed. + * Queue the completion callback to the kcopyd thread. + * + * Some callers assume that all the completions are called + * from a single thread and don't race with each other. + * + * We must not call the callback directly here because this + * code may not be executing in the thread. */ - job->fn(read_err, write_err, job->context); - mempool_free(job, job->kc->job_pool); + push(&kc->complete_jobs, job); + wake(kc); } } /* - * Create some little jobs that will do the move between - * them. + * Create some sub jobs to share the work between them. */ -#define SPLIT_COUNT 8 -static void split_job(struct kcopyd_job *job) +static void split_job(struct kcopyd_job *master_job) { int i; - atomic_set(&job->sub_jobs, SPLIT_COUNT); - for (i = 0; i < SPLIT_COUNT; i++) - segment_complete(0, 0u, job); + atomic_inc(&master_job->kc->nr_jobs); + + atomic_set(&master_job->sub_jobs, SPLIT_COUNT); + for (i = 0; i < SPLIT_COUNT; i++) { + master_job[i + 1].master_job = master_job; + segment_complete(0, 0u, &master_job[i + 1]); + } } int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, @@ -526,9 +703,11 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, unsigned int flags, dm_kcopyd_notify_fn fn, void *context) { struct kcopyd_job *job; + int i; /* - * Allocate a new job. + * Allocate an array of jobs consisting of one master job + * followed by SPLIT_COUNT sub jobs. */ job = mempool_alloc(kc->job_pool, GFP_NOIO); @@ -539,23 +718,36 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, job->flags = flags; job->read_err = 0; job->write_err = 0; - job->rw = READ; - - job->source = *from; job->num_dests = num_dests; memcpy(&job->dests, dests, sizeof(*dests) * num_dests); - job->offset = 0; - job->nr_pages = 0; - job->pages = NULL; + if (from) { + job->source = *from; + job->pages = NULL; + job->rw = READ; + } else { + memset(&job->source, 0, sizeof job->source); + job->source.count = job->dests[0].count; + job->pages = &zero_page_list; + + /* + * Use WRITE SAME to optimize zeroing if all dests support it. + */ + job->rw = WRITE | REQ_WRITE_SAME; + for (i = 0; i < job->num_dests; i++) + if (!bdev_write_same(job->dests[i].bdev)) { + job->rw = WRITE; + break; + } + } job->fn = fn; job->context = context; + job->master_job = job; - if (job->source.count < SUB_JOB_SIZE) + if (job->source.count <= SUB_JOB_SIZE) dispatch_job(job); - else { mutex_init(&job->lock); job->progress = 0; @@ -566,6 +758,46 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, } EXPORT_SYMBOL(dm_kcopyd_copy); +int dm_kcopyd_zero(struct dm_kcopyd_client *kc, + unsigned num_dests, struct dm_io_region *dests, + unsigned flags, dm_kcopyd_notify_fn fn, void *context) +{ + return dm_kcopyd_copy(kc, NULL, num_dests, dests, flags, fn, context); +} +EXPORT_SYMBOL(dm_kcopyd_zero); + +void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc, + dm_kcopyd_notify_fn fn, void *context) +{ + struct kcopyd_job *job; + + job = mempool_alloc(kc->job_pool, GFP_NOIO); + + memset(job, 0, sizeof(struct kcopyd_job)); + job->kc = kc; + job->fn = fn; + job->context = context; + job->master_job = job; + + atomic_inc(&kc->nr_jobs); + + return job; +} +EXPORT_SYMBOL(dm_kcopyd_prepare_callback); + +void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) +{ + struct kcopyd_job *job = j; + struct dm_kcopyd_client *kc = job->kc; + + job->read_err = read_err; + job->write_err = write_err; + + push(&kc->complete_jobs, job); + wake(kc); +} +EXPORT_SYMBOL(dm_kcopyd_do_callback); + /* * Cancels a kcopyd job, eg. someone might be deactivating a * mirror. @@ -581,38 +813,37 @@ int kcopyd_cancel(struct kcopyd_job *job, int block) /*----------------------------------------------------------------- * Client setup *---------------------------------------------------------------*/ -int dm_kcopyd_client_create(unsigned int nr_pages, - struct dm_kcopyd_client **result) +struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle) { int r = -ENOMEM; struct dm_kcopyd_client *kc; kc = kmalloc(sizeof(*kc), GFP_KERNEL); if (!kc) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - spin_lock_init(&kc->lock); spin_lock_init(&kc->job_lock); INIT_LIST_HEAD(&kc->complete_jobs); INIT_LIST_HEAD(&kc->io_jobs); INIT_LIST_HEAD(&kc->pages_jobs); + kc->throttle = throttle; kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache); if (!kc->job_pool) goto bad_slab; INIT_WORK(&kc->kcopyd_work, do_work); - kc->kcopyd_wq = create_singlethread_workqueue("kcopyd"); + kc->kcopyd_wq = alloc_workqueue("kcopyd", WQ_MEM_RECLAIM, 0); if (!kc->kcopyd_wq) goto bad_workqueue; kc->pages = NULL; - kc->nr_pages = kc->nr_free_pages = 0; - r = client_alloc_pages(kc, nr_pages); + kc->nr_reserved_pages = kc->nr_free_pages = 0; + r = client_reserve_pages(kc, RESERVE_PAGES); if (r) goto bad_client_pages; - kc->io_client = dm_io_client_create(nr_pages); + kc->io_client = dm_io_client_create(); if (IS_ERR(kc->io_client)) { r = PTR_ERR(kc->io_client); goto bad_io_client; @@ -621,8 +852,7 @@ int dm_kcopyd_client_create(unsigned int nr_pages, init_waitqueue_head(&kc->destroyq); atomic_set(&kc->nr_jobs, 0); - *result = kc; - return 0; + return kc; bad_io_client: client_free_pages(kc); @@ -633,7 +863,7 @@ bad_workqueue: bad_slab: kfree(kc); - return r; + return ERR_PTR(r); } EXPORT_SYMBOL(dm_kcopyd_client_create); diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 6449bcdf84c..53e848c1093 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -5,12 +5,12 @@ */ #include "dm.h" - #include <linux/module.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/slab.h> +#include <linux/device-mapper.h> #define DM_MSG_PREFIX "linear" @@ -29,6 +29,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct linear_c *lc; unsigned long long tmp; + char dummy; if (argc != 2) { ti->error = "Invalid argument count"; @@ -41,18 +42,20 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -ENOMEM; } - if (sscanf(argv[1], "%llu", &tmp) != 1) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) { ti->error = "dm-linear: Invalid device sector"; goto bad; } lc->start = tmp; - if (dm_get_device(ti, argv[0], lc->start, ti->len, - dm_table_get_mode(ti->table), &lc->dev)) { + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &lc->dev)) { ti->error = "dm-linear: Device lookup failed"; goto bad; } + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_write_same_bios = 1; ti->private = lc; return 0; @@ -73,7 +76,7 @@ static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) { struct linear_c *lc = ti->private; - return lc->start + (bi_sector - ti->begin); + return lc->start + dm_target_offset(ti, bi_sector); } static void linear_map_bio(struct dm_target *ti, struct bio *bio) @@ -81,19 +84,20 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) struct linear_c *lc = ti->private; bio->bi_bdev = lc->dev->bdev; - bio->bi_sector = linear_map_sector(ti, bio->bi_sector); + if (bio_sectors(bio)) + bio->bi_iter.bi_sector = + linear_map_sector(ti, bio->bi_iter.bi_sector); } -static int linear_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) +static int linear_map(struct dm_target *ti, struct bio *bio) { linear_map_bio(ti, bio); return DM_MAPIO_REMAPPED; } -static int linear_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) +static void linear_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) { struct linear_c *lc = (struct linear_c *) ti->private; @@ -107,23 +111,23 @@ static int linear_status(struct dm_target *ti, status_type_t type, (unsigned long long)lc->start); break; } - return 0; } -static int linear_ioctl(struct dm_target *ti, struct inode *inode, - struct file *filp, unsigned int cmd, +static int linear_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) { struct linear_c *lc = (struct linear_c *) ti->private; - struct block_device *bdev = lc->dev->bdev; - struct file fake_file = {}; - struct dentry fake_dentry = {}; + struct dm_dev *dev = lc->dev; + int r = 0; - fake_file.f_mode = lc->dev->mode; - fake_file.f_path.dentry = &fake_dentry; - fake_dentry.d_inode = bdev->bd_inode; + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (lc->start || + ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) + r = scsi_verify_blk_ioctl(NULL, cmd); - return blkdev_driver_ioctl(bdev->bd_inode, &fake_file, bdev->bd_disk, cmd, arg); + return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); } static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, @@ -141,9 +145,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } +static int linear_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct linear_c *lc = ti->private; + + return fn(ti, lc->dev, lc->start, ti->len, data); +} + static struct target_type linear_target = { .name = "linear", - .version= {1, 0, 3}, + .version = {1, 2, 1}, .module = THIS_MODULE, .ctr = linear_ctr, .dtr = linear_dtr, @@ -151,6 +163,7 @@ static struct target_type linear_target = { .status = linear_status, .ioctl = linear_ioctl, .merge = linear_merge, + .iterate_devices = linear_iterate_devices, }; int __init dm_linear_init(void) @@ -165,8 +178,5 @@ int __init dm_linear_init(void) void dm_linear_exit(void) { - int r = dm_unregister_target(&linear_target); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_target(&linear_target); } diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c new file mode 100644 index 00000000000..b953db6cc22 --- /dev/null +++ b/drivers/md/dm-log-userspace-base.c @@ -0,0 +1,930 @@ +/* + * Copyright (C) 2006-2009 Red Hat, Inc. + * + * This file is released under the LGPL. + */ + +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/dm-dirty-log.h> +#include <linux/device-mapper.h> +#include <linux/dm-log-userspace.h> +#include <linux/module.h> +#include <linux/workqueue.h> + +#include "dm-log-userspace-transfer.h" + +#define DM_LOG_USERSPACE_VSN "1.3.0" + +struct flush_entry { + int type; + region_t region; + struct list_head list; +}; + +/* + * This limit on the number of mark and clear request is, to a degree, + * arbitrary. However, there is some basis for the choice in the limits + * imposed on the size of data payload by dm-log-userspace-transfer.c: + * dm_consult_userspace(). + */ +#define MAX_FLUSH_GROUP_COUNT 32 + +struct log_c { + struct dm_target *ti; + struct dm_dev *log_dev; + uint32_t region_size; + region_t region_count; + uint64_t luid; + char uuid[DM_UUID_LEN]; + + char *usr_argv_str; + uint32_t usr_argc; + + /* + * in_sync_hint gets set when doing is_remote_recovering. It + * represents the first region that needs recovery. IOW, the + * first zero bit of sync_bits. This can be useful for to limit + * traffic for calls like is_remote_recovering and get_resync_work, + * but be take care in its use for anything else. + */ + uint64_t in_sync_hint; + + /* + * Mark and clear requests are held until a flush is issued + * so that we can group, and thereby limit, the amount of + * network traffic between kernel and userspace. The 'flush_lock' + * is used to protect these lists. + */ + spinlock_t flush_lock; + struct list_head mark_list; + struct list_head clear_list; + + /* + * Workqueue for flush of clear region requests. + */ + struct workqueue_struct *dmlog_wq; + struct delayed_work flush_log_work; + atomic_t sched_flush; + + /* + * Combine userspace flush and mark requests for efficiency. + */ + uint32_t integrated_flush; +}; + +static mempool_t *flush_entry_pool; + +static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) +{ + return kmalloc(sizeof(struct flush_entry), gfp_mask); +} + +static void flush_entry_free(void *element, void *pool_data) +{ + kfree(element); +} + +static int userspace_do_request(struct log_c *lc, const char *uuid, + int request_type, char *data, size_t data_size, + char *rdata, size_t *rdata_size) +{ + int r; + + /* + * If the server isn't there, -ESRCH is returned, + * and we must keep trying until the server is + * restored. + */ +retry: + r = dm_consult_userspace(uuid, lc->luid, request_type, data, + data_size, rdata, rdata_size); + + if (r != -ESRCH) + return r; + + DMERR(" Userspace log server not found."); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(2*HZ); + DMWARN("Attempting to contact userspace log server..."); + r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR, + lc->usr_argv_str, + strlen(lc->usr_argv_str) + 1, + NULL, NULL); + if (!r) + break; + } + DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); + r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL, + 0, NULL, NULL); + if (!r) + goto retry; + + DMERR("Error trying to resume userspace log: %d", r); + + return -ESRCH; +} + +static int build_constructor_string(struct dm_target *ti, + unsigned argc, char **argv, + char **ctr_str) +{ + int i, str_size; + char *str = NULL; + + *ctr_str = NULL; + + /* + * Determine overall size of the string. + */ + for (i = 0, str_size = 0; i < argc; i++) + str_size += strlen(argv[i]) + 1; /* +1 for space between args */ + + str_size += 20; /* Max number of chars in a printed u64 number */ + + str = kzalloc(str_size, GFP_KERNEL); + if (!str) { + DMWARN("Unable to allocate memory for constructor string"); + return -ENOMEM; + } + + str_size = sprintf(str, "%llu", (unsigned long long)ti->len); + for (i = 0; i < argc; i++) + str_size += sprintf(str + str_size, " %s", argv[i]); + + *ctr_str = str; + return str_size; +} + +static void do_flush(struct work_struct *work) +{ + int r; + struct log_c *lc = container_of(work, struct log_c, flush_log_work.work); + + atomic_set(&lc->sched_flush, 0); + + r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, NULL, 0, NULL, NULL); + + if (r) + dm_table_event(lc->ti->table); +} + +/* + * userspace_ctr + * + * argv contains: + * <UUID> [integrated_flush] <other args> + * Where 'other args' are the userspace implementation-specific log + * arguments. + * + * Example: + * <UUID> [integrated_flush] clustered-disk <arg count> <log dev> + * <region_size> [[no]sync] + * + * This module strips off the <UUID> and uses it for identification + * purposes when communicating with userspace about a log. + * + * If integrated_flush is defined, the kernel combines flush + * and mark requests. + * + * The rest of the line, beginning with 'clustered-disk', is passed + * to the userspace ctr function. + */ +static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, + unsigned argc, char **argv) +{ + int r = 0; + int str_size; + char *ctr_str = NULL; + struct log_c *lc = NULL; + uint64_t rdata; + size_t rdata_size = sizeof(rdata); + char *devices_rdata = NULL; + size_t devices_rdata_size = DM_NAME_LEN; + + if (argc < 3) { + DMWARN("Too few arguments to userspace dirty log"); + return -EINVAL; + } + + lc = kzalloc(sizeof(*lc), GFP_KERNEL); + if (!lc) { + DMWARN("Unable to allocate userspace log context."); + return -ENOMEM; + } + + /* The ptr value is sufficient for local unique id */ + lc->luid = (unsigned long)lc; + + lc->ti = ti; + + if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { + DMWARN("UUID argument too long."); + kfree(lc); + return -EINVAL; + } + + lc->usr_argc = argc; + + strncpy(lc->uuid, argv[0], DM_UUID_LEN); + argc--; + argv++; + spin_lock_init(&lc->flush_lock); + INIT_LIST_HEAD(&lc->mark_list); + INIT_LIST_HEAD(&lc->clear_list); + + if (!strcasecmp(argv[0], "integrated_flush")) { + lc->integrated_flush = 1; + argc--; + argv++; + } + + str_size = build_constructor_string(ti, argc, argv, &ctr_str); + if (str_size < 0) { + kfree(lc); + return str_size; + } + + devices_rdata = kzalloc(devices_rdata_size, GFP_KERNEL); + if (!devices_rdata) { + DMERR("Failed to allocate memory for device information"); + r = -ENOMEM; + goto out; + } + + /* + * Send table string and get back any opened device. + */ + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, + ctr_str, str_size, + devices_rdata, &devices_rdata_size); + + if (r < 0) { + if (r == -ESRCH) + DMERR("Userspace log server not found"); + else + DMERR("Userspace log server failed to create log"); + goto out; + } + + /* Since the region size does not change, get it now */ + rdata_size = sizeof(rdata); + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE, + NULL, 0, (char *)&rdata, &rdata_size); + + if (r) { + DMERR("Failed to get region size of dirty log"); + goto out; + } + + lc->region_size = (uint32_t)rdata; + lc->region_count = dm_sector_div_up(ti->len, lc->region_size); + + if (devices_rdata_size) { + if (devices_rdata[devices_rdata_size - 1] != '\0') { + DMERR("DM_ULOG_CTR device return string not properly terminated"); + r = -EINVAL; + goto out; + } + r = dm_get_device(ti, devices_rdata, + dm_table_get_mode(ti->table), &lc->log_dev); + if (r) + DMERR("Failed to register %s with device-mapper", + devices_rdata); + } + + if (lc->integrated_flush) { + lc->dmlog_wq = alloc_workqueue("dmlogd", WQ_MEM_RECLAIM, 0); + if (!lc->dmlog_wq) { + DMERR("couldn't start dmlogd"); + r = -ENOMEM; + goto out; + } + + INIT_DELAYED_WORK(&lc->flush_log_work, do_flush); + atomic_set(&lc->sched_flush, 0); + } + +out: + kfree(devices_rdata); + if (r) { + kfree(lc); + kfree(ctr_str); + } else { + lc->usr_argv_str = ctr_str; + log->context = lc; + } + + return r; +} + +static void userspace_dtr(struct dm_dirty_log *log) +{ + struct log_c *lc = log->context; + + if (lc->integrated_flush) { + /* flush workqueue */ + if (atomic_read(&lc->sched_flush)) + flush_delayed_work(&lc->flush_log_work); + + destroy_workqueue(lc->dmlog_wq); + } + + (void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, + NULL, 0, NULL, NULL); + + if (lc->log_dev) + dm_put_device(lc->ti, lc->log_dev); + + kfree(lc->usr_argv_str); + kfree(lc); + + return; +} + +static int userspace_presuspend(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, + NULL, 0, NULL, NULL); + + return r; +} + +static int userspace_postsuspend(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + /* + * Run planned flush earlier. + */ + if (lc->integrated_flush && atomic_read(&lc->sched_flush)) + flush_delayed_work(&lc->flush_log_work); + + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, + NULL, 0, NULL, NULL); + + return r; +} + +static int userspace_resume(struct dm_dirty_log *log) +{ + int r; + struct log_c *lc = log->context; + + lc->in_sync_hint = 0; + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, + NULL, 0, NULL, NULL); + + return r; +} + +static uint32_t userspace_get_region_size(struct dm_dirty_log *log) +{ + struct log_c *lc = log->context; + + return lc->region_size; +} + +/* + * userspace_is_clean + * + * Check whether a region is clean. If there is any sort of + * failure when consulting the server, we return not clean. + * + * Returns: 1 if clean, 0 otherwise + */ +static int userspace_is_clean(struct dm_dirty_log *log, region_t region) +{ + int r; + uint64_t region64 = (uint64_t)region; + int64_t is_clean; + size_t rdata_size; + struct log_c *lc = log->context; + + rdata_size = sizeof(is_clean); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN, + (char *)®ion64, sizeof(region64), + (char *)&is_clean, &rdata_size); + + return (r) ? 0 : (int)is_clean; +} + +/* + * userspace_in_sync + * + * Check if the region is in-sync. If there is any sort + * of failure when consulting the server, we assume that + * the region is not in sync. + * + * If 'can_block' is set, return immediately + * + * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK + */ +static int userspace_in_sync(struct dm_dirty_log *log, region_t region, + int can_block) +{ + int r; + uint64_t region64 = region; + int64_t in_sync; + size_t rdata_size; + struct log_c *lc = log->context; + + /* + * We can never respond directly - even if in_sync_hint is + * set. This is because another machine could see a device + * failure and mark the region out-of-sync. If we don't go + * to userspace to ask, we might think the region is in-sync + * and allow a read to pick up data that is stale. (This is + * very unlikely if a device actually fails; but it is very + * likely if a connection to one device from one machine fails.) + * + * There still might be a problem if the mirror caches the region + * state as in-sync... but then this call would not be made. So, + * that is a mirror problem. + */ + if (!can_block) + return -EWOULDBLOCK; + + rdata_size = sizeof(in_sync); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC, + (char *)®ion64, sizeof(region64), + (char *)&in_sync, &rdata_size); + return (r) ? 0 : (int)in_sync; +} + +static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) +{ + int r = 0; + struct flush_entry *fe; + + list_for_each_entry(fe, flush_list, list) { + r = userspace_do_request(lc, lc->uuid, fe->type, + (char *)&fe->region, + sizeof(fe->region), + NULL, NULL); + if (r) + break; + } + + return r; +} + +static int flush_by_group(struct log_c *lc, struct list_head *flush_list, + int flush_with_payload) +{ + int r = 0; + int count; + uint32_t type = 0; + struct flush_entry *fe, *tmp_fe; + LIST_HEAD(tmp_list); + uint64_t group[MAX_FLUSH_GROUP_COUNT]; + + /* + * Group process the requests + */ + while (!list_empty(flush_list)) { + count = 0; + + list_for_each_entry_safe(fe, tmp_fe, flush_list, list) { + group[count] = fe->region; + count++; + + list_move(&fe->list, &tmp_list); + + type = fe->type; + if (count >= MAX_FLUSH_GROUP_COUNT) + break; + } + + if (flush_with_payload) { + r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, + (char *)(group), + count * sizeof(uint64_t), + NULL, NULL); + /* + * Integrated flush failed. + */ + if (r) + break; + } else { + r = userspace_do_request(lc, lc->uuid, type, + (char *)(group), + count * sizeof(uint64_t), + NULL, NULL); + if (r) { + /* + * Group send failed. Attempt one-by-one. + */ + list_splice_init(&tmp_list, flush_list); + r = flush_one_by_one(lc, flush_list); + break; + } + } + } + + /* + * Must collect flush_entrys that were successfully processed + * as a group so that they will be free'd by the caller. + */ + list_splice_init(&tmp_list, flush_list); + + return r; +} + +/* + * userspace_flush + * + * This function is ok to block. + * The flush happens in two stages. First, it sends all + * clear/mark requests that are on the list. Then it + * tells the server to commit them. This gives the + * server a chance to optimise the commit, instead of + * doing it for every request. + * + * Additionally, we could implement another thread that + * sends the requests up to the server - reducing the + * load on flush. Then the flush would have less in + * the list and be responsible for the finishing commit. + * + * Returns: 0 on success, < 0 on failure + */ +static int userspace_flush(struct dm_dirty_log *log) +{ + int r = 0; + unsigned long flags; + struct log_c *lc = log->context; + LIST_HEAD(mark_list); + LIST_HEAD(clear_list); + int mark_list_is_empty; + int clear_list_is_empty; + struct flush_entry *fe, *tmp_fe; + + spin_lock_irqsave(&lc->flush_lock, flags); + list_splice_init(&lc->mark_list, &mark_list); + list_splice_init(&lc->clear_list, &clear_list); + spin_unlock_irqrestore(&lc->flush_lock, flags); + + mark_list_is_empty = list_empty(&mark_list); + clear_list_is_empty = list_empty(&clear_list); + + if (mark_list_is_empty && clear_list_is_empty) + return 0; + + r = flush_by_group(lc, &clear_list, 0); + if (r) + goto out; + + if (!lc->integrated_flush) { + r = flush_by_group(lc, &mark_list, 0); + if (r) + goto out; + r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH, + NULL, 0, NULL, NULL); + goto out; + } + + /* + * Send integrated flush request with mark_list as payload. + */ + r = flush_by_group(lc, &mark_list, 1); + if (r) + goto out; + + if (mark_list_is_empty && !atomic_read(&lc->sched_flush)) { + /* + * When there are only clear region requests, + * we schedule a flush in the future. + */ + queue_delayed_work(lc->dmlog_wq, &lc->flush_log_work, 3 * HZ); + atomic_set(&lc->sched_flush, 1); + } else { + /* + * Cancel pending flush because we + * have already flushed in mark_region. + */ + cancel_delayed_work(&lc->flush_log_work); + atomic_set(&lc->sched_flush, 0); + } + +out: + /* + * We can safely remove these entries, even after failure. + * Calling code will receive an error and will know that + * the log facility has failed. + */ + list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) { + list_del(&fe->list); + mempool_free(fe, flush_entry_pool); + } + list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) { + list_del(&fe->list); + mempool_free(fe, flush_entry_pool); + } + + if (r) + dm_table_event(lc->ti->table); + + return r; +} + +/* + * userspace_mark_region + * + * This function should avoid blocking unless absolutely required. + * (Memory allocation is valid for blocking.) + */ +static void userspace_mark_region(struct dm_dirty_log *log, region_t region) +{ + unsigned long flags; + struct log_c *lc = log->context; + struct flush_entry *fe; + + /* Wait for an allocation, but _never_ fail */ + fe = mempool_alloc(flush_entry_pool, GFP_NOIO); + BUG_ON(!fe); + + spin_lock_irqsave(&lc->flush_lock, flags); + fe->type = DM_ULOG_MARK_REGION; + fe->region = region; + list_add(&fe->list, &lc->mark_list); + spin_unlock_irqrestore(&lc->flush_lock, flags); + + return; +} + +/* + * userspace_clear_region + * + * This function must not block. + * So, the alloc can't block. In the worst case, it is ok to + * fail. It would simply mean we can't clear the region. + * Does nothing to current sync context, but does mean + * the region will be re-sync'ed on a reload of the mirror + * even though it is in-sync. + */ +static void userspace_clear_region(struct dm_dirty_log *log, region_t region) +{ + unsigned long flags; + struct log_c *lc = log->context; + struct flush_entry *fe; + + /* + * If we fail to allocate, we skip the clearing of + * the region. This doesn't hurt us in any way, except + * to cause the region to be resync'ed when the + * device is activated next time. + */ + fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); + if (!fe) { + DMERR("Failed to allocate memory to clear region."); + return; + } + + spin_lock_irqsave(&lc->flush_lock, flags); + fe->type = DM_ULOG_CLEAR_REGION; + fe->region = region; + list_add(&fe->list, &lc->clear_list); + spin_unlock_irqrestore(&lc->flush_lock, flags); + + return; +} + +/* + * userspace_get_resync_work + * + * Get a region that needs recovery. It is valid to return + * an error for this function. + * + * Returns: 1 if region filled, 0 if no work, <0 on error + */ +static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) +{ + int r; + size_t rdata_size; + struct log_c *lc = log->context; + struct { + int64_t i; /* 64-bit for mix arch compatibility */ + region_t r; + } pkg; + + if (lc->in_sync_hint >= lc->region_count) + return 0; + + rdata_size = sizeof(pkg); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK, + NULL, 0, (char *)&pkg, &rdata_size); + + *region = pkg.r; + return (r) ? r : (int)pkg.i; +} + +/* + * userspace_set_region_sync + * + * Set the sync status of a given region. This function + * must not fail. + */ +static void userspace_set_region_sync(struct dm_dirty_log *log, + region_t region, int in_sync) +{ + int r; + struct log_c *lc = log->context; + struct { + region_t r; + int64_t i; + } pkg; + + pkg.r = region; + pkg.i = (int64_t)in_sync; + + r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, + (char *)&pkg, sizeof(pkg), NULL, NULL); + + /* + * It would be nice to be able to report failures. + * However, it is easy emough to detect and resolve. + */ + return; +} + +/* + * userspace_get_sync_count + * + * If there is any sort of failure when consulting the server, + * we assume that the sync count is zero. + * + * Returns: sync count on success, 0 on failure + */ +static region_t userspace_get_sync_count(struct dm_dirty_log *log) +{ + int r; + size_t rdata_size; + uint64_t sync_count; + struct log_c *lc = log->context; + + rdata_size = sizeof(sync_count); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT, + NULL, 0, (char *)&sync_count, &rdata_size); + + if (r) + return 0; + + if (sync_count >= lc->region_count) + lc->in_sync_hint = lc->region_count; + + return (region_t)sync_count; +} + +/* + * userspace_status + * + * Returns: amount of space consumed + */ +static int userspace_status(struct dm_dirty_log *log, status_type_t status_type, + char *result, unsigned maxlen) +{ + int r = 0; + char *table_args; + size_t sz = (size_t)maxlen; + struct log_c *lc = log->context; + + switch (status_type) { + case STATUSTYPE_INFO: + r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO, + NULL, 0, result, &sz); + + if (r) { + sz = 0; + DMEMIT("%s 1 COM_FAILURE", log->type->name); + } + break; + case STATUSTYPE_TABLE: + sz = 0; + table_args = strchr(lc->usr_argv_str, ' '); + BUG_ON(!table_args); /* There will always be a ' ' */ + table_args++; + + DMEMIT("%s %u %s ", log->type->name, lc->usr_argc, lc->uuid); + if (lc->integrated_flush) + DMEMIT("integrated_flush "); + DMEMIT("%s ", table_args); + break; + } + return (r) ? 0 : (int)sz; +} + +/* + * userspace_is_remote_recovering + * + * Returns: 1 if region recovering, 0 otherwise + */ +static int userspace_is_remote_recovering(struct dm_dirty_log *log, + region_t region) +{ + int r; + uint64_t region64 = region; + struct log_c *lc = log->context; + static unsigned long long limit; + struct { + int64_t is_recovering; + uint64_t in_sync_hint; + } pkg; + size_t rdata_size = sizeof(pkg); + + /* + * Once the mirror has been reported to be in-sync, + * it will never again ask for recovery work. So, + * we can safely say there is not a remote machine + * recovering if the device is in-sync. (in_sync_hint + * must be reset at resume time.) + */ + if (region < lc->in_sync_hint) + return 0; + else if (jiffies < limit) + return 1; + + limit = jiffies + (HZ / 4); + r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING, + (char *)®ion64, sizeof(region64), + (char *)&pkg, &rdata_size); + if (r) + return 1; + + lc->in_sync_hint = pkg.in_sync_hint; + + return (int)pkg.is_recovering; +} + +static struct dm_dirty_log_type _userspace_type = { + .name = "userspace", + .module = THIS_MODULE, + .ctr = userspace_ctr, + .dtr = userspace_dtr, + .presuspend = userspace_presuspend, + .postsuspend = userspace_postsuspend, + .resume = userspace_resume, + .get_region_size = userspace_get_region_size, + .is_clean = userspace_is_clean, + .in_sync = userspace_in_sync, + .flush = userspace_flush, + .mark_region = userspace_mark_region, + .clear_region = userspace_clear_region, + .get_resync_work = userspace_get_resync_work, + .set_region_sync = userspace_set_region_sync, + .get_sync_count = userspace_get_sync_count, + .status = userspace_status, + .is_remote_recovering = userspace_is_remote_recovering, +}; + +static int __init userspace_dirty_log_init(void) +{ + int r = 0; + + flush_entry_pool = mempool_create(100, flush_entry_alloc, + flush_entry_free, NULL); + + if (!flush_entry_pool) { + DMWARN("Unable to create flush_entry_pool: No memory."); + return -ENOMEM; + } + + r = dm_ulog_tfr_init(); + if (r) { + DMWARN("Unable to initialize userspace log communications"); + mempool_destroy(flush_entry_pool); + return r; + } + + r = dm_dirty_log_type_register(&_userspace_type); + if (r) { + DMWARN("Couldn't register userspace dirty log type"); + dm_ulog_tfr_exit(); + mempool_destroy(flush_entry_pool); + return r; + } + + DMINFO("version " DM_LOG_USERSPACE_VSN " loaded"); + return 0; +} + +static void __exit userspace_dirty_log_exit(void) +{ + dm_dirty_log_type_unregister(&_userspace_type); + dm_ulog_tfr_exit(); + mempool_destroy(flush_entry_pool); + + DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); + return; +} + +module_init(userspace_dirty_log_init); +module_exit(userspace_dirty_log_exit); + +MODULE_DESCRIPTION(DM_NAME " userspace dirty log link"); +MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c new file mode 100644 index 00000000000..b428c0ae63d --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.c @@ -0,0 +1,286 @@ +/* + * Copyright (C) 2006-2009 Red Hat, Inc. + * + * This file is released under the LGPL. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <net/sock.h> +#include <linux/workqueue.h> +#include <linux/connector.h> +#include <linux/device-mapper.h> +#include <linux/dm-log-userspace.h> + +#include "dm-log-userspace-transfer.h" + +static uint32_t dm_ulog_seq; + +/* + * Netlink/Connector is an unreliable protocol. How long should + * we wait for a response before assuming it was lost and retrying? + * (If we do receive a response after this time, it will be discarded + * and the response to the resent request will be waited for. + */ +#define DM_ULOG_RETRY_TIMEOUT (15 * HZ) + +/* + * Pre-allocated space for speed + */ +#define DM_ULOG_PREALLOCED_SIZE 512 +static struct cn_msg *prealloced_cn_msg; +static struct dm_ulog_request *prealloced_ulog_tfr; + +static struct cb_id ulog_cn_id = { + .idx = CN_IDX_DM, + .val = CN_VAL_DM_USERSPACE_LOG +}; + +static DEFINE_MUTEX(dm_ulog_lock); + +struct receiving_pkg { + struct list_head list; + struct completion complete; + + uint32_t seq; + + int error; + size_t *data_size; + char *data; +}; + +static DEFINE_SPINLOCK(receiving_list_lock); +static struct list_head receiving_list; + +static int dm_ulog_sendto_server(struct dm_ulog_request *tfr) +{ + int r; + struct cn_msg *msg = prealloced_cn_msg; + + memset(msg, 0, sizeof(struct cn_msg)); + + msg->id.idx = ulog_cn_id.idx; + msg->id.val = ulog_cn_id.val; + msg->ack = 0; + msg->seq = tfr->seq; + msg->len = sizeof(struct dm_ulog_request) + tfr->data_size; + + r = cn_netlink_send(msg, 0, 0, gfp_any()); + + return r; +} + +/* + * Parameters for this function can be either msg or tfr, but not + * both. This function fills in the reply for a waiting request. + * If just msg is given, then the reply is simply an ACK from userspace + * that the request was received. + * + * Returns: 0 on success, -ENOENT on failure + */ +static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr) +{ + uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0; + struct receiving_pkg *pkg; + + /* + * The 'receiving_pkg' entries in this list are statically + * allocated on the stack in 'dm_consult_userspace'. + * Each process that is waiting for a reply from the user + * space server will have an entry in this list. + * + * We are safe to do it this way because the stack space + * is unique to each process, but still addressable by + * other processes. + */ + list_for_each_entry(pkg, &receiving_list, list) { + if (rtn_seq != pkg->seq) + continue; + + if (msg) { + pkg->error = -msg->ack; + /* + * If we are trying again, we will need to know our + * storage capacity. Otherwise, along with the + * error code, we make explicit that we have no data. + */ + if (pkg->error != -EAGAIN) + *(pkg->data_size) = 0; + } else if (tfr->data_size > *(pkg->data_size)) { + DMERR("Insufficient space to receive package [%u] " + "(%u vs %zu)", tfr->request_type, + tfr->data_size, *(pkg->data_size)); + + *(pkg->data_size) = 0; + pkg->error = -ENOSPC; + } else { + pkg->error = tfr->error; + memcpy(pkg->data, tfr->data, tfr->data_size); + *(pkg->data_size) = tfr->data_size; + } + complete(&pkg->complete); + return 0; + } + + return -ENOENT; +} + +/* + * This is the connector callback that delivers data + * that was sent from userspace. + */ +static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) +{ + struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1); + + if (!capable(CAP_SYS_ADMIN)) + return; + + spin_lock(&receiving_list_lock); + if (msg->len == 0) + fill_pkg(msg, NULL); + else if (msg->len < sizeof(*tfr)) + DMERR("Incomplete message received (expected %u, got %u): [%u]", + (unsigned)sizeof(*tfr), msg->len, msg->seq); + else + fill_pkg(NULL, tfr); + spin_unlock(&receiving_list_lock); +} + +/** + * dm_consult_userspace + * @uuid: log's universal unique identifier (must be DM_UUID_LEN in size) + * @luid: log's local unique identifier + * @request_type: found in include/linux/dm-log-userspace.h + * @data: data to tx to the server + * @data_size: size of data in bytes + * @rdata: place to put return data from server + * @rdata_size: value-result (amount of space given/amount of space used) + * + * rdata_size is undefined on failure. + * + * Memory used to communicate with userspace is zero'ed + * before populating to ensure that no unwanted bits leak + * from kernel space to user-space. All userspace log communications + * between kernel and user space go through this function. + * + * Returns: 0 on success, -EXXX on failure + **/ +int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, + char *data, size_t data_size, + char *rdata, size_t *rdata_size) +{ + int r = 0; + size_t dummy = 0; + int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg); + struct dm_ulog_request *tfr = prealloced_ulog_tfr; + struct receiving_pkg pkg; + + /* + * Given the space needed to hold the 'struct cn_msg' and + * 'struct dm_ulog_request' - do we have enough payload + * space remaining? + */ + if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) { + DMINFO("Size of tfr exceeds preallocated size"); + return -EINVAL; + } + + if (!rdata_size) + rdata_size = &dummy; +resend: + /* + * We serialize the sending of requests so we can + * use the preallocated space. + */ + mutex_lock(&dm_ulog_lock); + + memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg)); + memcpy(tfr->uuid, uuid, DM_UUID_LEN); + tfr->version = DM_ULOG_REQUEST_VERSION; + tfr->luid = luid; + tfr->seq = dm_ulog_seq++; + + /* + * Must be valid request type (all other bits set to + * zero). This reserves other bits for possible future + * use. + */ + tfr->request_type = request_type & DM_ULOG_REQUEST_MASK; + + tfr->data_size = data_size; + if (data && data_size) + memcpy(tfr->data, data, data_size); + + memset(&pkg, 0, sizeof(pkg)); + init_completion(&pkg.complete); + pkg.seq = tfr->seq; + pkg.data_size = rdata_size; + pkg.data = rdata; + spin_lock(&receiving_list_lock); + list_add(&(pkg.list), &receiving_list); + spin_unlock(&receiving_list_lock); + + r = dm_ulog_sendto_server(tfr); + + mutex_unlock(&dm_ulog_lock); + + if (r) { + DMERR("Unable to send log request [%u] to userspace: %d", + request_type, r); + spin_lock(&receiving_list_lock); + list_del_init(&(pkg.list)); + spin_unlock(&receiving_list_lock); + + goto out; + } + + r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT); + spin_lock(&receiving_list_lock); + list_del_init(&(pkg.list)); + spin_unlock(&receiving_list_lock); + if (!r) { + DMWARN("[%s] Request timed out: [%u/%u] - retrying", + (strlen(uuid) > 8) ? + (uuid + (strlen(uuid) - 8)) : (uuid), + request_type, pkg.seq); + goto resend; + } + + r = pkg.error; + if (r == -EAGAIN) + goto resend; + +out: + return r; +} + +int dm_ulog_tfr_init(void) +{ + int r; + void *prealloced; + + INIT_LIST_HEAD(&receiving_list); + + prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL); + if (!prealloced) + return -ENOMEM; + + prealloced_cn_msg = prealloced; + prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg); + + r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); + if (r) { + cn_del_callback(&ulog_cn_id); + return r; + } + + return 0; +} + +void dm_ulog_tfr_exit(void) +{ + cn_del_callback(&ulog_cn_id); + kfree(prealloced_cn_msg); +} diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h new file mode 100644 index 00000000000..04ee874f915 --- /dev/null +++ b/drivers/md/dm-log-userspace-transfer.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2006-2009 Red Hat, Inc. + * + * This file is released under the LGPL. + */ + +#ifndef __DM_LOG_USERSPACE_TRANSFER_H__ +#define __DM_LOG_USERSPACE_TRANSFER_H__ + +#define DM_MSG_PREFIX "dm-log-userspace" + +int dm_ulog_tfr_init(void); +void dm_ulog_tfr_exit(void); +int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, + char *data, size_t data_size, + char *rdata, size_t *rdata_size); + +#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */ diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 5b48478c79f..627d19186d5 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -12,44 +12,33 @@ #include <linux/dm-io.h> #include <linux/dm-dirty-log.h> -#include "dm.h" +#include <linux/device-mapper.h> #define DM_MSG_PREFIX "dirty region log" -struct dm_dirty_log_internal { - struct dm_dirty_log_type *type; - - struct list_head list; - long use; -}; - static LIST_HEAD(_log_types); static DEFINE_SPINLOCK(_lock); -static struct dm_dirty_log_internal *__find_dirty_log_type(const char *name) +static struct dm_dirty_log_type *__find_dirty_log_type(const char *name) { - struct dm_dirty_log_internal *log_type; + struct dm_dirty_log_type *log_type; list_for_each_entry(log_type, &_log_types, list) - if (!strcmp(name, log_type->type->name)) + if (!strcmp(name, log_type->name)) return log_type; return NULL; } -static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name) +static struct dm_dirty_log_type *_get_dirty_log_type(const char *name) { - struct dm_dirty_log_internal *log_type; + struct dm_dirty_log_type *log_type; spin_lock(&_lock); log_type = __find_dirty_log_type(name); - if (log_type) { - if (!log_type->use && !try_module_get(log_type->type->module)) - log_type = NULL; - else - log_type->use++; - } + if (log_type && !try_module_get(log_type->module)) + log_type = NULL; spin_unlock(&_lock); @@ -76,14 +65,14 @@ static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name) static struct dm_dirty_log_type *get_type(const char *type_name) { char *p, *type_name_dup; - struct dm_dirty_log_internal *log_type; + struct dm_dirty_log_type *log_type; if (!type_name) return NULL; log_type = _get_dirty_log_type(type_name); if (log_type) - return log_type->type; + return log_type; type_name_dup = kstrdup(type_name, GFP_KERNEL); if (!type_name_dup) { @@ -105,56 +94,33 @@ static struct dm_dirty_log_type *get_type(const char *type_name) kfree(type_name_dup); - return log_type ? log_type->type : NULL; + return log_type; } static void put_type(struct dm_dirty_log_type *type) { - struct dm_dirty_log_internal *log_type; - if (!type) return; spin_lock(&_lock); - log_type = __find_dirty_log_type(type->name); - if (!log_type) + if (!__find_dirty_log_type(type->name)) goto out; - if (!--log_type->use) - module_put(type->module); - - BUG_ON(log_type->use < 0); + module_put(type->module); out: spin_unlock(&_lock); } -static struct dm_dirty_log_internal *_alloc_dirty_log_type(struct dm_dirty_log_type *type) -{ - struct dm_dirty_log_internal *log_type = kzalloc(sizeof(*log_type), - GFP_KERNEL); - - if (log_type) - log_type->type = type; - - return log_type; -} - int dm_dirty_log_type_register(struct dm_dirty_log_type *type) { - struct dm_dirty_log_internal *log_type = _alloc_dirty_log_type(type); int r = 0; - if (!log_type) - return -ENOMEM; - spin_lock(&_lock); if (!__find_dirty_log_type(type->name)) - list_add(&log_type->list, &_log_types); - else { - kfree(log_type); + list_add(&type->list, &_log_types); + else r = -EEXIST; - } spin_unlock(&_lock); return r; @@ -163,33 +129,25 @@ EXPORT_SYMBOL(dm_dirty_log_type_register); int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type) { - struct dm_dirty_log_internal *log_type; - spin_lock(&_lock); - log_type = __find_dirty_log_type(type->name); - if (!log_type) { + if (!__find_dirty_log_type(type->name)) { spin_unlock(&_lock); return -EINVAL; } - if (log_type->use) { - spin_unlock(&_lock); - return -ETXTBSY; - } - - list_del(&log_type->list); + list_del(&type->list); spin_unlock(&_lock); - kfree(log_type); return 0; } EXPORT_SYMBOL(dm_dirty_log_type_unregister); struct dm_dirty_log *dm_dirty_log_create(const char *type_name, - struct dm_target *ti, - unsigned int argc, char **argv) + struct dm_target *ti, + int (*flush_callback_fn)(struct dm_target *ti), + unsigned int argc, char **argv) { struct dm_dirty_log_type *type; struct dm_dirty_log *log; @@ -204,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name, return NULL; } + log->flush_callback_fn = flush_callback_fn; log->type = type; if (type->ctr(log, ti, argc, argv)) { kfree(log); @@ -238,20 +197,28 @@ EXPORT_SYMBOL(dm_dirty_log_destroy); #define MIRROR_DISK_VERSION 2 #define LOG_OFFSET 2 -struct log_header { - uint32_t magic; +struct log_header_disk { + __le32 magic; /* * Simple, incrementing version. no backward * compatibility. */ + __le32 version; + __le64 nr_regions; +} __packed; + +struct log_header_core { + uint32_t magic; uint32_t version; - sector_t nr_regions; + uint64_t nr_regions; }; struct log_c { struct dm_target *ti; - int touched; + int touched_dirtied; + int touched_cleaned; + int flush_failed; uint32_t region_size; unsigned int region_count; region_t sync_count; @@ -276,11 +243,12 @@ struct log_c { * Disk log fields */ int log_dev_failed; + int log_dev_flush_failed; struct dm_dev *log_dev; - struct log_header header; + struct log_header_core header; struct dm_io_region header_location; - struct log_header *disk_header; + struct log_header_disk *disk_header; }; /* @@ -289,34 +257,34 @@ struct log_c { */ static inline int log_test_bit(uint32_t *bs, unsigned bit) { - return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0; + return test_bit_le(bit, bs) ? 1 : 0; } static inline void log_set_bit(struct log_c *l, uint32_t *bs, unsigned bit) { - ext2_set_bit(bit, (unsigned long *) bs); - l->touched = 1; + __set_bit_le(bit, bs); + l->touched_cleaned = 1; } static inline void log_clear_bit(struct log_c *l, uint32_t *bs, unsigned bit) { - ext2_clear_bit(bit, (unsigned long *) bs); - l->touched = 1; + __clear_bit_le(bit, bs); + l->touched_dirtied = 1; } /*---------------------------------------------------------------- * Header IO *--------------------------------------------------------------*/ -static void header_to_disk(struct log_header *core, struct log_header *disk) +static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk) { disk->magic = cpu_to_le32(core->magic); disk->version = cpu_to_le32(core->version); disk->nr_regions = cpu_to_le64(core->nr_regions); } -static void header_from_disk(struct log_header *core, struct log_header *disk) +static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk) { core->magic = le32_to_cpu(disk->magic); core->version = le32_to_cpu(disk->version); @@ -326,12 +294,23 @@ static void header_from_disk(struct log_header *core, struct log_header *disk) static int rw_header(struct log_c *lc, int rw) { lc->io_req.bi_rw = rw; - lc->io_req.mem.ptr.vma = lc->disk_header; - lc->io_req.notify.fn = NULL; return dm_io(&lc->io_req, 1, &lc->header_location, NULL); } +static int flush_header(struct log_c *lc) +{ + struct dm_io_region null_location = { + .bdev = lc->header_location.bdev, + .sector = 0, + .count = 0, + }; + + lc->io_req.bi_rw = WRITE_FLUSH; + + return dm_io(&lc->io_req, 1, &null_location, NULL); +} + static int read_header(struct log_c *log) { int r; @@ -362,10 +341,15 @@ static int read_header(struct log_c *log) return 0; } -static inline int write_header(struct log_c *log) +static int _check_region_size(struct dm_target *ti, uint32_t region_size) { - header_to_disk(&log->header, log->disk_header); - return rw_header(log, WRITE); + if (region_size < 2 || region_size > ti->len) + return 0; + + if (!is_power_of_2(region_size)) + return 0; + + return 1; } /*---------------------------------------------------------------- @@ -385,6 +369,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, unsigned int region_count; size_t bitset_size, buf_size; int r; + char dummy; if (argc < 1 || argc > 2) { DMWARN("wrong number of arguments to dirty region log"); @@ -403,8 +388,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, } } - if (sscanf(argv[0], "%u", ®ion_size) != 1) { - DMWARN("invalid region size string"); + if (sscanf(argv[0], "%u%c", ®ion_size, &dummy) != 1 || + !_check_region_size(ti, region_size)) { + DMWARN("invalid region size %s", argv[0]); return -EINVAL; } @@ -417,7 +403,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, } lc->ti = ti; - lc->touched = 0; + lc->touched_dirtied = 0; + lc->touched_cleaned = 0; + lc->flush_failed = 0; lc->region_size = region_size; lc->region_count = region_count; lc->sync = sync; @@ -445,32 +433,46 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, } else { lc->log_dev = dev; lc->log_dev_failed = 0; + lc->log_dev_flush_failed = 0; lc->header_location.bdev = lc->log_dev->bdev; lc->header_location.sector = 0; /* * Buffer holds both header and bitset. */ - buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + - bitset_size, ti->limits.hardsect_size); + buf_size = + dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size, + bdev_logical_block_size(lc->header_location. + bdev)); + + if (buf_size > i_size_read(dev->bdev->bd_inode)) { + DMWARN("log device %s too small: need %llu bytes", + dev->name, (unsigned long long)buf_size); + kfree(lc); + return -EINVAL; + } + lc->header_location.count = buf_size >> SECTOR_SHIFT; + lc->io_req.mem.type = DM_IO_VMA; - lc->io_req.client = dm_io_client_create(dm_div_up(buf_size, - PAGE_SIZE)); + lc->io_req.notify.fn = NULL; + lc->io_req.client = dm_io_client_create(); if (IS_ERR(lc->io_req.client)) { r = PTR_ERR(lc->io_req.client); DMWARN("couldn't allocate disk io client"); kfree(lc); - return -ENOMEM; + return r; } lc->disk_header = vmalloc(buf_size); if (!lc->disk_header) { DMWARN("couldn't allocate disk log buffer"); + dm_io_client_destroy(lc->io_req.client); kfree(lc); return -ENOMEM; } + lc->io_req.mem.ptr.vma = lc->disk_header; lc->clean_bits = (void *)lc->disk_header + (LOG_OFFSET << SECTOR_SHIFT); } @@ -482,6 +484,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, DMWARN("couldn't allocate sync bitset"); if (!dev) vfree(lc->clean_bits); + else + dm_io_client_destroy(lc->io_req.client); vfree(lc->disk_header); kfree(lc); return -ENOMEM; @@ -489,17 +493,18 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size); lc->sync_count = (sync == NOSYNC) ? region_count : 0; - lc->recovering_bits = vmalloc(bitset_size); + lc->recovering_bits = vzalloc(bitset_size); if (!lc->recovering_bits) { DMWARN("couldn't allocate sync bitset"); vfree(lc->sync_bits); if (!dev) vfree(lc->clean_bits); + else + dm_io_client_destroy(lc->io_req.client); vfree(lc->disk_header); kfree(lc); return -ENOMEM; } - memset(lc->recovering_bits, 0, bitset_size); lc->sync_search = 0; log->context = lc; @@ -543,8 +548,7 @@ static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti, return -EINVAL; } - r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */, - FMODE_READ | FMODE_WRITE, &dev); + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); if (r) return r; @@ -567,16 +571,6 @@ static void disk_dtr(struct dm_dirty_log *log) destroy_log_context(lc); } -static int count_bits32(uint32_t *addr, unsigned size) -{ - int count = 0, i; - - for (i = 0; i < size; i++) { - count += hweight32(*(addr+i)); - } - return count; -} - static void fail_log_device(struct log_c *lc) { if (lc->log_dev_failed) @@ -625,14 +619,22 @@ static int disk_resume(struct dm_dirty_log *log) /* copy clean across to sync */ memcpy(lc->sync_bits, lc->clean_bits, size); - lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count); + lc->sync_count = memweight(lc->clean_bits, + lc->bitset_uint32_count * sizeof(uint32_t)); lc->sync_search = 0; /* set the correct number of regions in the header */ lc->header.nr_regions = lc->region_count; + header_to_disk(&lc->header, lc->disk_header); + /* write the new header */ - r = write_header(lc); + r = rw_header(lc, WRITE); + if (!r) { + r = flush_header(lc); + if (r) + lc->log_dev_flush_failed = 1; + } if (r) { DMWARN("%s: Failed to write header on dirty region log device", lc->log_dev->name); @@ -675,18 +677,40 @@ static int core_flush(struct dm_dirty_log *log) static int disk_flush(struct dm_dirty_log *log) { - int r; - struct log_c *lc = (struct log_c *) log->context; + int r, i; + struct log_c *lc = log->context; /* only write if the log has changed */ - if (!lc->touched) + if (!lc->touched_cleaned && !lc->touched_dirtied) return 0; - r = write_header(lc); + if (lc->touched_cleaned && log->flush_callback_fn && + log->flush_callback_fn(lc->ti)) { + /* + * At this point it is impossible to determine which + * regions are clean and which are dirty (without + * re-reading the log off disk). So mark all of them + * dirty. + */ + lc->flush_failed = 1; + for (i = 0; i < lc->region_count; i++) + log_clear_bit(lc, lc->clean_bits, i); + } + + r = rw_header(lc, WRITE); if (r) fail_log_device(lc); - else - lc->touched = 0; + else { + if (lc->touched_dirtied) { + r = flush_header(lc); + if (r) { + lc->log_dev_flush_failed = 1; + fail_log_device(lc); + } else + lc->touched_dirtied = 0; + } + lc->touched_cleaned = 0; + } return r; } @@ -700,7 +724,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region) static void core_clear_region(struct dm_dirty_log *log, region_t region) { struct log_c *lc = (struct log_c *) log->context; - log_set_bit(lc, lc->clean_bits, region); + if (likely(!lc->flush_failed)) + log_set_bit(lc, lc->clean_bits, region); } static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) @@ -711,8 +736,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region) return 0; do { - *region = ext2_find_next_zero_bit( - (unsigned long *) lc->sync_bits, + *region = find_next_zero_bit_le(lc->sync_bits, lc->region_count, lc->sync_search); lc->sync_search = *region + 1; @@ -781,7 +805,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status, switch(status) { case STATUSTYPE_INFO: DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name, - lc->log_dev_failed ? 'D' : 'A'); + lc->log_dev_flush_failed ? 'F' : + lc->log_dev_failed ? 'D' : + 'A'); break; case STATUSTYPE_TABLE: diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 71dd65aa31b..f4167b013d9 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -5,10 +5,10 @@ * This file is released under the GPL. */ +#include <linux/device-mapper.h> + #include "dm.h" #include "dm-path-selector.h" -#include "dm-bio-list.h" -#include "dm-bio-record.h" #include "dm-uevent.h" #include <linux/ctype.h> @@ -19,20 +19,24 @@ #include <linux/slab.h> #include <linux/time.h> #include <linux/workqueue.h> +#include <linux/delay.h> #include <scsi/scsi_dh.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #define DM_MSG_PREFIX "multipath" -#define MESG_STR(x) x, sizeof(x) +#define DM_PG_INIT_DELAY_MSECS 2000 +#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) /* Path properties */ struct pgpath { struct list_head list; struct priority_group *pg; /* Owning PG */ + unsigned is_active; /* Path status */ unsigned fail_count; /* Cumulative failure count */ struct dm_path path; + struct delayed_work activate_path; }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -59,14 +63,19 @@ struct multipath { struct list_head list; struct dm_target *ti; + const char *hw_handler_name; + char *hw_handler_params; + spinlock_t lock; - const char *hw_handler_name; - struct work_struct activate_path; unsigned nr_priority_groups; struct list_head priority_groups; + + wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + unsigned pg_init_required; /* pg_init needs calling? */ unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ + unsigned pg_init_delay_retry; /* Delay pg_init retry? */ unsigned nr_valid_paths; /* Total number of usable paths */ struct pgpath *current_pgpath; @@ -74,15 +83,15 @@ struct multipath { struct priority_group *next_pg; /* Switch to this PG if set */ unsigned repeat_count; /* I/Os left before calling PS again */ - unsigned queue_io; /* Must we queue all I/O? */ - unsigned queue_if_no_path; /* Queue I/O if last path fails? */ - unsigned saved_queue_if_no_path;/* Saved state during suspension */ + unsigned queue_io:1; /* Must we queue all I/O? */ + unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ + unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ + unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ + unsigned pg_init_disabled:1; /* pg_init is not currently allowed */ + unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ - - struct work_struct process_queued_ios; - struct bio_list queued_ios; - unsigned queue_size; + unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ struct work_struct trigger_event; @@ -91,6 +100,8 @@ struct multipath { * can resubmit bios on error. */ mempool_t *mpio_pool; + + struct mutex work_mutex; }; /* @@ -98,19 +109,17 @@ struct multipath { */ struct dm_mpath_io { struct pgpath *pgpath; - struct dm_bio_details details; + size_t nr_bytes; }; typedef int (*action_fn) (struct pgpath *pgpath); -#define MIN_IOS 256 /* Mempool size */ - static struct kmem_cache *_mpio_cache; static struct workqueue_struct *kmultipathd, *kmpath_handlerd; -static void process_queued_ios(struct work_struct *work); static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); +static int __pgpath_busy(struct pgpath *pgpath); /*----------------------------------------------- @@ -121,8 +130,10 @@ static struct pgpath *alloc_pgpath(void) { struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); - if (pgpath) - pgpath->path.is_active = 1; + if (pgpath) { + pgpath->is_active = 1; + INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); + } return pgpath; } @@ -175,16 +186,18 @@ static void free_priority_group(struct priority_group *pg, static struct multipath *alloc_multipath(struct dm_target *ti) { struct multipath *m; + unsigned min_ios = dm_get_reserved_rq_based_ios(); m = kzalloc(sizeof(*m), GFP_KERNEL); if (m) { INIT_LIST_HEAD(&m->priority_groups); spin_lock_init(&m->lock); m->queue_io = 1; - INIT_WORK(&m->process_queued_ios, process_queued_ios); + m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; INIT_WORK(&m->trigger_event, trigger_event); - INIT_WORK(&m->activate_path, activate_path); - m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache); + init_waitqueue_head(&m->pg_init_wait); + mutex_init(&m->work_mutex); + m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); if (!m->mpio_pool) { kfree(m); return NULL; @@ -206,15 +219,66 @@ static void free_multipath(struct multipath *m) } kfree(m->hw_handler_name); + kfree(m->hw_handler_params); mempool_destroy(m->mpio_pool); kfree(m); } +static int set_mapinfo(struct multipath *m, union map_info *info) +{ + struct dm_mpath_io *mpio; + + mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); + if (!mpio) + return -ENOMEM; + + memset(mpio, 0, sizeof(*mpio)); + info->ptr = mpio; + + return 0; +} + +static void clear_mapinfo(struct multipath *m, union map_info *info) +{ + struct dm_mpath_io *mpio = info->ptr; + + info->ptr = NULL; + mempool_free(mpio, m->mpio_pool); +} /*----------------------------------------------- * Path selection *-----------------------------------------------*/ +static int __pg_init_all_paths(struct multipath *m) +{ + struct pgpath *pgpath; + unsigned long pg_init_delay = 0; + + if (m->pg_init_in_progress || m->pg_init_disabled) + return 0; + + m->pg_init_count++; + m->pg_init_required = 0; + + /* Check here to reset pg_init_required */ + if (!m->current_pg) + return 0; + + if (m->pg_init_delay_retry) + pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ? + m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS); + list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) { + /* Skip failed paths */ + if (!pgpath->is_active) + continue; + if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path, + pg_init_delay)) + m->pg_init_in_progress++; + } + return m->pg_init_in_progress; +} + static void __switch_pg(struct multipath *m, struct pgpath *pgpath) { m->current_pg = pgpath->pg; @@ -231,11 +295,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) m->pg_init_count = 0; } -static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) +static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, + size_t nr_bytes) { struct dm_path *path; - path = pg->ps.type->select_path(&pg->ps, &m->repeat_count); + path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); if (!path) return -ENXIO; @@ -247,7 +312,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg) return 0; } -static void __choose_pgpath(struct multipath *m) +static void __choose_pgpath(struct multipath *m, size_t nr_bytes) { struct priority_group *pg; unsigned bypassed = 1; @@ -259,25 +324,29 @@ static void __choose_pgpath(struct multipath *m) if (m->next_pg) { pg = m->next_pg; m->next_pg = NULL; - if (!__choose_path_in_pg(m, pg)) + if (!__choose_path_in_pg(m, pg, nr_bytes)) return; } /* Don't change PG until it has no remaining paths */ - if (m->current_pg && !__choose_path_in_pg(m, m->current_pg)) + if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) return; /* * Loop through priority groups until we find a valid path. * First time we skip PGs marked 'bypassed'. - * Second time we only try the ones we skipped. + * Second time we only try the ones we skipped, but set + * pg_init_delay_retry so we do not hammer controllers. */ do { list_for_each_entry(pg, &m->priority_groups, list) { if (pg->bypassed == bypassed) continue; - if (!__choose_path_in_pg(m, pg)) + if (!__choose_path_in_pg(m, pg, nr_bytes)) { + if (!bypassed) + m->pg_init_delay_retry = 1; return; + } } } while (bypassed--); @@ -299,48 +368,63 @@ failed: */ static int __must_push_back(struct multipath *m) { - return (m->queue_if_no_path != m->saved_queue_if_no_path && - dm_noflush_suspending(m->ti)); + return (m->queue_if_no_path || + (m->queue_if_no_path != m->saved_queue_if_no_path && + dm_noflush_suspending(m->ti))); } -static int map_io(struct multipath *m, struct bio *bio, - struct dm_mpath_io *mpio, unsigned was_queued) +#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required) + +/* + * Map cloned requests + */ +static int multipath_map(struct dm_target *ti, struct request *clone, + union map_info *map_context) { - int r = DM_MAPIO_REMAPPED; + struct multipath *m = (struct multipath *) ti->private; + int r = DM_MAPIO_REQUEUE; + size_t nr_bytes = blk_rq_bytes(clone); unsigned long flags; struct pgpath *pgpath; + struct block_device *bdev; + struct dm_mpath_io *mpio; spin_lock_irqsave(&m->lock, flags); /* Do we need to select a new pgpath? */ if (!m->current_pgpath || (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) - __choose_pgpath(m); + __choose_pgpath(m, nr_bytes); pgpath = m->current_pgpath; - if (was_queued) - m->queue_size--; - - if ((pgpath && m->queue_io) || - (!pgpath && m->queue_if_no_path)) { - /* Queue for the daemon to resubmit */ - bio_list_add(&m->queued_ios, bio); - m->queue_size++; - if ((m->pg_init_required && !m->pg_init_in_progress) || - !m->queue_io) - queue_work(kmultipathd, &m->process_queued_ios); - pgpath = NULL; - r = DM_MAPIO_SUBMITTED; - } else if (pgpath) - bio->bi_bdev = pgpath->path.dev->bdev; - else if (__must_push_back(m)) - r = DM_MAPIO_REQUEUE; - else - r = -EIO; /* Failed */ - + if (!pgpath) { + if (!__must_push_back(m)) + r = -EIO; /* Failed */ + goto out_unlock; + } + if (!pg_ready(m)) { + __pg_init_all_paths(m); + goto out_unlock; + } + if (set_mapinfo(m, map_context) < 0) + /* ENOMEM, requeue */ + goto out_unlock; + + bdev = pgpath->path.dev->bdev; + clone->q = bdev_get_queue(bdev); + clone->rq_disk = bdev->bd_disk; + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; + mpio = map_context->ptr; mpio->pgpath = pgpath; - + mpio->nr_bytes = nr_bytes; + if (pgpath->pg->ps.type->start_io) + pgpath->pg->ps.type->start_io(&pgpath->pg->ps, + &pgpath->path, + nr_bytes); + r = DM_MAPIO_REMAPPED; + +out_unlock: spin_unlock_irqrestore(&m->lock, flags); return r; @@ -361,86 +445,12 @@ static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path, else m->saved_queue_if_no_path = queue_if_no_path; m->queue_if_no_path = queue_if_no_path; - if (!m->queue_if_no_path && m->queue_size) - queue_work(kmultipathd, &m->process_queued_ios); - spin_unlock_irqrestore(&m->lock, flags); - return 0; -} - -/*----------------------------------------------------------------- - * The multipath daemon is responsible for resubmitting queued ios. - *---------------------------------------------------------------*/ - -static void dispatch_queued_ios(struct multipath *m) -{ - int r; - unsigned long flags; - struct bio *bio = NULL, *next; - struct dm_mpath_io *mpio; - union map_info *info; - - spin_lock_irqsave(&m->lock, flags); - bio = bio_list_get(&m->queued_ios); - spin_unlock_irqrestore(&m->lock, flags); - - while (bio) { - next = bio->bi_next; - bio->bi_next = NULL; - - info = dm_get_mapinfo(bio); - mpio = info->ptr; - - r = map_io(m, bio, mpio, 1); - if (r < 0) - bio_endio(bio, r); - else if (r == DM_MAPIO_REMAPPED) - generic_make_request(bio); - else if (r == DM_MAPIO_REQUEUE) - bio_endio(bio, -EIO); - - bio = next; - } -} - -static void process_queued_ios(struct work_struct *work) -{ - struct multipath *m = - container_of(work, struct multipath, process_queued_ios); - struct pgpath *pgpath = NULL; - unsigned init_required = 0, must_queue = 1; - unsigned long flags; - - spin_lock_irqsave(&m->lock, flags); - - if (!m->queue_size) - goto out; - - if (!m->current_pgpath) - __choose_pgpath(m); + if (!queue_if_no_path) + dm_table_run_md_queue_async(m->ti->table); - pgpath = m->current_pgpath; - - if ((pgpath && !m->queue_io) || - (!pgpath && !m->queue_if_no_path)) - must_queue = 0; - - if (m->pg_init_required && !m->pg_init_in_progress) { - m->pg_init_count++; - m->pg_init_required = 0; - m->pg_init_in_progress = 1; - init_required = 1; - } - -out: - spin_unlock_irqrestore(&m->lock, flags); - - if (init_required) - queue_work(kmpath_handlerd, &m->activate_path); - - if (!must_queue) - dispatch_queued_ios(m); + return 0; } /* @@ -465,69 +475,24 @@ static void trigger_event(struct work_struct *work) * <#paths> <#per-path selector args> * [<path> [<arg>]* ]+ ]+ *---------------------------------------------------------------*/ -struct param { - unsigned min; - unsigned max; - char *error; -}; - -static int read_param(struct param *param, char *str, unsigned *v, char **error) -{ - if (!str || - (sscanf(str, "%u", v) != 1) || - (*v < param->min) || - (*v > param->max)) { - *error = param->error; - return -EINVAL; - } - - return 0; -} - -struct arg_set { - unsigned argc; - char **argv; -}; - -static char *shift(struct arg_set *as) -{ - char *r; - - if (as->argc) { - as->argc--; - r = *as->argv; - as->argv++; - return r; - } - - return NULL; -} - -static void consume(struct arg_set *as, unsigned n) -{ - BUG_ON (as->argc < n); - as->argc -= n; - as->argv += n; -} - -static int parse_path_selector(struct arg_set *as, struct priority_group *pg, +static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, struct dm_target *ti) { int r; struct path_selector_type *pst; unsigned ps_argc; - static struct param _params[] = { + static struct dm_arg _args[] = { {0, 1024, "invalid number of path selector args"}, }; - pst = dm_get_path_selector(shift(as)); + pst = dm_get_path_selector(dm_shift_arg(as)); if (!pst) { ti->error = "unknown path selector type"; return -EINVAL; } - r = read_param(_params, shift(as), &ps_argc, &ti->error); + r = dm_read_arg_group(_args, as, &ps_argc, &ti->error); if (r) { dm_put_path_selector(pst); return -EINVAL; @@ -541,42 +506,90 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg, } pg->ps.type = pst; - consume(as, ps_argc); + dm_consume_args(as, ps_argc); return 0; } -static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, +static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, struct dm_target *ti) { int r; struct pgpath *p; struct multipath *m = ti->private; + struct request_queue *q = NULL; + const char *attached_handler_name; /* we need at least a path arg */ if (as->argc < 1) { ti->error = "no device given"; - return NULL; + return ERR_PTR(-EINVAL); } p = alloc_pgpath(); if (!p) - return NULL; + return ERR_PTR(-ENOMEM); - r = dm_get_device(ti, shift(as), ti->begin, ti->len, - dm_table_get_mode(ti->table), &p->path.dev); + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), + &p->path.dev); if (r) { ti->error = "error getting device"; goto bad; } + if (m->retain_attached_hw_handler || m->hw_handler_name) + q = bdev_get_queue(p->path.dev->bdev); + + if (m->retain_attached_hw_handler) { + attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); + if (attached_handler_name) { + /* + * Reset hw_handler_name to match the attached handler + * and clear any hw_handler_params associated with the + * ignored handler. + * + * NB. This modifies the table line to show the actual + * handler instead of the original table passed in. + */ + kfree(m->hw_handler_name); + m->hw_handler_name = attached_handler_name; + + kfree(m->hw_handler_params); + m->hw_handler_params = NULL; + } + } + if (m->hw_handler_name) { - r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev), - m->hw_handler_name); + /* + * Increments scsi_dh reference, even when using an + * already-attached handler. + */ + r = scsi_dh_attach(q, m->hw_handler_name); + if (r == -EBUSY) { + /* + * Already attached to different hw_handler: + * try to reattach with correct one. + */ + scsi_dh_detach(q); + r = scsi_dh_attach(q, m->hw_handler_name); + } + if (r < 0) { + ti->error = "error attaching hardware handler"; dm_put_device(ti, p->path.dev); goto bad; } + + if (m->hw_handler_params) { + r = scsi_dh_set_params(q, m->hw_handler_params); + if (r < 0) { + ti->error = "unable to set hardware " + "handler parameters"; + scsi_dh_detach(q); + dm_put_device(ti, p->path.dev); + goto bad; + } + } } r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); @@ -589,32 +602,32 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps, bad: free_pgpath(p); - return NULL; + return ERR_PTR(r); } -static struct priority_group *parse_priority_group(struct arg_set *as, +static struct priority_group *parse_priority_group(struct dm_arg_set *as, struct multipath *m) { - static struct param _params[] = { + static struct dm_arg _args[] = { {1, 1024, "invalid number of paths"}, {0, 1024, "invalid number of selector args"} }; int r; - unsigned i, nr_selector_args, nr_params; + unsigned i, nr_selector_args, nr_args; struct priority_group *pg; struct dm_target *ti = m->ti; if (as->argc < 2) { as->argc = 0; - ti->error = "not enough priority group aruments"; - return NULL; + ti->error = "not enough priority group arguments"; + return ERR_PTR(-EINVAL); } pg = alloc_priority_group(); if (!pg) { ti->error = "couldn't allocate priority group"; - return NULL; + return ERR_PTR(-ENOMEM); } pg->m = m; @@ -625,84 +638,109 @@ static struct priority_group *parse_priority_group(struct arg_set *as, /* * read the paths */ - r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error); + r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error); if (r) goto bad; - r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error); + r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error); if (r) goto bad; - nr_params = 1 + nr_selector_args; + nr_args = 1 + nr_selector_args; for (i = 0; i < pg->nr_pgpaths; i++) { struct pgpath *pgpath; - struct arg_set path_args; + struct dm_arg_set path_args; - if (as->argc < nr_params) { + if (as->argc < nr_args) { ti->error = "not enough path parameters"; + r = -EINVAL; goto bad; } - path_args.argc = nr_params; + path_args.argc = nr_args; path_args.argv = as->argv; pgpath = parse_path(&path_args, &pg->ps, ti); - if (!pgpath) + if (IS_ERR(pgpath)) { + r = PTR_ERR(pgpath); goto bad; + } pgpath->pg = pg; list_add_tail(&pgpath->list, &pg->pgpaths); - consume(as, nr_params); + dm_consume_args(as, nr_args); } return pg; bad: free_priority_group(pg, ti); - return NULL; + return ERR_PTR(r); } -static int parse_hw_handler(struct arg_set *as, struct multipath *m) +static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) { unsigned hw_argc; + int ret; struct dm_target *ti = m->ti; - static struct param _params[] = { + static struct dm_arg _args[] = { {0, 1024, "invalid number of hardware handler args"}, }; - if (read_param(_params, shift(as), &hw_argc, &ti->error)) + if (dm_read_arg_group(_args, as, &hw_argc, &ti->error)) return -EINVAL; if (!hw_argc) return 0; - m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL); - request_module("scsi_dh_%s", m->hw_handler_name); - if (scsi_dh_handler_exist(m->hw_handler_name) == 0) { + m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); + if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name), + "scsi_dh_%s", m->hw_handler_name)) { ti->error = "unknown hardware handler type"; - kfree(m->hw_handler_name); - m->hw_handler_name = NULL; - return -EINVAL; + ret = -EINVAL; + goto fail; } - consume(as, hw_argc - 1); + + if (hw_argc > 1) { + char *p; + int i, j, len = 4; + + for (i = 0; i <= hw_argc - 2; i++) + len += strlen(as->argv[i]) + 1; + p = m->hw_handler_params = kzalloc(len, GFP_KERNEL); + if (!p) { + ti->error = "memory allocation failed"; + ret = -ENOMEM; + goto fail; + } + j = sprintf(p, "%d", hw_argc - 1); + for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1) + j = sprintf(p, "%s", as->argv[i]); + } + dm_consume_args(as, hw_argc - 1); return 0; +fail: + kfree(m->hw_handler_name); + m->hw_handler_name = NULL; + return ret; } -static int parse_features(struct arg_set *as, struct multipath *m) +static int parse_features(struct dm_arg_set *as, struct multipath *m) { int r; unsigned argc; struct dm_target *ti = m->ti; - const char *param_name; + const char *arg_name; - static struct param _params[] = { - {0, 3, "invalid number of feature args"}, + static struct dm_arg _args[] = { + {0, 6, "invalid number of feature args"}, {1, 50, "pg_init_retries must be between 1 and 50"}, + {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, }; - r = read_param(_params, shift(as), &argc, &ti->error); + r = dm_read_arg_group(_args, as, &argc, &ti->error); if (r) return -EINVAL; @@ -710,18 +748,29 @@ static int parse_features(struct arg_set *as, struct multipath *m) return 0; do { - param_name = shift(as); + arg_name = dm_shift_arg(as); argc--; - if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) { + if (!strcasecmp(arg_name, "queue_if_no_path")) { r = queue_if_no_path(m, 1, 0); continue; } - if (!strnicmp(param_name, MESG_STR("pg_init_retries")) && + if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { + m->retain_attached_hw_handler = 1; + continue; + } + + if (!strcasecmp(arg_name, "pg_init_retries") && + (argc >= 1)) { + r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); + argc--; + continue; + } + + if (!strcasecmp(arg_name, "pg_init_delay_msecs") && (argc >= 1)) { - r = read_param(_params + 1, shift(as), - &m->pg_init_retries, &ti->error); + r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error); argc--; continue; } @@ -736,15 +785,15 @@ static int parse_features(struct arg_set *as, struct multipath *m) static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) { - /* target parameters */ - static struct param _params[] = { - {1, 1024, "invalid number of priority groups"}, - {1, 1024, "invalid initial priority group number"}, + /* target arguments */ + static struct dm_arg _args[] = { + {0, 1024, "invalid number of priority groups"}, + {0, 1024, "invalid initial priority group number"}, }; int r; struct multipath *m; - struct arg_set as; + struct dm_arg_set as; unsigned pg_count = 0; unsigned next_pg_num; @@ -765,21 +814,28 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, if (r) goto bad; - r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error); + r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error); if (r) goto bad; - r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error); + r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error); if (r) goto bad; + if ((!m->nr_priority_groups && next_pg_num) || + (m->nr_priority_groups && !next_pg_num)) { + ti->error = "invalid initial priority group"; + r = -EINVAL; + goto bad; + } + /* parse the priority groups */ while (as.argc) { struct priority_group *pg; pg = parse_priority_group(&as, m); - if (!pg) { - r = -EINVAL; + if (IS_ERR(pg)) { + r = PTR_ERR(pg); goto bad; } @@ -797,6 +853,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, goto bad; } + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_write_same_bios = 1; + return 0; bad: @@ -804,35 +864,54 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, return r; } -static void multipath_dtr(struct dm_target *ti) +static void multipath_wait_for_pg_init_completion(struct multipath *m) { - struct multipath *m = (struct multipath *) ti->private; + DECLARE_WAITQUEUE(wait, current); + unsigned long flags; - flush_workqueue(kmpath_handlerd); - flush_workqueue(kmultipathd); - free_multipath(m); + add_wait_queue(&m->pg_init_wait, &wait); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + + spin_lock_irqsave(&m->lock, flags); + if (!m->pg_init_in_progress) { + spin_unlock_irqrestore(&m->lock, flags); + break; + } + spin_unlock_irqrestore(&m->lock, flags); + + io_schedule(); + } + set_current_state(TASK_RUNNING); + + remove_wait_queue(&m->pg_init_wait, &wait); } -/* - * Map bios, recording original fields for later in case we have to resubmit - */ -static int multipath_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) +static void flush_multipath_work(struct multipath *m) { - int r; - struct dm_mpath_io *mpio; - struct multipath *m = (struct multipath *) ti->private; + unsigned long flags; - mpio = mempool_alloc(m->mpio_pool, GFP_NOIO); - dm_bio_record(&mpio->details, bio); + spin_lock_irqsave(&m->lock, flags); + m->pg_init_disabled = 1; + spin_unlock_irqrestore(&m->lock, flags); - map_context->ptr = mpio; - bio->bi_rw |= (1 << BIO_RW_FAILFAST); - r = map_io(m, bio, mpio, 0); - if (r < 0 || r == DM_MAPIO_REQUEUE) - mempool_free(mpio, m->mpio_pool); + flush_workqueue(kmpath_handlerd); + multipath_wait_for_pg_init_completion(m); + flush_workqueue(kmultipathd); + flush_work(&m->trigger_event); - return r; + spin_lock_irqsave(&m->lock, flags); + m->pg_init_disabled = 0; + spin_unlock_irqrestore(&m->lock, flags); +} + +static void multipath_dtr(struct dm_target *ti) +{ + struct multipath *m = ti->private; + + flush_multipath_work(m); + free_multipath(m); } /* @@ -845,13 +924,13 @@ static int fail_path(struct pgpath *pgpath) spin_lock_irqsave(&m->lock, flags); - if (!pgpath->path.is_active) + if (!pgpath->is_active) goto out; DMWARN("Failing path %s.", pgpath->path.dev->name); pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); - pgpath->path.is_active = 0; + pgpath->is_active = 0; pgpath->fail_count++; m->nr_valid_paths--; @@ -862,7 +941,7 @@ static int fail_path(struct pgpath *pgpath) dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti, pgpath->path.dev->name, m->nr_valid_paths); - queue_work(kmultipathd, &m->trigger_event); + schedule_work(&m->trigger_event); out: spin_unlock_irqrestore(&m->lock, flags); @@ -875,13 +954,13 @@ out: */ static int reinstate_path(struct pgpath *pgpath) { - int r = 0; + int r = 0, run_queue = 0; unsigned long flags; struct multipath *m = pgpath->pg->m; spin_lock_irqsave(&m->lock, flags); - if (pgpath->path.is_active) + if (pgpath->is_active) goto out; if (!pgpath->pg->ps.type->reinstate_path) { @@ -895,19 +974,25 @@ static int reinstate_path(struct pgpath *pgpath) if (r) goto out; - pgpath->path.is_active = 1; + pgpath->is_active = 1; - m->current_pgpath = NULL; - if (!m->nr_valid_paths++ && m->queue_size) - queue_work(kmultipathd, &m->process_queued_ios); + if (!m->nr_valid_paths++) { + m->current_pgpath = NULL; + run_queue = 1; + } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { + if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) + m->pg_init_in_progress++; + } dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti, pgpath->path.dev->name, m->nr_valid_paths); - queue_work(kmultipathd, &m->trigger_event); + schedule_work(&m->trigger_event); out: spin_unlock_irqrestore(&m->lock, flags); + if (run_queue) + dm_table_run_md_queue_async(m->ti->table); return r; } @@ -918,7 +1003,7 @@ out: static int action_dev(struct multipath *m, struct dm_dev *dev, action_fn action) { - int r = 0; + int r = -EINVAL; struct pgpath *pgpath; struct priority_group *pg; @@ -948,7 +1033,7 @@ static void bypass_pg(struct multipath *m, struct priority_group *pg, spin_unlock_irqrestore(&m->lock, flags); - queue_work(kmultipathd, &m->trigger_event); + schedule_work(&m->trigger_event); } /* @@ -959,8 +1044,9 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) struct priority_group *pg; unsigned pgnum; unsigned long flags; + char dummy; - if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || + if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || (pgnum > m->nr_priority_groups)) { DMWARN("invalid PG number supplied to switch_pg_num"); return -EINVAL; @@ -978,7 +1064,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) } spin_unlock_irqrestore(&m->lock, flags); - queue_work(kmultipathd, &m->trigger_event); + schedule_work(&m->trigger_event); return 0; } @@ -990,8 +1076,9 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) { struct priority_group *pg; unsigned pgnum; + char dummy; - if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || + if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || (pgnum > m->nr_priority_groups)) { DMWARN("invalid PG number supplied to bypass_pg"); return -EINVAL; @@ -1016,7 +1103,7 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) spin_lock_irqsave(&m->lock, flags); - if (m->pg_init_count <= m->pg_init_retries) + if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled) m->pg_init_required = 1; else limit_reached = 1; @@ -1026,12 +1113,13 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) return limit_reached; } -static void pg_init_done(struct dm_path *path, int errors) +static void pg_init_done(void *data, int errors) { - struct pgpath *pgpath = path_to_pgpath(path); + struct pgpath *pgpath = data; struct priority_group *pg = pgpath->pg; struct multipath *m = pg->m; unsigned long flags; + unsigned delay_retry = 0; /* device or driver problems */ switch (errors) { @@ -1042,8 +1130,8 @@ static void pg_init_done(struct dm_path *path, int errors) errors = 0; break; } - DMERR("Cannot failover device because scsi_dh_%s was not " - "loaded.", m->hw_handler_name); + DMERR("Could not failover the device: Handler scsi_dh_%s " + "Error %d.", m->hw_handler_name, errors); /* * Fail path for now, so we do not ping pong */ @@ -1056,8 +1144,9 @@ static void pg_init_done(struct dm_path *path, int errors) */ bypass_pg(m, pg, 1); break; - /* TODO: For SCSI_DH_RETRY we should wait a couple seconds */ case SCSI_DH_RETRY: + /* Wait before retrying. */ + delay_retry = 1; case SCSI_DH_IMM_RETRY: case SCSI_DH_RES_TEMP_UNAVAIL: if (pg_init_limit_reached(m, pgpath)) @@ -1075,96 +1164,124 @@ static void pg_init_done(struct dm_path *path, int errors) spin_lock_irqsave(&m->lock, flags); if (errors) { - DMERR("Could not failover device. Error %d.", errors); - m->current_pgpath = NULL; - m->current_pg = NULL; - } else if (!m->pg_init_required) { - m->queue_io = 0; + if (pgpath == m->current_pgpath) { + DMERR("Could not failover device. Error %d.", errors); + m->current_pgpath = NULL; + m->current_pg = NULL; + } + } else if (!m->pg_init_required) pg->bypassed = 0; + + if (--m->pg_init_in_progress) + /* Activations of other paths are still on going */ + goto out; + + if (m->pg_init_required) { + m->pg_init_delay_retry = delay_retry; + if (__pg_init_all_paths(m)) + goto out; } + m->queue_io = 0; - m->pg_init_in_progress = 0; - queue_work(kmultipathd, &m->process_queued_ios); + /* + * Wake up any thread waiting to suspend. + */ + wake_up(&m->pg_init_wait); + +out: spin_unlock_irqrestore(&m->lock, flags); } static void activate_path(struct work_struct *work) { - int ret; - struct multipath *m = - container_of(work, struct multipath, activate_path); - struct dm_path *path = &m->current_pgpath->path; + struct pgpath *pgpath = + container_of(work, struct pgpath, activate_path.work); - ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev)); - pg_init_done(path, ret); + if (pgpath->is_active) + scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), + pg_init_done, pgpath); + else + pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); +} + +static int noretry_error(int error) +{ + switch (error) { + case -EOPNOTSUPP: + case -EREMOTEIO: + case -EILSEQ: + case -ENODATA: + case -ENOSPC: + return 1; + } + + /* Anything else could be a path failure, so should be retried */ + return 0; } /* * end_io handling */ -static int do_end_io(struct multipath *m, struct bio *bio, +static int do_end_io(struct multipath *m, struct request *clone, int error, struct dm_mpath_io *mpio) { + /* + * We don't queue any clone request inside the multipath target + * during end I/O handling, since those clone requests don't have + * bio clones. If we queue them inside the multipath target, + * we need to make bio clones, that requires memory allocation. + * (See drivers/md/dm.c:end_clone_bio() about why the clone requests + * don't have bio clones.) + * Instead of queueing the clone request here, we queue the original + * request into dm core, which will remake a clone request and + * clone bios for it and resubmit it later. + */ + int r = DM_ENDIO_REQUEUE; unsigned long flags; - if (!error) + if (!error && !clone->errors) return 0; /* I/O complete */ - if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) + if (noretry_error(error)) return error; - if (error == -EOPNOTSUPP) - return error; + if (mpio->pgpath) + fail_path(mpio->pgpath); spin_lock_irqsave(&m->lock, flags); if (!m->nr_valid_paths) { - if (__must_push_back(m)) { - spin_unlock_irqrestore(&m->lock, flags); - return DM_ENDIO_REQUEUE; - } else if (!m->queue_if_no_path) { - spin_unlock_irqrestore(&m->lock, flags); - return -EIO; + if (!m->queue_if_no_path) { + if (!__must_push_back(m)) + r = -EIO; } else { - spin_unlock_irqrestore(&m->lock, flags); - goto requeue; + if (error == -EBADE) + r = error; } } spin_unlock_irqrestore(&m->lock, flags); - if (mpio->pgpath) - fail_path(mpio->pgpath); - - requeue: - dm_bio_restore(&mpio->details, bio); - - /* queue for the daemon to resubmit or fail */ - spin_lock_irqsave(&m->lock, flags); - bio_list_add(&m->queued_ios, bio); - m->queue_size++; - if (!m->queue_io) - queue_work(kmultipathd, &m->process_queued_ios); - spin_unlock_irqrestore(&m->lock, flags); - - return DM_ENDIO_INCOMPLETE; /* io not complete */ + return r; } -static int multipath_end_io(struct dm_target *ti, struct bio *bio, +static int multipath_end_io(struct dm_target *ti, struct request *clone, int error, union map_info *map_context) { struct multipath *m = ti->private; struct dm_mpath_io *mpio = map_context->ptr; - struct pgpath *pgpath = mpio->pgpath; + struct pgpath *pgpath; struct path_selector *ps; int r; - r = do_end_io(m, bio, error, mpio); + BUG_ON(!mpio); + + r = do_end_io(m, clone, error, mpio); + pgpath = mpio->pgpath; if (pgpath) { ps = &pgpath->pg->ps; if (ps->type->end_io) - ps->type->end_io(ps, &pgpath->path); + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } - if (r != DM_ENDIO_INCOMPLETE) - mempool_free(mpio, m->mpio_pool); + clear_mapinfo(m, map_context); return r; } @@ -1182,6 +1299,15 @@ static void multipath_presuspend(struct dm_target *ti) queue_if_no_path(m, 0, 1); } +static void multipath_postsuspend(struct dm_target *ti) +{ + struct multipath *m = ti->private; + + mutex_lock(&m->work_mutex); + flush_multipath_work(m); + mutex_unlock(&m->work_mutex); +} + /* * Restore the queue_if_no_path setting. */ @@ -1211,8 +1337,8 @@ static void multipath_resume(struct dm_target *ti) * [priority selector-name num_ps_args [ps_args]* * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+ */ -static int multipath_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) +static void multipath_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) { int sz = 0; unsigned long flags; @@ -1226,14 +1352,20 @@ static int multipath_status(struct dm_target *ti, status_type_t type, /* Features */ if (type == STATUSTYPE_INFO) - DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count); + DMEMIT("2 %u %u ", m->queue_io, m->pg_init_count); else { DMEMIT("%u ", m->queue_if_no_path + - (m->pg_init_retries > 0) * 2); + (m->pg_init_retries > 0) * 2 + + (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + + m->retain_attached_hw_handler); if (m->queue_if_no_path) DMEMIT("queue_if_no_path "); if (m->pg_init_retries) DMEMIT("pg_init_retries %u ", m->pg_init_retries); + if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) + DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); + if (m->retain_attached_hw_handler) + DMEMIT("retain_attached_hw_handler "); } if (!m->hw_handler_name || type == STATUSTYPE_INFO) @@ -1248,7 +1380,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type, else if (m->current_pg) pg_num = m->current_pg->pg_num; else - pg_num = 1; + pg_num = (m->nr_priority_groups ? 1 : 0); DMEMIT("%u ", pg_num); @@ -1276,7 +1408,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type, list_for_each_entry(p, &pg->pgpaths, list) { DMEMIT("%s %s %u ", p->path.dev->name, - p->path.is_active ? "A" : "F", + p->is_active ? "A" : "F", p->fail_count); if (pg->ps.type->status) sz += pg->ps.type->status(&pg->ps, @@ -1312,92 +1444,221 @@ static int multipath_status(struct dm_target *ti, status_type_t type, } spin_unlock_irqrestore(&m->lock, flags); - - return 0; } static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) { - int r; + int r = -EINVAL; struct dm_dev *dev; struct multipath *m = (struct multipath *) ti->private; action_fn action; + mutex_lock(&m->work_mutex); + + if (dm_suspended(ti)) { + r = -EBUSY; + goto out; + } + if (argc == 1) { - if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) - return queue_if_no_path(m, 1, 0); - else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) - return queue_if_no_path(m, 0, 0); + if (!strcasecmp(argv[0], "queue_if_no_path")) { + r = queue_if_no_path(m, 1, 0); + goto out; + } else if (!strcasecmp(argv[0], "fail_if_no_path")) { + r = queue_if_no_path(m, 0, 0); + goto out; + } } - if (argc != 2) - goto error; + if (argc != 2) { + DMWARN("Invalid multipath message arguments. Expected 2 arguments, got %d.", argc); + goto out; + } - if (!strnicmp(argv[0], MESG_STR("disable_group"))) - return bypass_pg_num(m, argv[1], 1); - else if (!strnicmp(argv[0], MESG_STR("enable_group"))) - return bypass_pg_num(m, argv[1], 0); - else if (!strnicmp(argv[0], MESG_STR("switch_group"))) - return switch_pg_num(m, argv[1]); - else if (!strnicmp(argv[0], MESG_STR("reinstate_path"))) + if (!strcasecmp(argv[0], "disable_group")) { + r = bypass_pg_num(m, argv[1], 1); + goto out; + } else if (!strcasecmp(argv[0], "enable_group")) { + r = bypass_pg_num(m, argv[1], 0); + goto out; + } else if (!strcasecmp(argv[0], "switch_group")) { + r = switch_pg_num(m, argv[1]); + goto out; + } else if (!strcasecmp(argv[0], "reinstate_path")) action = reinstate_path; - else if (!strnicmp(argv[0], MESG_STR("fail_path"))) + else if (!strcasecmp(argv[0], "fail_path")) action = fail_path; - else - goto error; + else { + DMWARN("Unrecognised multipath message received: %s", argv[0]); + goto out; + } - r = dm_get_device(ti, argv[1], ti->begin, ti->len, - dm_table_get_mode(ti->table), &dev); + r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); if (r) { DMWARN("message: error getting device %s", argv[1]); - return -EINVAL; + goto out; } r = action_dev(m, dev, action); dm_put_device(ti, dev); +out: + mutex_unlock(&m->work_mutex); return r; - -error: - DMWARN("Unrecognised multipath message received."); - return -EINVAL; } -static int multipath_ioctl(struct dm_target *ti, struct inode *inode, - struct file *filp, unsigned int cmd, +static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) { - struct multipath *m = (struct multipath *) ti->private; - struct block_device *bdev = NULL; + struct multipath *m = ti->private; + struct pgpath *pgpath; + struct block_device *bdev; + fmode_t mode; unsigned long flags; - struct file fake_file = {}; - struct dentry fake_dentry = {}; - int r = 0; + int r; - fake_file.f_path.dentry = &fake_dentry; + bdev = NULL; + mode = 0; + r = 0; spin_lock_irqsave(&m->lock, flags); if (!m->current_pgpath) - __choose_pgpath(m); + __choose_pgpath(m, 0); + + pgpath = m->current_pgpath; - if (m->current_pgpath) { - bdev = m->current_pgpath->path.dev->bdev; - fake_dentry.d_inode = bdev->bd_inode; - fake_file.f_mode = m->current_pgpath->path.dev->mode; + if (pgpath) { + bdev = pgpath->path.dev->bdev; + mode = pgpath->path.dev->mode; } - if (m->queue_io) - r = -EAGAIN; + if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path)) + r = -ENOTCONN; else if (!bdev) r = -EIO; spin_unlock_irqrestore(&m->lock, flags); - return r ? : blkdev_driver_ioctl(bdev->bd_inode, &fake_file, - bdev->bd_disk, cmd, arg); + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) { + int err = scsi_verify_blk_ioctl(NULL, cmd); + if (err) + r = err; + } + + if (r == -ENOTCONN && !fatal_signal_pending(current)) { + spin_lock_irqsave(&m->lock, flags); + if (!m->current_pg) { + /* Path status changed, redo selection */ + __choose_pgpath(m, 0); + } + if (m->pg_init_required) + __pg_init_all_paths(m); + spin_unlock_irqrestore(&m->lock, flags); + dm_table_run_md_queue_async(m->ti->table); + } + + return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); +} + +static int multipath_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct multipath *m = ti->private; + struct priority_group *pg; + struct pgpath *p; + int ret = 0; + + list_for_each_entry(pg, &m->priority_groups, list) { + list_for_each_entry(p, &pg->pgpaths, list) { + ret = fn(ti, p->path.dev, ti->begin, ti->len, data); + if (ret) + goto out; + } + } + +out: + return ret; +} + +static int __pgpath_busy(struct pgpath *pgpath) +{ + struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); + + return dm_underlying_device_busy(q); +} + +/* + * We return "busy", only when we can map I/Os but underlying devices + * are busy (so even if we map I/Os now, the I/Os will wait on + * the underlying queue). + * In other words, if we want to kill I/Os or queue them inside us + * due to map unavailability, we don't return "busy". Otherwise, + * dm core won't give us the I/Os and we can't do what we want. + */ +static int multipath_busy(struct dm_target *ti) +{ + int busy = 0, has_active = 0; + struct multipath *m = ti->private; + struct priority_group *pg; + struct pgpath *pgpath; + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + + /* pg_init in progress or no paths available */ + if (m->pg_init_in_progress || + (!m->nr_valid_paths && m->queue_if_no_path)) { + busy = 1; + goto out; + } + /* Guess which priority_group will be used at next mapping time */ + if (unlikely(!m->current_pgpath && m->next_pg)) + pg = m->next_pg; + else if (likely(m->current_pg)) + pg = m->current_pg; + else + /* + * We don't know which pg will be used at next mapping time. + * We don't call __choose_pgpath() here to avoid to trigger + * pg_init just by busy checking. + * So we don't know whether underlying devices we will be using + * at next mapping time are busy or not. Just try mapping. + */ + goto out; + + /* + * If there is one non-busy active path at least, the path selector + * will be able to select it. So we consider such a pg as not busy. + */ + busy = 1; + list_for_each_entry(pgpath, &pg->pgpaths, list) + if (pgpath->is_active) { + has_active = 1; + + if (!__pgpath_busy(pgpath)) { + busy = 0; + break; + } + } + + if (!has_active) + /* + * No active path in this pg, so this pg won't be used and + * the current_pg will be changed at next mapping time. + * We need to try mapping to determine it. + */ + busy = 0; + +out: + spin_unlock_irqrestore(&m->lock, flags); + + return busy; } /*----------------------------------------------------------------- @@ -1405,17 +1666,20 @@ static int multipath_ioctl(struct dm_target *ti, struct inode *inode, *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 0, 5}, + .version = {1, 7, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, - .map = multipath_map, - .end_io = multipath_end_io, + .map_rq = multipath_map, + .rq_end_io = multipath_end_io, .presuspend = multipath_presuspend, + .postsuspend = multipath_postsuspend, .resume = multipath_resume, .status = multipath_status, .message = multipath_message, .ioctl = multipath_ioctl, + .iterate_devices = multipath_iterate_devices, + .busy = multipath_busy, }; static int __init dm_multipath_init(void) @@ -1434,7 +1698,7 @@ static int __init dm_multipath_init(void) return -EINVAL; } - kmultipathd = create_workqueue("kmpathd"); + kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0); if (!kmultipathd) { DMERR("failed to create workqueue kmpathd"); dm_unregister_target(&multipath_target); @@ -1448,7 +1712,8 @@ static int __init dm_multipath_init(void) * old workqueue would also create a bottleneck in the * path of the storage hardware device activation. */ - kmpath_handlerd = create_singlethread_workqueue("kmpath_handlerd"); + kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd", + WQ_MEM_RECLAIM); if (!kmpath_handlerd) { DMERR("failed to create workqueue kmpath_handlerd"); destroy_workqueue(kmultipathd); @@ -1466,14 +1731,10 @@ static int __init dm_multipath_init(void) static void __exit dm_multipath_exit(void) { - int r; - destroy_workqueue(kmpath_handlerd); destroy_workqueue(kmultipathd); - r = dm_unregister_target(&multipath_target); - if (r < 0) - DMERR("target unregister failed %d", r); + dm_unregister_target(&multipath_target); kmem_cache_destroy(_mpio_cache); } diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h index c198b856a45..e230f719625 100644 --- a/drivers/md/dm-mpath.h +++ b/drivers/md/dm-mpath.h @@ -13,8 +13,6 @@ struct dm_dev; struct dm_path { struct dm_dev *dev; /* Read-only */ - unsigned is_active; /* Read-only */ - void *pscontext; /* For path-selector use */ }; diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c index ca1bb636a3e..fa0ccc585cb 100644 --- a/drivers/md/dm-path-selector.c +++ b/drivers/md/dm-path-selector.c @@ -9,16 +9,16 @@ * Path selector registration. */ -#include "dm.h" +#include <linux/device-mapper.h> +#include <linux/module.h> + #include "dm-path-selector.h" #include <linux/slab.h> struct ps_internal { struct path_selector_type pst; - struct list_head list; - long use; }; #define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst) @@ -44,12 +44,8 @@ static struct ps_internal *get_path_selector(const char *name) down_read(&_ps_lock); psi = __find_path_selector_type(name); - if (psi) { - if ((psi->use == 0) && !try_module_get(psi->pst.module)) - psi = NULL; - else - psi->use++; - } + if (psi && !try_module_get(psi->pst.module)) + psi = NULL; up_read(&_ps_lock); return psi; @@ -83,11 +79,7 @@ void dm_put_path_selector(struct path_selector_type *pst) if (!psi) goto out; - if (--psi->use == 0) - module_put(psi->pst.module); - - BUG_ON(psi->use < 0); - + module_put(psi->pst.module); out: up_read(&_ps_lock); } @@ -135,11 +127,6 @@ int dm_unregister_path_selector(struct path_selector_type *pst) return -EINVAL; } - if (psi->use) { - up_write(&_ps_lock); - return -ETXTBSY; - } - list_del(&psi->list); up_write(&_ps_lock); diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 27357b85d73..e7d1fa8b045 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -56,7 +56,8 @@ struct path_selector_type { * the path fails. */ struct dm_path *(*select_path) (struct path_selector *ps, - unsigned *repeat_count); + unsigned *repeat_count, + size_t nr_bytes); /* * Notify the selector that a path has failed. @@ -75,7 +76,10 @@ struct path_selector_type { int (*status) (struct path_selector *ps, struct dm_path *path, status_type_t type, char *result, unsigned int maxlen); - int (*end_io) (struct path_selector *ps, struct dm_path *path); + int (*start_io) (struct path_selector *ps, struct dm_path *path, + size_t nr_bytes); + int (*end_io) (struct path_selector *ps, struct dm_path *path, + size_t nr_bytes); }; /* Register a path selector */ diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c new file mode 100644 index 00000000000..3941fae0de9 --- /dev/null +++ b/drivers/md/dm-queue-length.c @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved. + * Copyright (C) 2006-2009 NEC Corporation. + * + * dm-queue-length.c + * + * Module Author: Stefan Bader, IBM + * Modified by: Kiyoshi Ueda, NEC + * + * This file is released under the GPL. + * + * queue-length path selector - choose a path with the least number of + * in-flight I/Os. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/atomic.h> + +#define DM_MSG_PREFIX "multipath queue-length" +#define QL_MIN_IO 128 +#define QL_VERSION "0.1.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned repeat_count; + atomic_t qlen; /* the number of in-flight I/Os */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + } + + return s; +} + +static int ql_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void ql_free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void ql_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + ql_free_paths(&s->valid_paths); + ql_free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int ql_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct path_info *pi; + + /* When called with NULL path, return selector status/args. */ + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", atomic_read(&pi->qlen)); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u ", pi->repeat_count); + break; + } + } + + return sz; +} + +static int ql_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned repeat_count = QL_MIN_IO; + char dummy; + + /* + * Arguments: [<repeat_count>] + * <repeat_count>: The number of I/Os before switching path. + * If not given, default (QL_MIN_IO) is used. + */ + if (argc > 1) { + *error = "queue-length ps: incorrect number of arguments"; + return -EINVAL; + } + + if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "queue-length ps: invalid repeat count"; + return -EINVAL; + } + + /* Allocate the path information structure */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "queue-length ps: Error allocating path information"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + atomic_set(&pi->qlen, 0); + + path->pscontext = pi; + + list_add_tail(&pi->list, &s->valid_paths); + + return 0; +} + +static void ql_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move(&pi->list, &s->failed_paths); +} + +static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move_tail(&pi->list, &s->valid_paths); + + return 0; +} + +/* + * Select a path having the minimum number of in-flight I/Os + */ +static struct dm_path *ql_select_path(struct path_selector *ps, + unsigned *repeat_count, size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + + if (list_empty(&s->valid_paths)) + return NULL; + + /* Change preferred (first in list) path to evenly balance. */ + list_move_tail(s->valid_paths.next, &s->valid_paths); + + list_for_each_entry(pi, &s->valid_paths, list) { + if (!best || + (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) + best = pi; + + if (!atomic_read(&best->qlen)) + break; + } + + if (!best) + return NULL; + + *repeat_count = best->repeat_count; + + return best->path; +} + +static int ql_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_inc(&pi->qlen); + + return 0; +} + +static int ql_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_dec(&pi->qlen); + + return 0; +} + +static struct path_selector_type ql_ps = { + .name = "queue-length", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 1, + .create = ql_create, + .destroy = ql_destroy, + .status = ql_status, + .add_path = ql_add_path, + .fail_path = ql_fail_path, + .reinstate_path = ql_reinstate_path, + .select_path = ql_select_path, + .start_io = ql_start_io, + .end_io = ql_end_io, +}; + +static int __init dm_ql_init(void) +{ + int r = dm_register_path_selector(&ql_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " QL_VERSION " loaded"); + + return r; +} + +static void __exit dm_ql_exit(void) +{ + int r = dm_unregister_path_selector(&ql_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_ql_init); +module_exit(dm_ql_exit); + +MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>"); +MODULE_DESCRIPTION( + "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n" + DM_NAME " path selector to balance the number of in-flight I/Os" +); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c new file mode 100644 index 00000000000..4880b69e2e9 --- /dev/null +++ b/drivers/md/dm-raid.c @@ -0,0 +1,1693 @@ +/* + * Copyright (C) 2010-2011 Neil Brown + * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/slab.h> +#include <linux/module.h> + +#include "md.h" +#include "raid1.h" +#include "raid5.h" +#include "raid10.h" +#include "bitmap.h" + +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "raid" + +/* + * The following flags are used by dm-raid.c to set up the array state. + * They must be cleared before md_run is called. + */ +#define FirstUse 10 /* rdev flag */ + +struct raid_dev { + /* + * Two DM devices, one to hold metadata and one to hold the + * actual data/parity. The reason for this is to not confuse + * ti->len and give more flexibility in altering size and + * characteristics. + * + * While it is possible for this device to be associated + * with a different physical device than the data_dev, it + * is intended for it to be the same. + * |--------- Physical Device ---------| + * |- meta_dev -|------ data_dev ------| + */ + struct dm_dev *meta_dev; + struct dm_dev *data_dev; + struct md_rdev rdev; +}; + +/* + * Flags for rs->print_flags field. + */ +#define DMPF_SYNC 0x1 +#define DMPF_NOSYNC 0x2 +#define DMPF_REBUILD 0x4 +#define DMPF_DAEMON_SLEEP 0x8 +#define DMPF_MIN_RECOVERY_RATE 0x10 +#define DMPF_MAX_RECOVERY_RATE 0x20 +#define DMPF_MAX_WRITE_BEHIND 0x40 +#define DMPF_STRIPE_CACHE 0x80 +#define DMPF_REGION_SIZE 0x100 +#define DMPF_RAID10_COPIES 0x200 +#define DMPF_RAID10_FORMAT 0x400 + +struct raid_set { + struct dm_target *ti; + + uint32_t bitmap_loaded; + uint32_t print_flags; + + struct mddev md; + struct raid_type *raid_type; + struct dm_target_callbacks callbacks; + + struct raid_dev dev[0]; +}; + +/* Supported raid types and properties. */ +static struct raid_type { + const char *name; /* RAID algorithm. */ + const char *descr; /* Descriptor text for logging. */ + const unsigned parity_devs; /* # of parity devices. */ + const unsigned minimal_devs; /* minimal # of devices in set. */ + const unsigned level; /* RAID level. */ + const unsigned algorithm; /* RAID algorithm. */ +} raid_types[] = { + {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, + {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */}, + {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, + {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, + {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, + {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, + {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, + {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, + {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, + {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} +}; + +static char *raid10_md_layout_to_format(int layout) +{ + /* + * Bit 16 and 17 stand for "offset" and "use_far_sets" + * Refer to MD's raid10.c for details + */ + if ((layout & 0x10000) && (layout & 0x20000)) + return "offset"; + + if ((layout & 0xFF) > 1) + return "near"; + + return "far"; +} + +static unsigned raid10_md_layout_to_copies(int layout) +{ + if ((layout & 0xFF) > 1) + return layout & 0xFF; + return (layout >> 8) & 0xFF; +} + +static int raid10_format_to_md_layout(char *format, unsigned copies) +{ + unsigned n = 1, f = 1; + + if (!strcmp("near", format)) + n = copies; + else + f = copies; + + if (!strcmp("offset", format)) + return 0x30000 | (f << 8) | n; + + if (!strcmp("far", format)) + return 0x20000 | (f << 8) | n; + + return (f << 8) | n; +} + +static struct raid_type *get_raid_type(char *name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(raid_types); i++) + if (!strcmp(raid_types[i].name, name)) + return &raid_types[i]; + + return NULL; +} + +static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) +{ + unsigned i; + struct raid_set *rs; + + if (raid_devs <= raid_type->parity_devs) { + ti->error = "Insufficient number of devices"; + return ERR_PTR(-EINVAL); + } + + rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); + if (!rs) { + ti->error = "Cannot allocate raid context"; + return ERR_PTR(-ENOMEM); + } + + mddev_init(&rs->md); + + rs->ti = ti; + rs->raid_type = raid_type; + rs->md.raid_disks = raid_devs; + rs->md.level = raid_type->level; + rs->md.new_level = rs->md.level; + rs->md.layout = raid_type->algorithm; + rs->md.new_layout = rs->md.layout; + rs->md.delta_disks = 0; + rs->md.recovery_cp = 0; + + for (i = 0; i < raid_devs; i++) + md_rdev_init(&rs->dev[i].rdev); + + /* + * Remaining items to be initialized by further RAID params: + * rs->md.persistent + * rs->md.external + * rs->md.chunk_sectors + * rs->md.new_chunk_sectors + * rs->md.dev_sectors + */ + + return rs; +} + +static void context_free(struct raid_set *rs) +{ + int i; + + for (i = 0; i < rs->md.raid_disks; i++) { + if (rs->dev[i].meta_dev) + dm_put_device(rs->ti, rs->dev[i].meta_dev); + md_rdev_clear(&rs->dev[i].rdev); + if (rs->dev[i].data_dev) + dm_put_device(rs->ti, rs->dev[i].data_dev); + } + + kfree(rs); +} + +/* + * For every device we have two words + * <meta_dev>: meta device name or '-' if missing + * <data_dev>: data device name or '-' if missing + * + * The following are permitted: + * - - + * - <data_dev> + * <meta_dev> <data_dev> + * + * The following is not allowed: + * <meta_dev> - + * + * This code parses those words. If there is a failure, + * the caller must use context_free to unwind the operations. + */ +static int dev_parms(struct raid_set *rs, char **argv) +{ + int i; + int rebuild = 0; + int metadata_available = 0; + int ret = 0; + + for (i = 0; i < rs->md.raid_disks; i++, argv += 2) { + rs->dev[i].rdev.raid_disk = i; + + rs->dev[i].meta_dev = NULL; + rs->dev[i].data_dev = NULL; + + /* + * There are no offsets, since there is a separate device + * for data and metadata. + */ + rs->dev[i].rdev.data_offset = 0; + rs->dev[i].rdev.mddev = &rs->md; + + if (strcmp(argv[0], "-")) { + ret = dm_get_device(rs->ti, argv[0], + dm_table_get_mode(rs->ti->table), + &rs->dev[i].meta_dev); + rs->ti->error = "RAID metadata device lookup failure"; + if (ret) + return ret; + + rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL); + if (!rs->dev[i].rdev.sb_page) + return -ENOMEM; + } + + if (!strcmp(argv[1], "-")) { + if (!test_bit(In_sync, &rs->dev[i].rdev.flags) && + (!rs->dev[i].rdev.recovery_offset)) { + rs->ti->error = "Drive designated for rebuild not specified"; + return -EINVAL; + } + + rs->ti->error = "No data device supplied with metadata device"; + if (rs->dev[i].meta_dev) + return -EINVAL; + + continue; + } + + ret = dm_get_device(rs->ti, argv[1], + dm_table_get_mode(rs->ti->table), + &rs->dev[i].data_dev); + if (ret) { + rs->ti->error = "RAID device lookup failure"; + return ret; + } + + if (rs->dev[i].meta_dev) { + metadata_available = 1; + rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev; + } + rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; + list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + rebuild++; + } + + if (metadata_available) { + rs->md.external = 0; + rs->md.persistent = 1; + rs->md.major_version = 2; + } else if (rebuild && !rs->md.recovery_cp) { + /* + * Without metadata, we will not be able to tell if the array + * is in-sync or not - we must assume it is not. Therefore, + * it is impossible to rebuild a drive. + * + * Even if there is metadata, the on-disk information may + * indicate that the array is not in-sync and it will then + * fail at that time. + * + * User could specify 'nosync' option if desperate. + */ + DMERR("Unable to rebuild drive while array is not in-sync"); + rs->ti->error = "RAID device lookup failure"; + return -EINVAL; + } + + return 0; +} + +/* + * validate_region_size + * @rs + * @region_size: region size in sectors. If 0, pick a size (4MiB default). + * + * Set rs->md.bitmap_info.chunksize (which really refers to 'region size'). + * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_region_size(struct raid_set *rs, unsigned long region_size) +{ + unsigned long min_region_size = rs->ti->len / (1 << 21); + + if (!region_size) { + /* + * Choose a reasonable default. All figures in sectors. + */ + if (min_region_size > (1 << 13)) { + /* If not a power of 2, make it the next power of 2 */ + if (min_region_size & (min_region_size - 1)) + region_size = 1 << fls(region_size); + DMINFO("Choosing default region size of %lu sectors", + region_size); + } else { + DMINFO("Choosing default region size of 4MiB"); + region_size = 1 << 13; /* sectors */ + } + } else { + /* + * Validate user-supplied value. + */ + if (region_size > rs->ti->len) { + rs->ti->error = "Supplied region size is too large"; + return -EINVAL; + } + + if (region_size < min_region_size) { + DMERR("Supplied region_size (%lu sectors) below minimum (%lu)", + region_size, min_region_size); + rs->ti->error = "Supplied region size is too small"; + return -EINVAL; + } + + if (!is_power_of_2(region_size)) { + rs->ti->error = "Region size is not a power of 2"; + return -EINVAL; + } + + if (region_size < rs->md.chunk_sectors) { + rs->ti->error = "Region size is smaller than the chunk size"; + return -EINVAL; + } + } + + /* + * Convert sectors to bytes. + */ + rs->md.bitmap_info.chunksize = (region_size << 9); + + return 0; +} + +/* + * validate_raid_redundancy + * @rs + * + * Determine if there are enough devices in the array that haven't + * failed (or are being rebuilt) to form a usable array. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_raid_redundancy(struct raid_set *rs) +{ + unsigned i, rebuild_cnt = 0; + unsigned rebuilds_per_group = 0, copies, d; + unsigned group_size, last_group_start; + + for (i = 0; i < rs->md.raid_disks; i++) + if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || + !rs->dev[i].rdev.sb_page) + rebuild_cnt++; + + switch (rs->raid_type->level) { + case 1: + if (rebuild_cnt >= rs->md.raid_disks) + goto too_many; + break; + case 4: + case 5: + case 6: + if (rebuild_cnt > rs->raid_type->parity_devs) + goto too_many; + break; + case 10: + copies = raid10_md_layout_to_copies(rs->md.layout); + if (rebuild_cnt < copies) + break; + + /* + * It is possible to have a higher rebuild count for RAID10, + * as long as the failed devices occur in different mirror + * groups (i.e. different stripes). + * + * When checking "near" format, make sure no adjacent devices + * have failed beyond what can be handled. In addition to the + * simple case where the number of devices is a multiple of the + * number of copies, we must also handle cases where the number + * of devices is not a multiple of the number of copies. + * E.g. dev1 dev2 dev3 dev4 dev5 + * A A B B C + * C D D E E + */ + if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) { + for (i = 0; i < rs->md.raid_disks * copies; i++) { + if (!(i % copies)) + rebuilds_per_group = 0; + d = i % rs->md.raid_disks; + if ((!rs->dev[d].rdev.sb_page || + !test_bit(In_sync, &rs->dev[d].rdev.flags)) && + (++rebuilds_per_group >= copies)) + goto too_many; + } + break; + } + + /* + * When checking "far" and "offset" formats, we need to ensure + * that the device that holds its copy is not also dead or + * being rebuilt. (Note that "far" and "offset" formats only + * support two copies right now. These formats also only ever + * use the 'use_far_sets' variant.) + * + * This check is somewhat complicated by the need to account + * for arrays that are not a multiple of (far) copies. This + * results in the need to treat the last (potentially larger) + * set differently. + */ + group_size = (rs->md.raid_disks / copies); + last_group_start = (rs->md.raid_disks / group_size) - 1; + last_group_start *= group_size; + for (i = 0; i < rs->md.raid_disks; i++) { + if (!(i % copies) && !(i > last_group_start)) + rebuilds_per_group = 0; + if ((!rs->dev[i].rdev.sb_page || + !test_bit(In_sync, &rs->dev[i].rdev.flags)) && + (++rebuilds_per_group >= copies)) + goto too_many; + } + break; + default: + if (rebuild_cnt) + return -EINVAL; + } + + return 0; + +too_many: + return -EINVAL; +} + +/* + * Possible arguments are... + * <chunk_size> [optional_args] + * + * Argument definitions + * <chunk_size> The number of sectors per disk that + * will form the "stripe" + * [[no]sync] Force or prevent recovery of the + * entire array + * [rebuild <idx>] Rebuild the drive indicated by the index + * [daemon_sleep <ms>] Time between bitmap daemon work to + * clear bits + * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization + * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization + * [write_mostly <idx>] Indicate a write mostly drive via index + * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) + * [stripe_cache <sectors>] Stripe cache size for higher RAIDs + * [region_size <sectors>] Defines granularity of bitmap + * + * RAID10-only options: + * [raid10_copies <# copies>] Number of copies. (Default: 2) + * [raid10_format <near|far|offset>] Layout algorithm. (Default: near) + */ +static int parse_raid_params(struct raid_set *rs, char **argv, + unsigned num_raid_params) +{ + char *raid10_format = "near"; + unsigned raid10_copies = 2; + unsigned i; + unsigned long value, region_size = 0; + sector_t sectors_per_dev = rs->ti->len; + sector_t max_io_len; + char *key; + + /* + * First, parse the in-order required arguments + * "chunk_size" is the only argument of this type. + */ + if ((kstrtoul(argv[0], 10, &value) < 0)) { + rs->ti->error = "Bad chunk size"; + return -EINVAL; + } else if (rs->raid_type->level == 1) { + if (value) + DMERR("Ignoring chunk size parameter for RAID 1"); + value = 0; + } else if (!is_power_of_2(value)) { + rs->ti->error = "Chunk size must be a power of 2"; + return -EINVAL; + } else if (value < 8) { + rs->ti->error = "Chunk size value is too small"; + return -EINVAL; + } + + rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; + argv++; + num_raid_params--; + + /* + * We set each individual device as In_sync with a completed + * 'recovery_offset'. If there has been a device failure or + * replacement then one of the following cases applies: + * + * 1) User specifies 'rebuild'. + * - Device is reset when param is read. + * 2) A new device is supplied. + * - No matching superblock found, resets device. + * 3) Device failure was transient and returns on reload. + * - Failure noticed, resets device for bitmap replay. + * 4) Device hadn't completed recovery after previous failure. + * - Superblock is read and overrides recovery_offset. + * + * What is found in the superblocks of the devices is always + * authoritative, unless 'rebuild' or '[no]sync' was specified. + */ + for (i = 0; i < rs->md.raid_disks; i++) { + set_bit(In_sync, &rs->dev[i].rdev.flags); + rs->dev[i].rdev.recovery_offset = MaxSector; + } + + /* + * Second, parse the unordered optional arguments + */ + for (i = 0; i < num_raid_params; i++) { + if (!strcasecmp(argv[i], "nosync")) { + rs->md.recovery_cp = MaxSector; + rs->print_flags |= DMPF_NOSYNC; + continue; + } + if (!strcasecmp(argv[i], "sync")) { + rs->md.recovery_cp = 0; + rs->print_flags |= DMPF_SYNC; + continue; + } + + /* The rest of the optional arguments come in key/value pairs */ + if ((i + 1) >= num_raid_params) { + rs->ti->error = "Wrong number of raid parameters given"; + return -EINVAL; + } + + key = argv[i++]; + + /* Parameters that take a string value are checked here. */ + if (!strcasecmp(key, "raid10_format")) { + if (rs->raid_type->level != 10) { + rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; + return -EINVAL; + } + if (strcmp("near", argv[i]) && + strcmp("far", argv[i]) && + strcmp("offset", argv[i])) { + rs->ti->error = "Invalid 'raid10_format' value given"; + return -EINVAL; + } + raid10_format = argv[i]; + rs->print_flags |= DMPF_RAID10_FORMAT; + continue; + } + + if (kstrtoul(argv[i], 10, &value) < 0) { + rs->ti->error = "Bad numerical argument given in raid params"; + return -EINVAL; + } + + /* Parameters that take a numeric value are checked here */ + if (!strcasecmp(key, "rebuild")) { + if (value >= rs->md.raid_disks) { + rs->ti->error = "Invalid rebuild index given"; + return -EINVAL; + } + clear_bit(In_sync, &rs->dev[value].rdev.flags); + rs->dev[value].rdev.recovery_offset = 0; + rs->print_flags |= DMPF_REBUILD; + } else if (!strcasecmp(key, "write_mostly")) { + if (rs->raid_type->level != 1) { + rs->ti->error = "write_mostly option is only valid for RAID1"; + return -EINVAL; + } + if (value >= rs->md.raid_disks) { + rs->ti->error = "Invalid write_mostly drive index given"; + return -EINVAL; + } + set_bit(WriteMostly, &rs->dev[value].rdev.flags); + } else if (!strcasecmp(key, "max_write_behind")) { + if (rs->raid_type->level != 1) { + rs->ti->error = "max_write_behind option is only valid for RAID1"; + return -EINVAL; + } + rs->print_flags |= DMPF_MAX_WRITE_BEHIND; + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + value /= 2; + if (value > COUNTER_MAX) { + rs->ti->error = "Max write-behind limit out of range"; + return -EINVAL; + } + rs->md.bitmap_info.max_write_behind = value; + } else if (!strcasecmp(key, "daemon_sleep")) { + rs->print_flags |= DMPF_DAEMON_SLEEP; + if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { + rs->ti->error = "daemon sleep period out of range"; + return -EINVAL; + } + rs->md.bitmap_info.daemon_sleep = value; + } else if (!strcasecmp(key, "stripe_cache")) { + rs->print_flags |= DMPF_STRIPE_CACHE; + + /* + * In device-mapper, we specify things in sectors, but + * MD records this value in kB + */ + value /= 2; + + if ((rs->raid_type->level != 5) && + (rs->raid_type->level != 6)) { + rs->ti->error = "Inappropriate argument: stripe_cache"; + return -EINVAL; + } + if (raid5_set_cache_size(&rs->md, (int)value)) { + rs->ti->error = "Bad stripe_cache size"; + return -EINVAL; + } + } else if (!strcasecmp(key, "min_recovery_rate")) { + rs->print_flags |= DMPF_MIN_RECOVERY_RATE; + if (value > INT_MAX) { + rs->ti->error = "min_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_min = (int)value; + } else if (!strcasecmp(key, "max_recovery_rate")) { + rs->print_flags |= DMPF_MAX_RECOVERY_RATE; + if (value > INT_MAX) { + rs->ti->error = "max_recovery_rate out of range"; + return -EINVAL; + } + rs->md.sync_speed_max = (int)value; + } else if (!strcasecmp(key, "region_size")) { + rs->print_flags |= DMPF_REGION_SIZE; + region_size = value; + } else if (!strcasecmp(key, "raid10_copies") && + (rs->raid_type->level == 10)) { + if ((value < 2) || (value > 0xFF)) { + rs->ti->error = "Bad value for 'raid10_copies'"; + return -EINVAL; + } + rs->print_flags |= DMPF_RAID10_COPIES; + raid10_copies = value; + } else { + DMERR("Unable to parse RAID parameter: %s", key); + rs->ti->error = "Unable to parse RAID parameters"; + return -EINVAL; + } + } + + if (validate_region_size(rs, region_size)) + return -EINVAL; + + if (rs->md.chunk_sectors) + max_io_len = rs->md.chunk_sectors; + else + max_io_len = region_size; + + if (dm_set_target_max_io_len(rs->ti, max_io_len)) + return -EINVAL; + + if (rs->raid_type->level == 10) { + if (raid10_copies > rs->md.raid_disks) { + rs->ti->error = "Not enough devices to satisfy specification"; + return -EINVAL; + } + + /* + * If the format is not "near", we only support + * two copies at the moment. + */ + if (strcmp("near", raid10_format) && (raid10_copies > 2)) { + rs->ti->error = "Too many copies for given RAID10 format."; + return -EINVAL; + } + + /* (Len * #mirrors) / #devices */ + sectors_per_dev = rs->ti->len * raid10_copies; + sector_div(sectors_per_dev, rs->md.raid_disks); + + rs->md.layout = raid10_format_to_md_layout(raid10_format, + raid10_copies); + rs->md.new_layout = rs->md.layout; + } else if ((rs->raid_type->level > 1) && + sector_div(sectors_per_dev, + (rs->md.raid_disks - rs->raid_type->parity_devs))) { + rs->ti->error = "Target length not divisible by number of data devices"; + return -EINVAL; + } + rs->md.dev_sectors = sectors_per_dev; + + /* Assume there are no metadata devices until the drives are parsed */ + rs->md.persistent = 0; + rs->md.external = 1; + + return 0; +} + +static void do_table_event(struct work_struct *ws) +{ + struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); + + dm_table_event(rs->ti->table); +} + +static int raid_is_congested(struct dm_target_callbacks *cb, int bits) +{ + struct raid_set *rs = container_of(cb, struct raid_set, callbacks); + + if (rs->raid_type->level == 1) + return md_raid1_congested(&rs->md, bits); + + if (rs->raid_type->level == 10) + return md_raid10_congested(&rs->md, bits); + + return md_raid5_congested(&rs->md, bits); +} + +/* + * This structure is never routinely used by userspace, unlike md superblocks. + * Devices with this superblock should only ever be accessed via device-mapper. + */ +#define DM_RAID_MAGIC 0x64526D44 +struct dm_raid_superblock { + __le32 magic; /* "DmRd" */ + __le32 features; /* Used to indicate possible future changes */ + + __le32 num_devices; /* Number of devices in this array. (Max 64) */ + __le32 array_position; /* The position of this drive in the array */ + + __le64 events; /* Incremented by md when superblock updated */ + __le64 failed_devices; /* Bit field of devices to indicate failures */ + + /* + * This offset tracks the progress of the repair or replacement of + * an individual drive. + */ + __le64 disk_recovery_offset; + + /* + * This offset tracks the progress of the initial array + * synchronisation/parity calculation. + */ + __le64 array_resync_offset; + + /* + * RAID characteristics + */ + __le32 level; + __le32 layout; + __le32 stripe_sectors; + + __u8 pad[452]; /* Round struct to 512 bytes. */ + /* Always set to 0 when writing. */ +} __packed; + +static int read_disk_sb(struct md_rdev *rdev, int size) +{ + BUG_ON(!rdev->sb_page); + + if (rdev->sb_loaded) + return 0; + + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { + DMERR("Failed to read superblock of device at position %d", + rdev->raid_disk); + md_error(rdev->mddev, rdev); + return -EINVAL; + } + + rdev->sb_loaded = 1; + + return 0; +} + +static void super_sync(struct mddev *mddev, struct md_rdev *rdev) +{ + int i; + uint64_t failed_devices; + struct dm_raid_superblock *sb; + struct raid_set *rs = container_of(mddev, struct raid_set, md); + + sb = page_address(rdev->sb_page); + failed_devices = le64_to_cpu(sb->failed_devices); + + for (i = 0; i < mddev->raid_disks; i++) + if (!rs->dev[i].data_dev || + test_bit(Faulty, &(rs->dev[i].rdev.flags))) + failed_devices |= (1ULL << i); + + memset(sb, 0, sizeof(*sb)); + + sb->magic = cpu_to_le32(DM_RAID_MAGIC); + sb->features = cpu_to_le32(0); /* No features yet */ + + sb->num_devices = cpu_to_le32(mddev->raid_disks); + sb->array_position = cpu_to_le32(rdev->raid_disk); + + sb->events = cpu_to_le64(mddev->events); + sb->failed_devices = cpu_to_le64(failed_devices); + + sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); + sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); + + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); + sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); +} + +/* + * super_load + * + * This function creates a superblock if one is not found on the device + * and will decide which superblock to use if there's a choice. + * + * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise + */ +static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) +{ + int ret; + struct dm_raid_superblock *sb; + struct dm_raid_superblock *refsb; + uint64_t events_sb, events_refsb; + + rdev->sb_start = 0; + rdev->sb_size = sizeof(*sb); + + ret = read_disk_sb(rdev, rdev->sb_size); + if (ret) + return ret; + + sb = page_address(rdev->sb_page); + + /* + * Two cases that we want to write new superblocks and rebuild: + * 1) New device (no matching magic number) + * 2) Device specified for rebuild (!In_sync w/ offset == 0) + */ + if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) || + (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) { + super_sync(rdev->mddev, rdev); + + set_bit(FirstUse, &rdev->flags); + + /* Force writing of superblocks to disk */ + set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); + + /* Any superblock is better than none, choose that if given */ + return refdev ? 0 : 1; + } + + if (!refdev) + return 1; + + events_sb = le64_to_cpu(sb->events); + + refsb = page_address(refdev->sb_page); + events_refsb = le64_to_cpu(refsb->events); + + return (events_sb > events_refsb) ? 1 : 0; +} + +static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) +{ + int role; + struct raid_set *rs = container_of(mddev, struct raid_set, md); + uint64_t events_sb; + uint64_t failed_devices; + struct dm_raid_superblock *sb; + uint32_t new_devs = 0; + uint32_t rebuilds = 0; + struct md_rdev *r; + struct dm_raid_superblock *sb2; + + sb = page_address(rdev->sb_page); + events_sb = le64_to_cpu(sb->events); + failed_devices = le64_to_cpu(sb->failed_devices); + + /* + * Initialise to 1 if this is a new superblock. + */ + mddev->events = events_sb ? : 1; + + /* + * Reshaping is not currently allowed + */ + if (le32_to_cpu(sb->level) != mddev->level) { + DMERR("Reshaping arrays not yet supported. (RAID level change)"); + return -EINVAL; + } + if (le32_to_cpu(sb->layout) != mddev->layout) { + DMERR("Reshaping arrays not yet supported. (RAID layout change)"); + DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); + DMERR(" Old layout: %s w/ %d copies", + raid10_md_layout_to_format(le32_to_cpu(sb->layout)), + raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); + DMERR(" New layout: %s w/ %d copies", + raid10_md_layout_to_format(mddev->layout), + raid10_md_layout_to_copies(mddev->layout)); + return -EINVAL; + } + if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) { + DMERR("Reshaping arrays not yet supported. (stripe sectors change)"); + return -EINVAL; + } + + /* We can only change the number of devices in RAID1 right now */ + if ((rs->raid_type->level != 1) && + (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { + DMERR("Reshaping arrays not yet supported. (device count change)"); + return -EINVAL; + } + + if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))) + mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); + + /* + * During load, we set FirstUse if a new superblock was written. + * There are two reasons we might not have a superblock: + * 1) The array is brand new - in which case, all of the + * devices must have their In_sync bit set. Also, + * recovery_cp must be 0, unless forced. + * 2) This is a new device being added to an old array + * and the new device needs to be rebuilt - in which + * case the In_sync bit will /not/ be set and + * recovery_cp must be MaxSector. + */ + rdev_for_each(r, mddev) { + if (!test_bit(In_sync, &r->flags)) { + DMINFO("Device %d specified for rebuild: " + "Clearing superblock", r->raid_disk); + rebuilds++; + } else if (test_bit(FirstUse, &r->flags)) + new_devs++; + } + + if (!rebuilds) { + if (new_devs == mddev->raid_disks) { + DMINFO("Superblocks created for new array"); + set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); + } else if (new_devs) { + DMERR("New device injected " + "into existing array without 'rebuild' " + "parameter specified"); + return -EINVAL; + } + } else if (new_devs) { + DMERR("'rebuild' devices cannot be " + "injected into an array with other first-time devices"); + return -EINVAL; + } else if (mddev->recovery_cp != MaxSector) { + DMERR("'rebuild' specified while array is not in-sync"); + return -EINVAL; + } + + /* + * Now we set the Faulty bit for those devices that are + * recorded in the superblock as failed. + */ + rdev_for_each(r, mddev) { + if (!r->sb_page) + continue; + sb2 = page_address(r->sb_page); + sb2->failed_devices = 0; + + /* + * Check for any device re-ordering. + */ + if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { + role = le32_to_cpu(sb2->array_position); + if (role != r->raid_disk) { + if (rs->raid_type->level != 1) { + rs->ti->error = "Cannot change device " + "positions in RAID array"; + return -EINVAL; + } + DMINFO("RAID1 device #%d now at position #%d", + role, r->raid_disk); + } + + /* + * Partial recovery is performed on + * returning failed devices. + */ + if (failed_devices & (1 << role)) + set_bit(Faulty, &r->flags); + } + } + + return 0; +} + +static int super_validate(struct mddev *mddev, struct md_rdev *rdev) +{ + struct dm_raid_superblock *sb = page_address(rdev->sb_page); + + /* + * If mddev->events is not set, we know we have not yet initialized + * the array. + */ + if (!mddev->events && super_init_validation(mddev, rdev)) + return -EINVAL; + + mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */ + rdev->mddev->bitmap_info.default_offset = 4096 >> 9; + if (!test_bit(FirstUse, &rdev->flags)) { + rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); + if (rdev->recovery_offset != MaxSector) + clear_bit(In_sync, &rdev->flags); + } + + /* + * If a device comes back, set it as not In_sync and no longer faulty. + */ + if (test_bit(Faulty, &rdev->flags)) { + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->recovery_offset = 0; + } + + clear_bit(FirstUse, &rdev->flags); + + return 0; +} + +/* + * Analyse superblocks and select the freshest. + */ +static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) +{ + int ret; + struct raid_dev *dev; + struct md_rdev *rdev, *tmp, *freshest; + struct mddev *mddev = &rs->md; + + freshest = NULL; + rdev_for_each_safe(rdev, tmp, mddev) { + /* + * Skipping super_load due to DMPF_SYNC will cause + * the array to undergo initialization again as + * though it were new. This is the intended effect + * of the "sync" directive. + * + * When reshaping capability is added, we must ensure + * that the "sync" directive is disallowed during the + * reshape. + */ + if (rs->print_flags & DMPF_SYNC) + continue; + + if (!rdev->meta_bdev) + continue; + + ret = super_load(rdev, freshest); + + switch (ret) { + case 1: + freshest = rdev; + break; + case 0: + break; + default: + dev = container_of(rdev, struct raid_dev, rdev); + if (dev->meta_dev) + dm_put_device(ti, dev->meta_dev); + + dev->meta_dev = NULL; + rdev->meta_bdev = NULL; + + if (rdev->sb_page) + put_page(rdev->sb_page); + + rdev->sb_page = NULL; + + rdev->sb_loaded = 0; + + /* + * We might be able to salvage the data device + * even though the meta device has failed. For + * now, we behave as though '- -' had been + * set for this device in the table. + */ + if (dev->data_dev) + dm_put_device(ti, dev->data_dev); + + dev->data_dev = NULL; + rdev->bdev = NULL; + + list_del(&rdev->same_set); + } + } + + if (!freshest) + return 0; + + if (validate_raid_redundancy(rs)) { + rs->ti->error = "Insufficient redundancy to activate array"; + return -EINVAL; + } + + /* + * Validation of the freshest device provides the source of + * validation for the remaining devices. + */ + ti->error = "Unable to assemble array: Invalid superblocks"; + if (super_validate(mddev, freshest)) + return -EINVAL; + + rdev_for_each(rdev, mddev) + if ((rdev != freshest) && super_validate(mddev, rdev)) + return -EINVAL; + + return 0; +} + +/* + * Construct a RAID4/5/6 mapping: + * Args: + * <raid_type> <#raid_params> <raid_params> \ + * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } + * + * <raid_params> varies by <raid_type>. See 'parse_raid_params' for + * details on possible <raid_params>. + */ +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int ret; + struct raid_type *rt; + unsigned long num_raid_params, num_raid_devs; + struct raid_set *rs = NULL; + + /* Must have at least <raid_type> <#raid_params> */ + if (argc < 2) { + ti->error = "Too few arguments"; + return -EINVAL; + } + + /* raid type */ + rt = get_raid_type(argv[0]); + if (!rt) { + ti->error = "Unrecognised raid_type"; + return -EINVAL; + } + argc--; + argv++; + + /* number of RAID parameters */ + if (kstrtoul(argv[0], 10, &num_raid_params) < 0) { + ti->error = "Cannot understand number of RAID parameters"; + return -EINVAL; + } + argc--; + argv++; + + /* Skip over RAID params for now and find out # of devices */ + if (num_raid_params + 1 > argc) { + ti->error = "Arguments do not agree with counts given"; + return -EINVAL; + } + + if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || + (num_raid_devs >= INT_MAX)) { + ti->error = "Cannot understand number of raid devices"; + return -EINVAL; + } + + rs = context_alloc(ti, rt, (unsigned)num_raid_devs); + if (IS_ERR(rs)) + return PTR_ERR(rs); + + ret = parse_raid_params(rs, argv, (unsigned)num_raid_params); + if (ret) + goto bad; + + ret = -EINVAL; + + argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ + argv += num_raid_params + 1; + + if (argc != (num_raid_devs * 2)) { + ti->error = "Supplied RAID devices does not match the count given"; + goto bad; + } + + ret = dev_parms(rs, argv); + if (ret) + goto bad; + + rs->md.sync_super = super_sync; + ret = analyse_superblocks(ti, rs); + if (ret) + goto bad; + + INIT_WORK(&rs->md.event_work, do_table_event); + ti->private = rs; + ti->num_flush_bios = 1; + + mutex_lock(&rs->md.reconfig_mutex); + ret = md_run(&rs->md); + rs->md.in_sync = 0; /* Assume already marked dirty */ + mutex_unlock(&rs->md.reconfig_mutex); + + if (ret) { + ti->error = "Fail to run raid array"; + goto bad; + } + + if (ti->len != rs->md.array_sectors) { + ti->error = "Array size does not match requested target length"; + ret = -EINVAL; + goto size_mismatch; + } + rs->callbacks.congested_fn = raid_is_congested; + dm_table_add_target_callbacks(ti->table, &rs->callbacks); + + mddev_suspend(&rs->md); + return 0; + +size_mismatch: + md_stop(&rs->md); +bad: + context_free(rs); + + return ret; +} + +static void raid_dtr(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + list_del_init(&rs->callbacks.list); + md_stop(&rs->md); + context_free(rs); +} + +static int raid_map(struct dm_target *ti, struct bio *bio) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + mddev->pers->make_request(mddev, bio); + + return DM_MAPIO_SUBMITTED; +} + +static const char *decipher_sync_action(struct mddev *mddev) +{ + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + return "frozen"; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return "reshape"; + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + return "resync"; + else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + return "check"; + return "repair"; + } + + if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + return "recover"; + } + + return "idle"; +} + +static void raid_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct raid_set *rs = ti->private; + unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ + unsigned sz = 0; + int i, array_in_sync = 0; + sector_t sync; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); + + if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) + sync = rs->md.curr_resync_completed; + else + sync = rs->md.recovery_cp; + + if (sync >= rs->md.resync_max_sectors) { + /* + * Sync complete. + */ + array_in_sync = 1; + sync = rs->md.resync_max_sectors; + } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) { + /* + * If "check" or "repair" is occurring, the array has + * undergone and initial sync and the health characters + * should not be 'a' anymore. + */ + array_in_sync = 1; + } else { + /* + * The array may be doing an initial sync, or it may + * be rebuilding individual components. If all the + * devices are In_sync, then it is the array that is + * being initialized. + */ + for (i = 0; i < rs->md.raid_disks; i++) + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + array_in_sync = 1; + } + + /* + * Status characters: + * 'D' = Dead/Failed device + * 'a' = Alive but not in-sync + * 'A' = Alive and in-sync + */ + for (i = 0; i < rs->md.raid_disks; i++) { + if (test_bit(Faulty, &rs->dev[i].rdev.flags)) + DMEMIT("D"); + else if (!array_in_sync || + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + DMEMIT("a"); + else + DMEMIT("A"); + } + + /* + * In-sync ratio: + * The in-sync ratio shows the progress of: + * - Initializing the array + * - Rebuilding a subset of devices of the array + * The user can distinguish between the two by referring + * to the status characters. + */ + DMEMIT(" %llu/%llu", + (unsigned long long) sync, + (unsigned long long) rs->md.resync_max_sectors); + + /* + * Sync action: + * See Documentation/device-mapper/dm-raid.c for + * information on each of these states. + */ + DMEMIT(" %s", decipher_sync_action(&rs->md)); + + /* + * resync_mismatches/mismatch_cnt + * This field shows the number of discrepancies found when + * performing a "check" of the array. + */ + DMEMIT(" %llu", + (strcmp(rs->md.last_sync_action, "check")) ? 0 : + (unsigned long long) + atomic64_read(&rs->md.resync_mismatches)); + break; + case STATUSTYPE_TABLE: + /* The string you would use to construct this array */ + for (i = 0; i < rs->md.raid_disks; i++) { + if ((rs->print_flags & DMPF_REBUILD) && + rs->dev[i].data_dev && + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + raid_param_cnt += 2; /* for rebuilds */ + if (rs->dev[i].data_dev && + test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + raid_param_cnt += 2; + } + + raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2); + if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) + raid_param_cnt--; + + DMEMIT("%s %u %u", rs->raid_type->name, + raid_param_cnt, rs->md.chunk_sectors); + + if ((rs->print_flags & DMPF_SYNC) && + (rs->md.recovery_cp == MaxSector)) + DMEMIT(" sync"); + if (rs->print_flags & DMPF_NOSYNC) + DMEMIT(" nosync"); + + for (i = 0; i < rs->md.raid_disks; i++) + if ((rs->print_flags & DMPF_REBUILD) && + rs->dev[i].data_dev && + !test_bit(In_sync, &rs->dev[i].rdev.flags)) + DMEMIT(" rebuild %u", i); + + if (rs->print_flags & DMPF_DAEMON_SLEEP) + DMEMIT(" daemon_sleep %lu", + rs->md.bitmap_info.daemon_sleep); + + if (rs->print_flags & DMPF_MIN_RECOVERY_RATE) + DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min); + + if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) + DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); + + for (i = 0; i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev && + test_bit(WriteMostly, &rs->dev[i].rdev.flags)) + DMEMIT(" write_mostly %u", i); + + if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) + DMEMIT(" max_write_behind %lu", + rs->md.bitmap_info.max_write_behind); + + if (rs->print_flags & DMPF_STRIPE_CACHE) { + struct r5conf *conf = rs->md.private; + + /* convert from kiB to sectors */ + DMEMIT(" stripe_cache %d", + conf ? conf->max_nr_stripes * 2 : 0); + } + + if (rs->print_flags & DMPF_REGION_SIZE) + DMEMIT(" region_size %lu", + rs->md.bitmap_info.chunksize >> 9); + + if (rs->print_flags & DMPF_RAID10_COPIES) + DMEMIT(" raid10_copies %u", + raid10_md_layout_to_copies(rs->md.layout)); + + if (rs->print_flags & DMPF_RAID10_FORMAT) + DMEMIT(" raid10_format %s", + raid10_md_layout_to_format(rs->md.layout)); + + DMEMIT(" %d", rs->md.raid_disks); + for (i = 0; i < rs->md.raid_disks; i++) { + if (rs->dev[i].meta_dev) + DMEMIT(" %s", rs->dev[i].meta_dev->name); + else + DMEMIT(" -"); + + if (rs->dev[i].data_dev) + DMEMIT(" %s", rs->dev[i].data_dev->name); + else + DMEMIT(" -"); + } + } +} + +static int raid_message(struct dm_target *ti, unsigned argc, char **argv) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + if (!strcasecmp(argv[0], "reshape")) { + DMERR("Reshape not supported."); + return -EINVAL; + } + + if (!mddev->pers || !mddev->pers->sync_request) + return -EINVAL; + + if (!strcasecmp(argv[0], "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); + } + } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + return -EBUSY; + else if (!strcasecmp(argv[0], "resync")) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + else if (!strcasecmp(argv[0], "recover")) { + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } else { + if (!strcasecmp(argv[0], "check")) + set_bit(MD_RECOVERY_CHECK, &mddev->recovery); + else if (!!strcasecmp(argv[0], "repair")) + return -EINVAL; + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } + if (mddev->ro == 2) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = 0; + if (!mddev->suspended) + md_wakeup_thread(mddev->sync_thread); + } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (!mddev->suspended) + md_wakeup_thread(mddev->thread); + + return 0; +} + +static int raid_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct raid_set *rs = ti->private; + unsigned i; + int ret = 0; + + for (i = 0; !ret && i < rs->md.raid_disks; i++) + if (rs->dev[i].data_dev) + ret = fn(ti, + rs->dev[i].data_dev, + 0, /* No offset on data devs */ + rs->md.dev_sectors, + data); + + return ret; +} + +static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct raid_set *rs = ti->private; + unsigned chunk_size = rs->md.chunk_sectors << 9; + struct r5conf *conf = rs->md.private; + + blk_limits_io_min(limits, chunk_size); + blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded)); +} + +static void raid_presuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + md_stop_writes(&rs->md); +} + +static void raid_postsuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + mddev_suspend(&rs->md); +} + +static void attempt_restore_of_faulty_devices(struct raid_set *rs) +{ + int i; + uint64_t failed_devices, cleared_failed_devices = 0; + unsigned long flags; + struct dm_raid_superblock *sb; + struct md_rdev *r; + + for (i = 0; i < rs->md.raid_disks; i++) { + r = &rs->dev[i].rdev; + if (test_bit(Faulty, &r->flags) && r->sb_page && + sync_page_io(r, 0, r->sb_size, r->sb_page, READ, 1)) { + DMINFO("Faulty %s device #%d has readable super block." + " Attempting to revive it.", + rs->raid_type->name, i); + + /* + * Faulty bit may be set, but sometimes the array can + * be suspended before the personalities can respond + * by removing the device from the array (i.e. calling + * 'hot_remove_disk'). If they haven't yet removed + * the failed device, its 'raid_disk' number will be + * '>= 0' - meaning we must call this function + * ourselves. + */ + if ((r->raid_disk >= 0) && + (r->mddev->pers->hot_remove_disk(r->mddev, r) != 0)) + /* Failed to revive this device, try next */ + continue; + + r->raid_disk = i; + r->saved_raid_disk = i; + flags = r->flags; + clear_bit(Faulty, &r->flags); + clear_bit(WriteErrorSeen, &r->flags); + clear_bit(In_sync, &r->flags); + if (r->mddev->pers->hot_add_disk(r->mddev, r)) { + r->raid_disk = -1; + r->saved_raid_disk = -1; + r->flags = flags; + } else { + r->recovery_offset = 0; + cleared_failed_devices |= 1 << i; + } + } + } + if (cleared_failed_devices) { + rdev_for_each(r, &rs->md) { + sb = page_address(r->sb_page); + failed_devices = le64_to_cpu(sb->failed_devices); + failed_devices &= ~cleared_failed_devices; + sb->failed_devices = cpu_to_le64(failed_devices); + } + } +} + +static void raid_resume(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + set_bit(MD_CHANGE_DEVS, &rs->md.flags); + if (!rs->bitmap_loaded) { + bitmap_load(&rs->md); + rs->bitmap_loaded = 1; + } else { + /* + * A secondary resume while the device is active. + * Take this opportunity to check whether any failed + * devices are reachable again. + */ + attempt_restore_of_faulty_devices(rs); + } + + clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); + mddev_resume(&rs->md); +} + +static struct target_type raid_target = { + .name = "raid", + .version = {1, 5, 2}, + .module = THIS_MODULE, + .ctr = raid_ctr, + .dtr = raid_dtr, + .map = raid_map, + .status = raid_status, + .message = raid_message, + .iterate_devices = raid_iterate_devices, + .io_hints = raid_io_hints, + .presuspend = raid_presuspend, + .postsuspend = raid_postsuspend, + .resume = raid_resume, +}; + +static int __init dm_raid_init(void) +{ + DMINFO("Loading target version %u.%u.%u", + raid_target.version[0], + raid_target.version[1], + raid_target.version[2]); + return dm_register_target(&raid_target); +} + +static void __exit dm_raid_exit(void) +{ + dm_unregister_target(&raid_target); +} + +module_init(dm_raid_init); +module_exit(dm_raid_exit); + +MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); +MODULE_ALIAS("dm-raid1"); +MODULE_ALIAS("dm-raid10"); +MODULE_ALIAS("dm-raid4"); +MODULE_ALIAS("dm-raid5"); +MODULE_ALIAS("dm-raid6"); +MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index ff05fe89308..7dfdb5c746d 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1,30 +1,27 @@ /* * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ -#include "dm.h" -#include "dm-bio-list.h" #include "dm-bio-record.h" -#include <linux/ctype.h> #include <linux/init.h> #include <linux/mempool.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/slab.h> -#include <linux/time.h> -#include <linux/vmalloc.h> #include <linux/workqueue.h> -#include <linux/log2.h> -#include <linux/hardirq.h> +#include <linux/device-mapper.h> #include <linux/dm-io.h> #include <linux/dm-dirty-log.h> #include <linux/dm-kcopyd.h> +#include <linux/dm-region-hash.h> #define DM_MSG_PREFIX "raid1" -#define DM_IO_PAGES 64 + +#define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ #define DM_RAID1_HANDLE_ERRORS 0x01 #define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) @@ -32,91 +29,11 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); /*----------------------------------------------------------------- - * Region hash - * - * The mirror splits itself up into discrete regions. Each - * region can be in one of three states: clean, dirty, - * nosync. There is no need to put clean regions in the hash. - * - * In addition to being present in the hash table a region _may_ - * be present on one of three lists. - * - * clean_regions: Regions on this list have no io pending to - * them, they are in sync, we are no longer interested in them, - * they are dull. rh_update_states() will remove them from the - * hash table. - * - * quiesced_regions: These regions have been spun down, ready - * for recovery. rh_recovery_start() will remove regions from - * this list and hand them to kmirrord, which will schedule the - * recovery io with kcopyd. - * - * recovered_regions: Regions that kcopyd has successfully - * recovered. rh_update_states() will now schedule any delayed - * io, up the recovery_count, and remove the region from the - * hash. - * - * There are 2 locks: - * A rw spin lock 'hash_lock' protects just the hash table, - * this is never held in write mode from interrupt context, - * which I believe means that we only have to disable irqs when - * doing a write lock. - * - * An ordinary spin lock 'region_lock' that protects the three - * lists in the region_hash, with the 'state', 'list' and - * 'bhs_delayed' fields of the regions. This is used from irq - * context, so all other uses will have to suspend local irqs. - *---------------------------------------------------------------*/ -struct mirror_set; -struct region_hash { - struct mirror_set *ms; - uint32_t region_size; - unsigned region_shift; - - /* holds persistent region state */ - struct dm_dirty_log *log; - - /* hash table */ - rwlock_t hash_lock; - mempool_t *region_pool; - unsigned int mask; - unsigned int nr_buckets; - struct list_head *buckets; - - spinlock_t region_lock; - atomic_t recovery_in_flight; - struct semaphore recovery_count; - struct list_head clean_regions; - struct list_head quiesced_regions; - struct list_head recovered_regions; - struct list_head failed_recovered_regions; -}; - -enum { - RH_CLEAN, - RH_DIRTY, - RH_NOSYNC, - RH_RECOVERING -}; - -struct region { - struct region_hash *rh; /* FIXME: can we get rid of this ? */ - region_t key; - int state; - - struct list_head hash_list; - struct list_head list; - - atomic_t pending; - struct bio_list delayed_bios; -}; - - -/*----------------------------------------------------------------- * Mirror set structures. *---------------------------------------------------------------*/ enum dm_raid1_error { DM_RAID1_WRITE_ERROR, + DM_RAID1_FLUSH_ERROR, DM_RAID1_SYNC_ERROR, DM_RAID1_READ_ERROR }; @@ -132,22 +49,24 @@ struct mirror { struct mirror_set { struct dm_target *ti; struct list_head list; - struct region_hash rh; - struct dm_kcopyd_client *kcopyd_client; + uint64_t features; spinlock_t lock; /* protects the lists */ struct bio_list reads; struct bio_list writes; struct bio_list failures; + struct bio_list holds; /* bios are waiting until suspend */ + struct dm_region_hash *rh; + struct dm_kcopyd_client *kcopyd_client; struct dm_io_client *io_client; - mempool_t *read_record_pool; /* recovery */ region_t nr_regions; int in_sync; int log_failure; + int leg_failure; atomic_t suspend; atomic_t default_mirror; /* Default mirror */ @@ -159,25 +78,17 @@ struct mirror_set { struct work_struct trigger_event; - unsigned int nr_mirrors; + unsigned nr_mirrors; struct mirror mirror[0]; }; -/* - * Conversion fns - */ -static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio) -{ - return (bio->bi_sector - rh->ms->ti->begin) >> rh->region_shift; -} +DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(raid1_resync_throttle, + "A percentage of time allocated for raid resynchronization"); -static inline sector_t region_to_sector(struct region_hash *rh, region_t region) +static void wakeup_mirrord(void *context) { - return region << rh->region_shift; -} + struct mirror_set *ms = context; -static void wake(struct mirror_set *ms) -{ queue_work(ms->kmirrord_wq, &ms->kmirrord_work); } @@ -186,7 +97,7 @@ static void delayed_wake_fn(unsigned long data) struct mirror_set *ms = (struct mirror_set *) data; clear_bit(0, &ms->timer_pending); - wake(ms); + wakeup_mirrord(ms); } static void delayed_wake(struct mirror_set *ms) @@ -200,479 +111,41 @@ static void delayed_wake(struct mirror_set *ms) add_timer(&ms->timer); } -/* FIXME move this */ -static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); - -#define MIN_REGIONS 64 -#define MAX_RECOVERY 1 -static int rh_init(struct region_hash *rh, struct mirror_set *ms, - struct dm_dirty_log *log, uint32_t region_size, - region_t nr_regions) -{ - unsigned int nr_buckets, max_buckets; - size_t i; - - /* - * Calculate a suitable number of buckets for our hash - * table. - */ - max_buckets = nr_regions >> 6; - for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) - ; - nr_buckets >>= 1; - - rh->ms = ms; - rh->log = log; - rh->region_size = region_size; - rh->region_shift = ffs(region_size) - 1; - rwlock_init(&rh->hash_lock); - rh->mask = nr_buckets - 1; - rh->nr_buckets = nr_buckets; - - rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); - if (!rh->buckets) { - DMERR("unable to allocate region hash memory"); - return -ENOMEM; - } - - for (i = 0; i < nr_buckets; i++) - INIT_LIST_HEAD(rh->buckets + i); - - spin_lock_init(&rh->region_lock); - sema_init(&rh->recovery_count, 0); - atomic_set(&rh->recovery_in_flight, 0); - INIT_LIST_HEAD(&rh->clean_regions); - INIT_LIST_HEAD(&rh->quiesced_regions); - INIT_LIST_HEAD(&rh->recovered_regions); - INIT_LIST_HEAD(&rh->failed_recovered_regions); - - rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, - sizeof(struct region)); - if (!rh->region_pool) { - vfree(rh->buckets); - rh->buckets = NULL; - return -ENOMEM; - } - - return 0; -} - -static void rh_exit(struct region_hash *rh) -{ - unsigned int h; - struct region *reg, *nreg; - - BUG_ON(!list_empty(&rh->quiesced_regions)); - for (h = 0; h < rh->nr_buckets; h++) { - list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) { - BUG_ON(atomic_read(®->pending)); - mempool_free(reg, rh->region_pool); - } - } - - if (rh->log) - dm_dirty_log_destroy(rh->log); - if (rh->region_pool) - mempool_destroy(rh->region_pool); - vfree(rh->buckets); -} - -#define RH_HASH_MULT 2654435387U - -static inline unsigned int rh_hash(struct region_hash *rh, region_t region) -{ - return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask; -} - -static struct region *__rh_lookup(struct region_hash *rh, region_t region) -{ - struct region *reg; - - list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list) - if (reg->key == region) - return reg; - - return NULL; -} - -static void __rh_insert(struct region_hash *rh, struct region *reg) -{ - unsigned int h = rh_hash(rh, reg->key); - list_add(®->hash_list, rh->buckets + h); -} - -static struct region *__rh_alloc(struct region_hash *rh, region_t region) +static void wakeup_all_recovery_waiters(void *context) { - struct region *reg, *nreg; - - read_unlock(&rh->hash_lock); - nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); - if (unlikely(!nreg)) - nreg = kmalloc(sizeof(struct region), GFP_NOIO); - nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? - RH_CLEAN : RH_NOSYNC; - nreg->rh = rh; - nreg->key = region; - - INIT_LIST_HEAD(&nreg->list); - - atomic_set(&nreg->pending, 0); - bio_list_init(&nreg->delayed_bios); - write_lock_irq(&rh->hash_lock); - - reg = __rh_lookup(rh, region); - if (reg) - /* we lost the race */ - mempool_free(nreg, rh->region_pool); - - else { - __rh_insert(rh, nreg); - if (nreg->state == RH_CLEAN) { - spin_lock(&rh->region_lock); - list_add(&nreg->list, &rh->clean_regions); - spin_unlock(&rh->region_lock); - } - reg = nreg; - } - write_unlock_irq(&rh->hash_lock); - read_lock(&rh->hash_lock); - - return reg; + wake_up_all(&_kmirrord_recovery_stopped); } -static inline struct region *__rh_find(struct region_hash *rh, region_t region) -{ - struct region *reg; - - reg = __rh_lookup(rh, region); - if (!reg) - reg = __rh_alloc(rh, region); - - return reg; -} - -static int rh_state(struct region_hash *rh, region_t region, int may_block) -{ - int r; - struct region *reg; - - read_lock(&rh->hash_lock); - reg = __rh_lookup(rh, region); - read_unlock(&rh->hash_lock); - - if (reg) - return reg->state; - - /* - * The region wasn't in the hash, so we fall back to the - * dirty log. - */ - r = rh->log->type->in_sync(rh->log, region, may_block); - - /* - * Any error from the dirty log (eg. -EWOULDBLOCK) gets - * taken as a RH_NOSYNC - */ - return r == 1 ? RH_CLEAN : RH_NOSYNC; -} - -static inline int rh_in_sync(struct region_hash *rh, - region_t region, int may_block) -{ - int state = rh_state(rh, region, may_block); - return state == RH_CLEAN || state == RH_DIRTY; -} - -static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list) -{ - struct bio *bio; - - while ((bio = bio_list_pop(bio_list))) { - queue_bio(ms, bio, WRITE); - } -} - -static void complete_resync_work(struct region *reg, int success) -{ - struct region_hash *rh = reg->rh; - - rh->log->type->set_region_sync(rh->log, reg->key, success); - - /* - * Dispatch the bios before we call 'wake_up_all'. - * This is important because if we are suspending, - * we want to know that recovery is complete and - * the work queue is flushed. If we wake_up_all - * before we dispatch_bios (queue bios and call wake()), - * then we risk suspending before the work queue - * has been properly flushed. - */ - dispatch_bios(rh->ms, ®->delayed_bios); - if (atomic_dec_and_test(&rh->recovery_in_flight)) - wake_up_all(&_kmirrord_recovery_stopped); - up(&rh->recovery_count); -} - -static void rh_update_states(struct region_hash *rh) -{ - struct region *reg, *next; - - LIST_HEAD(clean); - LIST_HEAD(recovered); - LIST_HEAD(failed_recovered); - - /* - * Quickly grab the lists. - */ - write_lock_irq(&rh->hash_lock); - spin_lock(&rh->region_lock); - if (!list_empty(&rh->clean_regions)) { - list_splice_init(&rh->clean_regions, &clean); - - list_for_each_entry(reg, &clean, list) - list_del(®->hash_list); - } - - if (!list_empty(&rh->recovered_regions)) { - list_splice_init(&rh->recovered_regions, &recovered); - - list_for_each_entry (reg, &recovered, list) - list_del(®->hash_list); - } - - if (!list_empty(&rh->failed_recovered_regions)) { - list_splice_init(&rh->failed_recovered_regions, - &failed_recovered); - - list_for_each_entry(reg, &failed_recovered, list) - list_del(®->hash_list); - } - - spin_unlock(&rh->region_lock); - write_unlock_irq(&rh->hash_lock); - - /* - * All the regions on the recovered and clean lists have - * now been pulled out of the system, so no need to do - * any more locking. - */ - list_for_each_entry_safe (reg, next, &recovered, list) { - rh->log->type->clear_region(rh->log, reg->key); - complete_resync_work(reg, 1); - mempool_free(reg, rh->region_pool); - } - - list_for_each_entry_safe(reg, next, &failed_recovered, list) { - complete_resync_work(reg, errors_handled(rh->ms) ? 0 : 1); - mempool_free(reg, rh->region_pool); - } - - list_for_each_entry_safe(reg, next, &clean, list) { - rh->log->type->clear_region(rh->log, reg->key); - mempool_free(reg, rh->region_pool); - } - - rh->log->type->flush(rh->log); -} - -static void rh_inc(struct region_hash *rh, region_t region) -{ - struct region *reg; - - read_lock(&rh->hash_lock); - reg = __rh_find(rh, region); - - spin_lock_irq(&rh->region_lock); - atomic_inc(®->pending); - - if (reg->state == RH_CLEAN) { - reg->state = RH_DIRTY; - list_del_init(®->list); /* take off the clean list */ - spin_unlock_irq(&rh->region_lock); - - rh->log->type->mark_region(rh->log, reg->key); - } else - spin_unlock_irq(&rh->region_lock); - - - read_unlock(&rh->hash_lock); -} - -static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios) -{ - struct bio *bio; - - for (bio = bios->head; bio; bio = bio->bi_next) - rh_inc(rh, bio_to_region(rh, bio)); -} - -static void rh_dec(struct region_hash *rh, region_t region) +static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) { unsigned long flags; - struct region *reg; int should_wake = 0; + struct bio_list *bl; - read_lock(&rh->hash_lock); - reg = __rh_lookup(rh, region); - read_unlock(&rh->hash_lock); - - spin_lock_irqsave(&rh->region_lock, flags); - if (atomic_dec_and_test(®->pending)) { - /* - * There is no pending I/O for this region. - * We can move the region to corresponding list for next action. - * At this point, the region is not yet connected to any list. - * - * If the state is RH_NOSYNC, the region should be kept off - * from clean list. - * The hash entry for RH_NOSYNC will remain in memory - * until the region is recovered or the map is reloaded. - */ - - /* do nothing for RH_NOSYNC */ - if (reg->state == RH_RECOVERING) { - list_add_tail(®->list, &rh->quiesced_regions); - } else if (reg->state == RH_DIRTY) { - reg->state = RH_CLEAN; - list_add(®->list, &rh->clean_regions); - } - should_wake = 1; - } - spin_unlock_irqrestore(&rh->region_lock, flags); + bl = (rw == WRITE) ? &ms->writes : &ms->reads; + spin_lock_irqsave(&ms->lock, flags); + should_wake = !(bl->head); + bio_list_add(bl, bio); + spin_unlock_irqrestore(&ms->lock, flags); if (should_wake) - wake(rh->ms); -} - -/* - * Starts quiescing a region in preparation for recovery. - */ -static int __rh_recovery_prepare(struct region_hash *rh) -{ - int r; - struct region *reg; - region_t region; - - /* - * Ask the dirty log what's next. - */ - r = rh->log->type->get_resync_work(rh->log, ®ion); - if (r <= 0) - return r; - - /* - * Get this region, and start it quiescing by setting the - * recovering flag. - */ - read_lock(&rh->hash_lock); - reg = __rh_find(rh, region); - read_unlock(&rh->hash_lock); - - spin_lock_irq(&rh->region_lock); - reg->state = RH_RECOVERING; - - /* Already quiesced ? */ - if (atomic_read(®->pending)) - list_del_init(®->list); - else - list_move(®->list, &rh->quiesced_regions); - - spin_unlock_irq(&rh->region_lock); - - return 1; -} - -static void rh_recovery_prepare(struct region_hash *rh) -{ - /* Extra reference to avoid race with rh_stop_recovery */ - atomic_inc(&rh->recovery_in_flight); - - while (!down_trylock(&rh->recovery_count)) { - atomic_inc(&rh->recovery_in_flight); - if (__rh_recovery_prepare(rh) <= 0) { - atomic_dec(&rh->recovery_in_flight); - up(&rh->recovery_count); - break; - } - } - - /* Drop the extra reference */ - if (atomic_dec_and_test(&rh->recovery_in_flight)) - wake_up_all(&_kmirrord_recovery_stopped); -} - -/* - * Returns any quiesced regions. - */ -static struct region *rh_recovery_start(struct region_hash *rh) -{ - struct region *reg = NULL; - - spin_lock_irq(&rh->region_lock); - if (!list_empty(&rh->quiesced_regions)) { - reg = list_entry(rh->quiesced_regions.next, - struct region, list); - list_del_init(®->list); /* remove from the quiesced list */ - } - spin_unlock_irq(&rh->region_lock); - - return reg; -} - -static void rh_recovery_end(struct region *reg, int success) -{ - struct region_hash *rh = reg->rh; - - spin_lock_irq(&rh->region_lock); - if (success) - list_add(®->list, ®->rh->recovered_regions); - else { - reg->state = RH_NOSYNC; - list_add(®->list, ®->rh->failed_recovered_regions); - } - spin_unlock_irq(&rh->region_lock); - - wake(rh->ms); -} - -static int rh_flush(struct region_hash *rh) -{ - return rh->log->type->flush(rh->log); + wakeup_mirrord(ms); } -static void rh_delay(struct region_hash *rh, struct bio *bio) +static void dispatch_bios(void *context, struct bio_list *bio_list) { - struct region *reg; - - read_lock(&rh->hash_lock); - reg = __rh_find(rh, bio_to_region(rh, bio)); - bio_list_add(®->delayed_bios, bio); - read_unlock(&rh->hash_lock); -} - -static void rh_stop_recovery(struct region_hash *rh) -{ - int i; - - /* wait for any recovering regions */ - for (i = 0; i < MAX_RECOVERY; i++) - down(&rh->recovery_count); -} - -static void rh_start_recovery(struct region_hash *rh) -{ - int i; - - for (i = 0; i < MAX_RECOVERY; i++) - up(&rh->recovery_count); + struct mirror_set *ms = context; + struct bio *bio; - wake(rh->ms); + while ((bio = bio_list_pop(bio_list))) + queue_bio(ms, bio, WRITE); } -#define MIN_READ_RECORDS 20 -struct dm_raid1_read_record { +struct dm_raid1_bio_record { struct mirror *m; + /* if details->bi_bdev == NULL, details were not saved */ struct dm_bio_details details; + region_t write_region; }; /* @@ -708,6 +181,17 @@ static void set_default_mirror(struct mirror *m) atomic_set(&ms->default_mirror, m - m0); } +static struct mirror *get_valid_mirror(struct mirror_set *ms) +{ + struct mirror *m; + + for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++) + if (!atomic_read(&m->error_count)) + return m; + + return NULL; +} + /* fail_mirror * @m: mirror device to fail * @error_type: one of the enum's, DM_RAID1_*_ERROR @@ -727,8 +211,7 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) struct mirror_set *ms = m->ms; struct mirror *new; - if (!errors_handled(ms)) - return; + ms->leg_failure = 1; /* * error_count is used for nothing more than a @@ -740,6 +223,9 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) if (test_and_set_bit(error_type, &m->error_type)) return; + if (!errors_handled(ms)) + return; + if (m != get_default_mirror(ms)) goto out; @@ -753,19 +239,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) goto out; } - for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) - if (!atomic_read(&new->error_count)) { - set_default_mirror(new); - break; - } - - if (unlikely(new == ms->mirror + ms->nr_mirrors)) + new = get_valid_mirror(ms); + if (new) + set_default_mirror(new); + else DMWARN("All sides of mirror have failed."); out: schedule_work(&ms->trigger_event); } +static int mirror_flush(struct dm_target *ti) +{ + struct mirror_set *ms = ti->private; + unsigned long error_bits; + + unsigned int i; + struct dm_io_region io[ms->nr_mirrors]; + struct mirror *m; + struct dm_io_request io_req = { + .bi_rw = WRITE_FLUSH, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = NULL, + .client = ms->io_client, + }; + + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) { + io[i].bdev = m->dev->bdev; + io[i].sector = 0; + io[i].count = 0; + } + + error_bits = -1; + dm_io(&io_req, ms->nr_mirrors, io, &error_bits); + if (unlikely(error_bits != 0)) { + for (i = 0; i < ms->nr_mirrors; i++) + if (test_bit(i, &error_bits)) + fail_mirror(ms->mirror + i, + DM_RAID1_FLUSH_ERROR); + return -EIO; + } + + return 0; +} + /*----------------------------------------------------------------- * Recovery. * @@ -776,8 +293,8 @@ out: static void recovery_complete(int read_err, unsigned long write_err, void *context) { - struct region *reg = (struct region *)context; - struct mirror_set *ms = reg->rh->ms; + struct dm_region *reg = context; + struct mirror_set *ms = dm_rh_region_context(reg); int m, bit = 0; if (read_err) { @@ -803,31 +320,33 @@ static void recovery_complete(int read_err, unsigned long write_err, } } - rh_recovery_end(reg, !(read_err || write_err)); + dm_rh_recovery_end(reg, !(read_err || write_err)); } -static int recover(struct mirror_set *ms, struct region *reg) +static int recover(struct mirror_set *ms, struct dm_region *reg) { int r; - unsigned int i; + unsigned i; struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; struct mirror *m; unsigned long flags = 0; + region_t key = dm_rh_get_region_key(reg); + sector_t region_size = dm_rh_get_region_size(ms->rh); /* fill in the source */ m = get_default_mirror(ms); from.bdev = m->dev->bdev; - from.sector = m->offset + region_to_sector(reg->rh, reg->key); - if (reg->key == (ms->nr_regions - 1)) { + from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key); + if (key == (ms->nr_regions - 1)) { /* * The final region may be smaller than * region_size. */ - from.count = ms->ti->len & (reg->rh->region_size - 1); + from.count = ms->ti->len & (region_size - 1); if (!from.count) - from.count = reg->rh->region_size; + from.count = region_size; } else - from.count = reg->rh->region_size; + from.count = region_size; /* fill in the destinations */ for (i = 0, dest = to; i < ms->nr_mirrors; i++) { @@ -836,13 +355,15 @@ static int recover(struct mirror_set *ms, struct region *reg) m = ms->mirror + i; dest->bdev = m->dev->bdev; - dest->sector = m->offset + region_to_sector(reg->rh, reg->key); + dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key); dest->count = from.count; dest++; } /* hand to kcopyd */ - set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); + if (!errors_handled(ms)) + set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); + r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, recovery_complete, reg); @@ -851,22 +372,22 @@ static int recover(struct mirror_set *ms, struct region *reg) static void do_recovery(struct mirror_set *ms) { + struct dm_region *reg; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); int r; - struct region *reg; - struct dm_dirty_log *log = ms->rh.log; /* * Start quiescing some regions. */ - rh_recovery_prepare(&ms->rh); + dm_rh_recovery_prepare(ms->rh); /* * Copy any already quiesced regions. */ - while ((reg = rh_recovery_start(&ms->rh))) { + while ((reg = dm_rh_recovery_start(ms->rh))) { r = recover(ms, reg); if (r) - rh_recovery_end(reg, 0); + dm_rh_recovery_end(reg, 0); } /* @@ -907,10 +428,11 @@ static int default_ok(struct mirror *m) static int mirror_available(struct mirror_set *ms, struct bio *bio) { - region_t region = bio_to_region(&ms->rh, bio); + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + region_t region = dm_rh_bio_to_region(ms->rh, bio); - if (ms->rh.log->type->in_sync(ms->rh.log, region, 0)) - return choose_mirror(ms, bio->bi_sector) ? 1 : 0; + if (log->type->in_sync(log, region, 0)) + return choose_mirror(ms, bio->bi_iter.bi_sector) ? 1 : 0; return 0; } @@ -920,13 +442,15 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio) */ static sector_t map_sector(struct mirror *m, struct bio *bio) { - return m->offset + (bio->bi_sector - m->ms->ti->begin); + if (unlikely(!bio->bi_iter.bi_size)) + return 0; + return m->offset + dm_target_offset(m->ms->ti, bio->bi_iter.bi_sector); } static void map_bio(struct mirror *m, struct bio *bio) { bio->bi_bdev = m->dev->bdev; - bio->bi_sector = map_sector(m, bio); + bio->bi_iter.bi_sector = map_sector(m, bio); } static void map_region(struct dm_io_region *io, struct mirror *m, @@ -934,7 +458,35 @@ static void map_region(struct dm_io_region *io, struct mirror *m, { io->bdev = m->dev->bdev; io->sector = map_sector(m, bio); - io->count = bio->bi_size >> 9; + io->count = bio_sectors(bio); +} + +static void hold_bio(struct mirror_set *ms, struct bio *bio) +{ + /* + * Lock is required to avoid race condition during suspend + * process. + */ + spin_lock_irq(&ms->lock); + + if (atomic_read(&ms->suspend)) { + spin_unlock_irq(&ms->lock); + + /* + * If device is suspended, complete the bio. + */ + if (dm_noflush_suspending(ms->ti)) + bio_endio(bio, DM_ENDIO_REQUEUE); + else + bio_endio(bio, -EIO); + return; + } + + /* + * Hold bio until the suspend is complete. + */ + bio_list_add(&ms->holds, bio); + spin_unlock_irq(&ms->lock); } /*----------------------------------------------------------------- @@ -974,8 +526,8 @@ static void read_async_bio(struct mirror *m, struct bio *bio) struct dm_io_region io; struct dm_io_request io_req = { .bi_rw = READ, - .mem.type = DM_IO_BVEC, - .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, + .mem.type = DM_IO_BIO, + .mem.ptr.bio = bio, .notify.fn = read_callback, .notify.context = bio, .client = m->ms->io_client, @@ -983,7 +535,14 @@ static void read_async_bio(struct mirror *m, struct bio *bio) map_region(&io, m, bio); bio_set_m(bio, m); - (void) dm_io(&io_req, 1, &io, NULL); + BUG_ON(dm_io(&io_req, 1, &io, NULL)); +} + +static inline int region_in_sync(struct mirror_set *ms, region_t region, + int may_block) +{ + int state = dm_rh_get_state(ms->rh, region, may_block); + return state == DM_RH_CLEAN || state == DM_RH_DIRTY; } static void do_reads(struct mirror_set *ms, struct bio_list *reads) @@ -993,14 +552,14 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) struct mirror *m; while ((bio = bio_list_pop(reads))) { - region = bio_to_region(&ms->rh, bio); + region = dm_rh_bio_to_region(ms->rh, bio); m = get_default_mirror(ms); /* * We can only read balance if the region is in sync. */ - if (likely(rh_in_sync(&ms->rh, region, 1))) - m = choose_mirror(ms, bio->bi_sector); + if (likely(region_in_sync(ms, region, 1))) + m = choose_mirror(ms, bio->bi_iter.bi_sector); else if (m && atomic_read(&m->error_count)) m = NULL; @@ -1022,64 +581,12 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) * NOSYNC: increment pending, just write to the default mirror *---------------------------------------------------------------*/ -/* __bio_mark_nosync - * @ms - * @bio - * @done - * @error - * - * The bio was written on some mirror(s) but failed on other mirror(s). - * We can successfully endio the bio but should avoid the region being - * marked clean by setting the state RH_NOSYNC. - * - * This function is _not_ safe in interrupt context! - */ -static void __bio_mark_nosync(struct mirror_set *ms, - struct bio *bio, unsigned done, int error) -{ - unsigned long flags; - struct region_hash *rh = &ms->rh; - struct dm_dirty_log *log = ms->rh.log; - struct region *reg; - region_t region = bio_to_region(rh, bio); - int recovering = 0; - - /* We must inform the log that the sync count has changed. */ - log->type->set_region_sync(log, region, 0); - ms->in_sync = 0; - - read_lock(&rh->hash_lock); - reg = __rh_find(rh, region); - read_unlock(&rh->hash_lock); - - /* region hash entry should exist because write was in-flight */ - BUG_ON(!reg); - BUG_ON(!list_empty(®->list)); - - spin_lock_irqsave(&rh->region_lock, flags); - /* - * Possible cases: - * 1) RH_DIRTY - * 2) RH_NOSYNC: was dirty, other preceeding writes failed - * 3) RH_RECOVERING: flushing pending writes - * Either case, the region should have not been connected to list. - */ - recovering = (reg->state == RH_RECOVERING); - reg->state = RH_NOSYNC; - BUG_ON(!list_empty(®->list)); - spin_unlock_irqrestore(&rh->region_lock, flags); - - bio_endio(bio, error); - if (recovering) - complete_resync_work(reg, 0); -} static void write_callback(unsigned long error, void *context) { unsigned i, ret = 0; struct bio *bio = (struct bio *) context; struct mirror_set *ms; - int uptodate = 0; int should_wake = 0; unsigned long flags; @@ -1092,36 +599,27 @@ static void write_callback(unsigned long error, void *context) * This way we handle both writes to SYNC and NOSYNC * regions with the same code. */ - if (likely(!error)) - goto out; + if (likely(!error)) { + bio_endio(bio, ret); + return; + } for (i = 0; i < ms->nr_mirrors; i++) if (test_bit(i, &error)) fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); - else - uptodate = 1; - if (unlikely(!uptodate)) { - DMERR("All replicated volumes dead, failing I/O"); - /* None of the writes succeeded, fail the I/O. */ - ret = -EIO; - } else if (errors_handled(ms)) { - /* - * Need to raise event. Since raising - * events can block, we need to do it in - * the main thread. - */ - spin_lock_irqsave(&ms->lock, flags); - if (!ms->failures.head) - should_wake = 1; - bio_list_add(&ms->failures, bio); - spin_unlock_irqrestore(&ms->lock, flags); - if (should_wake) - wake(ms); - return; - } -out: - bio_endio(bio, ret); + /* + * Need to raise event. Since raising + * events can block, we need to do it in + * the main thread. + */ + spin_lock_irqsave(&ms->lock, flags); + if (!ms->failures.head) + should_wake = 1; + bio_list_add(&ms->failures, bio); + spin_unlock_irqrestore(&ms->lock, flags); + if (should_wake) + wakeup_mirrord(ms); } static void do_write(struct mirror_set *ms, struct bio *bio) @@ -1130,14 +628,20 @@ static void do_write(struct mirror_set *ms, struct bio *bio) struct dm_io_region io[ms->nr_mirrors], *dest = io; struct mirror *m; struct dm_io_request io_req = { - .bi_rw = WRITE, - .mem.type = DM_IO_BVEC, - .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, + .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA), + .mem.type = DM_IO_BIO, + .mem.ptr.bio = bio, .notify.fn = write_callback, .notify.context = bio, .client = ms->io_client, }; + if (bio->bi_rw & REQ_DISCARD) { + io_req.bi_rw |= REQ_DISCARD; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = NULL; + } + for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) map_region(dest++, m, bio); @@ -1147,7 +651,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) */ bio_set_m(bio, get_default_mirror(ms)); - (void) dm_io(&io_req, ms->nr_mirrors, io, NULL); + BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); } static void do_writes(struct mirror_set *ms, struct bio_list *writes) @@ -1155,6 +659,9 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) int state; struct bio *bio; struct bio_list sync, nosync, recover, *this_list = NULL; + struct bio_list requeue; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + region_t region; if (!writes->head) return; @@ -1165,20 +672,35 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) bio_list_init(&sync); bio_list_init(&nosync); bio_list_init(&recover); + bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1); + if ((bio->bi_rw & REQ_FLUSH) || + (bio->bi_rw & REQ_DISCARD)) { + bio_list_add(&sync, bio); + continue; + } + + region = dm_rh_bio_to_region(ms->rh, bio); + + if (log->type->is_remote_recovering && + log->type->is_remote_recovering(log, region)) { + bio_list_add(&requeue, bio); + continue; + } + + state = dm_rh_get_state(ms->rh, region, 1); switch (state) { - case RH_CLEAN: - case RH_DIRTY: + case DM_RH_CLEAN: + case DM_RH_DIRTY: this_list = &sync; break; - case RH_NOSYNC: + case DM_RH_NOSYNC: this_list = &nosync; break; - case RH_RECOVERING: + case DM_RH_RECOVERING: this_list = &recover; break; } @@ -1187,32 +709,56 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) } /* + * Add bios that are delayed due to remote recovery + * back on to the write queue + */ + if (unlikely(requeue.head)) { + spin_lock_irq(&ms->lock); + bio_list_merge(&ms->writes, &requeue); + spin_unlock_irq(&ms->lock); + delayed_wake(ms); + } + + /* * Increment the pending counts for any regions that will * be written to (writes to recover regions are going to * be delayed). */ - rh_inc_pending(&ms->rh, &sync); - rh_inc_pending(&ms->rh, &nosync); - ms->log_failure = rh_flush(&ms->rh) ? 1 : 0; + dm_rh_inc_pending(ms->rh, &sync); + dm_rh_inc_pending(ms->rh, &nosync); + + /* + * If the flush fails on a previous call and succeeds here, + * we must not reset the log_failure variable. We need + * userspace interaction to do that. + */ + ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure; /* * Dispatch io. */ - if (unlikely(ms->log_failure)) { + if (unlikely(ms->log_failure) && errors_handled(ms)) { spin_lock_irq(&ms->lock); bio_list_merge(&ms->failures, &sync); spin_unlock_irq(&ms->lock); - wake(ms); + wakeup_mirrord(ms); } else while ((bio = bio_list_pop(&sync))) do_write(ms, bio); while ((bio = bio_list_pop(&recover))) - rh_delay(&ms->rh, bio); + dm_rh_delay(ms->rh, bio); while ((bio = bio_list_pop(&nosync))) { - map_bio(get_default_mirror(ms), bio); - generic_make_request(bio); + if (unlikely(ms->leg_failure) && errors_handled(ms)) { + spin_lock_irq(&ms->lock); + bio_list_add(&ms->failures, bio); + spin_unlock_irq(&ms->lock); + wakeup_mirrord(ms); + } else { + map_bio(get_default_mirror(ms), bio); + generic_make_request(bio); + } } } @@ -1220,18 +766,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) { struct bio *bio; - if (!failures->head) - return; - - if (!ms->log_failure) { - while ((bio = bio_list_pop(failures))) - __bio_mark_nosync(ms, bio, bio->bi_size, 0); + if (likely(!failures->head)) return; - } /* * If the log has failed, unattempted writes are being - * put on the failures list. We can't issue those writes + * put on the holds list. We can't issue those writes * until a log has been marked, so we must store them. * * If a 'noflush' suspend is in progress, we can requeue @@ -1246,23 +786,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures) * for us to treat them the same and requeue them * as well. */ - if (dm_noflush_suspending(ms->ti)) { - while ((bio = bio_list_pop(failures))) - bio_endio(bio, DM_ENDIO_REQUEUE); - return; - } + while ((bio = bio_list_pop(failures))) { + if (!ms->log_failure) { + ms->in_sync = 0; + dm_rh_mark_nosync(ms->rh, bio); + } - if (atomic_read(&ms->suspend)) { - while ((bio = bio_list_pop(failures))) + /* + * If all the legs are dead, fail the I/O. + * If we have been told to handle errors, hold the bio + * and wait for userspace to deal with the problem. + * Otherwise pretend that the I/O succeeded. (This would + * be wrong if the failed leg returned after reboot and + * got replicated back to the good legs.) + */ + if (!get_valid_mirror(ms)) bio_endio(bio, -EIO); - return; + else if (errors_handled(ms)) + hold_bio(ms, bio); + else + bio_endio(bio, 0); } - - spin_lock_irq(&ms->lock); - bio_list_merge(&ms->failures, failures); - spin_unlock_irq(&ms->lock); - - delayed_wake(ms); } static void trigger_event(struct work_struct *work) @@ -1278,8 +822,8 @@ static void trigger_event(struct work_struct *work) *---------------------------------------------------------------*/ static void do_mirror(struct work_struct *work) { - struct mirror_set *ms =container_of(work, struct mirror_set, - kmirrord_work); + struct mirror_set *ms = container_of(work, struct mirror_set, + kmirrord_work); struct bio_list reads, writes, failures; unsigned long flags; @@ -1292,16 +836,13 @@ static void do_mirror(struct work_struct *work) bio_list_init(&ms->failures); spin_unlock_irqrestore(&ms->lock, flags); - rh_update_states(&ms->rh); + dm_rh_update_states(ms->rh, errors_handled(ms)); do_recovery(ms); do_reads(ms, &reads); do_writes(ms, &writes); do_failures(ms, &failures); - - dm_table_unplug_all(ms->ti->table); } - /*----------------------------------------------------------------- * Target functions *---------------------------------------------------------------*/ @@ -1313,9 +854,6 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, size_t len; struct mirror_set *ms = NULL; - if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors)) - return NULL; - len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); ms = kzalloc(len, GFP_KERNEL); @@ -1325,36 +863,34 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors, } spin_lock_init(&ms->lock); + bio_list_init(&ms->reads); + bio_list_init(&ms->writes); + bio_list_init(&ms->failures); + bio_list_init(&ms->holds); ms->ti = ti; ms->nr_mirrors = nr_mirrors; ms->nr_regions = dm_sector_div_up(ti->len, region_size); ms->in_sync = 0; ms->log_failure = 0; + ms->leg_failure = 0; atomic_set(&ms->suspend, 0); atomic_set(&ms->default_mirror, DEFAULT_MIRROR); - len = sizeof(struct dm_raid1_read_record); - ms->read_record_pool = mempool_create_kmalloc_pool(MIN_READ_RECORDS, - len); - if (!ms->read_record_pool) { - ti->error = "Error creating mirror read_record_pool"; - kfree(ms); - return NULL; - } - - ms->io_client = dm_io_client_create(DM_IO_PAGES); + ms->io_client = dm_io_client_create(); if (IS_ERR(ms->io_client)) { ti->error = "Error creating dm_io client"; - mempool_destroy(ms->read_record_pool); kfree(ms); return NULL; } - if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { + ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord, + wakeup_all_recovery_waiters, + ms->ti->begin, MAX_RECOVERY, + dl, region_size, ms->nr_regions); + if (IS_ERR(ms->rh)) { ti->error = "Error creating dirty region hash"; dm_io_client_destroy(ms->io_client); - mempool_destroy(ms->read_record_pool); kfree(ms); return NULL; } @@ -1369,29 +905,22 @@ static void free_context(struct mirror_set *ms, struct dm_target *ti, dm_put_device(ti, ms->mirror[m].dev); dm_io_client_destroy(ms->io_client); - rh_exit(&ms->rh); - mempool_destroy(ms->read_record_pool); + dm_region_hash_destroy(ms->rh); kfree(ms); } -static inline int _check_region_size(struct dm_target *ti, uint32_t size) -{ - return !(size % (PAGE_SIZE >> 9) || !is_power_of_2(size) || - size > ti->len); -} - static int get_mirror(struct mirror_set *ms, struct dm_target *ti, unsigned int mirror, char **argv) { unsigned long long offset; + char dummy; - if (sscanf(argv[1], "%llu", &offset) != 1) { + if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) { ti->error = "Invalid offset"; return -EINVAL; } - if (dm_get_device(ti, argv[0], offset, ti->len, - dm_table_get_mode(ti->table), + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ms->mirror[mirror].dev)) { ti->error = "Device lookup failure"; return -ENXIO; @@ -1409,18 +938,19 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, * Create dirty log: log_type #log_params <log_params> */ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, - unsigned int argc, char **argv, - unsigned int *args_used) + unsigned argc, char **argv, + unsigned *args_used) { - unsigned int param_count; + unsigned param_count; struct dm_dirty_log *dl; + char dummy; if (argc < 2) { ti->error = "Insufficient mirror log arguments"; return NULL; } - if (sscanf(argv[1], "%u", ¶m_count) != 1) { + if (sscanf(argv[1], "%u%c", ¶m_count, &dummy) != 1) { ti->error = "Invalid mirror log argument count"; return NULL; } @@ -1432,18 +962,13 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, return NULL; } - dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); + dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count, + argv + 2); if (!dl) { ti->error = "Error creating mirror dirty log"; return NULL; } - if (!_check_region_size(ti, dl->type->get_region_size(dl))) { - ti->error = "Invalid region size"; - dm_dirty_log_destroy(dl); - return NULL; - } - return dl; } @@ -1452,13 +977,14 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv, { unsigned num_features; struct dm_target *ti = ms->ti; + char dummy; *args_used = 0; if (!argc) return 0; - if (sscanf(argv[0], "%u", &num_features) != 1) { + if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) { ti->error = "Invalid number of features"; return -EINVAL; } @@ -1502,6 +1028,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) unsigned int nr_mirrors, m, args_used; struct mirror_set *ms; struct dm_dirty_log *dl; + char dummy; dl = create_dirty_log(ti, argc, argv, &args_used); if (!dl) @@ -1510,7 +1037,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) argv += args_used; argc -= args_used; - if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || + if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 || nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { ti->error = "Invalid number of mirrors"; dm_dirty_log_destroy(dl); @@ -1543,9 +1070,17 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ti->private = ms; - ti->split_io = ms->rh.region_size; - ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); + r = dm_set_target_max_io_len(ti, dm_rh_get_region_size(ms->rh)); + if (r) + goto err_free_context; + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record); + ti->discard_zeroes_data_unsupported = true; + + ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0); if (!ms->kmirrord_wq) { DMERR("couldn't start kmirrord"); r = -ENOMEM; @@ -1578,11 +1113,13 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err_destroy_wq; } - r = dm_kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); - if (r) + ms->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); + if (IS_ERR(ms->kcopyd_client)) { + r = PTR_ERR(ms->kcopyd_client); goto err_destroy_wq; + } - wake(ms); + wakeup_mirrord(ms); return 0; err_destroy_wq: @@ -1598,47 +1135,34 @@ static void mirror_dtr(struct dm_target *ti) del_timer_sync(&ms->timer); flush_workqueue(ms->kmirrord_wq); + flush_work(&ms->trigger_event); dm_kcopyd_client_destroy(ms->kcopyd_client); destroy_workqueue(ms->kmirrord_wq); free_context(ms, ti, ms->nr_mirrors); } -static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) -{ - unsigned long flags; - int should_wake = 0; - struct bio_list *bl; - - bl = (rw == WRITE) ? &ms->writes : &ms->reads; - spin_lock_irqsave(&ms->lock, flags); - should_wake = !(bl->head); - bio_list_add(bl, bio); - spin_unlock_irqrestore(&ms->lock, flags); - - if (should_wake) - wake(ms); -} - /* * Mirror mapping function */ -static int mirror_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) +static int mirror_map(struct dm_target *ti, struct bio *bio) { int r, rw = bio_rw(bio); struct mirror *m; struct mirror_set *ms = ti->private; - struct dm_raid1_read_record *read_record = NULL; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + struct dm_raid1_bio_record *bio_record = + dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record)); + + bio_record->details.bi_bdev = NULL; if (rw == WRITE) { /* Save region for mirror_end_io() handler */ - map_context->ll = bio_to_region(&ms->rh, bio); + bio_record->write_region = dm_rh_bio_to_region(ms->rh, bio); queue_bio(ms, bio, rw); return DM_MAPIO_SUBMITTED; } - r = ms->rh.log->type->in_sync(ms->rh.log, - bio_to_region(&ms->rh, bio), 0); + r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); if (r < 0 && r != -EWOULDBLOCK) return r; @@ -1657,47 +1181,44 @@ static int mirror_map(struct dm_target *ti, struct bio *bio, * The region is in-sync and we can perform reads directly. * Store enough information so we can retry if it fails. */ - m = choose_mirror(ms, bio->bi_sector); + m = choose_mirror(ms, bio->bi_iter.bi_sector); if (unlikely(!m)) return -EIO; - read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); - if (likely(read_record)) { - dm_bio_record(&read_record->details, bio); - map_context->ptr = read_record; - read_record->m = m; - } + dm_bio_record(&bio_record->details, bio); + bio_record->m = m; map_bio(m, bio); return DM_MAPIO_REMAPPED; } -static int mirror_end_io(struct dm_target *ti, struct bio *bio, - int error, union map_info *map_context) +static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) { int rw = bio_rw(bio); struct mirror_set *ms = (struct mirror_set *) ti->private; struct mirror *m = NULL; struct dm_bio_details *bd = NULL; - struct dm_raid1_read_record *read_record = map_context->ptr; + struct dm_raid1_bio_record *bio_record = + dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record)); /* * We need to dec pending if this was a write. */ if (rw == WRITE) { - rh_dec(&ms->rh, map_context->ll); + if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) + dm_rh_dec(ms->rh, bio_record->write_region); return error; } if (error == -EOPNOTSUPP) goto out; - if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) + if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) goto out; if (unlikely(error)) { - if (!read_record) { + if (!bio_record->details.bi_bdev) { /* * There wasn't enough memory to record necessary * information for a retry or there was no other @@ -1707,7 +1228,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, return -EIO; } - m = read_record->m; + m = bio_record->m; DMERR("Mirror read failed from %s. Trying alternative device.", m->dev->name); @@ -1719,22 +1240,21 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, * mirror. */ if (default_ok(m) || mirror_available(ms, bio)) { - bd = &read_record->details; + bd = &bio_record->details; dm_bio_restore(bd, bio); - mempool_free(read_record, ms->read_record_pool); - map_context->ptr = NULL; + bio_record->details.bi_bdev = NULL; + + atomic_inc(&bio->bi_remaining); + queue_bio(ms, bio, rw); - return 1; + return DM_ENDIO_INCOMPLETE; } DMERR("All replicated volumes dead, failing I/O"); } out: - if (read_record) { - mempool_free(read_record, ms->read_record_pool); - map_context->ptr = NULL; - } + bio_record->details.bi_bdev = NULL; return error; } @@ -1742,18 +1262,35 @@ out: static void mirror_presuspend(struct dm_target *ti) { struct mirror_set *ms = (struct mirror_set *) ti->private; - struct dm_dirty_log *log = ms->rh.log; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); + + struct bio_list holds; + struct bio *bio; atomic_set(&ms->suspend, 1); /* + * Process bios in the hold list to start recovery waiting + * for bios in the hold list. After the process, no bio has + * a chance to be added in the hold list because ms->suspend + * is set. + */ + spin_lock_irq(&ms->lock); + holds = ms->holds; + bio_list_init(&ms->holds); + spin_unlock_irq(&ms->lock); + + while ((bio = bio_list_pop(&holds))) + hold_bio(ms, bio); + + /* * We must finish up all the work that we've * generated (i.e. recovery work). */ - rh_stop_recovery(&ms->rh); + dm_rh_stop_recovery(ms->rh); wait_event(_kmirrord_recovery_stopped, - !atomic_read(&ms->rh.recovery_in_flight)); + !dm_rh_recovery_in_flight(ms->rh)); if (log->type->presuspend && log->type->presuspend(log)) /* FIXME: need better error handling */ @@ -1771,7 +1308,7 @@ static void mirror_presuspend(struct dm_target *ti) static void mirror_postsuspend(struct dm_target *ti) { struct mirror_set *ms = ti->private; - struct dm_dirty_log *log = ms->rh.log; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); if (log->type->postsuspend && log->type->postsuspend(log)) /* FIXME: need better error handling */ @@ -1781,13 +1318,13 @@ static void mirror_postsuspend(struct dm_target *ti) static void mirror_resume(struct dm_target *ti) { struct mirror_set *ms = ti->private; - struct dm_dirty_log *log = ms->rh.log; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); atomic_set(&ms->suspend, 0); if (log->type->resume && log->type->resume(log)) /* FIXME: need better error handling */ DMWARN("log resume failed"); - rh_start_recovery(&ms->rh); + dm_rh_start_recovery(ms->rh); } /* @@ -1808,18 +1345,19 @@ static char device_status_char(struct mirror *m) if (!atomic_read(&(m->error_count))) return 'A'; - return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : + return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' : + (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; } -static int mirror_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) +static void mirror_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) { unsigned int m, sz = 0; struct mirror_set *ms = (struct mirror_set *) ti->private; - struct dm_dirty_log *log = ms->rh.log; + struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); char buffer[ms->nr_mirrors + 1]; switch (type) { @@ -1832,15 +1370,15 @@ static int mirror_status(struct dm_target *ti, status_type_t type, buffer[m] = '\0'; DMEMIT("%llu/%llu 1 %s ", - (unsigned long long)log->type->get_sync_count(ms->rh.log), + (unsigned long long)log->type->get_sync_count(log), (unsigned long long)ms->nr_regions, buffer); - sz += log->type->status(ms->rh.log, type, result+sz, maxlen-sz); + sz += log->type->status(log, type, result+sz, maxlen-sz); break; case STATUSTYPE_TABLE: - sz = log->type->status(ms->rh.log, type, result, maxlen); + sz = log->type->status(log, type, result, maxlen); DMEMIT("%d", ms->nr_mirrors); for (m = 0; m < ms->nr_mirrors; m++) @@ -1850,13 +1388,25 @@ static int mirror_status(struct dm_target *ti, status_type_t type, if (ms->features & DM_RAID1_HANDLE_ERRORS) DMEMIT(" 1 handle_errors"); } +} - return 0; +static int mirror_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct mirror_set *ms = ti->private; + int ret = 0; + unsigned i; + + for (i = 0; !ret && i < ms->nr_mirrors; i++) + ret = fn(ti, ms->mirror[i].dev, + ms->mirror[i].offset, ti->len, data); + + return ret; } static struct target_type mirror_target = { .name = "mirror", - .version = {1, 0, 20}, + .version = {1, 13, 2}, .module = THIS_MODULE, .ctr = mirror_ctr, .dtr = mirror_dtr, @@ -1866,6 +1416,7 @@ static struct target_type mirror_target = { .postsuspend = mirror_postsuspend, .resume = mirror_resume, .status = mirror_status, + .iterate_devices = mirror_iterate_devices, }; static int __init dm_mirror_init(void) @@ -1873,19 +1424,20 @@ static int __init dm_mirror_init(void) int r; r = dm_register_target(&mirror_target); - if (r < 0) + if (r < 0) { DMERR("Failed to register mirror target"); + goto bad_target; + } + + return 0; +bad_target: return r; } static void __exit dm_mirror_exit(void) { - int r; - - r = dm_unregister_target(&mirror_target); - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_target(&mirror_target); } /* Module hooks */ diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c new file mode 100644 index 00000000000..b929fd5f498 --- /dev/null +++ b/drivers/md/dm-region-hash.c @@ -0,0 +1,724 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/dm-dirty-log.h> +#include <linux/dm-region-hash.h> + +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include "dm.h" + +#define DM_MSG_PREFIX "region hash" + +/*----------------------------------------------------------------- + * Region hash + * + * The mirror splits itself up into discrete regions. Each + * region can be in one of three states: clean, dirty, + * nosync. There is no need to put clean regions in the hash. + * + * In addition to being present in the hash table a region _may_ + * be present on one of three lists. + * + * clean_regions: Regions on this list have no io pending to + * them, they are in sync, we are no longer interested in them, + * they are dull. dm_rh_update_states() will remove them from the + * hash table. + * + * quiesced_regions: These regions have been spun down, ready + * for recovery. rh_recovery_start() will remove regions from + * this list and hand them to kmirrord, which will schedule the + * recovery io with kcopyd. + * + * recovered_regions: Regions that kcopyd has successfully + * recovered. dm_rh_update_states() will now schedule any delayed + * io, up the recovery_count, and remove the region from the + * hash. + * + * There are 2 locks: + * A rw spin lock 'hash_lock' protects just the hash table, + * this is never held in write mode from interrupt context, + * which I believe means that we only have to disable irqs when + * doing a write lock. + * + * An ordinary spin lock 'region_lock' that protects the three + * lists in the region_hash, with the 'state', 'list' and + * 'delayed_bios' fields of the regions. This is used from irq + * context, so all other uses will have to suspend local irqs. + *---------------------------------------------------------------*/ +struct dm_region_hash { + uint32_t region_size; + unsigned region_shift; + + /* holds persistent region state */ + struct dm_dirty_log *log; + + /* hash table */ + rwlock_t hash_lock; + mempool_t *region_pool; + unsigned mask; + unsigned nr_buckets; + unsigned prime; + unsigned shift; + struct list_head *buckets; + + unsigned max_recovery; /* Max # of regions to recover in parallel */ + + spinlock_t region_lock; + atomic_t recovery_in_flight; + struct semaphore recovery_count; + struct list_head clean_regions; + struct list_head quiesced_regions; + struct list_head recovered_regions; + struct list_head failed_recovered_regions; + + /* + * If there was a flush failure no regions can be marked clean. + */ + int flush_failure; + + void *context; + sector_t target_begin; + + /* Callback function to schedule bios writes */ + void (*dispatch_bios)(void *context, struct bio_list *bios); + + /* Callback function to wakeup callers worker thread. */ + void (*wakeup_workers)(void *context); + + /* Callback function to wakeup callers recovery waiters. */ + void (*wakeup_all_recovery_waiters)(void *context); +}; + +struct dm_region { + struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ + region_t key; + int state; + + struct list_head hash_list; + struct list_head list; + + atomic_t pending; + struct bio_list delayed_bios; +}; + +/* + * Conversion fns + */ +static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector) +{ + return sector >> rh->region_shift; +} + +sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) +{ + return region << rh->region_shift; +} +EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); + +region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) +{ + return dm_rh_sector_to_region(rh, bio->bi_iter.bi_sector - + rh->target_begin); +} +EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); + +void *dm_rh_region_context(struct dm_region *reg) +{ + return reg->rh->context; +} +EXPORT_SYMBOL_GPL(dm_rh_region_context); + +region_t dm_rh_get_region_key(struct dm_region *reg) +{ + return reg->key; +} +EXPORT_SYMBOL_GPL(dm_rh_get_region_key); + +sector_t dm_rh_get_region_size(struct dm_region_hash *rh) +{ + return rh->region_size; +} +EXPORT_SYMBOL_GPL(dm_rh_get_region_size); + +/* + * FIXME: shall we pass in a structure instead of all these args to + * dm_region_hash_create()???? + */ +#define RH_HASH_MULT 2654435387U +#define RH_HASH_SHIFT 12 + +#define MIN_REGIONS 64 +struct dm_region_hash *dm_region_hash_create( + void *context, void (*dispatch_bios)(void *context, + struct bio_list *bios), + void (*wakeup_workers)(void *context), + void (*wakeup_all_recovery_waiters)(void *context), + sector_t target_begin, unsigned max_recovery, + struct dm_dirty_log *log, uint32_t region_size, + region_t nr_regions) +{ + struct dm_region_hash *rh; + unsigned nr_buckets, max_buckets; + size_t i; + + /* + * Calculate a suitable number of buckets for our hash + * table. + */ + max_buckets = nr_regions >> 6; + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) + ; + nr_buckets >>= 1; + + rh = kmalloc(sizeof(*rh), GFP_KERNEL); + if (!rh) { + DMERR("unable to allocate region hash memory"); + return ERR_PTR(-ENOMEM); + } + + rh->context = context; + rh->dispatch_bios = dispatch_bios; + rh->wakeup_workers = wakeup_workers; + rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; + rh->target_begin = target_begin; + rh->max_recovery = max_recovery; + rh->log = log; + rh->region_size = region_size; + rh->region_shift = ffs(region_size) - 1; + rwlock_init(&rh->hash_lock); + rh->mask = nr_buckets - 1; + rh->nr_buckets = nr_buckets; + + rh->shift = RH_HASH_SHIFT; + rh->prime = RH_HASH_MULT; + + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); + if (!rh->buckets) { + DMERR("unable to allocate region hash bucket memory"); + kfree(rh); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < nr_buckets; i++) + INIT_LIST_HEAD(rh->buckets + i); + + spin_lock_init(&rh->region_lock); + sema_init(&rh->recovery_count, 0); + atomic_set(&rh->recovery_in_flight, 0); + INIT_LIST_HEAD(&rh->clean_regions); + INIT_LIST_HEAD(&rh->quiesced_regions); + INIT_LIST_HEAD(&rh->recovered_regions); + INIT_LIST_HEAD(&rh->failed_recovered_regions); + rh->flush_failure = 0; + + rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, + sizeof(struct dm_region)); + if (!rh->region_pool) { + vfree(rh->buckets); + kfree(rh); + rh = ERR_PTR(-ENOMEM); + } + + return rh; +} +EXPORT_SYMBOL_GPL(dm_region_hash_create); + +void dm_region_hash_destroy(struct dm_region_hash *rh) +{ + unsigned h; + struct dm_region *reg, *nreg; + + BUG_ON(!list_empty(&rh->quiesced_regions)); + for (h = 0; h < rh->nr_buckets; h++) { + list_for_each_entry_safe(reg, nreg, rh->buckets + h, + hash_list) { + BUG_ON(atomic_read(®->pending)); + mempool_free(reg, rh->region_pool); + } + } + + if (rh->log) + dm_dirty_log_destroy(rh->log); + + if (rh->region_pool) + mempool_destroy(rh->region_pool); + + vfree(rh->buckets); + kfree(rh); +} +EXPORT_SYMBOL_GPL(dm_region_hash_destroy); + +struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh) +{ + return rh->log; +} +EXPORT_SYMBOL_GPL(dm_rh_dirty_log); + +static unsigned rh_hash(struct dm_region_hash *rh, region_t region) +{ + return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask; +} + +static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg; + struct list_head *bucket = rh->buckets + rh_hash(rh, region); + + list_for_each_entry(reg, bucket, hash_list) + if (reg->key == region) + return reg; + + return NULL; +} + +static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg) +{ + list_add(®->hash_list, rh->buckets + rh_hash(rh, reg->key)); +} + +static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg, *nreg; + + nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); + if (unlikely(!nreg)) + nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); + + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? + DM_RH_CLEAN : DM_RH_NOSYNC; + nreg->rh = rh; + nreg->key = region; + INIT_LIST_HEAD(&nreg->list); + atomic_set(&nreg->pending, 0); + bio_list_init(&nreg->delayed_bios); + + write_lock_irq(&rh->hash_lock); + reg = __rh_lookup(rh, region); + if (reg) + /* We lost the race. */ + mempool_free(nreg, rh->region_pool); + else { + __rh_insert(rh, nreg); + if (nreg->state == DM_RH_CLEAN) { + spin_lock(&rh->region_lock); + list_add(&nreg->list, &rh->clean_regions); + spin_unlock(&rh->region_lock); + } + + reg = nreg; + } + write_unlock_irq(&rh->hash_lock); + + return reg; +} + +static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg; + + reg = __rh_lookup(rh, region); + if (!reg) { + read_unlock(&rh->hash_lock); + reg = __rh_alloc(rh, region); + read_lock(&rh->hash_lock); + } + + return reg; +} + +int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block) +{ + int r; + struct dm_region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + if (reg) + return reg->state; + + /* + * The region wasn't in the hash, so we fall back to the + * dirty log. + */ + r = rh->log->type->in_sync(rh->log, region, may_block); + + /* + * Any error from the dirty log (eg. -EWOULDBLOCK) gets + * taken as a DM_RH_NOSYNC + */ + return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC; +} +EXPORT_SYMBOL_GPL(dm_rh_get_state); + +static void complete_resync_work(struct dm_region *reg, int success) +{ + struct dm_region_hash *rh = reg->rh; + + rh->log->type->set_region_sync(rh->log, reg->key, success); + + /* + * Dispatch the bios before we call 'wake_up_all'. + * This is important because if we are suspending, + * we want to know that recovery is complete and + * the work queue is flushed. If we wake_up_all + * before we dispatch_bios (queue bios and call wake()), + * then we risk suspending before the work queue + * has been properly flushed. + */ + rh->dispatch_bios(rh->context, ®->delayed_bios); + if (atomic_dec_and_test(&rh->recovery_in_flight)) + rh->wakeup_all_recovery_waiters(rh->context); + up(&rh->recovery_count); +} + +/* dm_rh_mark_nosync + * @ms + * @bio + * + * The bio was written on some mirror(s) but failed on other mirror(s). + * We can successfully endio the bio but should avoid the region being + * marked clean by setting the state DM_RH_NOSYNC. + * + * This function is _not_ safe in interrupt context! + */ +void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) +{ + unsigned long flags; + struct dm_dirty_log *log = rh->log; + struct dm_region *reg; + region_t region = dm_rh_bio_to_region(rh, bio); + int recovering = 0; + + if (bio->bi_rw & REQ_FLUSH) { + rh->flush_failure = 1; + return; + } + + if (bio->bi_rw & REQ_DISCARD) + return; + + /* We must inform the log that the sync count has changed. */ + log->type->set_region_sync(log, region, 0); + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + /* region hash entry should exist because write was in-flight */ + BUG_ON(!reg); + BUG_ON(!list_empty(®->list)); + + spin_lock_irqsave(&rh->region_lock, flags); + /* + * Possible cases: + * 1) DM_RH_DIRTY + * 2) DM_RH_NOSYNC: was dirty, other preceding writes failed + * 3) DM_RH_RECOVERING: flushing pending writes + * Either case, the region should have not been connected to list. + */ + recovering = (reg->state == DM_RH_RECOVERING); + reg->state = DM_RH_NOSYNC; + BUG_ON(!list_empty(®->list)); + spin_unlock_irqrestore(&rh->region_lock, flags); + + if (recovering) + complete_resync_work(reg, 0); +} +EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); + +void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) +{ + struct dm_region *reg, *next; + + LIST_HEAD(clean); + LIST_HEAD(recovered); + LIST_HEAD(failed_recovered); + + /* + * Quickly grab the lists. + */ + write_lock_irq(&rh->hash_lock); + spin_lock(&rh->region_lock); + if (!list_empty(&rh->clean_regions)) { + list_splice_init(&rh->clean_regions, &clean); + + list_for_each_entry(reg, &clean, list) + list_del(®->hash_list); + } + + if (!list_empty(&rh->recovered_regions)) { + list_splice_init(&rh->recovered_regions, &recovered); + + list_for_each_entry(reg, &recovered, list) + list_del(®->hash_list); + } + + if (!list_empty(&rh->failed_recovered_regions)) { + list_splice_init(&rh->failed_recovered_regions, + &failed_recovered); + + list_for_each_entry(reg, &failed_recovered, list) + list_del(®->hash_list); + } + + spin_unlock(&rh->region_lock); + write_unlock_irq(&rh->hash_lock); + + /* + * All the regions on the recovered and clean lists have + * now been pulled out of the system, so no need to do + * any more locking. + */ + list_for_each_entry_safe(reg, next, &recovered, list) { + rh->log->type->clear_region(rh->log, reg->key); + complete_resync_work(reg, 1); + mempool_free(reg, rh->region_pool); + } + + list_for_each_entry_safe(reg, next, &failed_recovered, list) { + complete_resync_work(reg, errors_handled ? 0 : 1); + mempool_free(reg, rh->region_pool); + } + + list_for_each_entry_safe(reg, next, &clean, list) { + rh->log->type->clear_region(rh->log, reg->key); + mempool_free(reg, rh->region_pool); + } + + rh->log->type->flush(rh->log); +} +EXPORT_SYMBOL_GPL(dm_rh_update_states); + +static void rh_inc(struct dm_region_hash *rh, region_t region) +{ + struct dm_region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + + spin_lock_irq(&rh->region_lock); + atomic_inc(®->pending); + + if (reg->state == DM_RH_CLEAN) { + reg->state = DM_RH_DIRTY; + list_del_init(®->list); /* take off the clean list */ + spin_unlock_irq(&rh->region_lock); + + rh->log->type->mark_region(rh->log, reg->key); + } else + spin_unlock_irq(&rh->region_lock); + + + read_unlock(&rh->hash_lock); +} + +void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) +{ + struct bio *bio; + + for (bio = bios->head; bio; bio = bio->bi_next) { + if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)) + continue; + rh_inc(rh, dm_rh_bio_to_region(rh, bio)); + } +} +EXPORT_SYMBOL_GPL(dm_rh_inc_pending); + +void dm_rh_dec(struct dm_region_hash *rh, region_t region) +{ + unsigned long flags; + struct dm_region *reg; + int should_wake = 0; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irqsave(&rh->region_lock, flags); + if (atomic_dec_and_test(®->pending)) { + /* + * There is no pending I/O for this region. + * We can move the region to corresponding list for next action. + * At this point, the region is not yet connected to any list. + * + * If the state is DM_RH_NOSYNC, the region should be kept off + * from clean list. + * The hash entry for DM_RH_NOSYNC will remain in memory + * until the region is recovered or the map is reloaded. + */ + + /* do nothing for DM_RH_NOSYNC */ + if (unlikely(rh->flush_failure)) { + /* + * If a write flush failed some time ago, we + * don't know whether or not this write made it + * to the disk, so we must resync the device. + */ + reg->state = DM_RH_NOSYNC; + } else if (reg->state == DM_RH_RECOVERING) { + list_add_tail(®->list, &rh->quiesced_regions); + } else if (reg->state == DM_RH_DIRTY) { + reg->state = DM_RH_CLEAN; + list_add(®->list, &rh->clean_regions); + } + should_wake = 1; + } + spin_unlock_irqrestore(&rh->region_lock, flags); + + if (should_wake) + rh->wakeup_workers(rh->context); +} +EXPORT_SYMBOL_GPL(dm_rh_dec); + +/* + * Starts quiescing a region in preparation for recovery. + */ +static int __rh_recovery_prepare(struct dm_region_hash *rh) +{ + int r; + region_t region; + struct dm_region *reg; + + /* + * Ask the dirty log what's next. + */ + r = rh->log->type->get_resync_work(rh->log, ®ion); + if (r <= 0) + return r; + + /* + * Get this region, and start it quiescing by setting the + * recovering flag. + */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irq(&rh->region_lock); + reg->state = DM_RH_RECOVERING; + + /* Already quiesced ? */ + if (atomic_read(®->pending)) + list_del_init(®->list); + else + list_move(®->list, &rh->quiesced_regions); + + spin_unlock_irq(&rh->region_lock); + + return 1; +} + +void dm_rh_recovery_prepare(struct dm_region_hash *rh) +{ + /* Extra reference to avoid race with dm_rh_stop_recovery */ + atomic_inc(&rh->recovery_in_flight); + + while (!down_trylock(&rh->recovery_count)) { + atomic_inc(&rh->recovery_in_flight); + if (__rh_recovery_prepare(rh) <= 0) { + atomic_dec(&rh->recovery_in_flight); + up(&rh->recovery_count); + break; + } + } + + /* Drop the extra reference */ + if (atomic_dec_and_test(&rh->recovery_in_flight)) + rh->wakeup_all_recovery_waiters(rh->context); +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); + +/* + * Returns any quiesced regions. + */ +struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh) +{ + struct dm_region *reg = NULL; + + spin_lock_irq(&rh->region_lock); + if (!list_empty(&rh->quiesced_regions)) { + reg = list_entry(rh->quiesced_regions.next, + struct dm_region, list); + list_del_init(®->list); /* remove from the quiesced list */ + } + spin_unlock_irq(&rh->region_lock); + + return reg; +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_start); + +void dm_rh_recovery_end(struct dm_region *reg, int success) +{ + struct dm_region_hash *rh = reg->rh; + + spin_lock_irq(&rh->region_lock); + if (success) + list_add(®->list, ®->rh->recovered_regions); + else + list_add(®->list, ®->rh->failed_recovered_regions); + + spin_unlock_irq(&rh->region_lock); + + rh->wakeup_workers(rh->context); +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_end); + +/* Return recovery in flight count. */ +int dm_rh_recovery_in_flight(struct dm_region_hash *rh) +{ + return atomic_read(&rh->recovery_in_flight); +} +EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); + +int dm_rh_flush(struct dm_region_hash *rh) +{ + return rh->log->type->flush(rh->log); +} +EXPORT_SYMBOL_GPL(dm_rh_flush); + +void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) +{ + struct dm_region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); + bio_list_add(®->delayed_bios, bio); + read_unlock(&rh->hash_lock); +} +EXPORT_SYMBOL_GPL(dm_rh_delay); + +void dm_rh_stop_recovery(struct dm_region_hash *rh) +{ + int i; + + /* wait for any recovering regions */ + for (i = 0; i < rh->max_recovery; i++) + down(&rh->recovery_count); +} +EXPORT_SYMBOL_GPL(dm_rh_stop_recovery); + +void dm_rh_start_recovery(struct dm_region_hash *rh) +{ + int i; + + for (i = 0; i < rh->max_recovery; i++) + up(&rh->recovery_count); + + rh->wakeup_workers(rh->context); +} +EXPORT_SYMBOL_GPL(dm_rh_start_recovery); + +MODULE_DESCRIPTION(DM_NAME " region hash"); +MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 391dfa2ad43..6ab1192cdd5 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -9,10 +9,12 @@ * Round-robin path selector. */ -#include "dm.h" +#include <linux/device-mapper.h> + #include "dm-path-selector.h" #include <linux/slab.h> +#include <linux/module.h> #define DM_MSG_PREFIX "multipath round-robin" @@ -112,6 +114,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, struct selector *s = (struct selector *) ps->context; struct path_info *pi; unsigned repeat_count = RR_MIN_IO; + char dummy; if (argc > 1) { *error = "round-robin ps: incorrect number of arguments"; @@ -119,7 +122,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, } /* First path argument is number of I/Os before switching path */ - if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { + if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { *error = "round-robin ps: invalid repeat count"; return -EINVAL; } @@ -160,7 +163,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) } static struct dm_path *rr_select_path(struct path_selector *ps, - unsigned *repeat_count) + unsigned *repeat_count, size_t nr_bytes) { struct selector *s = (struct selector *) ps->context; struct path_info *pi = NULL; diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c new file mode 100644 index 00000000000..9df8f6bd641 --- /dev/null +++ b/drivers/md/dm-service-time.c @@ -0,0 +1,343 @@ +/* + * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. + * + * Module Author: Kiyoshi Ueda + * + * This file is released under the GPL. + * + * Throughput oriented path selector. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include <linux/slab.h> +#include <linux/module.h> + +#define DM_MSG_PREFIX "multipath service-time" +#define ST_MIN_IO 1 +#define ST_MAX_RELATIVE_THROUGHPUT 100 +#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 +#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) +#define ST_VERSION "0.2.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned repeat_count; + unsigned relative_throughput; + atomic_t in_flight_size; /* Total size of in-flight I/Os */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + } + + return s; +} + +static int st_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void st_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int st_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct path_info *pi; + + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), + pi->relative_throughput); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u %u ", pi->repeat_count, + pi->relative_throughput); + break; + } + } + + return sz; +} + +static int st_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned repeat_count = ST_MIN_IO; + unsigned relative_throughput = 1; + char dummy; + + /* + * Arguments: [<repeat_count> [<relative_throughput>]] + * <repeat_count>: The number of I/Os before switching path. + * If not given, default (ST_MIN_IO) is used. + * <relative_throughput>: The relative throughput value of + * the path among all paths in the path-group. + * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> + * If not given, minimum value '1' is used. + * If '0' is given, the path isn't selected while + * other paths having a positive value are + * available. + */ + if (argc > 2) { + *error = "service-time ps: incorrect number of arguments"; + return -EINVAL; + } + + if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "service-time ps: invalid repeat count"; + return -EINVAL; + } + + if ((argc == 2) && + (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || + relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { + *error = "service-time ps: invalid relative_throughput value"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "service-time ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + pi->relative_throughput = relative_throughput; + atomic_set(&pi->in_flight_size, 0); + + path->pscontext = pi; + + list_add_tail(&pi->list, &s->valid_paths); + + return 0; +} + +static void st_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move(&pi->list, &s->failed_paths); +} + +static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + + list_move_tail(&pi->list, &s->valid_paths); + + return 0; +} + +/* + * Compare the estimated service time of 2 paths, pi1 and pi2, + * for the incoming I/O. + * + * Returns: + * < 0 : pi1 is better + * 0 : no difference between pi1 and pi2 + * > 0 : pi2 is better + * + * Description: + * Basically, the service time is estimated by: + * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' + * To reduce the calculation, some optimizations are made. + * (See comments inline) + */ +static int st_compare_load(struct path_info *pi1, struct path_info *pi2, + size_t incoming) +{ + size_t sz1, sz2, st1, st2; + + sz1 = atomic_read(&pi1->in_flight_size); + sz2 = atomic_read(&pi2->in_flight_size); + + /* + * Case 1: Both have same throughput value. Choose less loaded path. + */ + if (pi1->relative_throughput == pi2->relative_throughput) + return sz1 - sz2; + + /* + * Case 2a: Both have same load. Choose higher throughput path. + * Case 2b: One path has no throughput value. Choose the other one. + */ + if (sz1 == sz2 || + !pi1->relative_throughput || !pi2->relative_throughput) + return pi2->relative_throughput - pi1->relative_throughput; + + /* + * Case 3: Calculate service time. Choose faster path. + * Service time using pi1: + * st1 = (sz1 + incoming) / pi1->relative_throughput + * Service time using pi2: + * st2 = (sz2 + incoming) / pi2->relative_throughput + * + * To avoid the division, transform the expression to use + * multiplication. + * Because ->relative_throughput > 0 here, if st1 < st2, + * the expressions below are the same meaning: + * (sz1 + incoming) / pi1->relative_throughput < + * (sz2 + incoming) / pi2->relative_throughput + * (sz1 + incoming) * pi2->relative_throughput < + * (sz2 + incoming) * pi1->relative_throughput + * So use the later one. + */ + sz1 += incoming; + sz2 += incoming; + if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || + sz2 >= ST_MAX_INFLIGHT_SIZE)) { + /* + * Size may be too big for multiplying pi->relative_throughput + * and overflow. + * To avoid the overflow and mis-selection, shift down both. + */ + sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; + sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; + } + st1 = sz1 * pi2->relative_throughput; + st2 = sz2 * pi1->relative_throughput; + if (st1 != st2) + return st1 - st2; + + /* + * Case 4: Service time is equal. Choose higher throughput path. + */ + return pi2->relative_throughput - pi1->relative_throughput; +} + +static struct dm_path *st_select_path(struct path_selector *ps, + unsigned *repeat_count, size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + + if (list_empty(&s->valid_paths)) + return NULL; + + /* Change preferred (first in list) path to evenly balance. */ + list_move_tail(s->valid_paths.next, &s->valid_paths); + + list_for_each_entry(pi, &s->valid_paths, list) + if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) + best = pi; + + if (!best) + return NULL; + + *repeat_count = best->repeat_count; + + return best->path; +} + +static int st_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_add(nr_bytes, &pi->in_flight_size); + + return 0; +} + +static int st_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_sub(nr_bytes, &pi->in_flight_size); + + return 0; +} + +static struct path_selector_type st_ps = { + .name = "service-time", + .module = THIS_MODULE, + .table_args = 2, + .info_args = 2, + .create = st_create, + .destroy = st_destroy, + .status = st_status, + .add_path = st_add_path, + .fail_path = st_fail_path, + .reinstate_path = st_reinstate_path, + .select_path = st_select_path, + .start_io = st_start_io, + .end_io = st_end_io, +}; + +static int __init dm_st_init(void) +{ + int r = dm_register_path_selector(&st_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " ST_VERSION " loaded"); + + return r; +} + +static void __exit dm_st_exit(void) +{ + int r = dm_unregister_path_selector(&st_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_st_init); +module_exit(dm_st_exit); + +MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); +MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c new file mode 100644 index 00000000000..d6e88178d22 --- /dev/null +++ b/drivers/md/dm-snap-persistent.c @@ -0,0 +1,958 @@ +/* + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2006-2008 Red Hat GmbH + * + * This file is released under the GPL. + */ + +#include "dm-exception-store.h" + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/vmalloc.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/dm-io.h> +#include "dm-bufio.h" + +#define DM_MSG_PREFIX "persistent snapshot" +#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ + +#define DM_PREFETCH_CHUNKS 12 + +/*----------------------------------------------------------------- + * Persistent snapshots, by persistent we mean that the snapshot + * will survive a reboot. + *---------------------------------------------------------------*/ + +/* + * We need to store a record of which parts of the origin have + * been copied to the snapshot device. The snapshot code + * requires that we copy exception chunks to chunk aligned areas + * of the COW store. It makes sense therefore, to store the + * metadata in chunk size blocks. + * + * There is no backward or forward compatibility implemented, + * snapshots with different disk versions than the kernel will + * not be usable. It is expected that "lvcreate" will blank out + * the start of a fresh COW device before calling the snapshot + * constructor. + * + * The first chunk of the COW device just contains the header. + * After this there is a chunk filled with exception metadata, + * followed by as many exception chunks as can fit in the + * metadata areas. + * + * All on disk structures are in little-endian format. The end + * of the exceptions info is indicated by an exception with a + * new_chunk of 0, which is invalid since it would point to the + * header chunk. + */ + +/* + * Magic for persistent snapshots: "SnAp" - Feeble isn't it. + */ +#define SNAP_MAGIC 0x70416e53 + +/* + * The on-disk version of the metadata. + */ +#define SNAPSHOT_DISK_VERSION 1 + +#define NUM_SNAPSHOT_HDR_CHUNKS 1 + +struct disk_header { + __le32 magic; + + /* + * Is this snapshot valid. There is no way of recovering + * an invalid snapshot. + */ + __le32 valid; + + /* + * Simple, incrementing version. no backward + * compatibility. + */ + __le32 version; + + /* In sectors */ + __le32 chunk_size; +} __packed; + +struct disk_exception { + __le64 old_chunk; + __le64 new_chunk; +} __packed; + +struct core_exception { + uint64_t old_chunk; + uint64_t new_chunk; +}; + +struct commit_callback { + void (*callback)(void *, int success); + void *context; +}; + +/* + * The top level structure for a persistent exception store. + */ +struct pstore { + struct dm_exception_store *store; + int version; + int valid; + uint32_t exceptions_per_area; + + /* + * Now that we have an asynchronous kcopyd there is no + * need for large chunk sizes, so it wont hurt to have a + * whole chunks worth of metadata in memory at once. + */ + void *area; + + /* + * An area of zeros used to clear the next area. + */ + void *zero_area; + + /* + * An area used for header. The header can be written + * concurrently with metadata (when invalidating the snapshot), + * so it needs a separate buffer. + */ + void *header_area; + + /* + * Used to keep track of which metadata area the data in + * 'chunk' refers to. + */ + chunk_t current_area; + + /* + * The next free chunk for an exception. + * + * When creating exceptions, all the chunks here and above are + * free. It holds the next chunk to be allocated. On rare + * occasions (e.g. after a system crash) holes can be left in + * the exception store because chunks can be committed out of + * order. + * + * When merging exceptions, it does not necessarily mean all the + * chunks here and above are free. It holds the value it would + * have held if all chunks had been committed in order of + * allocation. Consequently the value may occasionally be + * slightly too low, but since it's only used for 'status' and + * it can never reach its minimum value too early this doesn't + * matter. + */ + + chunk_t next_free; + + /* + * The index of next free exception in the current + * metadata area. + */ + uint32_t current_committed; + + atomic_t pending_count; + uint32_t callback_count; + struct commit_callback *callbacks; + struct dm_io_client *io_client; + + struct workqueue_struct *metadata_wq; +}; + +static int alloc_area(struct pstore *ps) +{ + int r = -ENOMEM; + size_t len; + + len = ps->store->chunk_size << SECTOR_SHIFT; + + /* + * Allocate the chunk_size block of memory that will hold + * a single metadata area. + */ + ps->area = vmalloc(len); + if (!ps->area) + goto err_area; + + ps->zero_area = vzalloc(len); + if (!ps->zero_area) + goto err_zero_area; + + ps->header_area = vmalloc(len); + if (!ps->header_area) + goto err_header_area; + + return 0; + +err_header_area: + vfree(ps->zero_area); + +err_zero_area: + vfree(ps->area); + +err_area: + return r; +} + +static void free_area(struct pstore *ps) +{ + if (ps->area) + vfree(ps->area); + ps->area = NULL; + + if (ps->zero_area) + vfree(ps->zero_area); + ps->zero_area = NULL; + + if (ps->header_area) + vfree(ps->header_area); + ps->header_area = NULL; +} + +struct mdata_req { + struct dm_io_region *where; + struct dm_io_request *io_req; + struct work_struct work; + int result; +}; + +static void do_metadata(struct work_struct *work) +{ + struct mdata_req *req = container_of(work, struct mdata_req, work); + + req->result = dm_io(req->io_req, 1, req->where, NULL); +} + +/* + * Read or write a chunk aligned and sized block of data from a device. + */ +static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw, + int metadata) +{ + struct dm_io_region where = { + .bdev = dm_snap_cow(ps->store->snap)->bdev, + .sector = ps->store->chunk_size * chunk, + .count = ps->store->chunk_size, + }; + struct dm_io_request io_req = { + .bi_rw = rw, + .mem.type = DM_IO_VMA, + .mem.ptr.vma = area, + .client = ps->io_client, + .notify.fn = NULL, + }; + struct mdata_req req; + + if (!metadata) + return dm_io(&io_req, 1, &where, NULL); + + req.where = &where; + req.io_req = &io_req; + + /* + * Issue the synchronous I/O from a different thread + * to avoid generic_make_request recursion. + */ + INIT_WORK_ONSTACK(&req.work, do_metadata); + queue_work(ps->metadata_wq, &req.work); + flush_workqueue(ps->metadata_wq); + destroy_work_on_stack(&req.work); + + return req.result; +} + +/* + * Convert a metadata area index to a chunk index. + */ +static chunk_t area_location(struct pstore *ps, chunk_t area) +{ + return NUM_SNAPSHOT_HDR_CHUNKS + ((ps->exceptions_per_area + 1) * area); +} + +static void skip_metadata(struct pstore *ps) +{ + uint32_t stride = ps->exceptions_per_area + 1; + chunk_t next_free = ps->next_free; + if (sector_div(next_free, stride) == NUM_SNAPSHOT_HDR_CHUNKS) + ps->next_free++; +} + +/* + * Read or write a metadata area. Remembering to skip the first + * chunk which holds the header. + */ +static int area_io(struct pstore *ps, int rw) +{ + int r; + chunk_t chunk; + + chunk = area_location(ps, ps->current_area); + + r = chunk_io(ps, ps->area, chunk, rw, 0); + if (r) + return r; + + return 0; +} + +static void zero_memory_area(struct pstore *ps) +{ + memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT); +} + +static int zero_disk_area(struct pstore *ps, chunk_t area) +{ + return chunk_io(ps, ps->zero_area, area_location(ps, area), WRITE, 0); +} + +static int read_header(struct pstore *ps, int *new_snapshot) +{ + int r; + struct disk_header *dh; + unsigned chunk_size; + int chunk_size_supplied = 1; + char *chunk_err; + + /* + * Use default chunk size (or logical_block_size, if larger) + * if none supplied + */ + if (!ps->store->chunk_size) { + ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, + bdev_logical_block_size(dm_snap_cow(ps->store->snap)-> + bdev) >> 9); + ps->store->chunk_mask = ps->store->chunk_size - 1; + ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; + chunk_size_supplied = 0; + } + + ps->io_client = dm_io_client_create(); + if (IS_ERR(ps->io_client)) + return PTR_ERR(ps->io_client); + + r = alloc_area(ps); + if (r) + return r; + + r = chunk_io(ps, ps->header_area, 0, READ, 1); + if (r) + goto bad; + + dh = ps->header_area; + + if (le32_to_cpu(dh->magic) == 0) { + *new_snapshot = 1; + return 0; + } + + if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { + DMWARN("Invalid or corrupt snapshot"); + r = -ENXIO; + goto bad; + } + + *new_snapshot = 0; + ps->valid = le32_to_cpu(dh->valid); + ps->version = le32_to_cpu(dh->version); + chunk_size = le32_to_cpu(dh->chunk_size); + + if (ps->store->chunk_size == chunk_size) + return 0; + + if (chunk_size_supplied) + DMWARN("chunk size %u in device metadata overrides " + "table chunk size of %u.", + chunk_size, ps->store->chunk_size); + + /* We had a bogus chunk_size. Fix stuff up. */ + free_area(ps); + + r = dm_exception_store_set_chunk_size(ps->store, chunk_size, + &chunk_err); + if (r) { + DMERR("invalid on-disk chunk size %u: %s.", + chunk_size, chunk_err); + return r; + } + + r = alloc_area(ps); + return r; + +bad: + free_area(ps); + return r; +} + +static int write_header(struct pstore *ps) +{ + struct disk_header *dh; + + memset(ps->header_area, 0, ps->store->chunk_size << SECTOR_SHIFT); + + dh = ps->header_area; + dh->magic = cpu_to_le32(SNAP_MAGIC); + dh->valid = cpu_to_le32(ps->valid); + dh->version = cpu_to_le32(ps->version); + dh->chunk_size = cpu_to_le32(ps->store->chunk_size); + + return chunk_io(ps, ps->header_area, 0, WRITE, 1); +} + +/* + * Access functions for the disk exceptions, these do the endian conversions. + */ +static struct disk_exception *get_exception(struct pstore *ps, void *ps_area, + uint32_t index) +{ + BUG_ON(index >= ps->exceptions_per_area); + + return ((struct disk_exception *) ps_area) + index; +} + +static void read_exception(struct pstore *ps, void *ps_area, + uint32_t index, struct core_exception *result) +{ + struct disk_exception *de = get_exception(ps, ps_area, index); + + /* copy it */ + result->old_chunk = le64_to_cpu(de->old_chunk); + result->new_chunk = le64_to_cpu(de->new_chunk); +} + +static void write_exception(struct pstore *ps, + uint32_t index, struct core_exception *e) +{ + struct disk_exception *de = get_exception(ps, ps->area, index); + + /* copy it */ + de->old_chunk = cpu_to_le64(e->old_chunk); + de->new_chunk = cpu_to_le64(e->new_chunk); +} + +static void clear_exception(struct pstore *ps, uint32_t index) +{ + struct disk_exception *de = get_exception(ps, ps->area, index); + + /* clear it */ + de->old_chunk = 0; + de->new_chunk = 0; +} + +/* + * Registers the exceptions that are present in the current area. + * 'full' is filled in to indicate if the area has been + * filled. + */ +static int insert_exceptions(struct pstore *ps, void *ps_area, + int (*callback)(void *callback_context, + chunk_t old, chunk_t new), + void *callback_context, + int *full) +{ + int r; + unsigned int i; + struct core_exception e; + + /* presume the area is full */ + *full = 1; + + for (i = 0; i < ps->exceptions_per_area; i++) { + read_exception(ps, ps_area, i, &e); + + /* + * If the new_chunk is pointing at the start of + * the COW device, where the first metadata area + * is we know that we've hit the end of the + * exceptions. Therefore the area is not full. + */ + if (e.new_chunk == 0LL) { + ps->current_committed = i; + *full = 0; + break; + } + + /* + * Keep track of the start of the free chunks. + */ + if (ps->next_free <= e.new_chunk) + ps->next_free = e.new_chunk + 1; + + /* + * Otherwise we add the exception to the snapshot. + */ + r = callback(callback_context, e.old_chunk, e.new_chunk); + if (r) + return r; + } + + return 0; +} + +static int read_exceptions(struct pstore *ps, + int (*callback)(void *callback_context, chunk_t old, + chunk_t new), + void *callback_context) +{ + int r, full = 1; + struct dm_bufio_client *client; + chunk_t prefetch_area = 0; + + client = dm_bufio_client_create(dm_snap_cow(ps->store->snap)->bdev, + ps->store->chunk_size << SECTOR_SHIFT, + 1, 0, NULL, NULL); + + if (IS_ERR(client)) + return PTR_ERR(client); + + /* + * Setup for one current buffer + desired readahead buffers. + */ + dm_bufio_set_minimum_buffers(client, 1 + DM_PREFETCH_CHUNKS); + + /* + * Keeping reading chunks and inserting exceptions until + * we find a partially full area. + */ + for (ps->current_area = 0; full; ps->current_area++) { + struct dm_buffer *bp; + void *area; + chunk_t chunk; + + if (unlikely(prefetch_area < ps->current_area)) + prefetch_area = ps->current_area; + + if (DM_PREFETCH_CHUNKS) do { + chunk_t pf_chunk = area_location(ps, prefetch_area); + if (unlikely(pf_chunk >= dm_bufio_get_device_size(client))) + break; + dm_bufio_prefetch(client, pf_chunk, 1); + prefetch_area++; + if (unlikely(!prefetch_area)) + break; + } while (prefetch_area <= ps->current_area + DM_PREFETCH_CHUNKS); + + chunk = area_location(ps, ps->current_area); + + area = dm_bufio_read(client, chunk, &bp); + if (unlikely(IS_ERR(area))) { + r = PTR_ERR(area); + goto ret_destroy_bufio; + } + + r = insert_exceptions(ps, area, callback, callback_context, + &full); + + if (!full) + memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT); + + dm_bufio_release(bp); + + dm_bufio_forget(client, chunk); + + if (unlikely(r)) + goto ret_destroy_bufio; + } + + ps->current_area--; + + skip_metadata(ps); + + r = 0; + +ret_destroy_bufio: + dm_bufio_client_destroy(client); + + return r; +} + +static struct pstore *get_info(struct dm_exception_store *store) +{ + return (struct pstore *) store->context; +} + +static void persistent_usage(struct dm_exception_store *store, + sector_t *total_sectors, + sector_t *sectors_allocated, + sector_t *metadata_sectors) +{ + struct pstore *ps = get_info(store); + + *sectors_allocated = ps->next_free * store->chunk_size; + *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); + + /* + * First chunk is the fixed header. + * Then there are (ps->current_area + 1) metadata chunks, each one + * separated from the next by ps->exceptions_per_area data chunks. + */ + *metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) * + store->chunk_size; +} + +static void persistent_dtr(struct dm_exception_store *store) +{ + struct pstore *ps = get_info(store); + + destroy_workqueue(ps->metadata_wq); + + /* Created in read_header */ + if (ps->io_client) + dm_io_client_destroy(ps->io_client); + free_area(ps); + + /* Allocated in persistent_read_metadata */ + if (ps->callbacks) + vfree(ps->callbacks); + + kfree(ps); +} + +static int persistent_read_metadata(struct dm_exception_store *store, + int (*callback)(void *callback_context, + chunk_t old, chunk_t new), + void *callback_context) +{ + int r, uninitialized_var(new_snapshot); + struct pstore *ps = get_info(store); + + /* + * Read the snapshot header. + */ + r = read_header(ps, &new_snapshot); + if (r) + return r; + + /* + * Now we know correct chunk_size, complete the initialisation. + */ + ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / + sizeof(struct disk_exception); + ps->callbacks = dm_vcalloc(ps->exceptions_per_area, + sizeof(*ps->callbacks)); + if (!ps->callbacks) + return -ENOMEM; + + /* + * Do we need to setup a new snapshot ? + */ + if (new_snapshot) { + r = write_header(ps); + if (r) { + DMWARN("write_header failed"); + return r; + } + + ps->current_area = 0; + zero_memory_area(ps); + r = zero_disk_area(ps, 0); + if (r) + DMWARN("zero_disk_area(0) failed"); + return r; + } + /* + * Sanity checks. + */ + if (ps->version != SNAPSHOT_DISK_VERSION) { + DMWARN("unable to handle snapshot disk version %d", + ps->version); + return -EINVAL; + } + + /* + * Metadata are valid, but snapshot is invalidated + */ + if (!ps->valid) + return 1; + + /* + * Read the metadata. + */ + r = read_exceptions(ps, callback, callback_context); + + return r; +} + +static int persistent_prepare_exception(struct dm_exception_store *store, + struct dm_exception *e) +{ + struct pstore *ps = get_info(store); + sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); + + /* Is there enough room ? */ + if (size < ((ps->next_free + 1) * store->chunk_size)) + return -ENOSPC; + + e->new_chunk = ps->next_free; + + /* + * Move onto the next free pending, making sure to take + * into account the location of the metadata chunks. + */ + ps->next_free++; + skip_metadata(ps); + + atomic_inc(&ps->pending_count); + return 0; +} + +static void persistent_commit_exception(struct dm_exception_store *store, + struct dm_exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + unsigned int i; + struct pstore *ps = get_info(store); + struct core_exception ce; + struct commit_callback *cb; + + ce.old_chunk = e->old_chunk; + ce.new_chunk = e->new_chunk; + write_exception(ps, ps->current_committed++, &ce); + + /* + * Add the callback to the back of the array. This code + * is the only place where the callback array is + * manipulated, and we know that it will never be called + * multiple times concurrently. + */ + cb = ps->callbacks + ps->callback_count++; + cb->callback = callback; + cb->context = callback_context; + + /* + * If there are exceptions in flight and we have not yet + * filled this metadata area there's nothing more to do. + */ + if (!atomic_dec_and_test(&ps->pending_count) && + (ps->current_committed != ps->exceptions_per_area)) + return; + + /* + * If we completely filled the current area, then wipe the next one. + */ + if ((ps->current_committed == ps->exceptions_per_area) && + zero_disk_area(ps, ps->current_area + 1)) + ps->valid = 0; + + /* + * Commit exceptions to disk. + */ + if (ps->valid && area_io(ps, WRITE_FLUSH_FUA)) + ps->valid = 0; + + /* + * Advance to the next area if this one is full. + */ + if (ps->current_committed == ps->exceptions_per_area) { + ps->current_committed = 0; + ps->current_area++; + zero_memory_area(ps); + } + + for (i = 0; i < ps->callback_count; i++) { + cb = ps->callbacks + i; + cb->callback(cb->context, ps->valid); + } + + ps->callback_count = 0; +} + +static int persistent_prepare_merge(struct dm_exception_store *store, + chunk_t *last_old_chunk, + chunk_t *last_new_chunk) +{ + struct pstore *ps = get_info(store); + struct core_exception ce; + int nr_consecutive; + int r; + + /* + * When current area is empty, move back to preceding area. + */ + if (!ps->current_committed) { + /* + * Have we finished? + */ + if (!ps->current_area) + return 0; + + ps->current_area--; + r = area_io(ps, READ); + if (r < 0) + return r; + ps->current_committed = ps->exceptions_per_area; + } + + read_exception(ps, ps->area, ps->current_committed - 1, &ce); + *last_old_chunk = ce.old_chunk; + *last_new_chunk = ce.new_chunk; + + /* + * Find number of consecutive chunks within the current area, + * working backwards. + */ + for (nr_consecutive = 1; nr_consecutive < ps->current_committed; + nr_consecutive++) { + read_exception(ps, ps->area, + ps->current_committed - 1 - nr_consecutive, &ce); + if (ce.old_chunk != *last_old_chunk - nr_consecutive || + ce.new_chunk != *last_new_chunk - nr_consecutive) + break; + } + + return nr_consecutive; +} + +static int persistent_commit_merge(struct dm_exception_store *store, + int nr_merged) +{ + int r, i; + struct pstore *ps = get_info(store); + + BUG_ON(nr_merged > ps->current_committed); + + for (i = 0; i < nr_merged; i++) + clear_exception(ps, ps->current_committed - 1 - i); + + r = area_io(ps, WRITE_FLUSH_FUA); + if (r < 0) + return r; + + ps->current_committed -= nr_merged; + + /* + * At this stage, only persistent_usage() uses ps->next_free, so + * we make no attempt to keep ps->next_free strictly accurate + * as exceptions may have been committed out-of-order originally. + * Once a snapshot has become merging, we set it to the value it + * would have held had all the exceptions been committed in order. + * + * ps->current_area does not get reduced by prepare_merge() until + * after commit_merge() has removed the nr_merged previous exceptions. + */ + ps->next_free = area_location(ps, ps->current_area) + + ps->current_committed + 1; + + return 0; +} + +static void persistent_drop_snapshot(struct dm_exception_store *store) +{ + struct pstore *ps = get_info(store); + + ps->valid = 0; + if (write_header(ps)) + DMWARN("write header failed"); +} + +static int persistent_ctr(struct dm_exception_store *store, + unsigned argc, char **argv) +{ + struct pstore *ps; + + /* allocate the pstore */ + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + ps->store = store; + ps->valid = 1; + ps->version = SNAPSHOT_DISK_VERSION; + ps->area = NULL; + ps->zero_area = NULL; + ps->header_area = NULL; + ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */ + ps->current_committed = 0; + + ps->callback_count = 0; + atomic_set(&ps->pending_count, 0); + ps->callbacks = NULL; + + ps->metadata_wq = alloc_workqueue("ksnaphd", WQ_MEM_RECLAIM, 0); + if (!ps->metadata_wq) { + kfree(ps); + DMERR("couldn't start header metadata update thread"); + return -ENOMEM; + } + + store->context = ps; + + return 0; +} + +static unsigned persistent_status(struct dm_exception_store *store, + status_type_t status, char *result, + unsigned maxlen) +{ + unsigned sz = 0; + + switch (status) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + DMEMIT(" P %llu", (unsigned long long)store->chunk_size); + } + + return sz; +} + +static struct dm_exception_store_type _persistent_type = { + .name = "persistent", + .module = THIS_MODULE, + .ctr = persistent_ctr, + .dtr = persistent_dtr, + .read_metadata = persistent_read_metadata, + .prepare_exception = persistent_prepare_exception, + .commit_exception = persistent_commit_exception, + .prepare_merge = persistent_prepare_merge, + .commit_merge = persistent_commit_merge, + .drop_snapshot = persistent_drop_snapshot, + .usage = persistent_usage, + .status = persistent_status, +}; + +static struct dm_exception_store_type _persistent_compat_type = { + .name = "P", + .module = THIS_MODULE, + .ctr = persistent_ctr, + .dtr = persistent_dtr, + .read_metadata = persistent_read_metadata, + .prepare_exception = persistent_prepare_exception, + .commit_exception = persistent_commit_exception, + .prepare_merge = persistent_prepare_merge, + .commit_merge = persistent_commit_merge, + .drop_snapshot = persistent_drop_snapshot, + .usage = persistent_usage, + .status = persistent_status, +}; + +int dm_persistent_snapshot_init(void) +{ + int r; + + r = dm_exception_store_type_register(&_persistent_type); + if (r) { + DMERR("Unable to register persistent exception store type"); + return r; + } + + r = dm_exception_store_type_register(&_persistent_compat_type); + if (r) { + DMERR("Unable to register old-style persistent exception " + "store type"); + dm_exception_store_type_unregister(&_persistent_type); + return r; + } + + return r; +} + +void dm_persistent_snapshot_exit(void) +{ + dm_exception_store_type_unregister(&_persistent_type); + dm_exception_store_type_unregister(&_persistent_compat_type); +} diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c new file mode 100644 index 00000000000..1ce9a2586e4 --- /dev/null +++ b/drivers/md/dm-snap-transient.c @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2006-2008 Red Hat GmbH + * + * This file is released under the GPL. + */ + +#include "dm-exception-store.h" + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/vmalloc.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/dm-io.h> + +#define DM_MSG_PREFIX "transient snapshot" + +/*----------------------------------------------------------------- + * Implementation of the store for non-persistent snapshots. + *---------------------------------------------------------------*/ +struct transient_c { + sector_t next_free; +}; + +static void transient_dtr(struct dm_exception_store *store) +{ + kfree(store->context); +} + +static int transient_read_metadata(struct dm_exception_store *store, + int (*callback)(void *callback_context, + chunk_t old, chunk_t new), + void *callback_context) +{ + return 0; +} + +static int transient_prepare_exception(struct dm_exception_store *store, + struct dm_exception *e) +{ + struct transient_c *tc = store->context; + sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev); + + if (size < (tc->next_free + store->chunk_size)) + return -1; + + e->new_chunk = sector_to_chunk(store, tc->next_free); + tc->next_free += store->chunk_size; + + return 0; +} + +static void transient_commit_exception(struct dm_exception_store *store, + struct dm_exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + /* Just succeed */ + callback(callback_context, 1); +} + +static void transient_usage(struct dm_exception_store *store, + sector_t *total_sectors, + sector_t *sectors_allocated, + sector_t *metadata_sectors) +{ + *sectors_allocated = ((struct transient_c *) store->context)->next_free; + *total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev); + *metadata_sectors = 0; +} + +static int transient_ctr(struct dm_exception_store *store, + unsigned argc, char **argv) +{ + struct transient_c *tc; + + tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); + if (!tc) + return -ENOMEM; + + tc->next_free = 0; + store->context = tc; + + return 0; +} + +static unsigned transient_status(struct dm_exception_store *store, + status_type_t status, char *result, + unsigned maxlen) +{ + unsigned sz = 0; + + switch (status) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + DMEMIT(" N %llu", (unsigned long long)store->chunk_size); + } + + return sz; +} + +static struct dm_exception_store_type _transient_type = { + .name = "transient", + .module = THIS_MODULE, + .ctr = transient_ctr, + .dtr = transient_dtr, + .read_metadata = transient_read_metadata, + .prepare_exception = transient_prepare_exception, + .commit_exception = transient_commit_exception, + .usage = transient_usage, + .status = transient_status, +}; + +static struct dm_exception_store_type _transient_compat_type = { + .name = "N", + .module = THIS_MODULE, + .ctr = transient_ctr, + .dtr = transient_dtr, + .read_metadata = transient_read_metadata, + .prepare_exception = transient_prepare_exception, + .commit_exception = transient_commit_exception, + .usage = transient_usage, + .status = transient_status, +}; + +int dm_transient_snapshot_init(void) +{ + int r; + + r = dm_exception_store_type_register(&_transient_type); + if (r) { + DMWARN("Unable to register transient exception store type"); + return r; + } + + r = dm_exception_store_type_register(&_transient_compat_type); + if (r) { + DMWARN("Unable to register old-style transient " + "exception store type"); + dm_exception_store_type_unregister(&_transient_type); + return r; + } + + return r; +} + +void dm_transient_snapshot_exit(void) +{ + dm_exception_store_type_unregister(&_transient_type); + dm_exception_store_type_unregister(&_transient_compat_type); +} diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 6e5528aecc9..5bd2290cfb1 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -7,8 +7,8 @@ */ #include <linux/blkdev.h> -#include <linux/ctype.h> #include <linux/device-mapper.h> +#include <linux/delay.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/kdev_t.h> @@ -20,65 +20,161 @@ #include <linux/log2.h> #include <linux/dm-kcopyd.h> -#include "dm-snap.h" -#include "dm-bio-list.h" +#include "dm-exception-store.h" #define DM_MSG_PREFIX "snapshots" -/* - * The percentage increment we will wake up users at - */ -#define WAKE_UP_PERCENT 5 +static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; -/* - * kcopyd priority of snapshot operations - */ -#define SNAPSHOT_COPY_PRIORITY 2 - -/* - * Reserve 1MB for each snapshot initially (with minimum of 1 page). - */ -#define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1) +#define dm_target_is_snapshot_merge(ti) \ + ((ti)->type->name == dm_snapshot_merge_target_name) /* * The size of the mempool used to track chunks in use. */ #define MIN_IOS 256 -static struct workqueue_struct *ksnapd; -static void flush_queued_bios(struct work_struct *work); +#define DM_TRACKED_CHUNK_HASH_SIZE 16 +#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ + (DM_TRACKED_CHUNK_HASH_SIZE - 1)) -struct dm_snap_pending_exception { - struct dm_snap_exception e; +struct dm_exception_table { + uint32_t hash_mask; + unsigned hash_shift; + struct list_head *table; +}; + +struct dm_snapshot { + struct rw_semaphore lock; + + struct dm_dev *origin; + struct dm_dev *cow; + + struct dm_target *ti; + + /* List of snapshots per Origin */ + struct list_head list; /* - * Origin buffers waiting for this to complete are held - * in a bio list + * You can't use a snapshot if this is 0 (e.g. if full). + * A snapshot-merge target never clears this. */ - struct bio_list origin_bios; - struct bio_list snapshot_bios; + int valid; + + /* Origin writes don't trigger exceptions until this is set */ + int active; + + atomic_t pending_exceptions_count; + + /* Protected by "lock" */ + sector_t exception_start_sequence; + + /* Protected by kcopyd single-threaded callback */ + sector_t exception_complete_sequence; /* - * Short-term queue of pending exceptions prior to submission. + * A list of pending exceptions that completed out of order. + * Protected by kcopyd single-threaded callback. */ - struct list_head list; + struct list_head out_of_order_list; + + mempool_t *pending_pool; + + struct dm_exception_table pending; + struct dm_exception_table complete; + + /* + * pe_lock protects all pending_exception operations and access + * as well as the snapshot_bios list. + */ + spinlock_t pe_lock; + + /* Chunks with outstanding reads */ + spinlock_t tracked_chunk_lock; + struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; + + /* The on disk metadata handler */ + struct dm_exception_store *store; + + struct dm_kcopyd_client *kcopyd_client; + + /* Wait for events based on state_bits */ + unsigned long state_bits; + + /* Range of chunks currently being merged. */ + chunk_t first_merging_chunk; + int num_merging_chunks; + + /* + * The merge operation failed if this flag is set. + * Failure modes are handled as follows: + * - I/O error reading the header + * => don't load the target; abort. + * - Header does not have "valid" flag set + * => use the origin; forget about the snapshot. + * - I/O error when reading exceptions + * => don't load the target; abort. + * (We can't use the intermediate origin state.) + * - I/O error while merging + * => stop merging; set merge_failed; process I/O normally. + */ + int merge_failed; + + /* + * Incoming bios that overlap with chunks being merged must wait + * for them to be committed. + */ + struct bio_list bios_queued_during_merge; +}; + +/* + * state_bits: + * RUNNING_MERGE - Merge operation is in progress. + * SHUTDOWN_MERGE - Set to signal that merge needs to be stopped; + * cleared afterwards. + */ +#define RUNNING_MERGE 0 +#define SHUTDOWN_MERGE 1 + +DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, + "A percentage of time allocated for copy on write"); + +struct dm_dev *dm_snap_origin(struct dm_snapshot *s) +{ + return s->origin; +} +EXPORT_SYMBOL(dm_snap_origin); + +struct dm_dev *dm_snap_cow(struct dm_snapshot *s) +{ + return s->cow; +} +EXPORT_SYMBOL(dm_snap_cow); + +static sector_t chunk_to_sector(struct dm_exception_store *store, + chunk_t chunk) +{ + return chunk << store->chunk_shift; +} +static int bdev_equal(struct block_device *lhs, struct block_device *rhs) +{ /* - * The primary pending_exception is the one that holds - * the ref_count and the list of origin_bios for a - * group of pending_exceptions. It is always last to get freed. - * These fields get set up when writing to the origin. + * There is only ever one instance of a particular block + * device so we can compare pointers safely. */ - struct dm_snap_pending_exception *primary_pe; + return lhs == rhs; +} + +struct dm_snap_pending_exception { + struct dm_exception e; /* - * Number of pending_exceptions processing this chunk. - * When this drops to zero we must complete the origin bios. - * If incrementing or decrementing this, hold pe->snap->lock for - * the sibling concerned and not pe->primary_pe->snap->lock unless - * they are the same. + * Origin buffers waiting for this to complete are held + * in a bio list */ - atomic_t ref_count; + struct bio_list origin_bios; + struct bio_list snapshot_bios; /* Pointer back to snapshot context */ struct dm_snapshot *snap; @@ -88,6 +184,21 @@ struct dm_snap_pending_exception { * kcopyd. */ int started; + + /* There was copying error. */ + int copy_error; + + /* A sequence number, it is used for in-order completion. */ + sector_t exception_sequence; + + struct list_head out_of_order_entry; + + /* + * For writing a complete chunk, bypassing the copy. + */ + struct bio *full_bio; + bio_end_io_t *full_bio_end_io; + void *full_bio_private; }; /* @@ -102,46 +213,48 @@ struct dm_snap_tracked_chunk { chunk_t chunk; }; -static struct kmem_cache *tracked_chunk_cache; +static void init_tracked_chunk(struct bio *bio) +{ + struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); + INIT_HLIST_NODE(&c->node); +} -static struct dm_snap_tracked_chunk *track_chunk(struct dm_snapshot *s, - chunk_t chunk) +static bool is_bio_tracked(struct bio *bio) { - struct dm_snap_tracked_chunk *c = mempool_alloc(s->tracked_chunk_pool, - GFP_NOIO); - unsigned long flags; + struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); + return !hlist_unhashed(&c->node); +} + +static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk) +{ + struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); c->chunk = chunk; - spin_lock_irqsave(&s->tracked_chunk_lock, flags); + spin_lock_irq(&s->tracked_chunk_lock); hlist_add_head(&c->node, &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]); - spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); - - return c; + spin_unlock_irq(&s->tracked_chunk_lock); } -static void stop_tracking_chunk(struct dm_snapshot *s, - struct dm_snap_tracked_chunk *c) +static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio) { + struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk)); unsigned long flags; spin_lock_irqsave(&s->tracked_chunk_lock, flags); hlist_del(&c->node); spin_unlock_irqrestore(&s->tracked_chunk_lock, flags); - - mempool_free(c, s->tracked_chunk_pool); } static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) { struct dm_snap_tracked_chunk *c; - struct hlist_node *hn; int found = 0; spin_lock_irq(&s->tracked_chunk_lock); - hlist_for_each_entry(c, hn, + hlist_for_each_entry(c, &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) { if (c->chunk == chunk) { found = 1; @@ -155,6 +268,16 @@ static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk) } /* + * This conflicting I/O is extremely improbable in the caller, + * so msleep(1) is sufficient and there is no need for a wait queue. + */ +static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk) +{ + while (__chunk_is_tracked(s, chunk)) + msleep(1); +} + +/* * One of these per registered origin, held in the snapshot_origins hash */ struct origin { @@ -176,6 +299,10 @@ struct origin { static struct list_head *_origins; static struct rw_semaphore _origins_lock; +static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done); +static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock); +static uint64_t _pending_exceptions_done_count; + static int init_origin_hash(void) { int i; @@ -224,24 +351,149 @@ static void __insert_origin(struct origin *o) } /* + * _origins_lock must be held when calling this function. + * Returns number of snapshots registered using the supplied cow device, plus: + * snap_src - a snapshot suitable for use as a source of exception handover + * snap_dest - a snapshot capable of receiving exception handover. + * snap_merge - an existing snapshot-merge target linked to the same origin. + * There can be at most one snapshot-merge target. The parameter is optional. + * + * Possible return values and states of snap_src and snap_dest. + * 0: NULL, NULL - first new snapshot + * 1: snap_src, NULL - normal snapshot + * 2: snap_src, snap_dest - waiting for handover + * 2: snap_src, NULL - handed over, waiting for old to be deleted + * 1: NULL, snap_dest - source got destroyed without handover + */ +static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, + struct dm_snapshot **snap_src, + struct dm_snapshot **snap_dest, + struct dm_snapshot **snap_merge) +{ + struct dm_snapshot *s; + struct origin *o; + int count = 0; + int active; + + o = __lookup_origin(snap->origin->bdev); + if (!o) + goto out; + + list_for_each_entry(s, &o->snapshots, list) { + if (dm_target_is_snapshot_merge(s->ti) && snap_merge) + *snap_merge = s; + if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) + continue; + + down_read(&s->lock); + active = s->active; + up_read(&s->lock); + + if (active) { + if (snap_src) + *snap_src = s; + } else if (snap_dest) + *snap_dest = s; + + count++; + } + +out: + return count; +} + +/* + * On success, returns 1 if this snapshot is a handover destination, + * otherwise returns 0. + */ +static int __validate_exception_handover(struct dm_snapshot *snap) +{ + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + struct dm_snapshot *snap_merge = NULL; + + /* Does snapshot need exceptions handed over to it? */ + if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, + &snap_merge) == 2) || + snap_dest) { + snap->ti->error = "Snapshot cow pairing for exception " + "table handover failed"; + return -EINVAL; + } + + /* + * If no snap_src was found, snap cannot become a handover + * destination. + */ + if (!snap_src) + return 0; + + /* + * Non-snapshot-merge handover? + */ + if (!dm_target_is_snapshot_merge(snap->ti)) + return 1; + + /* + * Do not allow more than one merging snapshot. + */ + if (snap_merge) { + snap->ti->error = "A snapshot is already merging."; + return -EINVAL; + } + + if (!snap_src->store->type->prepare_merge || + !snap_src->store->type->commit_merge) { + snap->ti->error = "Snapshot exception store does not " + "support snapshot-merge."; + return -EINVAL; + } + + return 1; +} + +static void __insert_snapshot(struct origin *o, struct dm_snapshot *s) +{ + struct dm_snapshot *l; + + /* Sort the list according to chunk size, largest-first smallest-last */ + list_for_each_entry(l, &o->snapshots, list) + if (l->store->chunk_size < s->store->chunk_size) + break; + list_add_tail(&s->list, &l->list); +} + +/* * Make a note of the snapshot and its origin so we can look it * up when the origin has a write on it. + * + * Also validate snapshot exception store handovers. + * On success, returns 1 if this registration is a handover destination, + * otherwise returns 0. */ static int register_snapshot(struct dm_snapshot *snap) { - struct origin *o; + struct origin *o, *new_o = NULL; struct block_device *bdev = snap->origin->bdev; + int r = 0; + + new_o = kmalloc(sizeof(*new_o), GFP_KERNEL); + if (!new_o) + return -ENOMEM; down_write(&_origins_lock); - o = __lookup_origin(bdev); - if (!o) { + r = __validate_exception_handover(snap); + if (r < 0) { + kfree(new_o); + goto out; + } + + o = __lookup_origin(bdev); + if (o) + kfree(new_o); + else { /* New origin */ - o = kmalloc(sizeof(*o), GFP_KERNEL); - if (!o) { - up_write(&_origins_lock); - return -ENOMEM; - } + o = new_o; /* Initialise the struct */ INIT_LIST_HEAD(&o->snapshots); @@ -250,10 +502,27 @@ static int register_snapshot(struct dm_snapshot *snap) __insert_origin(o); } - list_add_tail(&snap->list, &o->snapshots); + __insert_snapshot(o, snap); + +out: + up_write(&_origins_lock); + + return r; +} + +/* + * Move snapshot to correct place in list according to chunk size. + */ +static void reregister_snapshot(struct dm_snapshot *s) +{ + struct block_device *bdev = s->origin->bdev; + + down_write(&_origins_lock); + + list_del(&s->list); + __insert_snapshot(__lookup_origin(bdev), s); up_write(&_origins_lock); - return 0; } static void unregister_snapshot(struct dm_snapshot *s) @@ -264,7 +533,7 @@ static void unregister_snapshot(struct dm_snapshot *s) o = __lookup_origin(s->origin->bdev); list_del(&s->list); - if (list_empty(&o->snapshots)) { + if (o && list_empty(&o->snapshots)) { list_del(&o->hash_list); kfree(o); } @@ -277,8 +546,8 @@ static void unregister_snapshot(struct dm_snapshot *s) * The lowest hash_shift bits of the chunk number are ignored, allowing * some consecutive chunks to be grouped together. */ -static int init_exception_table(struct exception_table *et, uint32_t size, - unsigned hash_shift) +static int dm_exception_table_init(struct dm_exception_table *et, + uint32_t size, unsigned hash_shift) { unsigned int i; @@ -294,10 +563,11 @@ static int init_exception_table(struct exception_table *et, uint32_t size, return 0; } -static void exit_exception_table(struct exception_table *et, struct kmem_cache *mem) +static void dm_exception_table_exit(struct dm_exception_table *et, + struct kmem_cache *mem) { struct list_head *slot; - struct dm_snap_exception *ex, *next; + struct dm_exception *ex, *next; int i, size; size = et->hash_mask + 1; @@ -311,19 +581,12 @@ static void exit_exception_table(struct exception_table *et, struct kmem_cache * vfree(et->table); } -static uint32_t exception_hash(struct exception_table *et, chunk_t chunk) +static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) { return (chunk >> et->hash_shift) & et->hash_mask; } -static void insert_exception(struct exception_table *eh, - struct dm_snap_exception *e) -{ - struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; - list_add(&e->hash_list, l); -} - -static void remove_exception(struct dm_snap_exception *e) +static void dm_remove_exception(struct dm_exception *e) { list_del(&e->hash_list); } @@ -332,11 +595,11 @@ static void remove_exception(struct dm_snap_exception *e) * Return the exception data for a sector, or NULL if not * remapped. */ -static struct dm_snap_exception *lookup_exception(struct exception_table *et, - chunk_t chunk) +static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, + chunk_t chunk) { struct list_head *slot; - struct dm_snap_exception *e; + struct dm_exception *e; slot = &et->table[exception_hash(et, chunk)]; list_for_each_entry (e, slot, hash_list) @@ -347,18 +610,18 @@ static struct dm_snap_exception *lookup_exception(struct exception_table *et, return NULL; } -static struct dm_snap_exception *alloc_exception(void) +static struct dm_exception *alloc_completed_exception(gfp_t gfp) { - struct dm_snap_exception *e; + struct dm_exception *e; - e = kmem_cache_alloc(exception_cache, GFP_NOIO); - if (!e) + e = kmem_cache_alloc(exception_cache, gfp); + if (!e && gfp == GFP_NOIO) e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); return e; } -static void free_exception(struct dm_snap_exception *e) +static void free_completed_exception(struct dm_exception *e) { kmem_cache_free(exception_cache, e); } @@ -368,6 +631,7 @@ static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snaps struct dm_snap_pending_exception *pe = mempool_alloc(s->pending_pool, GFP_NOIO); + atomic_inc(&s->pending_exceptions_count); pe->snap = s; return pe; @@ -375,15 +639,18 @@ static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snaps static void free_pending_exception(struct dm_snap_pending_exception *pe) { - mempool_free(pe, pe->snap->pending_pool); + struct dm_snapshot *s = pe->snap; + + mempool_free(pe, s->pending_pool); + smp_mb__before_atomic(); + atomic_dec(&s->pending_exceptions_count); } -static void insert_completed_exception(struct dm_snapshot *s, - struct dm_snap_exception *new_e) +static void dm_insert_exception(struct dm_exception_table *eh, + struct dm_exception *new_e) { - struct exception_table *eh = &s->complete; struct list_head *l; - struct dm_snap_exception *e = NULL; + struct dm_exception *e = NULL; l = &eh->table[exception_hash(eh, new_e->old_chunk)]; @@ -399,7 +666,7 @@ static void insert_completed_exception(struct dm_snapshot *s, new_e->new_chunk == (dm_chunk_number(e->new_chunk) + dm_consecutive_chunk_count(e) + 1)) { dm_consecutive_chunk_count_inc(e); - free_exception(new_e); + free_completed_exception(new_e); return; } @@ -409,7 +676,7 @@ static void insert_completed_exception(struct dm_snapshot *s, dm_consecutive_chunk_count_inc(e); e->old_chunk--; e->new_chunk--; - free_exception(new_e); + free_completed_exception(new_e); return; } @@ -421,11 +688,16 @@ out: list_add(&new_e->hash_list, e ? &e->hash_list : l); } -int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) +/* + * Callback used by the exception stores to load exceptions when + * initialising. + */ +static int dm_add_exception(void *context, chunk_t old, chunk_t new) { - struct dm_snap_exception *e; + struct dm_snapshot *s = context; + struct dm_exception *e; - e = alloc_exception(); + e = alloc_completed_exception(GFP_KERNEL); if (!e) return -ENOMEM; @@ -434,12 +706,29 @@ int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) /* Consecutive_count is implicitly initialised to zero */ e->new_chunk = new; - insert_completed_exception(s, e); + dm_insert_exception(&s->complete, e); return 0; } /* + * Return a minimum chunk size of all snapshots that have the specified origin. + * Return zero if the origin has no snapshots. + */ +static uint32_t __minimum_chunk_size(struct origin *o) +{ + struct dm_snapshot *snap; + unsigned chunk_size = 0; + + if (o) + list_for_each_entry(snap, &o->snapshots, list) + chunk_size = min_not_zero(chunk_size, + snap->store->chunk_size); + + return (uint32_t) chunk_size; +} + +/* * Hard coded magic. */ static int calc_max_buckets(void) @@ -456,22 +745,23 @@ static int calc_max_buckets(void) */ static int init_hash_tables(struct dm_snapshot *s) { - sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; + sector_t hash_size, cow_dev_size, max_buckets; /* * Calculate based on the size of the original volume or * the COW volume... */ cow_dev_size = get_dev_size(s->cow->bdev); - origin_dev_size = get_dev_size(s->origin->bdev); max_buckets = calc_max_buckets(); - hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift; + hash_size = cow_dev_size >> s->store->chunk_shift; hash_size = min(hash_size, max_buckets); + if (hash_size < 64) + hash_size = 64; hash_size = rounddown_pow_of_two(hash_size); - if (init_exception_table(&s->complete, hash_size, - DM_CHUNK_CONSECUTIVE_BITS)) + if (dm_exception_table_init(&s->complete, hash_size, + DM_CHUNK_CONSECUTIVE_BITS)) return -ENOMEM; /* @@ -482,67 +772,285 @@ static int init_hash_tables(struct dm_snapshot *s) if (hash_size < 64) hash_size = 64; - if (init_exception_table(&s->pending, hash_size, 0)) { - exit_exception_table(&s->complete, exception_cache); + if (dm_exception_table_init(&s->pending, hash_size, 0)) { + dm_exception_table_exit(&s->complete, exception_cache); return -ENOMEM; } return 0; } -/* - * Round a number up to the nearest 'size' boundary. size must - * be a power of 2. - */ -static ulong round_up(ulong n, ulong size) +static void merge_shutdown(struct dm_snapshot *s) +{ + clear_bit_unlock(RUNNING_MERGE, &s->state_bits); + smp_mb__after_atomic(); + wake_up_bit(&s->state_bits, RUNNING_MERGE); +} + +static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s) { - size--; - return (n + size) & ~size; + s->first_merging_chunk = 0; + s->num_merging_chunks = 0; + + return bio_list_get(&s->bios_queued_during_merge); } -static int set_chunk_size(struct dm_snapshot *s, const char *chunk_size_arg, - char **error) +/* + * Remove one chunk from the index of completed exceptions. + */ +static int __remove_single_exception_chunk(struct dm_snapshot *s, + chunk_t old_chunk) { - unsigned long chunk_size; - char *value; + struct dm_exception *e; - chunk_size = simple_strtoul(chunk_size_arg, &value, 10); - if (*chunk_size_arg == '\0' || *value != '\0') { - *error = "Invalid chunk size"; + e = dm_lookup_exception(&s->complete, old_chunk); + if (!e) { + DMERR("Corruption detected: exception for block %llu is " + "on disk but not in memory", + (unsigned long long)old_chunk); return -EINVAL; } - if (!chunk_size) { - s->chunk_size = s->chunk_mask = s->chunk_shift = 0; + /* + * If this is the only chunk using this exception, remove exception. + */ + if (!dm_consecutive_chunk_count(e)) { + dm_remove_exception(e); + free_completed_exception(e); return 0; } /* - * Chunk size must be multiple of page size. Silently - * round up if it's not. + * The chunk may be either at the beginning or the end of a + * group of consecutive chunks - never in the middle. We are + * removing chunks in the opposite order to that in which they + * were added, so this should always be true. + * Decrement the consecutive chunk counter and adjust the + * starting point if necessary. */ - chunk_size = round_up(chunk_size, PAGE_SIZE >> 9); - - /* Check chunk_size is a power of 2 */ - if (!is_power_of_2(chunk_size)) { - *error = "Chunk size is not a power of 2"; + if (old_chunk == e->old_chunk) { + e->old_chunk++; + e->new_chunk++; + } else if (old_chunk != e->old_chunk + + dm_consecutive_chunk_count(e)) { + DMERR("Attempt to merge block %llu from the " + "middle of a chunk range [%llu - %llu]", + (unsigned long long)old_chunk, + (unsigned long long)e->old_chunk, + (unsigned long long) + e->old_chunk + dm_consecutive_chunk_count(e)); return -EINVAL; } - /* Validate the chunk size against the device block size */ - if (chunk_size % (bdev_hardsect_size(s->cow->bdev) >> 9)) { - *error = "Chunk size is not a multiple of device blocksize"; - return -EINVAL; + dm_consecutive_chunk_count_dec(e); + + return 0; +} + +static void flush_bios(struct bio *bio); + +static int remove_single_exception_chunk(struct dm_snapshot *s) +{ + struct bio *b = NULL; + int r; + chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; + + down_write(&s->lock); + + /* + * Process chunks (and associated exceptions) in reverse order + * so that dm_consecutive_chunk_count_dec() accounting works. + */ + do { + r = __remove_single_exception_chunk(s, old_chunk); + if (r) + goto out; + } while (old_chunk-- > s->first_merging_chunk); + + b = __release_queued_bios_after_merge(s); + +out: + up_write(&s->lock); + if (b) + flush_bios(b); + + return r; +} + +static int origin_write_extent(struct dm_snapshot *merging_snap, + sector_t sector, unsigned chunk_size); + +static void merge_callback(int read_err, unsigned long write_err, + void *context); + +static uint64_t read_pending_exceptions_done_count(void) +{ + uint64_t pending_exceptions_done; + + spin_lock(&_pending_exceptions_done_spinlock); + pending_exceptions_done = _pending_exceptions_done_count; + spin_unlock(&_pending_exceptions_done_spinlock); + + return pending_exceptions_done; +} + +static void increment_pending_exceptions_done_count(void) +{ + spin_lock(&_pending_exceptions_done_spinlock); + _pending_exceptions_done_count++; + spin_unlock(&_pending_exceptions_done_spinlock); + + wake_up_all(&_pending_exceptions_done); +} + +static void snapshot_merge_next_chunks(struct dm_snapshot *s) +{ + int i, linear_chunks; + chunk_t old_chunk, new_chunk; + struct dm_io_region src, dest; + sector_t io_size; + uint64_t previous_count; + + BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits)); + if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits))) + goto shut; + + /* + * valid flag never changes during merge, so no lock required. + */ + if (!s->valid) { + DMERR("Snapshot is invalid: can't merge"); + goto shut; + } + + linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk, + &new_chunk); + if (linear_chunks <= 0) { + if (linear_chunks < 0) { + DMERR("Read error in exception store: " + "shutting down merge"); + down_write(&s->lock); + s->merge_failed = 1; + up_write(&s->lock); + } + goto shut; + } + + /* Adjust old_chunk and new_chunk to reflect start of linear region */ + old_chunk = old_chunk + 1 - linear_chunks; + new_chunk = new_chunk + 1 - linear_chunks; + + /* + * Use one (potentially large) I/O to copy all 'linear_chunks' + * from the exception store to the origin + */ + io_size = linear_chunks * s->store->chunk_size; + + dest.bdev = s->origin->bdev; + dest.sector = chunk_to_sector(s->store, old_chunk); + dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector); + + src.bdev = s->cow->bdev; + src.sector = chunk_to_sector(s->store, new_chunk); + src.count = dest.count; + + /* + * Reallocate any exceptions needed in other snapshots then + * wait for the pending exceptions to complete. + * Each time any pending exception (globally on the system) + * completes we are woken and repeat the process to find out + * if we can proceed. While this may not seem a particularly + * efficient algorithm, it is not expected to have any + * significant impact on performance. + */ + previous_count = read_pending_exceptions_done_count(); + while (origin_write_extent(s, dest.sector, io_size)) { + wait_event(_pending_exceptions_done, + (read_pending_exceptions_done_count() != + previous_count)); + /* Retry after the wait, until all exceptions are done. */ + previous_count = read_pending_exceptions_done_count(); + } + + down_write(&s->lock); + s->first_merging_chunk = old_chunk; + s->num_merging_chunks = linear_chunks; + up_write(&s->lock); + + /* Wait until writes to all 'linear_chunks' drain */ + for (i = 0; i < linear_chunks; i++) + __check_for_conflicting_io(s, old_chunk + i); + + dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s); + return; + +shut: + merge_shutdown(s); +} + +static void error_bios(struct bio *bio); + +static void merge_callback(int read_err, unsigned long write_err, void *context) +{ + struct dm_snapshot *s = context; + struct bio *b = NULL; + + if (read_err || write_err) { + if (read_err) + DMERR("Read error: shutting down merge."); + else + DMERR("Write error: shutting down merge."); + goto shut; } - s->chunk_size = chunk_size; - s->chunk_mask = chunk_size - 1; - s->chunk_shift = ffs(chunk_size) - 1; + if (s->store->type->commit_merge(s->store, + s->num_merging_chunks) < 0) { + DMERR("Write error in exception store: shutting down merge"); + goto shut; + } + + if (remove_single_exception_chunk(s) < 0) + goto shut; + + snapshot_merge_next_chunks(s); + + return; + +shut: + down_write(&s->lock); + s->merge_failed = 1; + b = __release_queued_bios_after_merge(s); + up_write(&s->lock); + error_bios(b); + + merge_shutdown(s); +} + +static void start_merge(struct dm_snapshot *s) +{ + if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits)) + snapshot_merge_next_chunks(s); +} + +static int wait_schedule(void *ptr) +{ + schedule(); return 0; } /* + * Stop the merging process and wait until it finishes. + */ +static void stop_merge(struct dm_snapshot *s) +{ + set_bit(SHUTDOWN_MERGE, &s->state_bits); + wait_on_bit(&s->state_bits, RUNNING_MERGE, wait_schedule, + TASK_UNINTERRUPTIBLE); + clear_bit(SHUTDOWN_MERGE, &s->state_bits); +} + +/* * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> */ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) @@ -550,99 +1058,93 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct dm_snapshot *s; int i; int r = -EINVAL; - char persistent; - char *origin_path; - char *cow_path; + char *origin_path, *cow_path; + unsigned args_used, num_flush_bios = 1; + fmode_t origin_mode = FMODE_READ; if (argc != 4) { ti->error = "requires exactly 4 arguments"; r = -EINVAL; - goto bad1; + goto bad; } - origin_path = argv[0]; - cow_path = argv[1]; - persistent = toupper(*argv[2]); - - if (persistent != 'P' && persistent != 'N') { - ti->error = "Persistent flag is not P or N"; - r = -EINVAL; - goto bad1; + if (dm_target_is_snapshot_merge(ti)) { + num_flush_bios = 2; + origin_mode = FMODE_WRITE; } s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s == NULL) { - ti->error = "Cannot allocate snapshot context private " - "structure"; + if (!s) { + ti->error = "Cannot allocate private snapshot structure"; r = -ENOMEM; - goto bad1; + goto bad; } - r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); + origin_path = argv[0]; + argv++; + argc--; + + r = dm_get_device(ti, origin_path, origin_mode, &s->origin); if (r) { ti->error = "Cannot get origin device"; - goto bad2; + goto bad_origin; } - r = dm_get_device(ti, cow_path, 0, 0, - FMODE_READ | FMODE_WRITE, &s->cow); + cow_path = argv[0]; + argv++; + argc--; + + r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow); if (r) { - dm_put_device(ti, s->origin); ti->error = "Cannot get COW device"; - goto bad2; + goto bad_cow; } - r = set_chunk_size(s, argv[3], &ti->error); - if (r) - goto bad3; + r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); + if (r) { + ti->error = "Couldn't create exception store"; + r = -EINVAL; + goto bad_store; + } - s->type = persistent; + argv += args_used; + argc -= args_used; + s->ti = ti; s->valid = 1; s->active = 0; - s->last_percent = 0; + atomic_set(&s->pending_exceptions_count, 0); + s->exception_start_sequence = 0; + s->exception_complete_sequence = 0; + INIT_LIST_HEAD(&s->out_of_order_list); init_rwsem(&s->lock); + INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); - s->ti = ti; + s->state_bits = 0; + s->merge_failed = 0; + s->first_merging_chunk = 0; + s->num_merging_chunks = 0; + bio_list_init(&s->bios_queued_during_merge); /* Allocate hash table for COW data */ if (init_hash_tables(s)) { ti->error = "Unable to allocate hash table space"; r = -ENOMEM; - goto bad3; + goto bad_hash_tables; } - s->store.snap = s; - - if (persistent == 'P') - r = dm_create_persistent(&s->store); - else - r = dm_create_transient(&s->store); - - if (r) { - ti->error = "Couldn't create exception store"; - r = -EINVAL; - goto bad4; - } - - r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); - if (r) { + s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); + if (IS_ERR(s->kcopyd_client)) { + r = PTR_ERR(s->kcopyd_client); ti->error = "Could not create kcopyd client"; - goto bad5; + goto bad_kcopyd; } s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache); if (!s->pending_pool) { ti->error = "Could not allocate mempool for pending exceptions"; - goto bad6; - } - - s->tracked_chunk_pool = mempool_create_slab_pool(MIN_IOS, - tracked_chunk_cache); - if (!s->tracked_chunk_pool) { - ti->error = "Could not allocate tracked_chunk mempool for " - "tracking reads"; - goto bad_tracked_chunk_pool; + r = -ENOMEM; + goto bad_pending_pool; } for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) @@ -650,56 +1152,79 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) spin_lock_init(&s->tracked_chunk_lock); - /* Metadata must only be loaded into one table at once */ - r = s->store.read_metadata(&s->store); + ti->private = s; + ti->num_flush_bios = num_flush_bios; + ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk); + + /* Add snapshot to the list of snapshots for this origin */ + /* Exceptions aren't triggered till snapshot_resume() is called */ + r = register_snapshot(s); + if (r == -ENOMEM) { + ti->error = "Snapshot origin struct allocation failed"; + goto bad_load_and_register; + } else if (r < 0) { + /* invalid handover, register_snapshot has set ti->error */ + goto bad_load_and_register; + } + + /* + * Metadata must only be loaded into one table at once, so skip this + * if metadata will be handed over during resume. + * Chunk size will be set during the handover - set it to zero to + * ensure it's ignored. + */ + if (r > 0) { + s->store->chunk_size = 0; + return 0; + } + + r = s->store->type->read_metadata(s->store, dm_add_exception, + (void *)s); if (r < 0) { ti->error = "Failed to read snapshot metadata"; - goto bad_load_and_register; + goto bad_read_metadata; } else if (r > 0) { s->valid = 0; DMWARN("Snapshot is marked invalid."); } - bio_list_init(&s->queued_bios); - INIT_WORK(&s->queued_bios_work, flush_queued_bios); - - /* Add snapshot to the list of snapshots for this origin */ - /* Exceptions aren't triggered till snapshot_resume() is called */ - if (register_snapshot(s)) { - r = -EINVAL; - ti->error = "Cannot register snapshot origin"; - goto bad_load_and_register; + if (!s->store->chunk_size) { + ti->error = "Chunk size not set"; + goto bad_read_metadata; } - ti->private = s; - ti->split_io = s->chunk_size; + r = dm_set_target_max_io_len(ti, s->store->chunk_size); + if (r) + goto bad_read_metadata; return 0; - bad_load_and_register: - mempool_destroy(s->tracked_chunk_pool); +bad_read_metadata: + unregister_snapshot(s); - bad_tracked_chunk_pool: +bad_load_and_register: mempool_destroy(s->pending_pool); - bad6: +bad_pending_pool: dm_kcopyd_client_destroy(s->kcopyd_client); - bad5: - s->store.destroy(&s->store); +bad_kcopyd: + dm_exception_table_exit(&s->pending, pending_cache); + dm_exception_table_exit(&s->complete, exception_cache); - bad4: - exit_exception_table(&s->pending, pending_cache); - exit_exception_table(&s->complete, exception_cache); +bad_hash_tables: + dm_exception_store_destroy(s->store); - bad3: +bad_store: dm_put_device(ti, s->cow); + +bad_cow: dm_put_device(ti, s->origin); - bad2: +bad_origin: kfree(s); - bad1: +bad: return r; } @@ -708,10 +1233,39 @@ static void __free_exceptions(struct dm_snapshot *s) dm_kcopyd_client_destroy(s->kcopyd_client); s->kcopyd_client = NULL; - exit_exception_table(&s->pending, pending_cache); - exit_exception_table(&s->complete, exception_cache); + dm_exception_table_exit(&s->pending, pending_cache); + dm_exception_table_exit(&s->complete, exception_cache); +} + +static void __handover_exceptions(struct dm_snapshot *snap_src, + struct dm_snapshot *snap_dest) +{ + union { + struct dm_exception_table table_swap; + struct dm_exception_store *store_swap; + } u; + + /* + * Swap all snapshot context information between the two instances. + */ + u.table_swap = snap_dest->complete; + snap_dest->complete = snap_src->complete; + snap_src->complete = u.table_swap; + + u.store_swap = snap_dest->store; + snap_dest->store = snap_src->store; + snap_src->store = u.store_swap; + + snap_dest->store->snap = snap_dest; + snap_src->store->snap = snap_src; - s->store.destroy(&s->store); + snap_dest->ti->max_io_len = snap_dest->store->chunk_size; + snap_dest->valid = snap_src->valid; + + /* + * Set source invalid to ensure it receives no further I/O. + */ + snap_src->valid = 0; } static void snapshot_dtr(struct dm_target *ti) @@ -720,27 +1274,49 @@ static void snapshot_dtr(struct dm_target *ti) int i; #endif struct dm_snapshot *s = ti->private; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + down_read(&_origins_lock); + /* Check whether exception handover must be cancelled */ + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); + if (snap_src && snap_dest && (s == snap_src)) { + down_write(&snap_dest->lock); + snap_dest->valid = 0; + up_write(&snap_dest->lock); + DMERR("Cancelling snapshot handover."); + } + up_read(&_origins_lock); - flush_workqueue(ksnapd); + if (dm_target_is_snapshot_merge(ti)) + stop_merge(s); /* Prevent further origin writes from using this snapshot. */ /* After this returns there can be no new kcopyd jobs. */ unregister_snapshot(s); + while (atomic_read(&s->pending_exceptions_count)) + msleep(1); + /* + * Ensure instructions in mempool_destroy aren't reordered + * before atomic_read. + */ + smp_mb(); + #ifdef CONFIG_DM_DEBUG for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++) BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i])); #endif - mempool_destroy(s->tracked_chunk_pool); - __free_exceptions(s); mempool_destroy(s->pending_pool); - dm_put_device(ti, s->origin); + dm_exception_store_destroy(s->store); + dm_put_device(ti, s->cow); + dm_put_device(ti, s->origin); + kfree(s); } @@ -759,18 +1335,24 @@ static void flush_bios(struct bio *bio) } } -static void flush_queued_bios(struct work_struct *work) -{ - struct dm_snapshot *s = - container_of(work, struct dm_snapshot, queued_bios_work); - struct bio *queued_bios; - unsigned long flags; +static int do_origin(struct dm_dev *origin, struct bio *bio); - spin_lock_irqsave(&s->pe_lock, flags); - queued_bios = bio_list_get(&s->queued_bios); - spin_unlock_irqrestore(&s->pe_lock, flags); +/* + * Flush a list of buffers. + */ +static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) +{ + struct bio *n; + int r; - flush_bios(queued_bios); + while (bio) { + n = bio->bi_next; + bio->bi_next = NULL; + r = do_origin(s->origin, bio); + if (r == DM_MAPIO_REMAPPED) + generic_make_request(bio); + bio = n; + } } /* @@ -798,57 +1380,21 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) else if (err == -ENOMEM) DMERR("Invalidating snapshot: Unable to allocate exception."); - if (s->store.drop_snapshot) - s->store.drop_snapshot(&s->store); + if (s->store->type->drop_snapshot) + s->store->type->drop_snapshot(s->store); s->valid = 0; dm_table_event(s->ti->table); } -static void get_pending_exception(struct dm_snap_pending_exception *pe) -{ - atomic_inc(&pe->ref_count); -} - -static struct bio *put_pending_exception(struct dm_snap_pending_exception *pe) -{ - struct dm_snap_pending_exception *primary_pe; - struct bio *origin_bios = NULL; - - primary_pe = pe->primary_pe; - - /* - * If this pe is involved in a write to the origin and - * it is the last sibling to complete then release - * the bios for the original write to the origin. - */ - if (primary_pe && - atomic_dec_and_test(&primary_pe->ref_count)) - origin_bios = bio_list_get(&primary_pe->origin_bios); - - /* - * Free the pe if it's not linked to an origin write or if - * it's not itself a primary pe. - */ - if (!primary_pe || primary_pe != pe) - free_pending_exception(pe); - - /* - * Free the primary pe if nothing references it. - */ - if (primary_pe && !atomic_read(&primary_pe->ref_count)) - free_pending_exception(primary_pe); - - return origin_bios; -} - static void pending_complete(struct dm_snap_pending_exception *pe, int success) { - struct dm_snap_exception *e; + struct dm_exception *e; struct dm_snapshot *s = pe->snap; struct bio *origin_bios = NULL; struct bio *snapshot_bios = NULL; + struct bio *full_bio = NULL; int error = 0; if (!success) { @@ -859,7 +1405,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) goto out; } - e = alloc_exception(); + e = alloc_completed_exception(GFP_NOIO); if (!e) { down_write(&s->lock); __invalidate_snapshot(s, -ENOMEM); @@ -870,38 +1416,48 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success) down_write(&s->lock); if (!s->valid) { - free_exception(e); + free_completed_exception(e); error = 1; goto out; } - /* - * Check for conflicting reads. This is extremely improbable, - * so yield() is sufficient and there is no need for a wait queue. - */ - while (__chunk_is_tracked(s, pe->e.old_chunk)) - yield(); + /* Check for conflicting reads */ + __check_for_conflicting_io(s, pe->e.old_chunk); /* * Add a proper exception, and remove the * in-flight exception from the list. */ - insert_completed_exception(s, e); + dm_insert_exception(&s->complete, e); - out: - remove_exception(&pe->e); +out: + dm_remove_exception(&pe->e); snapshot_bios = bio_list_get(&pe->snapshot_bios); - origin_bios = put_pending_exception(pe); + origin_bios = bio_list_get(&pe->origin_bios); + full_bio = pe->full_bio; + if (full_bio) { + full_bio->bi_end_io = pe->full_bio_end_io; + full_bio->bi_private = pe->full_bio_private; + atomic_inc(&full_bio->bi_remaining); + } + free_pending_exception(pe); + + increment_pending_exceptions_done_count(); up_write(&s->lock); /* Submit any pending write bios */ - if (error) + if (error) { + if (full_bio) + bio_io_error(full_bio); error_bios(snapshot_bios); - else + } else { + if (full_bio) + bio_endio(full_bio, 0); flush_bios(snapshot_bios); + } - flush_bios(origin_bios); + retry_origin_bios(s, origin_bios); } static void commit_callback(void *context, int success) @@ -911,6 +1467,19 @@ static void commit_callback(void *context, int success) pending_complete(pe, success); } +static void complete_exception(struct dm_snap_pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + + if (unlikely(pe->copy_error)) + pending_complete(pe, 0); + + else + /* Update the metadata if we are persistent */ + s->store->type->commit_exception(s->store, &pe->e, + commit_callback, pe); +} + /* * Called when the copy I/O has finished. kcopyd actually runs * this code so don't block. @@ -920,13 +1489,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) struct dm_snap_pending_exception *pe = context; struct dm_snapshot *s = pe->snap; - if (read_err || write_err) - pending_complete(pe, 0); + pe->copy_error = read_err || write_err; - else - /* Update the metadata if we are persistent */ - s->store.commit_exception(&s->store, &pe->e, commit_callback, - pe); + if (pe->exception_sequence == s->exception_complete_sequence) { + s->exception_complete_sequence++; + complete_exception(pe); + + while (!list_empty(&s->out_of_order_list)) { + pe = list_entry(s->out_of_order_list.next, + struct dm_snap_pending_exception, out_of_order_entry); + if (pe->exception_sequence != s->exception_complete_sequence) + break; + s->exception_complete_sequence++; + list_del(&pe->out_of_order_entry); + complete_exception(pe); + } + } else { + struct list_head *lh; + struct dm_snap_pending_exception *pe2; + + list_for_each_prev(lh, &s->out_of_order_list) { + pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry); + if (pe2->exception_sequence < pe->exception_sequence) + break; + } + list_add(&pe->out_of_order_entry, lh); + } } /* @@ -942,16 +1530,52 @@ static void start_copy(struct dm_snap_pending_exception *pe) dev_size = get_dev_size(bdev); src.bdev = bdev; - src.sector = chunk_to_sector(s, pe->e.old_chunk); - src.count = min(s->chunk_size, dev_size - src.sector); + src.sector = chunk_to_sector(s->store, pe->e.old_chunk); + src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector); dest.bdev = s->cow->bdev; - dest.sector = chunk_to_sector(s, pe->e.new_chunk); + dest.sector = chunk_to_sector(s->store, pe->e.new_chunk); dest.count = src.count; /* Hand over to kcopyd */ - dm_kcopyd_copy(s->kcopyd_client, - &src, 1, &dest, 0, copy_callback, pe); + dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); +} + +static void full_bio_end_io(struct bio *bio, int error) +{ + void *callback_data = bio->bi_private; + + dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0); +} + +static void start_full_bio(struct dm_snap_pending_exception *pe, + struct bio *bio) +{ + struct dm_snapshot *s = pe->snap; + void *callback_data; + + pe->full_bio = bio; + pe->full_bio_end_io = bio->bi_end_io; + pe->full_bio_private = bio->bi_private; + + callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, + copy_callback, pe); + + bio->bi_end_io = full_bio_end_io; + bio->bi_private = callback_data; + + generic_make_request(bio); +} + +static struct dm_snap_pending_exception * +__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) +{ + struct dm_exception *e = dm_lookup_exception(&s->pending, chunk); + + if (!e) + return NULL; + + return container_of(e, struct dm_snap_pending_exception, e); } /* @@ -963,80 +1587,61 @@ static void start_copy(struct dm_snap_pending_exception *pe) * this. */ static struct dm_snap_pending_exception * -__find_pending_exception(struct dm_snapshot *s, struct bio *bio) +__find_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) { - struct dm_snap_exception *e; - struct dm_snap_pending_exception *pe; - chunk_t chunk = sector_to_chunk(s, bio->bi_sector); - - /* - * Is there a pending exception for this already ? - */ - e = lookup_exception(&s->pending, chunk); - if (e) { - /* cast the exception to a pending exception */ - pe = container_of(e, struct dm_snap_pending_exception, e); - goto out; - } - - /* - * Create a new pending exception, we don't want - * to hold the lock while we do this. - */ - up_write(&s->lock); - pe = alloc_pending_exception(s); - down_write(&s->lock); - - if (!s->valid) { - free_pending_exception(pe); - return NULL; - } + struct dm_snap_pending_exception *pe2; - e = lookup_exception(&s->pending, chunk); - if (e) { + pe2 = __lookup_pending_exception(s, chunk); + if (pe2) { free_pending_exception(pe); - pe = container_of(e, struct dm_snap_pending_exception, e); - goto out; + return pe2; } pe->e.old_chunk = chunk; bio_list_init(&pe->origin_bios); bio_list_init(&pe->snapshot_bios); - pe->primary_pe = NULL; - atomic_set(&pe->ref_count, 0); pe->started = 0; + pe->full_bio = NULL; - if (s->store.prepare_exception(&s->store, &pe->e)) { + if (s->store->type->prepare_exception(s->store, &pe->e)) { free_pending_exception(pe); return NULL; } - get_pending_exception(pe); - insert_exception(&s->pending, &pe->e); + pe->exception_sequence = s->exception_start_sequence++; + + dm_insert_exception(&s->pending, &pe->e); - out: return pe; } -static void remap_exception(struct dm_snapshot *s, struct dm_snap_exception *e, +static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, struct bio *bio, chunk_t chunk) { bio->bi_bdev = s->cow->bdev; - bio->bi_sector = chunk_to_sector(s, dm_chunk_number(e->new_chunk) + - (chunk - e->old_chunk)) + - (bio->bi_sector & s->chunk_mask); + bio->bi_iter.bi_sector = + chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) + + (chunk - e->old_chunk)) + + (bio->bi_iter.bi_sector & s->store->chunk_mask); } -static int snapshot_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) +static int snapshot_map(struct dm_target *ti, struct bio *bio) { - struct dm_snap_exception *e; + struct dm_exception *e; struct dm_snapshot *s = ti->private; int r = DM_MAPIO_REMAPPED; chunk_t chunk; struct dm_snap_pending_exception *pe = NULL; - chunk = sector_to_chunk(s, bio->bi_sector); + init_tracked_chunk(bio); + + if (bio->bi_rw & REQ_FLUSH) { + bio->bi_bdev = s->cow->bdev; + return DM_MAPIO_REMAPPED; + } + + chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); /* Full snapshots are not usable */ /* To get here the table must be live so s->active is always set. */ @@ -1053,7 +1658,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, } /* If the block is already remapped - use that, else remap it */ - e = lookup_exception(&s->complete, chunk); + e = dm_lookup_exception(&s->complete, chunk); if (e) { remap_exception(s, e, bio, chunk); goto out_unlock; @@ -1065,18 +1670,48 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, * writeable. */ if (bio_rw(bio) == WRITE) { - pe = __find_pending_exception(s, bio); + pe = __lookup_pending_exception(s, chunk); if (!pe) { - __invalidate_snapshot(s, -ENOMEM); - r = -EIO; - goto out_unlock; + up_write(&s->lock); + pe = alloc_pending_exception(s); + down_write(&s->lock); + + if (!s->valid) { + free_pending_exception(pe); + r = -EIO; + goto out_unlock; + } + + e = dm_lookup_exception(&s->complete, chunk); + if (e) { + free_pending_exception(pe); + remap_exception(s, e, bio, chunk); + goto out_unlock; + } + + pe = __find_pending_exception(s, pe, chunk); + if (!pe) { + __invalidate_snapshot(s, -ENOMEM); + r = -EIO; + goto out_unlock; + } } remap_exception(s, &pe->e, bio, chunk); - bio_list_add(&pe->snapshot_bios, bio); r = DM_MAPIO_SUBMITTED; + if (!pe->started && + bio->bi_iter.bi_size == + (s->store->chunk_size << SECTOR_SHIFT)) { + pe->started = 1; + up_write(&s->lock); + start_full_bio(pe, bio); + goto out; + } + + bio_list_add(&pe->snapshot_bios, bio); + if (!pe->started) { /* this is protected by snap->lock */ pe->started = 1; @@ -1086,58 +1721,216 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio, } } else { bio->bi_bdev = s->origin->bdev; - map_context->ptr = track_chunk(s, chunk); + track_chunk(s, bio, chunk); } - out_unlock: +out_unlock: up_write(&s->lock); - out: +out: + return r; +} + +/* + * A snapshot-merge target behaves like a combination of a snapshot + * target and a snapshot-origin target. It only generates new + * exceptions in other snapshots and not in the one that is being + * merged. + * + * For each chunk, if there is an existing exception, it is used to + * redirect I/O to the cow device. Otherwise I/O is sent to the origin, + * which in turn might generate exceptions in other snapshots. + * If merging is currently taking place on the chunk in question, the + * I/O is deferred by adding it to s->bios_queued_during_merge. + */ +static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_exception *e; + struct dm_snapshot *s = ti->private; + int r = DM_MAPIO_REMAPPED; + chunk_t chunk; + + init_tracked_chunk(bio); + + if (bio->bi_rw & REQ_FLUSH) { + if (!dm_bio_get_target_bio_nr(bio)) + bio->bi_bdev = s->origin->bdev; + else + bio->bi_bdev = s->cow->bdev; + return DM_MAPIO_REMAPPED; + } + + chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); + + down_write(&s->lock); + + /* Full merging snapshots are redirected to the origin */ + if (!s->valid) + goto redirect_to_origin; + + /* If the block is already remapped - use that */ + e = dm_lookup_exception(&s->complete, chunk); + if (e) { + /* Queue writes overlapping with chunks being merged */ + if (bio_rw(bio) == WRITE && + chunk >= s->first_merging_chunk && + chunk < (s->first_merging_chunk + + s->num_merging_chunks)) { + bio->bi_bdev = s->origin->bdev; + bio_list_add(&s->bios_queued_during_merge, bio); + r = DM_MAPIO_SUBMITTED; + goto out_unlock; + } + + remap_exception(s, e, bio, chunk); + + if (bio_rw(bio) == WRITE) + track_chunk(s, bio, chunk); + goto out_unlock; + } + +redirect_to_origin: + bio->bi_bdev = s->origin->bdev; + + if (bio_rw(bio) == WRITE) { + up_write(&s->lock); + return do_origin(s->origin, bio); + } + +out_unlock: + up_write(&s->lock); + return r; } -static int snapshot_end_io(struct dm_target *ti, struct bio *bio, - int error, union map_info *map_context) +static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int error) { struct dm_snapshot *s = ti->private; - struct dm_snap_tracked_chunk *c = map_context->ptr; - if (c) - stop_tracking_chunk(s, c); + if (is_bio_tracked(bio)) + stop_tracking_chunk(s, bio); return 0; } +static void snapshot_merge_presuspend(struct dm_target *ti) +{ + struct dm_snapshot *s = ti->private; + + stop_merge(s); +} + +static int snapshot_preresume(struct dm_target *ti) +{ + int r = 0; + struct dm_snapshot *s = ti->private; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + down_read(&_origins_lock); + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); + if (snap_src && snap_dest) { + down_read(&snap_src->lock); + if (s == snap_src) { + DMERR("Unable to resume snapshot source until " + "handover completes."); + r = -EINVAL; + } else if (!dm_suspended(snap_src->ti)) { + DMERR("Unable to perform snapshot handover until " + "source is suspended."); + r = -EINVAL; + } + up_read(&snap_src->lock); + } + up_read(&_origins_lock); + + return r; +} + static void snapshot_resume(struct dm_target *ti) { struct dm_snapshot *s = ti->private; + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + down_read(&_origins_lock); + (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); + if (snap_src && snap_dest) { + down_write(&snap_src->lock); + down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); + __handover_exceptions(snap_src, snap_dest); + up_write(&snap_dest->lock); + up_write(&snap_src->lock); + } + up_read(&_origins_lock); + + /* Now we have correct chunk size, reregister */ + reregister_snapshot(s); down_write(&s->lock); s->active = 1; up_write(&s->lock); } -static int snapshot_status(struct dm_target *ti, status_type_t type, - char *result, unsigned int maxlen) +static uint32_t get_origin_minimum_chunksize(struct block_device *bdev) { + uint32_t min_chunksize; + + down_read(&_origins_lock); + min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); + up_read(&_origins_lock); + + return min_chunksize; +} + +static void snapshot_merge_resume(struct dm_target *ti) +{ + struct dm_snapshot *s = ti->private; + + /* + * Handover exceptions from existing snapshot. + */ + snapshot_resume(ti); + + /* + * snapshot-merge acts as an origin, so set ti->max_io_len + */ + ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev); + + start_merge(s); +} + +static void snapshot_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + unsigned sz = 0; struct dm_snapshot *snap = ti->private; switch (type) { case STATUSTYPE_INFO: + + down_write(&snap->lock); + if (!snap->valid) - snprintf(result, maxlen, "Invalid"); + DMEMIT("Invalid"); + else if (snap->merge_failed) + DMEMIT("Merge failed"); else { - if (snap->store.fraction_full) { - sector_t numerator, denominator; - snap->store.fraction_full(&snap->store, - &numerator, - &denominator); - snprintf(result, maxlen, "%llu/%llu", - (unsigned long long)numerator, - (unsigned long long)denominator); + if (snap->store->type->usage) { + sector_t total_sectors, sectors_allocated, + metadata_sectors; + snap->store->type->usage(snap->store, + &total_sectors, + §ors_allocated, + &metadata_sectors); + DMEMIT("%llu/%llu %llu", + (unsigned long long)sectors_allocated, + (unsigned long long)total_sectors, + (unsigned long long)metadata_sectors); } else - snprintf(result, maxlen, "Unknown"); + DMEMIT("Unknown"); } + + up_write(&snap->lock); + break; case STATUSTYPE_TABLE: @@ -1146,30 +1939,61 @@ static int snapshot_status(struct dm_target *ti, status_type_t type, * to make private copies if the output is to * make sense. */ - snprintf(result, maxlen, "%s %s %c %llu", - snap->origin->name, snap->cow->name, - snap->type, - (unsigned long long)snap->chunk_size); + DMEMIT("%s %s", snap->origin->name, snap->cow->name); + snap->store->type->status(snap->store, type, result + sz, + maxlen - sz); break; } +} - return 0; +static int snapshot_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_snapshot *snap = ti->private; + int r; + + r = fn(ti, snap->origin, 0, ti->len, data); + + if (!r) + r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data); + + return r; } + /*----------------------------------------------------------------- * Origin methods *---------------------------------------------------------------*/ -static int __origin_write(struct list_head *snapshots, struct bio *bio) + +/* + * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any + * supplied bio was ignored. The caller may submit it immediately. + * (No remapping actually occurs as the origin is always a direct linear + * map.) + * + * If further exceptions are required, DM_MAPIO_SUBMITTED is returned + * and any supplied bio is added to a list to be submitted once all + * the necessary exceptions exist. + */ +static int __origin_write(struct list_head *snapshots, sector_t sector, + struct bio *bio) { - int r = DM_MAPIO_REMAPPED, first = 0; + int r = DM_MAPIO_REMAPPED; struct dm_snapshot *snap; - struct dm_snap_exception *e; - struct dm_snap_pending_exception *pe, *next_pe, *primary_pe = NULL; + struct dm_exception *e; + struct dm_snap_pending_exception *pe; + struct dm_snap_pending_exception *pe_to_start_now = NULL; + struct dm_snap_pending_exception *pe_to_start_last = NULL; chunk_t chunk; - LIST_HEAD(pe_queue); /* Do all the snapshots on this origin */ list_for_each_entry (snap, snapshots, list) { + /* + * Don't make new exceptions in a merging snapshot + * because it has effectively been deleted + */ + if (dm_target_is_snapshot_merge(snap->ti)) + continue; down_write(&snap->lock); @@ -1178,86 +2002,85 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio) goto next_snapshot; /* Nothing to do if writing beyond end of snapshot */ - if (bio->bi_sector >= dm_table_get_size(snap->ti->table)) + if (sector >= dm_table_get_size(snap->ti->table)) goto next_snapshot; /* * Remember, different snapshots can have * different chunk sizes. */ - chunk = sector_to_chunk(snap, bio->bi_sector); + chunk = sector_to_chunk(snap->store, sector); /* * Check exception table to see if block * is already remapped in this snapshot * and trigger an exception if not. - * - * ref_count is initialised to 1 so pending_complete() - * won't destroy the primary_pe while we're inside this loop. */ - e = lookup_exception(&snap->complete, chunk); + e = dm_lookup_exception(&snap->complete, chunk); if (e) goto next_snapshot; - pe = __find_pending_exception(snap, bio); + pe = __lookup_pending_exception(snap, chunk); if (!pe) { - __invalidate_snapshot(snap, -ENOMEM); - goto next_snapshot; - } + up_write(&snap->lock); + pe = alloc_pending_exception(snap); + down_write(&snap->lock); - if (!primary_pe) { - /* - * Either every pe here has same - * primary_pe or none has one yet. - */ - if (pe->primary_pe) - primary_pe = pe->primary_pe; - else { - primary_pe = pe; - first = 1; + if (!snap->valid) { + free_pending_exception(pe); + goto next_snapshot; } - bio_list_add(&primary_pe->origin_bios, bio); + e = dm_lookup_exception(&snap->complete, chunk); + if (e) { + free_pending_exception(pe); + goto next_snapshot; + } - r = DM_MAPIO_SUBMITTED; + pe = __find_pending_exception(snap, pe, chunk); + if (!pe) { + __invalidate_snapshot(snap, -ENOMEM); + goto next_snapshot; + } } - if (!pe->primary_pe) { - pe->primary_pe = primary_pe; - get_pending_exception(primary_pe); + r = DM_MAPIO_SUBMITTED; + + /* + * If an origin bio was supplied, queue it to wait for the + * completion of this exception, and start this one last, + * at the end of the function. + */ + if (bio) { + bio_list_add(&pe->origin_bios, bio); + bio = NULL; + + if (!pe->started) { + pe->started = 1; + pe_to_start_last = pe; + } } if (!pe->started) { pe->started = 1; - list_add_tail(&pe->list, &pe_queue); + pe_to_start_now = pe; } - next_snapshot: +next_snapshot: up_write(&snap->lock); - } - - if (!primary_pe) - return r; - - /* - * If this is the first time we're processing this chunk and - * ref_count is now 1 it means all the pending exceptions - * got completed while we were in the loop above, so it falls to - * us here to remove the primary_pe and submit any origin_bios. - */ - if (first && atomic_dec_and_test(&primary_pe->ref_count)) { - flush_bios(bio_list_get(&primary_pe->origin_bios)); - free_pending_exception(primary_pe); - /* If we got here, pe_queue is necessarily empty. */ - return r; + if (pe_to_start_now) { + start_copy(pe_to_start_now); + pe_to_start_now = NULL; + } } /* - * Now that we have a complete pe list we can start the copying. + * Submit the exception against which the bio is queued last, + * to give the other exceptions a head start. */ - list_for_each_entry_safe(pe, next_pe, &pe_queue, list) - start_copy(pe); + if (pe_to_start_last) + start_copy(pe_to_start_last); return r; } @@ -1273,16 +2096,56 @@ static int do_origin(struct dm_dev *origin, struct bio *bio) down_read(&_origins_lock); o = __lookup_origin(origin->bdev); if (o) - r = __origin_write(&o->snapshots, bio); + r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio); up_read(&_origins_lock); return r; } /* + * Trigger exceptions in all non-merging snapshots. + * + * The chunk size of the merging snapshot may be larger than the chunk + * size of some other snapshot so we may need to reallocate multiple + * chunks in other snapshots. + * + * We scan all the overlapping exceptions in the other snapshots. + * Returns 1 if anything was reallocated and must be waited for, + * otherwise returns 0. + * + * size must be a multiple of merging_snap's chunk_size. + */ +static int origin_write_extent(struct dm_snapshot *merging_snap, + sector_t sector, unsigned size) +{ + int must_wait = 0; + sector_t n; + struct origin *o; + + /* + * The origin's __minimum_chunk_size() got stored in max_io_len + * by snapshot_merge_resume(). + */ + down_read(&_origins_lock); + o = __lookup_origin(merging_snap->origin->bdev); + for (n = 0; n < size; n += merging_snap->ti->max_io_len) + if (__origin_write(&o->snapshots, sector + n, NULL) == + DM_MAPIO_SUBMITTED) + must_wait = 1; + up_read(&_origins_lock); + + return must_wait; +} + +/* * Origin: maps a linear range of a device, with hooks for snapshotting. */ +struct dm_origin { + struct dm_dev *dev; + unsigned split_boundary; +}; + /* * Construct an origin mapping: <dev_path> * The context for an origin is merely a 'struct dm_dev *' @@ -1291,67 +2154,82 @@ static int do_origin(struct dm_dev *origin, struct bio *bio) static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int r; - struct dm_dev *dev; + struct dm_origin *o; if (argc != 1) { ti->error = "origin: incorrect number of arguments"; return -EINVAL; } - r = dm_get_device(ti, argv[0], 0, ti->len, - dm_table_get_mode(ti->table), &dev); + o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL); + if (!o) { + ti->error = "Cannot allocate private origin structure"; + r = -ENOMEM; + goto bad_alloc; + } + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev); if (r) { ti->error = "Cannot get target device"; - return r; + goto bad_open; } - ti->private = dev; + ti->private = o; + ti->num_flush_bios = 1; + return 0; + +bad_open: + kfree(o); +bad_alloc: + return r; } static void origin_dtr(struct dm_target *ti) { - struct dm_dev *dev = ti->private; - dm_put_device(ti, dev); + struct dm_origin *o = ti->private; + dm_put_device(ti, o->dev); + kfree(o); } -static int origin_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) +static int origin_map(struct dm_target *ti, struct bio *bio) { - struct dm_dev *dev = ti->private; - bio->bi_bdev = dev->bdev; + struct dm_origin *o = ti->private; + unsigned available_sectors; + + bio->bi_bdev = o->dev->bdev; + + if (unlikely(bio->bi_rw & REQ_FLUSH)) + return DM_MAPIO_REMAPPED; + + if (bio_rw(bio) != WRITE) + return DM_MAPIO_REMAPPED; + + available_sectors = o->split_boundary - + ((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1)); + + if (bio_sectors(bio) > available_sectors) + dm_accept_partial_bio(bio, available_sectors); /* Only tell snapshots if this is a write */ - return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; + return do_origin(o->dev, bio); } -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - /* - * Set the target "split_io" field to the minimum of all the snapshots' + * Set the target "max_io_len" field to the minimum of all the snapshots' * chunk sizes. */ static void origin_resume(struct dm_target *ti) { - struct dm_dev *dev = ti->private; - struct dm_snapshot *snap; - struct origin *o; - chunk_t chunk_size = 0; - - down_read(&_origins_lock); - o = __lookup_origin(dev->bdev); - if (o) - list_for_each_entry (snap, &o->snapshots, list) - chunk_size = min_not_zero(chunk_size, snap->chunk_size); - up_read(&_origins_lock); + struct dm_origin *o = ti->private; - ti->split_io = chunk_size; + o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev); } -static int origin_status(struct dm_target *ti, status_type_t type, char *result, - unsigned int maxlen) +static void origin_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) { - struct dm_dev *dev = ti->private; + struct dm_origin *o = ti->private; switch (type) { case STATUSTYPE_INFO: @@ -1359,121 +2237,152 @@ static int origin_status(struct dm_target *ti, status_type_t type, char *result, break; case STATUSTYPE_TABLE: - snprintf(result, maxlen, "%s", dev->name); + snprintf(result, maxlen, "%s", o->dev->name); break; } +} - return 0; +static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct dm_origin *o = ti->private; + struct request_queue *q = bdev_get_queue(o->dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = o->dev->bdev; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static int origin_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_origin *o = ti->private; + + return fn(ti, o->dev, 0, ti->len, data); } static struct target_type origin_target = { .name = "snapshot-origin", - .version = {1, 6, 0}, + .version = {1, 8, 1}, .module = THIS_MODULE, .ctr = origin_ctr, .dtr = origin_dtr, .map = origin_map, .resume = origin_resume, .status = origin_status, + .merge = origin_merge, + .iterate_devices = origin_iterate_devices, }; static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 6, 0}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, .map = snapshot_map, .end_io = snapshot_end_io, + .preresume = snapshot_preresume, .resume = snapshot_resume, .status = snapshot_status, + .iterate_devices = snapshot_iterate_devices, +}; + +static struct target_type merge_target = { + .name = dm_snapshot_merge_target_name, + .version = {1, 2, 0}, + .module = THIS_MODULE, + .ctr = snapshot_ctr, + .dtr = snapshot_dtr, + .map = snapshot_merge_map, + .end_io = snapshot_end_io, + .presuspend = snapshot_merge_presuspend, + .preresume = snapshot_preresume, + .resume = snapshot_merge_resume, + .status = snapshot_status, + .iterate_devices = snapshot_iterate_devices, }; static int __init dm_snapshot_init(void) { int r; - r = dm_register_target(&snapshot_target); + r = dm_exception_store_init(); if (r) { - DMERR("snapshot target register failed %d", r); + DMERR("Failed to initialize exception stores"); return r; } + r = dm_register_target(&snapshot_target); + if (r < 0) { + DMERR("snapshot target register failed %d", r); + goto bad_register_snapshot_target; + } + r = dm_register_target(&origin_target); if (r < 0) { DMERR("Origin target register failed %d", r); - goto bad1; + goto bad_register_origin_target; + } + + r = dm_register_target(&merge_target); + if (r < 0) { + DMERR("Merge target register failed %d", r); + goto bad_register_merge_target; } r = init_origin_hash(); if (r) { DMERR("init_origin_hash failed."); - goto bad2; + goto bad_origin_hash; } - exception_cache = KMEM_CACHE(dm_snap_exception, 0); + exception_cache = KMEM_CACHE(dm_exception, 0); if (!exception_cache) { DMERR("Couldn't create exception cache."); r = -ENOMEM; - goto bad3; + goto bad_exception_cache; } pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0); if (!pending_cache) { DMERR("Couldn't create pending cache."); r = -ENOMEM; - goto bad4; - } - - tracked_chunk_cache = KMEM_CACHE(dm_snap_tracked_chunk, 0); - if (!tracked_chunk_cache) { - DMERR("Couldn't create cache to track chunks in use."); - r = -ENOMEM; - goto bad5; - } - - ksnapd = create_singlethread_workqueue("ksnapd"); - if (!ksnapd) { - DMERR("Failed to create ksnapd workqueue."); - r = -ENOMEM; - goto bad_pending_pool; + goto bad_pending_cache; } return 0; - bad_pending_pool: - kmem_cache_destroy(tracked_chunk_cache); - bad5: - kmem_cache_destroy(pending_cache); - bad4: +bad_pending_cache: kmem_cache_destroy(exception_cache); - bad3: +bad_exception_cache: exit_origin_hash(); - bad2: +bad_origin_hash: + dm_unregister_target(&merge_target); +bad_register_merge_target: dm_unregister_target(&origin_target); - bad1: +bad_register_origin_target: dm_unregister_target(&snapshot_target); +bad_register_snapshot_target: + dm_exception_store_exit(); + return r; } static void __exit dm_snapshot_exit(void) { - int r; - - destroy_workqueue(ksnapd); - - r = dm_unregister_target(&snapshot_target); - if (r) - DMERR("snapshot unregister failed %d", r); - - r = dm_unregister_target(&origin_target); - if (r) - DMERR("origin unregister failed %d", r); + dm_unregister_target(&snapshot_target); + dm_unregister_target(&origin_target); + dm_unregister_target(&merge_target); exit_origin_hash(); kmem_cache_destroy(pending_cache); kmem_cache_destroy(exception_cache); - kmem_cache_destroy(tracked_chunk_cache); + + dm_exception_store_exit(); } /* Module hooks */ @@ -1483,3 +2392,5 @@ module_exit(dm_snapshot_exit); MODULE_DESCRIPTION(DM_NAME " snapshot target"); MODULE_AUTHOR("Joe Thornber"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("dm-snapshot-origin"); +MODULE_ALIAS("dm-snapshot-merge"); diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h deleted file mode 100644 index 292c15609ae..00000000000 --- a/drivers/md/dm-snap.h +++ /dev/null @@ -1,231 +0,0 @@ -/* - * dm-snapshot.c - * - * Copyright (C) 2001-2002 Sistina Software (UK) Limited. - * - * This file is released under the GPL. - */ - -#ifndef DM_SNAPSHOT_H -#define DM_SNAPSHOT_H - -#include "dm.h" -#include "dm-bio-list.h" -#include <linux/blkdev.h> -#include <linux/workqueue.h> - -struct exception_table { - uint32_t hash_mask; - unsigned hash_shift; - struct list_head *table; -}; - -/* - * The snapshot code deals with largish chunks of the disk at a - * time. Typically 32k - 512k. - */ -typedef sector_t chunk_t; - -/* - * An exception is used where an old chunk of data has been - * replaced by a new one. - * If chunk_t is 64 bits in size, the top 8 bits of new_chunk hold the number - * of chunks that follow contiguously. Remaining bits hold the number of the - * chunk within the device. - */ -struct dm_snap_exception { - struct list_head hash_list; - - chunk_t old_chunk; - chunk_t new_chunk; -}; - -/* - * Funtions to manipulate consecutive chunks - */ -# if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) -# define DM_CHUNK_CONSECUTIVE_BITS 8 -# define DM_CHUNK_NUMBER_BITS 56 - -static inline chunk_t dm_chunk_number(chunk_t chunk) -{ - return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL); -} - -static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) -{ - return e->new_chunk >> DM_CHUNK_NUMBER_BITS; -} - -static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) -{ - e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS); - - BUG_ON(!dm_consecutive_chunk_count(e)); -} - -# else -# define DM_CHUNK_CONSECUTIVE_BITS 0 - -static inline chunk_t dm_chunk_number(chunk_t chunk) -{ - return chunk; -} - -static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e) -{ - return 0; -} - -static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e) -{ -} - -# endif - -/* - * Abstraction to handle the meta/layout of exception stores (the - * COW device). - */ -struct exception_store { - - /* - * Destroys this object when you've finished with it. - */ - void (*destroy) (struct exception_store *store); - - /* - * The target shouldn't read the COW device until this is - * called. - */ - int (*read_metadata) (struct exception_store *store); - - /* - * Find somewhere to store the next exception. - */ - int (*prepare_exception) (struct exception_store *store, - struct dm_snap_exception *e); - - /* - * Update the metadata with this exception. - */ - void (*commit_exception) (struct exception_store *store, - struct dm_snap_exception *e, - void (*callback) (void *, int success), - void *callback_context); - - /* - * The snapshot is invalid, note this in the metadata. - */ - void (*drop_snapshot) (struct exception_store *store); - - /* - * Return how full the snapshot is. - */ - void (*fraction_full) (struct exception_store *store, - sector_t *numerator, - sector_t *denominator); - - struct dm_snapshot *snap; - void *context; -}; - -#define DM_TRACKED_CHUNK_HASH_SIZE 16 -#define DM_TRACKED_CHUNK_HASH(x) ((unsigned long)(x) & \ - (DM_TRACKED_CHUNK_HASH_SIZE - 1)) - -struct dm_snapshot { - struct rw_semaphore lock; - struct dm_target *ti; - - struct dm_dev *origin; - struct dm_dev *cow; - - /* List of snapshots per Origin */ - struct list_head list; - - /* Size of data blocks saved - must be a power of 2 */ - chunk_t chunk_size; - chunk_t chunk_mask; - chunk_t chunk_shift; - - /* You can't use a snapshot if this is 0 (e.g. if full) */ - int valid; - - /* Origin writes don't trigger exceptions until this is set */ - int active; - - /* Used for display of table */ - char type; - - /* The last percentage we notified */ - int last_percent; - - mempool_t *pending_pool; - - struct exception_table pending; - struct exception_table complete; - - /* - * pe_lock protects all pending_exception operations and access - * as well as the snapshot_bios list. - */ - spinlock_t pe_lock; - - /* The on disk metadata handler */ - struct exception_store store; - - struct dm_kcopyd_client *kcopyd_client; - - /* Queue of snapshot writes for ksnapd to flush */ - struct bio_list queued_bios; - struct work_struct queued_bios_work; - - /* Chunks with outstanding reads */ - mempool_t *tracked_chunk_pool; - spinlock_t tracked_chunk_lock; - struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE]; -}; - -/* - * Used by the exception stores to load exceptions hen - * initialising. - */ -int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new); - -/* - * Constructor and destructor for the default persistent - * store. - */ -int dm_create_persistent(struct exception_store *store); - -int dm_create_transient(struct exception_store *store); - -/* - * Return the number of sectors in the device. - */ -static inline sector_t get_dev_size(struct block_device *bdev) -{ - return bdev->bd_inode->i_size >> SECTOR_SHIFT; -} - -static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector) -{ - return (sector & ~s->chunk_mask) >> s->chunk_shift; -} - -static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk) -{ - return chunk << s->chunk_shift; -} - -static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs) -{ - /* - * There is only ever one instance of a particular block - * device so we can compare pointers safely. - */ - return lhs == rhs; -} - -#endif diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c new file mode 100644 index 00000000000..28a90122a5a --- /dev/null +++ b/drivers/md/dm-stats.c @@ -0,0 +1,981 @@ +#include <linux/errno.h> +#include <linux/numa.h> +#include <linux/slab.h> +#include <linux/rculist.h> +#include <linux/threads.h> +#include <linux/preempt.h> +#include <linux/irqflags.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/device-mapper.h> + +#include "dm.h" +#include "dm-stats.h" + +#define DM_MSG_PREFIX "stats" + +static int dm_stat_need_rcu_barrier; + +/* + * Using 64-bit values to avoid overflow (which is a + * problem that block/genhd.c's IO accounting has). + */ +struct dm_stat_percpu { + unsigned long long sectors[2]; + unsigned long long ios[2]; + unsigned long long merges[2]; + unsigned long long ticks[2]; + unsigned long long io_ticks[2]; + unsigned long long io_ticks_total; + unsigned long long time_in_queue; +}; + +struct dm_stat_shared { + atomic_t in_flight[2]; + unsigned long stamp; + struct dm_stat_percpu tmp; +}; + +struct dm_stat { + struct list_head list_entry; + int id; + size_t n_entries; + sector_t start; + sector_t end; + sector_t step; + const char *program_id; + const char *aux_data; + struct rcu_head rcu_head; + size_t shared_alloc_size; + size_t percpu_alloc_size; + struct dm_stat_percpu *stat_percpu[NR_CPUS]; + struct dm_stat_shared stat_shared[0]; +}; + +struct dm_stats_last_position { + sector_t last_sector; + unsigned last_rw; +}; + +/* + * A typo on the command line could possibly make the kernel run out of memory + * and crash. To prevent the crash we account all used memory. We fail if we + * exhaust 1/4 of all memory or 1/2 of vmalloc space. + */ +#define DM_STATS_MEMORY_FACTOR 4 +#define DM_STATS_VMALLOC_FACTOR 2 + +static DEFINE_SPINLOCK(shared_memory_lock); + +static unsigned long shared_memory_amount; + +static bool __check_shared_memory(size_t alloc_size) +{ + size_t a; + + a = shared_memory_amount + alloc_size; + if (a < shared_memory_amount) + return false; + if (a >> PAGE_SHIFT > totalram_pages / DM_STATS_MEMORY_FACTOR) + return false; +#ifdef CONFIG_MMU + if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR) + return false; +#endif + return true; +} + +static bool check_shared_memory(size_t alloc_size) +{ + bool ret; + + spin_lock_irq(&shared_memory_lock); + + ret = __check_shared_memory(alloc_size); + + spin_unlock_irq(&shared_memory_lock); + + return ret; +} + +static bool claim_shared_memory(size_t alloc_size) +{ + spin_lock_irq(&shared_memory_lock); + + if (!__check_shared_memory(alloc_size)) { + spin_unlock_irq(&shared_memory_lock); + return false; + } + + shared_memory_amount += alloc_size; + + spin_unlock_irq(&shared_memory_lock); + + return true; +} + +static void free_shared_memory(size_t alloc_size) +{ + unsigned long flags; + + spin_lock_irqsave(&shared_memory_lock, flags); + + if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) { + spin_unlock_irqrestore(&shared_memory_lock, flags); + DMCRIT("Memory usage accounting bug."); + return; + } + + shared_memory_amount -= alloc_size; + + spin_unlock_irqrestore(&shared_memory_lock, flags); +} + +static void *dm_kvzalloc(size_t alloc_size, int node) +{ + void *p; + + if (!claim_shared_memory(alloc_size)) + return NULL; + + if (alloc_size <= KMALLOC_MAX_SIZE) { + p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node); + if (p) + return p; + } + p = vzalloc_node(alloc_size, node); + if (p) + return p; + + free_shared_memory(alloc_size); + + return NULL; +} + +static void dm_kvfree(void *ptr, size_t alloc_size) +{ + if (!ptr) + return; + + free_shared_memory(alloc_size); + + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + +static void dm_stat_free(struct rcu_head *head) +{ + int cpu; + struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); + + kfree(s->program_id); + kfree(s->aux_data); + for_each_possible_cpu(cpu) + dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size); + dm_kvfree(s, s->shared_alloc_size); +} + +static int dm_stat_in_flight(struct dm_stat_shared *shared) +{ + return atomic_read(&shared->in_flight[READ]) + + atomic_read(&shared->in_flight[WRITE]); +} + +void dm_stats_init(struct dm_stats *stats) +{ + int cpu; + struct dm_stats_last_position *last; + + mutex_init(&stats->mutex); + INIT_LIST_HEAD(&stats->list); + stats->last = alloc_percpu(struct dm_stats_last_position); + for_each_possible_cpu(cpu) { + last = per_cpu_ptr(stats->last, cpu); + last->last_sector = (sector_t)ULLONG_MAX; + last->last_rw = UINT_MAX; + } +} + +void dm_stats_cleanup(struct dm_stats *stats) +{ + size_t ni; + struct dm_stat *s; + struct dm_stat_shared *shared; + + while (!list_empty(&stats->list)) { + s = container_of(stats->list.next, struct dm_stat, list_entry); + list_del(&s->list_entry); + for (ni = 0; ni < s->n_entries; ni++) { + shared = &s->stat_shared[ni]; + if (WARN_ON(dm_stat_in_flight(shared))) { + DMCRIT("leaked in-flight counter at index %lu " + "(start %llu, end %llu, step %llu): reads %d, writes %d", + (unsigned long)ni, + (unsigned long long)s->start, + (unsigned long long)s->end, + (unsigned long long)s->step, + atomic_read(&shared->in_flight[READ]), + atomic_read(&shared->in_flight[WRITE])); + } + } + dm_stat_free(&s->rcu_head); + } + free_percpu(stats->last); +} + +static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, + sector_t step, const char *program_id, const char *aux_data, + void (*suspend_callback)(struct mapped_device *), + void (*resume_callback)(struct mapped_device *), + struct mapped_device *md) +{ + struct list_head *l; + struct dm_stat *s, *tmp_s; + sector_t n_entries; + size_t ni; + size_t shared_alloc_size; + size_t percpu_alloc_size; + struct dm_stat_percpu *p; + int cpu; + int ret_id; + int r; + + if (end < start || !step) + return -EINVAL; + + n_entries = end - start; + if (dm_sector_div64(n_entries, step)) + n_entries++; + + if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1)) + return -EOVERFLOW; + + shared_alloc_size = sizeof(struct dm_stat) + (size_t)n_entries * sizeof(struct dm_stat_shared); + if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries) + return -EOVERFLOW; + + percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu); + if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries) + return -EOVERFLOW; + + if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size)) + return -ENOMEM; + + s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE); + if (!s) + return -ENOMEM; + + s->n_entries = n_entries; + s->start = start; + s->end = end; + s->step = step; + s->shared_alloc_size = shared_alloc_size; + s->percpu_alloc_size = percpu_alloc_size; + + s->program_id = kstrdup(program_id, GFP_KERNEL); + if (!s->program_id) { + r = -ENOMEM; + goto out; + } + s->aux_data = kstrdup(aux_data, GFP_KERNEL); + if (!s->aux_data) { + r = -ENOMEM; + goto out; + } + + for (ni = 0; ni < n_entries; ni++) { + atomic_set(&s->stat_shared[ni].in_flight[READ], 0); + atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); + } + + for_each_possible_cpu(cpu) { + p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu)); + if (!p) { + r = -ENOMEM; + goto out; + } + s->stat_percpu[cpu] = p; + } + + /* + * Suspend/resume to make sure there is no i/o in flight, + * so that newly created statistics will be exact. + * + * (note: we couldn't suspend earlier because we must not + * allocate memory while suspended) + */ + suspend_callback(md); + + mutex_lock(&stats->mutex); + s->id = 0; + list_for_each(l, &stats->list) { + tmp_s = container_of(l, struct dm_stat, list_entry); + if (WARN_ON(tmp_s->id < s->id)) { + r = -EINVAL; + goto out_unlock_resume; + } + if (tmp_s->id > s->id) + break; + if (unlikely(s->id == INT_MAX)) { + r = -ENFILE; + goto out_unlock_resume; + } + s->id++; + } + ret_id = s->id; + list_add_tail_rcu(&s->list_entry, l); + mutex_unlock(&stats->mutex); + + resume_callback(md); + + return ret_id; + +out_unlock_resume: + mutex_unlock(&stats->mutex); + resume_callback(md); +out: + dm_stat_free(&s->rcu_head); + return r; +} + +static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id) +{ + struct dm_stat *s; + + list_for_each_entry(s, &stats->list, list_entry) { + if (s->id > id) + break; + if (s->id == id) + return s; + } + + return NULL; +} + +static int dm_stats_delete(struct dm_stats *stats, int id) +{ + struct dm_stat *s; + int cpu; + + mutex_lock(&stats->mutex); + + s = __dm_stats_find(stats, id); + if (!s) { + mutex_unlock(&stats->mutex); + return -ENOENT; + } + + list_del_rcu(&s->list_entry); + mutex_unlock(&stats->mutex); + + /* + * vfree can't be called from RCU callback + */ + for_each_possible_cpu(cpu) + if (is_vmalloc_addr(s->stat_percpu)) + goto do_sync_free; + if (is_vmalloc_addr(s)) { +do_sync_free: + synchronize_rcu_expedited(); + dm_stat_free(&s->rcu_head); + } else { + ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1; + call_rcu(&s->rcu_head, dm_stat_free); + } + return 0; +} + +static int dm_stats_list(struct dm_stats *stats, const char *program, + char *result, unsigned maxlen) +{ + struct dm_stat *s; + sector_t len; + unsigned sz = 0; + + /* + * Output format: + * <region_id>: <start_sector>+<length> <step> <program_id> <aux_data> + */ + + mutex_lock(&stats->mutex); + list_for_each_entry(s, &stats->list, list_entry) { + if (!program || !strcmp(program, s->program_id)) { + len = s->end - s->start; + DMEMIT("%d: %llu+%llu %llu %s %s\n", s->id, + (unsigned long long)s->start, + (unsigned long long)len, + (unsigned long long)s->step, + s->program_id, + s->aux_data); + } + } + mutex_unlock(&stats->mutex); + + return 1; +} + +static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p) +{ + /* + * This is racy, but so is part_round_stats_single. + */ + unsigned long now = jiffies; + unsigned in_flight_read; + unsigned in_flight_write; + unsigned long difference = now - shared->stamp; + + if (!difference) + return; + in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]); + in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]); + if (in_flight_read) + p->io_ticks[READ] += difference; + if (in_flight_write) + p->io_ticks[WRITE] += difference; + if (in_flight_read + in_flight_write) { + p->io_ticks_total += difference; + p->time_in_queue += (in_flight_read + in_flight_write) * difference; + } + shared->stamp = now; +} + +static void dm_stat_for_entry(struct dm_stat *s, size_t entry, + unsigned long bi_rw, sector_t len, bool merged, + bool end, unsigned long duration) +{ + unsigned long idx = bi_rw & REQ_WRITE; + struct dm_stat_shared *shared = &s->stat_shared[entry]; + struct dm_stat_percpu *p; + + /* + * For strict correctness we should use local_irq_save/restore + * instead of preempt_disable/enable. + * + * preempt_disable/enable is racy if the driver finishes bios + * from non-interrupt context as well as from interrupt context + * or from more different interrupts. + * + * On 64-bit architectures the race only results in not counting some + * events, so it is acceptable. On 32-bit architectures the race could + * cause the counter going off by 2^32, so we need to do proper locking + * there. + * + * part_stat_lock()/part_stat_unlock() have this race too. + */ +#if BITS_PER_LONG == 32 + unsigned long flags; + local_irq_save(flags); +#else + preempt_disable(); +#endif + p = &s->stat_percpu[smp_processor_id()][entry]; + + if (!end) { + dm_stat_round(shared, p); + atomic_inc(&shared->in_flight[idx]); + } else { + dm_stat_round(shared, p); + atomic_dec(&shared->in_flight[idx]); + p->sectors[idx] += len; + p->ios[idx] += 1; + p->merges[idx] += merged; + p->ticks[idx] += duration; + } + +#if BITS_PER_LONG == 32 + local_irq_restore(flags); +#else + preempt_enable(); +#endif +} + +static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw, + sector_t bi_sector, sector_t end_sector, + bool end, unsigned long duration, + struct dm_stats_aux *stats_aux) +{ + sector_t rel_sector, offset, todo, fragment_len; + size_t entry; + + if (end_sector <= s->start || bi_sector >= s->end) + return; + if (unlikely(bi_sector < s->start)) { + rel_sector = 0; + todo = end_sector - s->start; + } else { + rel_sector = bi_sector - s->start; + todo = end_sector - bi_sector; + } + if (unlikely(end_sector > s->end)) + todo -= (end_sector - s->end); + + offset = dm_sector_div64(rel_sector, s->step); + entry = rel_sector; + do { + if (WARN_ON_ONCE(entry >= s->n_entries)) { + DMCRIT("Invalid area access in region id %d", s->id); + return; + } + fragment_len = todo; + if (fragment_len > s->step - offset) + fragment_len = s->step - offset; + dm_stat_for_entry(s, entry, bi_rw, fragment_len, + stats_aux->merged, end, duration); + todo -= fragment_len; + entry++; + offset = 0; + } while (unlikely(todo != 0)); +} + +void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, + sector_t bi_sector, unsigned bi_sectors, bool end, + unsigned long duration, struct dm_stats_aux *stats_aux) +{ + struct dm_stat *s; + sector_t end_sector; + struct dm_stats_last_position *last; + + if (unlikely(!bi_sectors)) + return; + + end_sector = bi_sector + bi_sectors; + + if (!end) { + /* + * A race condition can at worst result in the merged flag being + * misrepresented, so we don't have to disable preemption here. + */ + last = __this_cpu_ptr(stats->last); + stats_aux->merged = + (bi_sector == (ACCESS_ONCE(last->last_sector) && + ((bi_rw & (REQ_WRITE | REQ_DISCARD)) == + (ACCESS_ONCE(last->last_rw) & (REQ_WRITE | REQ_DISCARD))) + )); + ACCESS_ONCE(last->last_sector) = end_sector; + ACCESS_ONCE(last->last_rw) = bi_rw; + } + + rcu_read_lock(); + + list_for_each_entry_rcu(s, &stats->list, list_entry) + __dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux); + + rcu_read_unlock(); +} + +static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared, + struct dm_stat *s, size_t x) +{ + int cpu; + struct dm_stat_percpu *p; + + local_irq_disable(); + p = &s->stat_percpu[smp_processor_id()][x]; + dm_stat_round(shared, p); + local_irq_enable(); + + memset(&shared->tmp, 0, sizeof(shared->tmp)); + for_each_possible_cpu(cpu) { + p = &s->stat_percpu[cpu][x]; + shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]); + shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]); + shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]); + shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]); + shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]); + shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]); + shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]); + shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]); + shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]); + shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]); + shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total); + shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue); + } +} + +static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, + bool init_tmp_percpu_totals) +{ + size_t x; + struct dm_stat_shared *shared; + struct dm_stat_percpu *p; + + for (x = idx_start; x < idx_end; x++) { + shared = &s->stat_shared[x]; + if (init_tmp_percpu_totals) + __dm_stat_init_temporary_percpu_totals(shared, s, x); + local_irq_disable(); + p = &s->stat_percpu[smp_processor_id()][x]; + p->sectors[READ] -= shared->tmp.sectors[READ]; + p->sectors[WRITE] -= shared->tmp.sectors[WRITE]; + p->ios[READ] -= shared->tmp.ios[READ]; + p->ios[WRITE] -= shared->tmp.ios[WRITE]; + p->merges[READ] -= shared->tmp.merges[READ]; + p->merges[WRITE] -= shared->tmp.merges[WRITE]; + p->ticks[READ] -= shared->tmp.ticks[READ]; + p->ticks[WRITE] -= shared->tmp.ticks[WRITE]; + p->io_ticks[READ] -= shared->tmp.io_ticks[READ]; + p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE]; + p->io_ticks_total -= shared->tmp.io_ticks_total; + p->time_in_queue -= shared->tmp.time_in_queue; + local_irq_enable(); + } +} + +static int dm_stats_clear(struct dm_stats *stats, int id) +{ + struct dm_stat *s; + + mutex_lock(&stats->mutex); + + s = __dm_stats_find(stats, id); + if (!s) { + mutex_unlock(&stats->mutex); + return -ENOENT; + } + + __dm_stat_clear(s, 0, s->n_entries, true); + + mutex_unlock(&stats->mutex); + + return 1; +} + +/* + * This is like jiffies_to_msec, but works for 64-bit values. + */ +static unsigned long long dm_jiffies_to_msec64(unsigned long long j) +{ + unsigned long long result = 0; + unsigned mult; + + if (j) + result = jiffies_to_msecs(j & 0x3fffff); + if (j >= 1 << 22) { + mult = jiffies_to_msecs(1 << 22); + result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff); + } + if (j >= 1ULL << 44) + result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44); + + return result; +} + +static int dm_stats_print(struct dm_stats *stats, int id, + size_t idx_start, size_t idx_len, + bool clear, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct dm_stat *s; + size_t x; + sector_t start, end, step; + size_t idx_end; + struct dm_stat_shared *shared; + + /* + * Output format: + * <start_sector>+<length> counters + */ + + mutex_lock(&stats->mutex); + + s = __dm_stats_find(stats, id); + if (!s) { + mutex_unlock(&stats->mutex); + return -ENOENT; + } + + idx_end = idx_start + idx_len; + if (idx_end < idx_start || + idx_end > s->n_entries) + idx_end = s->n_entries; + + if (idx_start > idx_end) + idx_start = idx_end; + + step = s->step; + start = s->start + (step * idx_start); + + for (x = idx_start; x < idx_end; x++, start = end) { + shared = &s->stat_shared[x]; + end = start + step; + if (unlikely(end > s->end)) + end = s->end; + + __dm_stat_init_temporary_percpu_totals(shared, s, x); + + DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n", + (unsigned long long)start, + (unsigned long long)step, + shared->tmp.ios[READ], + shared->tmp.merges[READ], + shared->tmp.sectors[READ], + dm_jiffies_to_msec64(shared->tmp.ticks[READ]), + shared->tmp.ios[WRITE], + shared->tmp.merges[WRITE], + shared->tmp.sectors[WRITE], + dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]), + dm_stat_in_flight(shared), + dm_jiffies_to_msec64(shared->tmp.io_ticks_total), + dm_jiffies_to_msec64(shared->tmp.time_in_queue), + dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]), + dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE])); + + if (unlikely(sz + 1 >= maxlen)) + goto buffer_overflow; + } + + if (clear) + __dm_stat_clear(s, idx_start, idx_end, false); + +buffer_overflow: + mutex_unlock(&stats->mutex); + + return 1; +} + +static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data) +{ + struct dm_stat *s; + const char *new_aux_data; + + mutex_lock(&stats->mutex); + + s = __dm_stats_find(stats, id); + if (!s) { + mutex_unlock(&stats->mutex); + return -ENOENT; + } + + new_aux_data = kstrdup(aux_data, GFP_KERNEL); + if (!new_aux_data) { + mutex_unlock(&stats->mutex); + return -ENOMEM; + } + + kfree(s->aux_data); + s->aux_data = new_aux_data; + + mutex_unlock(&stats->mutex); + + return 0; +} + +static int message_stats_create(struct mapped_device *md, + unsigned argc, char **argv, + char *result, unsigned maxlen) +{ + int id; + char dummy; + unsigned long long start, end, len, step; + unsigned divisor; + const char *program_id, *aux_data; + + /* + * Input format: + * <range> <step> [<program_id> [<aux_data>]] + */ + + if (argc < 3 || argc > 5) + return -EINVAL; + + if (!strcmp(argv[1], "-")) { + start = 0; + len = dm_get_size(md); + if (!len) + len = 1; + } else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 || + start != (sector_t)start || len != (sector_t)len) + return -EINVAL; + + end = start + len; + if (start >= end) + return -EINVAL; + + if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) { + step = end - start; + if (do_div(step, divisor)) + step++; + if (!step) + step = 1; + } else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 || + step != (sector_t)step || !step) + return -EINVAL; + + program_id = "-"; + aux_data = "-"; + + if (argc > 3) + program_id = argv[3]; + + if (argc > 4) + aux_data = argv[4]; + + /* + * If a buffer overflow happens after we created the region, + * it's too late (the userspace would retry with a larger + * buffer, but the region id that caused the overflow is already + * leaked). So we must detect buffer overflow in advance. + */ + snprintf(result, maxlen, "%d", INT_MAX); + if (dm_message_test_buffer_overflow(result, maxlen)) + return 1; + + id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data, + dm_internal_suspend, dm_internal_resume, md); + if (id < 0) + return id; + + snprintf(result, maxlen, "%d", id); + + return 1; +} + +static int message_stats_delete(struct mapped_device *md, + unsigned argc, char **argv) +{ + int id; + char dummy; + + if (argc != 2) + return -EINVAL; + + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) + return -EINVAL; + + return dm_stats_delete(dm_get_stats(md), id); +} + +static int message_stats_clear(struct mapped_device *md, + unsigned argc, char **argv) +{ + int id; + char dummy; + + if (argc != 2) + return -EINVAL; + + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) + return -EINVAL; + + return dm_stats_clear(dm_get_stats(md), id); +} + +static int message_stats_list(struct mapped_device *md, + unsigned argc, char **argv, + char *result, unsigned maxlen) +{ + int r; + const char *program = NULL; + + if (argc < 1 || argc > 2) + return -EINVAL; + + if (argc > 1) { + program = kstrdup(argv[1], GFP_KERNEL); + if (!program) + return -ENOMEM; + } + + r = dm_stats_list(dm_get_stats(md), program, result, maxlen); + + kfree(program); + + return r; +} + +static int message_stats_print(struct mapped_device *md, + unsigned argc, char **argv, bool clear, + char *result, unsigned maxlen) +{ + int id; + char dummy; + unsigned long idx_start = 0, idx_len = ULONG_MAX; + + if (argc != 2 && argc != 4) + return -EINVAL; + + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) + return -EINVAL; + + if (argc > 3) { + if (strcmp(argv[2], "-") && + sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1) + return -EINVAL; + if (strcmp(argv[3], "-") && + sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1) + return -EINVAL; + } + + return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear, + result, maxlen); +} + +static int message_stats_set_aux(struct mapped_device *md, + unsigned argc, char **argv) +{ + int id; + char dummy; + + if (argc != 3) + return -EINVAL; + + if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0) + return -EINVAL; + + return dm_stats_set_aux(dm_get_stats(md), id, argv[2]); +} + +int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, + char *result, unsigned maxlen) +{ + int r; + + if (dm_request_based(md)) { + DMWARN("Statistics are only supported for bio-based devices"); + return -EOPNOTSUPP; + } + + /* All messages here must start with '@' */ + if (!strcasecmp(argv[0], "@stats_create")) + r = message_stats_create(md, argc, argv, result, maxlen); + else if (!strcasecmp(argv[0], "@stats_delete")) + r = message_stats_delete(md, argc, argv); + else if (!strcasecmp(argv[0], "@stats_clear")) + r = message_stats_clear(md, argc, argv); + else if (!strcasecmp(argv[0], "@stats_list")) + r = message_stats_list(md, argc, argv, result, maxlen); + else if (!strcasecmp(argv[0], "@stats_print")) + r = message_stats_print(md, argc, argv, false, result, maxlen); + else if (!strcasecmp(argv[0], "@stats_print_clear")) + r = message_stats_print(md, argc, argv, true, result, maxlen); + else if (!strcasecmp(argv[0], "@stats_set_aux")) + r = message_stats_set_aux(md, argc, argv); + else + return 2; /* this wasn't a stats message */ + + if (r == -EINVAL) + DMWARN("Invalid parameters for message %s", argv[0]); + + return r; +} + +int __init dm_statistics_init(void) +{ + shared_memory_amount = 0; + dm_stat_need_rcu_barrier = 0; + return 0; +} + +void dm_statistics_exit(void) +{ + if (dm_stat_need_rcu_barrier) + rcu_barrier(); + if (WARN_ON(shared_memory_amount)) + DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount); +} + +module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO); +MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics"); diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h new file mode 100644 index 00000000000..e7c4984bf23 --- /dev/null +++ b/drivers/md/dm-stats.h @@ -0,0 +1,40 @@ +#ifndef DM_STATS_H +#define DM_STATS_H + +#include <linux/types.h> +#include <linux/mutex.h> +#include <linux/list.h> + +int dm_statistics_init(void); +void dm_statistics_exit(void); + +struct dm_stats { + struct mutex mutex; + struct list_head list; /* list of struct dm_stat */ + struct dm_stats_last_position __percpu *last; + sector_t last_sector; + unsigned last_rw; +}; + +struct dm_stats_aux { + bool merged; +}; + +void dm_stats_init(struct dm_stats *st); +void dm_stats_cleanup(struct dm_stats *st); + +struct mapped_device; + +int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv, + char *result, unsigned maxlen); + +void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, + sector_t bi_sector, unsigned bi_sectors, bool end, + unsigned long duration, struct dm_stats_aux *aux); + +static inline bool dm_stats_used(struct dm_stats *st) +{ + return !list_empty(&st->list); +} + +#endif diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 4de90ab3968..d1600d2aa2e 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -5,6 +5,7 @@ */ #include "dm.h" +#include <linux/device-mapper.h> #include <linux/module.h> #include <linux/init.h> @@ -25,43 +26,40 @@ struct stripe { struct stripe_c { uint32_t stripes; + int stripes_shift; /* The size of this target / num. stripes */ sector_t stripe_width; - /* stripe chunk size */ - uint32_t chunk_shift; - sector_t chunk_mask; + uint32_t chunk_size; + int chunk_size_shift; /* Needed for handling events */ struct dm_target *ti; /* Work struct used for triggering events*/ - struct work_struct kstriped_ws; + struct work_struct trigger_event; struct stripe stripe[0]; }; -static struct workqueue_struct *kstriped; - /* * An event is triggered whenever a drive * drops out of a stripe volume. */ static void trigger_event(struct work_struct *work) { - struct stripe_c *sc = container_of(work, struct stripe_c, kstriped_ws); - + struct stripe_c *sc = container_of(work, struct stripe_c, + trigger_event); dm_table_event(sc->ti->table); - } static inline struct stripe_c *alloc_context(unsigned int stripes) { size_t len; - if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), - stripes)) + if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), + stripes)) return NULL; len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); @@ -76,12 +74,12 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, unsigned int stripe, char **argv) { unsigned long long start; + char dummy; - if (sscanf(argv[1], "%llu", &start) != 1) + if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1) return -EINVAL; - if (dm_get_device(ti, argv[0], start, sc->stripe_width, - dm_table_get_mode(ti->table), + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &sc->stripe[stripe].dev)) return -ENXIO; @@ -92,15 +90,14 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, /* * Construct a striped mapping. - * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+ + * <number of stripes> <chunk size> [<dev_path> <offset>]+ */ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct stripe_c *sc; - sector_t width; + sector_t width, tmp_len; uint32_t stripes; uint32_t chunk_size; - char *end; int r; unsigned int i; @@ -109,37 +106,27 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - stripes = simple_strtoul(argv[0], &end, 10); - if (*end) { + if (kstrtouint(argv[0], 10, &stripes) || !stripes) { ti->error = "Invalid stripe count"; return -EINVAL; } - chunk_size = simple_strtoul(argv[1], &end, 10); - if (*end) { + if (kstrtouint(argv[1], 10, &chunk_size) || !chunk_size) { ti->error = "Invalid chunk_size"; return -EINVAL; } - /* - * chunk_size is a power of two - */ - if (!is_power_of_2(chunk_size) || - (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) { - ti->error = "Invalid chunk size"; - return -EINVAL; - } - - if (ti->len & (chunk_size - 1)) { + width = ti->len; + if (sector_div(width, stripes)) { ti->error = "Target length not divisible by " - "chunk size"; + "number of stripes"; return -EINVAL; } - width = ti->len; - if (sector_div(width, stripes)) { + tmp_len = width; + if (sector_div(tmp_len, chunk_size)) { ti->error = "Target length not divisible by " - "number of stripes"; + "chunk size"; return -EINVAL; } @@ -159,19 +146,31 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -ENOMEM; } - INIT_WORK(&sc->kstriped_ws, trigger_event); + INIT_WORK(&sc->trigger_event, trigger_event); /* Set pointer to dm target; used in trigger_event */ sc->ti = ti; - sc->stripes = stripes; sc->stripe_width = width; - ti->split_io = chunk_size; - sc->chunk_mask = ((sector_t) chunk_size) - 1; - for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) - chunk_size >>= 1; - sc->chunk_shift--; + if (stripes & (stripes - 1)) + sc->stripes_shift = -1; + else + sc->stripes_shift = __ffs(stripes); + + r = dm_set_target_max_io_len(ti, chunk_size); + if (r) + return r; + + ti->num_flush_bios = stripes; + ti->num_discard_bios = stripes; + ti->num_write_same_bios = stripes; + + sc->chunk_size = chunk_size; + if (chunk_size & (chunk_size - 1)) + sc->chunk_size_shift = -1; + else + sc->chunk_size_shift = __ffs(chunk_size); /* * Get the stripe destinations. @@ -203,22 +202,105 @@ static void stripe_dtr(struct dm_target *ti) for (i = 0; i < sc->stripes; i++) dm_put_device(ti, sc->stripe[i].dev); - flush_workqueue(kstriped); + flush_work(&sc->trigger_event); kfree(sc); } -static int stripe_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) +static void stripe_map_sector(struct stripe_c *sc, sector_t sector, + uint32_t *stripe, sector_t *result) { - struct stripe_c *sc = (struct stripe_c *) ti->private; + sector_t chunk = dm_target_offset(sc->ti, sector); + sector_t chunk_offset; + + if (sc->chunk_size_shift < 0) + chunk_offset = sector_div(chunk, sc->chunk_size); + else { + chunk_offset = chunk & (sc->chunk_size - 1); + chunk >>= sc->chunk_size_shift; + } + + if (sc->stripes_shift < 0) + *stripe = sector_div(chunk, sc->stripes); + else { + *stripe = chunk & (sc->stripes - 1); + chunk >>= sc->stripes_shift; + } + + if (sc->chunk_size_shift < 0) + chunk *= sc->chunk_size; + else + chunk <<= sc->chunk_size_shift; + + *result = chunk + chunk_offset; +} + +static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, + uint32_t target_stripe, sector_t *result) +{ + uint32_t stripe; + + stripe_map_sector(sc, sector, &stripe, result); + if (stripe == target_stripe) + return; + + /* round down */ + sector = *result; + if (sc->chunk_size_shift < 0) + *result -= sector_div(sector, sc->chunk_size); + else + *result = sector & ~(sector_t)(sc->chunk_size - 1); + + if (target_stripe < stripe) + *result += sc->chunk_size; /* next chunk */ +} - sector_t offset = bio->bi_sector - ti->begin; - sector_t chunk = offset >> sc->chunk_shift; - uint32_t stripe = sector_div(chunk, sc->stripes); +static int stripe_map_range(struct stripe_c *sc, struct bio *bio, + uint32_t target_stripe) +{ + sector_t begin, end; + + stripe_map_range_sector(sc, bio->bi_iter.bi_sector, + target_stripe, &begin); + stripe_map_range_sector(sc, bio_end_sector(bio), + target_stripe, &end); + if (begin < end) { + bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; + bio->bi_iter.bi_sector = begin + + sc->stripe[target_stripe].physical_start; + bio->bi_iter.bi_size = to_bytes(end - begin); + return DM_MAPIO_REMAPPED; + } else { + /* The range doesn't map to the target stripe */ + bio_endio(bio, 0); + return DM_MAPIO_SUBMITTED; + } +} + +static int stripe_map(struct dm_target *ti, struct bio *bio) +{ + struct stripe_c *sc = ti->private; + uint32_t stripe; + unsigned target_bio_nr; + + if (bio->bi_rw & REQ_FLUSH) { + target_bio_nr = dm_bio_get_target_bio_nr(bio); + BUG_ON(target_bio_nr >= sc->stripes); + bio->bi_bdev = sc->stripe[target_bio_nr].dev->bdev; + return DM_MAPIO_REMAPPED; + } + if (unlikely(bio->bi_rw & REQ_DISCARD) || + unlikely(bio->bi_rw & REQ_WRITE_SAME)) { + target_bio_nr = dm_bio_get_target_bio_nr(bio); + BUG_ON(target_bio_nr >= sc->stripes); + return stripe_map_range(sc, bio, target_bio_nr); + } + + stripe_map_sector(sc, bio->bi_iter.bi_sector, + &stripe, &bio->bi_iter.bi_sector); + bio->bi_iter.bi_sector += sc->stripe[stripe].physical_start; bio->bi_bdev = sc->stripe[stripe].dev->bdev; - bio->bi_sector = sc->stripe[stripe].physical_start + - (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); + return DM_MAPIO_REMAPPED; } @@ -235,8 +317,8 @@ static int stripe_map(struct dm_target *ti, struct bio *bio, * */ -static int stripe_status(struct dm_target *ti, - status_type_t type, char *result, unsigned int maxlen) +static void stripe_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) { struct stripe_c *sc = (struct stripe_c *) ti->private; char buffer[sc->stripes + 1]; @@ -257,17 +339,15 @@ static int stripe_status(struct dm_target *ti, case STATUSTYPE_TABLE: DMEMIT("%d %llu", sc->stripes, - (unsigned long long)sc->chunk_mask + 1); + (unsigned long long)sc->chunk_size); for (i = 0; i < sc->stripes; i++) DMEMIT(" %s %llu", sc->stripe[i].dev->name, (unsigned long long)sc->stripe[i].physical_start); break; } - return 0; } -static int stripe_end_io(struct dm_target *ti, struct bio *bio, - int error, union map_info *map_context) +static int stripe_end_io(struct dm_target *ti, struct bio *bio, int error) { unsigned i; char major_minor[16]; @@ -276,7 +356,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, if (!error) return 0; /* I/O complete */ - if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio)) + if ((error == -EWOULDBLOCK) && (bio->bi_rw & REQ_RAHEAD)) return error; if (error == -EOPNOTSUPP) @@ -284,8 +364,8 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, memset(major_minor, 0, sizeof(major_minor)); sprintf(major_minor, "%d:%d", - bio->bi_bdev->bd_disk->major, - bio->bi_bdev->bd_disk->first_minor); + MAJOR(disk_devt(bio->bi_bdev->bd_disk)), + MINOR(disk_devt(bio->bi_bdev->bd_disk))); /* * Test to see which stripe drive triggered the event @@ -298,21 +378,70 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, atomic_inc(&(sc->stripe[i].error_count)); if (atomic_read(&(sc->stripe[i].error_count)) < DM_IO_ERROR_THRESHOLD) - queue_work(kstriped, &sc->kstriped_ws); + schedule_work(&sc->trigger_event); } return error; } +static int stripe_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct stripe_c *sc = ti->private; + int ret = 0; + unsigned i = 0; + + do { + ret = fn(ti, sc->stripe[i].dev, + sc->stripe[i].physical_start, + sc->stripe_width, data); + } while (!ret && ++i < sc->stripes); + + return ret; +} + +static void stripe_io_hints(struct dm_target *ti, + struct queue_limits *limits) +{ + struct stripe_c *sc = ti->private; + unsigned chunk_size = sc->chunk_size << SECTOR_SHIFT; + + blk_limits_io_min(limits, chunk_size); + blk_limits_io_opt(limits, chunk_size * sc->stripes); +} + +static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct stripe_c *sc = ti->private; + sector_t bvm_sector = bvm->bi_sector; + uint32_t stripe; + struct request_queue *q; + + stripe_map_sector(sc, bvm_sector, &stripe, &bvm_sector); + + q = bdev_get_queue(sc->stripe[stripe].dev->bdev); + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = sc->stripe[stripe].dev->bdev; + bvm->bi_sector = sc->stripe[stripe].physical_start + bvm_sector; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + static struct target_type stripe_target = { .name = "striped", - .version = {1, 1, 0}, + .version = {1, 5, 1}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, .map = stripe_map, .end_io = stripe_end_io, .status = stripe_status, + .iterate_devices = stripe_iterate_devices, + .io_hints = stripe_io_hints, + .merge = stripe_merge, }; int __init dm_stripe_init(void) @@ -320,14 +449,9 @@ int __init dm_stripe_init(void) int r; r = dm_register_target(&stripe_target); - if (r < 0) + if (r < 0) { DMWARN("target registration failed"); - - kstriped = create_singlethread_workqueue("kstriped"); - if (!kstriped) { - DMERR("failed to create workqueue kstriped"); - dm_unregister_target(&stripe_target); - return -ENOMEM; + return r; } return r; @@ -335,10 +459,5 @@ int __init dm_stripe_init(void) void dm_stripe_exit(void) { - if (dm_unregister_target(&stripe_target)) - DMWARN("target unregistration failed"); - - destroy_workqueue(kstriped); - - return; + dm_unregister_target(&stripe_target); } diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c new file mode 100644 index 00000000000..09a688b3d48 --- /dev/null +++ b/drivers/md/dm-switch.c @@ -0,0 +1,538 @@ +/* + * Copyright (C) 2010-2012 by Dell Inc. All rights reserved. + * Copyright (C) 2011-2013 Red Hat, Inc. + * + * This file is released under the GPL. + * + * dm-switch is a device-mapper target that maps IO to underlying block + * devices efficiently when there are a large number of fixed-sized + * address regions but there is no simple pattern to allow for a compact + * mapping representation such as dm-stripe. + */ + +#include <linux/device-mapper.h> + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/vmalloc.h> + +#define DM_MSG_PREFIX "switch" + +/* + * One region_table_slot_t holds <region_entries_per_slot> region table + * entries each of which is <region_table_entry_bits> in size. + */ +typedef unsigned long region_table_slot_t; + +/* + * A device with the offset to its start sector. + */ +struct switch_path { + struct dm_dev *dmdev; + sector_t start; +}; + +/* + * Context block for a dm switch device. + */ +struct switch_ctx { + struct dm_target *ti; + + unsigned nr_paths; /* Number of paths in path_list. */ + + unsigned region_size; /* Region size in 512-byte sectors */ + unsigned long nr_regions; /* Number of regions making up the device */ + signed char region_size_bits; /* log2 of region_size or -1 */ + + unsigned char region_table_entry_bits; /* Number of bits in one region table entry */ + unsigned char region_entries_per_slot; /* Number of entries in one region table slot */ + signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */ + + region_table_slot_t *region_table; /* Region table */ + + /* + * Array of dm devices to switch between. + */ + struct switch_path path_list[0]; +}; + +static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths, + unsigned region_size) +{ + struct switch_ctx *sctx; + + sctx = kzalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_path), + GFP_KERNEL); + if (!sctx) + return NULL; + + sctx->ti = ti; + sctx->region_size = region_size; + + ti->private = sctx; + + return sctx; +} + +static int alloc_region_table(struct dm_target *ti, unsigned nr_paths) +{ + struct switch_ctx *sctx = ti->private; + sector_t nr_regions = ti->len; + sector_t nr_slots; + + if (!(sctx->region_size & (sctx->region_size - 1))) + sctx->region_size_bits = __ffs(sctx->region_size); + else + sctx->region_size_bits = -1; + + sctx->region_table_entry_bits = 1; + while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 && + (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths) + sctx->region_table_entry_bits++; + + sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits; + if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1))) + sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot); + else + sctx->region_entries_per_slot_bits = -1; + + if (sector_div(nr_regions, sctx->region_size)) + nr_regions++; + + sctx->nr_regions = nr_regions; + if (sctx->nr_regions != nr_regions || sctx->nr_regions >= ULONG_MAX) { + ti->error = "Region table too large"; + return -EINVAL; + } + + nr_slots = nr_regions; + if (sector_div(nr_slots, sctx->region_entries_per_slot)) + nr_slots++; + + if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) { + ti->error = "Region table too large"; + return -EINVAL; + } + + sctx->region_table = vmalloc(nr_slots * sizeof(region_table_slot_t)); + if (!sctx->region_table) { + ti->error = "Cannot allocate region table"; + return -ENOMEM; + } + + return 0; +} + +static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr, + unsigned long *region_index, unsigned *bit) +{ + if (sctx->region_entries_per_slot_bits >= 0) { + *region_index = region_nr >> sctx->region_entries_per_slot_bits; + *bit = region_nr & (sctx->region_entries_per_slot - 1); + } else { + *region_index = region_nr / sctx->region_entries_per_slot; + *bit = region_nr % sctx->region_entries_per_slot; + } + + *bit *= sctx->region_table_entry_bits; +} + +/* + * Find which path to use at given offset. + */ +static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) +{ + unsigned long region_index; + unsigned bit, path_nr; + sector_t p; + + p = offset; + if (sctx->region_size_bits >= 0) + p >>= sctx->region_size_bits; + else + sector_div(p, sctx->region_size); + + switch_get_position(sctx, p, ®ion_index, &bit); + path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & + ((1 << sctx->region_table_entry_bits) - 1); + + /* This can only happen if the processor uses non-atomic stores. */ + if (unlikely(path_nr >= sctx->nr_paths)) + path_nr = 0; + + return path_nr; +} + +static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr, + unsigned value) +{ + unsigned long region_index; + unsigned bit; + region_table_slot_t pte; + + switch_get_position(sctx, region_nr, ®ion_index, &bit); + + pte = sctx->region_table[region_index]; + pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit); + pte |= (region_table_slot_t)value << bit; + sctx->region_table[region_index] = pte; +} + +/* + * Fill the region table with an initial round robin pattern. + */ +static void initialise_region_table(struct switch_ctx *sctx) +{ + unsigned path_nr = 0; + unsigned long region_nr; + + for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) { + switch_region_table_write(sctx, region_nr, path_nr); + if (++path_nr >= sctx->nr_paths) + path_nr = 0; + } +} + +static int parse_path(struct dm_arg_set *as, struct dm_target *ti) +{ + struct switch_ctx *sctx = ti->private; + unsigned long long start; + int r; + + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), + &sctx->path_list[sctx->nr_paths].dmdev); + if (r) { + ti->error = "Device lookup failed"; + return r; + } + + if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) { + ti->error = "Invalid device starting offset"; + dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); + return -EINVAL; + } + + sctx->path_list[sctx->nr_paths].start = start; + + sctx->nr_paths++; + + return 0; +} + +/* + * Destructor: Don't free the dm_target, just the ti->private data (if any). + */ +static void switch_dtr(struct dm_target *ti) +{ + struct switch_ctx *sctx = ti->private; + + while (sctx->nr_paths--) + dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); + + vfree(sctx->region_table); + kfree(sctx); +} + +/* + * Constructor arguments: + * <num_paths> <region_size> <num_optional_args> [<optional_args>...] + * [<dev_path> <offset>]+ + * + * Optional args are to allow for future extension: currently this + * parameter must be 0. + */ +static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + static struct dm_arg _args[] = { + {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"}, + {1, UINT_MAX, "Invalid region size"}, + {0, 0, "Invalid number of optional args"}, + }; + + struct switch_ctx *sctx; + struct dm_arg_set as; + unsigned nr_paths, region_size, nr_optional_args; + int r; + + as.argc = argc; + as.argv = argv; + + r = dm_read_arg(_args, &as, &nr_paths, &ti->error); + if (r) + return -EINVAL; + + r = dm_read_arg(_args + 1, &as, ®ion_size, &ti->error); + if (r) + return r; + + r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error); + if (r) + return r; + /* parse optional arguments here, if we add any */ + + if (as.argc != nr_paths * 2) { + ti->error = "Incorrect number of path arguments"; + return -EINVAL; + } + + sctx = alloc_switch_ctx(ti, nr_paths, region_size); + if (!sctx) { + ti->error = "Cannot allocate redirection context"; + return -ENOMEM; + } + + r = dm_set_target_max_io_len(ti, region_size); + if (r) + goto error; + + while (as.argc) { + r = parse_path(&as, ti); + if (r) + goto error; + } + + r = alloc_region_table(ti, nr_paths); + if (r) + goto error; + + initialise_region_table(sctx); + + /* For UNMAP, sending the request down any path is sufficient */ + ti->num_discard_bios = 1; + + return 0; + +error: + switch_dtr(ti); + + return r; +} + +static int switch_map(struct dm_target *ti, struct bio *bio) +{ + struct switch_ctx *sctx = ti->private; + sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector); + unsigned path_nr = switch_get_path_nr(sctx, offset); + + bio->bi_bdev = sctx->path_list[path_nr].dmdev->bdev; + bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset; + + return DM_MAPIO_REMAPPED; +} + +/* + * We need to parse hex numbers in the message as quickly as possible. + * + * This table-based hex parser improves performance. + * It improves a time to load 1000000 entries compared to the condition-based + * parser. + * table-based parser condition-based parser + * PA-RISC 0.29s 0.31s + * Opteron 0.0495s 0.0498s + */ +static const unsigned char hex_table[256] = { +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, +255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 +}; + +static __always_inline unsigned long parse_hex(const char **string) +{ + unsigned char d; + unsigned long r = 0; + + while ((d = hex_table[(unsigned char)**string]) < 16) { + r = (r << 4) | d; + (*string)++; + } + + return r; +} + +static int process_set_region_mappings(struct switch_ctx *sctx, + unsigned argc, char **argv) +{ + unsigned i; + unsigned long region_index = 0; + + for (i = 1; i < argc; i++) { + unsigned long path_nr; + const char *string = argv[i]; + + if (*string == ':') + region_index++; + else { + region_index = parse_hex(&string); + if (unlikely(*string != ':')) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + } + + string++; + if (unlikely(!*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + + path_nr = parse_hex(&string); + if (unlikely(*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + if (unlikely(region_index >= sctx->nr_regions)) { + DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions); + return -EINVAL; + } + if (unlikely(path_nr >= sctx->nr_paths)) { + DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths); + return -EINVAL; + } + + switch_region_table_write(sctx, region_index, path_nr); + } + + return 0; +} + +/* + * Messages are processed one-at-a-time. + * + * Only set_region_mappings is supported. + */ +static int switch_message(struct dm_target *ti, unsigned argc, char **argv) +{ + static DEFINE_MUTEX(message_mutex); + + struct switch_ctx *sctx = ti->private; + int r = -EINVAL; + + mutex_lock(&message_mutex); + + if (!strcasecmp(argv[0], "set_region_mappings")) + r = process_set_region_mappings(sctx, argc, argv); + else + DMWARN("Unrecognised message received."); + + mutex_unlock(&message_mutex); + + return r; +} + +static void switch_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct switch_ctx *sctx = ti->private; + unsigned sz = 0; + int path_nr; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size); + for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) + DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name, + (unsigned long long)sctx->path_list[path_nr].start); + break; + } +} + +/* + * Switch ioctl: + * + * Passthrough all ioctls to the path for sector 0 + */ +static int switch_ioctl(struct dm_target *ti, unsigned cmd, + unsigned long arg) +{ + struct switch_ctx *sctx = ti->private; + struct block_device *bdev; + fmode_t mode; + unsigned path_nr; + int r = 0; + + path_nr = switch_get_path_nr(sctx, 0); + + bdev = sctx->path_list[path_nr].dmdev->bdev; + mode = sctx->path_list[path_nr].dmdev->mode; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (ti->len + sctx->path_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) + r = scsi_verify_blk_ioctl(NULL, cmd); + + return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); +} + +static int switch_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct switch_ctx *sctx = ti->private; + int path_nr; + int r; + + for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) { + r = fn(ti, sctx->path_list[path_nr].dmdev, + sctx->path_list[path_nr].start, ti->len, data); + if (r) + return r; + } + + return 0; +} + +static struct target_type switch_target = { + .name = "switch", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = switch_ctr, + .dtr = switch_dtr, + .map = switch_map, + .message = switch_message, + .status = switch_status, + .ioctl = switch_ioctl, + .iterate_devices = switch_iterate_devices, +}; + +static int __init dm_switch_init(void) +{ + int r; + + r = dm_register_target(&switch_target); + if (r < 0) + DMERR("dm_register_target() failed %d", r); + + return r; +} + +static void __exit dm_switch_exit(void) +{ + dm_unregister_target(&switch_target); +} + +module_init(dm_switch_init); +module_exit(dm_switch_exit); + +MODULE_DESCRIPTION(DM_NAME " dynamic path switching target"); +MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>"); +MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>"); +MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>"); +MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c new file mode 100644 index 00000000000..c62c5ab6aed --- /dev/null +++ b/drivers/md/dm-sysfs.c @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2008 Red Hat, Inc. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/sysfs.h> +#include <linux/dm-ioctl.h> +#include "dm.h" + +struct dm_sysfs_attr { + struct attribute attr; + ssize_t (*show)(struct mapped_device *, char *); + ssize_t (*store)(struct mapped_device *, char *); +}; + +#define DM_ATTR_RO(_name) \ +struct dm_sysfs_attr dm_attr_##_name = \ + __ATTR(_name, S_IRUGO, dm_attr_##_name##_show, NULL) + +static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr, + char *page) +{ + struct dm_sysfs_attr *dm_attr; + struct mapped_device *md; + ssize_t ret; + + dm_attr = container_of(attr, struct dm_sysfs_attr, attr); + if (!dm_attr->show) + return -EIO; + + md = dm_get_from_kobject(kobj); + if (!md) + return -EINVAL; + + ret = dm_attr->show(md, page); + dm_put(md); + + return ret; +} + +static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf) +{ + if (dm_copy_name_and_uuid(md, buf, NULL)) + return -EIO; + + strcat(buf, "\n"); + return strlen(buf); +} + +static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf) +{ + if (dm_copy_name_and_uuid(md, NULL, buf)) + return -EIO; + + strcat(buf, "\n"); + return strlen(buf); +} + +static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) +{ + sprintf(buf, "%d\n", dm_suspended_md(md)); + + return strlen(buf); +} + +static DM_ATTR_RO(name); +static DM_ATTR_RO(uuid); +static DM_ATTR_RO(suspended); + +static struct attribute *dm_attrs[] = { + &dm_attr_name.attr, + &dm_attr_uuid.attr, + &dm_attr_suspended.attr, + NULL, +}; + +static const struct sysfs_ops dm_sysfs_ops = { + .show = dm_attr_show, +}; + +/* + * dm kobject is embedded in mapped_device structure + * no need to define release function here + */ +static struct kobj_type dm_ktype = { + .sysfs_ops = &dm_sysfs_ops, + .default_attrs = dm_attrs, + .release = dm_kobject_release, +}; + +/* + * Initialize kobj + * because nobody using md yet, no need to call explicit dm_get/put + */ +int dm_sysfs_init(struct mapped_device *md) +{ + return kobject_init_and_add(dm_kobject(md), &dm_ktype, + &disk_to_dev(dm_disk(md))->kobj, + "%s", "dm"); +} + +/* + * Remove kobj, called after all references removed + */ +void dm_sysfs_exit(struct mapped_device *md) +{ + struct kobject *kobj = dm_kobject(md); + kobject_put(kobj); + wait_for_completion(dm_get_completion_from_kobject(kobj)); +} diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 61f44140923..5f59f1e3e5b 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2001 Sistina Software (UK) Limited. - * Copyright (C) 2004 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -12,10 +12,12 @@ #include <linux/blkdev.h> #include <linux/namei.h> #include <linux/ctype.h> +#include <linux/string.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/mutex.h> -#include <asm/atomic.h> +#include <linux/delay.h> +#include <linux/atomic.h> #define DM_MSG_PREFIX "table" @@ -26,7 +28,7 @@ struct dm_table { struct mapped_device *md; - atomic_t holders; + unsigned type; /* btree table */ unsigned int depth; @@ -38,25 +40,27 @@ struct dm_table { sector_t *highs; struct dm_target *targets; + struct target_type *immutable_target_type; + unsigned integrity_supported:1; + unsigned singleton:1; + /* * Indicates the rw permissions for the new logical * device. This should be a combination of FMODE_READ * and FMODE_WRITE. */ - int mode; + fmode_t mode; /* a list of devices used by this table */ struct list_head devices; - /* - * These are optimistic limits taken from all the - * targets, some targets will need smaller limits. - */ - struct io_restrictions limits; - /* events get handed up using this callback */ void (*event_fn)(void *); void *event_context; + + struct dm_md_mempools *mempools; + + struct list_head target_callbacks; }; /* @@ -75,42 +79,6 @@ static unsigned int int_log(unsigned int n, unsigned int base) } /* - * Returns the minimum that is _not_ zero, unless both are zero. - */ -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - -/* - * Combine two io_restrictions, always taking the lower value. - */ -static void combine_restrictions_low(struct io_restrictions *lhs, - struct io_restrictions *rhs) -{ - lhs->max_sectors = - min_not_zero(lhs->max_sectors, rhs->max_sectors); - - lhs->max_phys_segments = - min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments); - - lhs->max_hw_segments = - min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments); - - lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size); - - lhs->max_segment_size = - min_not_zero(lhs->max_segment_size, rhs->max_segment_size); - - lhs->max_hw_sectors = - min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors); - - lhs->seg_boundary_mask = - min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask); - - lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn); - - lhs->no_cluster |= rhs->no_cluster; -} - -/* * Calculate the index of the child node of the n'th node k'th key. */ static inline unsigned int get_child(unsigned int n, unsigned int k) @@ -173,12 +141,11 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) return NULL; size = nmemb * elem_size; - addr = vmalloc(size); - if (addr) - memset(addr, 0, size); + addr = vzalloc(size); return addr; } +EXPORT_SYMBOL(dm_vcalloc); /* * highs, and targets are managed as dynamic arrays during a @@ -188,7 +155,6 @@ static int alloc_targets(struct dm_table *t, unsigned int num) { sector_t *n_highs; struct dm_target *n_targets; - int n = t->num_targets; /* * Allocate both the target array and offset array at once. @@ -202,12 +168,7 @@ static int alloc_targets(struct dm_table *t, unsigned int num) n_targets = (struct dm_target *) (n_highs + num); - if (n) { - memcpy(n_highs, t->highs, sizeof(*n_highs) * n); - memcpy(n_targets, t->targets, sizeof(*n_targets) * n); - } - - memset(n_highs + n, -1, sizeof(*n_highs) * (num - n)); + memset(n_highs, -1, sizeof(*n_highs) * num); vfree(t->highs); t->num_allocated = num; @@ -217,7 +178,7 @@ static int alloc_targets(struct dm_table *t, unsigned int num) return 0; } -int dm_table_create(struct dm_table **result, int mode, +int dm_table_create(struct dm_table **result, fmode_t mode, unsigned num_targets, struct mapped_device *md) { struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL); @@ -226,16 +187,20 @@ int dm_table_create(struct dm_table **result, int mode, return -ENOMEM; INIT_LIST_HEAD(&t->devices); - atomic_set(&t->holders, 1); + INIT_LIST_HEAD(&t->target_callbacks); if (!num_targets) num_targets = KEYS_PER_NODE; num_targets = dm_round_up(num_targets, KEYS_PER_NODE); + if (!num_targets) { + kfree(t); + return -ENOMEM; + } + if (alloc_targets(t, num_targets)) { kfree(t); - t = NULL; return -ENOMEM; } @@ -250,16 +215,22 @@ static void free_devices(struct list_head *devices) struct list_head *tmp, *next; list_for_each_safe(tmp, next, devices) { - struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + struct dm_dev_internal *dd = + list_entry(tmp, struct dm_dev_internal, list); + DMWARN("dm_table_destroy: dm_put_device call missing for %s", + dd->dm_dev.name); kfree(dd); } } -static void table_destroy(struct dm_table *t) +void dm_table_destroy(struct dm_table *t) { unsigned int i; - /* free the indexes (see dm_table_complete) */ + if (!t) + return; + + /* free the indexes */ if (t->depth >= 2) vfree(t->index[t->depth - 2]); @@ -276,63 +247,22 @@ static void table_destroy(struct dm_table *t) vfree(t->highs); /* free the device list */ - if (t->devices.next != &t->devices) { - DMWARN("devices still present during destroy: " - "dm_table_remove_device calls missing"); + free_devices(&t->devices); - free_devices(&t->devices); - } + dm_free_md_mempools(t->mempools); kfree(t); } -void dm_table_get(struct dm_table *t) -{ - atomic_inc(&t->holders); -} - -void dm_table_put(struct dm_table *t) -{ - if (!t) - return; - - if (atomic_dec_and_test(&t->holders)) - table_destroy(t); -} - -/* - * Checks to see if we need to extend highs or targets. - */ -static inline int check_space(struct dm_table *t) -{ - if (t->num_targets >= t->num_allocated) - return alloc_targets(t, t->num_allocated * 2); - - return 0; -} - -/* - * Convert a device path to a dev_t. - */ -static int lookup_device(const char *path, dev_t *dev) -{ - struct block_device *bdev = lookup_bdev(path); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - *dev = bdev->bd_dev; - bdput(bdev); - return 0; -} - /* * See if we've already got a device in the list. */ -static struct dm_dev *find_device(struct list_head *l, dev_t dev) +static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) { - struct dm_dev *dd; + struct dm_dev_internal *dd; list_for_each_entry (dd, l, list) - if (dd->bdev->bd_dev == dev) + if (dd->dm_dev.bdev->bd_dev == dev) return dd; return NULL; @@ -341,100 +271,167 @@ static struct dm_dev *find_device(struct list_head *l, dev_t dev) /* * Open a device so we can use it as a map destination. */ -static int open_dev(struct dm_dev *d, dev_t dev, struct mapped_device *md) +static int open_dev(struct dm_dev_internal *d, dev_t dev, + struct mapped_device *md) { static char *_claim_ptr = "I belong to device-mapper"; struct block_device *bdev; int r; - BUG_ON(d->bdev); + BUG_ON(d->dm_dev.bdev); - bdev = open_by_devnum(dev, d->mode); + bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr); if (IS_ERR(bdev)) return PTR_ERR(bdev); - r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md)); - if (r) - blkdev_put(bdev); - else - d->bdev = bdev; - return r; + + r = bd_link_disk_holder(bdev, dm_disk(md)); + if (r) { + blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL); + return r; + } + + d->dm_dev.bdev = bdev; + return 0; } /* * Close a device that we've been using. */ -static void close_dev(struct dm_dev *d, struct mapped_device *md) +static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) { - if (!d->bdev) + if (!d->dm_dev.bdev) return; - bd_release_from_disk(d->bdev, dm_disk(md)); - blkdev_put(d->bdev); - d->bdev = NULL; + bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md)); + blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL); + d->dm_dev.bdev = NULL; } /* - * If possible, this checks an area of a destination device is valid. + * If possible, this checks an area of a destination device is invalid. */ -static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len) +static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { - sector_t dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT; + struct request_queue *q; + struct queue_limits *limits = data; + struct block_device *bdev = dev->bdev; + sector_t dev_size = + i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + unsigned short logical_block_size_sectors = + limits->logical_block_size >> SECTOR_SHIFT; + char b[BDEVNAME_SIZE]; + + /* + * Some devices exist without request functions, + * such as loop devices not yet bound to backing files. + * Forbid the use of such devices. + */ + q = bdev_get_queue(bdev); + if (!q || !q->make_request_fn) { + DMWARN("%s: %s is not yet initialised: " + "start=%llu, len=%llu, dev_size=%llu", + dm_device_name(ti->table->md), bdevname(bdev, b), + (unsigned long long)start, + (unsigned long long)len, + (unsigned long long)dev_size); + return 1; + } if (!dev_size) + return 0; + + if ((start >= dev_size) || (start + len > dev_size)) { + DMWARN("%s: %s too small for target: " + "start=%llu, len=%llu, dev_size=%llu", + dm_device_name(ti->table->md), bdevname(bdev, b), + (unsigned long long)start, + (unsigned long long)len, + (unsigned long long)dev_size); + return 1; + } + + if (logical_block_size_sectors <= 1) + return 0; + + if (start & (logical_block_size_sectors - 1)) { + DMWARN("%s: start=%llu not aligned to h/w " + "logical block size %u of %s", + dm_device_name(ti->table->md), + (unsigned long long)start, + limits->logical_block_size, bdevname(bdev, b)); return 1; + } - return ((start < dev_size) && (len <= (dev_size - start))); + if (len & (logical_block_size_sectors - 1)) { + DMWARN("%s: len=%llu not aligned to h/w " + "logical block size %u of %s", + dm_device_name(ti->table->md), + (unsigned long long)len, + limits->logical_block_size, bdevname(bdev, b)); + return 1; + } + + return 0; } /* - * This upgrades the mode on an already open dm_dev. Being + * This upgrades the mode on an already open dm_dev, being * careful to leave things as they were if we fail to reopen the - * device. + * device and not to touch the existing bdev field in case + * it is accessed concurrently inside dm_table_any_congested(). */ -static int upgrade_mode(struct dm_dev *dd, int new_mode, struct mapped_device *md) +static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, + struct mapped_device *md) { int r; - struct dm_dev dd_copy; - dev_t dev = dd->bdev->bd_dev; + struct dm_dev_internal dd_new, dd_old; - dd_copy = *dd; + dd_new = dd_old = *dd; - dd->mode |= new_mode; - dd->bdev = NULL; - r = open_dev(dd, dev, md); - if (!r) - close_dev(&dd_copy, md); - else - *dd = dd_copy; + dd_new.dm_dev.mode |= new_mode; + dd_new.dm_dev.bdev = NULL; - return r; + r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md); + if (r) + return r; + + dd->dm_dev.mode |= new_mode; + close_dev(&dd_old, md); + + return 0; } /* * Add a device to the list, or just increment the usage count if * it's already present. */ -static int __table_get_device(struct dm_table *t, struct dm_target *ti, - const char *path, sector_t start, sector_t len, - int mode, struct dm_dev **result) +int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result) { int r; dev_t uninitialized_var(dev); - struct dm_dev *dd; + struct dm_dev_internal *dd; unsigned int major, minor; + struct dm_table *t = ti->table; + char dummy; BUG_ON(!t); - if (sscanf(path, "%u:%u", &major, &minor) == 2) { + if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { /* Extract the major/minor numbers */ dev = MKDEV(major, minor); if (MAJOR(dev) != major || MINOR(dev) != minor) return -EOVERFLOW; } else { /* convert the path to a device */ - if ((r = lookup_device(path, &dev))) - return r; + struct block_device *bdev = lookup_bdev(path); + + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + dev = bdev->bd_dev; + bdput(bdev); } dd = find_device(&t->devices, dev); @@ -443,111 +440,81 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, if (!dd) return -ENOMEM; - dd->mode = mode; - dd->bdev = NULL; + dd->dm_dev.mode = mode; + dd->dm_dev.bdev = NULL; if ((r = open_dev(dd, dev, t->md))) { kfree(dd); return r; } - format_dev_t(dd->name, dev); + format_dev_t(dd->dm_dev.name, dev); atomic_set(&dd->count, 0); list_add(&dd->list, &t->devices); - } else if (dd->mode != (mode | dd->mode)) { + } else if (dd->dm_dev.mode != (mode | dd->dm_dev.mode)) { r = upgrade_mode(dd, mode, t->md); if (r) return r; } atomic_inc(&dd->count); - if (!check_device_area(dd, start, len)) { - DMWARN("device %s too small for target", path); - dm_put_device(ti, dd); - return -EINVAL; - } - - *result = dd; - + *result = &dd->dm_dev; return 0; } +EXPORT_SYMBOL(dm_get_device); -void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) +static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { + struct queue_limits *limits = data; + struct block_device *bdev = dev->bdev; struct request_queue *q = bdev_get_queue(bdev); - struct io_restrictions *rs = &ti->limits; + char b[BDEVNAME_SIZE]; - /* - * Combine the device limits low. - * - * FIXME: if we move an io_restriction struct - * into q this would just be a call to - * combine_restrictions_low() - */ - rs->max_sectors = - min_not_zero(rs->max_sectors, q->max_sectors); + if (unlikely(!q)) { + DMWARN("%s: Cannot set limits for nonexistent device %s", + dm_device_name(ti->table->md), bdevname(bdev, b)); + return 0; + } + + if (bdev_stack_limits(limits, bdev, start) < 0) + DMWARN("%s: adding target device %s caused an alignment inconsistency: " + "physical_block_size=%u, logical_block_size=%u, " + "alignment_offset=%u, start=%llu", + dm_device_name(ti->table->md), bdevname(bdev, b), + q->limits.physical_block_size, + q->limits.logical_block_size, + q->limits.alignment_offset, + (unsigned long long) start << SECTOR_SHIFT); /* * Check if merge fn is supported. * If not we'll force DM to use PAGE_SIZE or * smaller I/O, just to be safe. */ - - if (q->merge_bvec_fn && !ti->type->merge) - rs->max_sectors = - min_not_zero(rs->max_sectors, - (unsigned int) (PAGE_SIZE >> 9)); - - rs->max_phys_segments = - min_not_zero(rs->max_phys_segments, - q->max_phys_segments); - - rs->max_hw_segments = - min_not_zero(rs->max_hw_segments, q->max_hw_segments); - - rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); - - rs->max_segment_size = - min_not_zero(rs->max_segment_size, q->max_segment_size); - - rs->max_hw_sectors = - min_not_zero(rs->max_hw_sectors, q->max_hw_sectors); - - rs->seg_boundary_mask = - min_not_zero(rs->seg_boundary_mask, - q->seg_boundary_mask); - - rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn); - - rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); -} -EXPORT_SYMBOL_GPL(dm_set_device_limits); - -int dm_get_device(struct dm_target *ti, const char *path, sector_t start, - sector_t len, int mode, struct dm_dev **result) -{ - int r = __table_get_device(ti->table, ti, path, - start, len, mode, result); - - if (!r) - dm_set_device_limits(ti, (*result)->bdev); - - return r; + if (dm_queue_merge_is_compulsory(q) && !ti->type->merge) + blk_limits_max_hw_sectors(limits, + (unsigned int) (PAGE_SIZE >> 9)); + return 0; } /* - * Decrement a devices use count and remove it if necessary. + * Decrement a device's use count and remove it if necessary. */ -void dm_put_device(struct dm_target *ti, struct dm_dev *dd) +void dm_put_device(struct dm_target *ti, struct dm_dev *d) { + struct dm_dev_internal *dd = container_of(d, struct dm_dev_internal, + dm_dev); + if (atomic_dec_and_test(&dd->count)) { close_dev(dd, ti->table->md); list_del(&dd->list); kfree(dd); } } +EXPORT_SYMBOL(dm_put_device); /* * Checks to see if the target joins onto the end of the table. @@ -565,14 +532,28 @@ static int adjoin(struct dm_table *table, struct dm_target *ti) /* * Used to dynamically allocate the arg array. + * + * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must + * process messages even if some device is suspended. These messages have a + * small fixed number of arguments. + * + * On the other hand, dm-switch needs to process bulk data using messages and + * excessive use of GFP_NOIO could cause trouble. */ static char **realloc_argv(unsigned *array_size, char **old_argv) { char **argv; unsigned new_size; + gfp_t gfp; - new_size = *array_size ? *array_size * 2 : 64; - argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); + if (*array_size) { + new_size = *array_size * 2; + gfp = GFP_KERNEL; + } else { + new_size = 8; + gfp = GFP_NOIO; + } + argv = kmalloc(new_size * sizeof(*argv), gfp); if (argv) { memcpy(argv, old_argv, *array_size * sizeof(*argv)); *array_size = new_size; @@ -602,11 +583,8 @@ int dm_split_args(int *argc, char ***argvp, char *input) return -ENOMEM; while (1) { - start = end; - /* Skip whitespace */ - while (*start && isspace(*start)) - start++; + start = skip_spaces(end); if (!*start) break; /* success, we hit the end */ @@ -648,24 +626,78 @@ int dm_split_args(int *argc, char ***argvp, char *input) return 0; } -static void check_for_valid_limits(struct io_restrictions *rs) +/* + * Impose necessary and sufficient conditions on a devices's table such + * that any incoming bio which respects its logical_block_size can be + * processed successfully. If it falls across the boundary between + * two or more targets, the size of each piece it gets split into must + * be compatible with the logical_block_size of the target processing it. + */ +static int validate_hardware_logical_block_alignment(struct dm_table *table, + struct queue_limits *limits) { - if (!rs->max_sectors) - rs->max_sectors = SAFE_MAX_SECTORS; - if (!rs->max_hw_sectors) - rs->max_hw_sectors = SAFE_MAX_SECTORS; - if (!rs->max_phys_segments) - rs->max_phys_segments = MAX_PHYS_SEGMENTS; - if (!rs->max_hw_segments) - rs->max_hw_segments = MAX_HW_SEGMENTS; - if (!rs->hardsect_size) - rs->hardsect_size = 1 << SECTOR_SHIFT; - if (!rs->max_segment_size) - rs->max_segment_size = MAX_SEGMENT_SIZE; - if (!rs->seg_boundary_mask) - rs->seg_boundary_mask = -1; - if (!rs->bounce_pfn) - rs->bounce_pfn = -1; + /* + * This function uses arithmetic modulo the logical_block_size + * (in units of 512-byte sectors). + */ + unsigned short device_logical_block_size_sects = + limits->logical_block_size >> SECTOR_SHIFT; + + /* + * Offset of the start of the next table entry, mod logical_block_size. + */ + unsigned short next_target_start = 0; + + /* + * Given an aligned bio that extends beyond the end of a + * target, how many sectors must the next target handle? + */ + unsigned short remaining = 0; + + struct dm_target *uninitialized_var(ti); + struct queue_limits ti_limits; + unsigned i = 0; + + /* + * Check each entry in the table in turn. + */ + while (i < dm_table_get_num_targets(table)) { + ti = dm_table_get_target(table, i++); + + blk_set_stacking_limits(&ti_limits); + + /* combine all target devices' limits */ + if (ti->type->iterate_devices) + ti->type->iterate_devices(ti, dm_set_device_limits, + &ti_limits); + + /* + * If the remaining sectors fall entirely within this + * table entry are they compatible with its logical_block_size? + */ + if (remaining < ti->len && + remaining & ((ti_limits.logical_block_size >> + SECTOR_SHIFT) - 1)) + break; /* Error */ + + next_target_start = + (unsigned short) ((next_target_start + ti->len) & + (device_logical_block_size_sects - 1)); + remaining = next_target_start ? + device_logical_block_size_sects - next_target_start : 0; + } + + if (remaining) { + DMWARN("%s: table line %u (start sect %llu len %llu) " + "not aligned to h/w logical block size %u", + dm_device_name(table->md), i, + (unsigned long long) ti->begin, + (unsigned long long) ti->len, + limits->logical_block_size); + return -EINVAL; + } + + return 0; } int dm_table_add_target(struct dm_table *t, const char *type, @@ -675,8 +707,13 @@ int dm_table_add_target(struct dm_table *t, const char *type, char **argv; struct dm_target *tgt; - if ((r = check_space(t))) - return r; + if (t->singleton) { + DMERR("%s: target type %s must appear alone in table", + dm_device_name(t->md), t->targets->type->name); + return -EINVAL; + } + + BUG_ON(t->num_targets >= t->num_allocated); tgt = t->targets + t->num_targets; memset(tgt, 0, sizeof(*tgt)); @@ -693,6 +730,36 @@ int dm_table_add_target(struct dm_table *t, const char *type, return -EINVAL; } + if (dm_target_needs_singleton(tgt->type)) { + if (t->num_targets) { + DMERR("%s: target type %s must appear alone in table", + dm_device_name(t->md), type); + return -EINVAL; + } + t->singleton = 1; + } + + if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) { + DMERR("%s: target type %s may not be included in read-only tables", + dm_device_name(t->md), type); + return -EINVAL; + } + + if (t->immutable_target_type) { + if (t->immutable_target_type != tgt->type) { + DMERR("%s: immutable target type %s cannot be mixed with other target types", + dm_device_name(t->md), t->immutable_target_type->name); + return -EINVAL; + } + } else if (dm_target_is_immutable(tgt->type)) { + if (t->num_targets) { + DMERR("%s: immutable target type %s cannot be mixed with other target types", + dm_device_name(t->md), tgt->type->name); + return -EINVAL; + } + t->immutable_target_type = tgt->type; + } + tgt->table = t; tgt->begin = start; tgt->len = len; @@ -720,9 +787,10 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; - /* FIXME: the plan is to combine high here and then have - * the merge fn apply the target level restrictions. */ - combine_restrictions_low(&t->limits, &tgt->limits); + if (!tgt->num_discard_bios && tgt->discards_supported) + DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", + dm_device_name(t->md), type); + return 0; bad: @@ -731,6 +799,187 @@ int dm_table_add_target(struct dm_table *t, const char *type, return r; } +/* + * Target argument parsing helpers. + */ +static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error, unsigned grouped) +{ + const char *arg_str = dm_shift_arg(arg_set); + char dummy; + + if (!arg_str || + (sscanf(arg_str, "%u%c", value, &dummy) != 1) || + (*value < arg->min) || + (*value > arg->max) || + (grouped && arg_set->argc < *value)) { + *error = arg->error; + return -EINVAL; + } + + return 0; +} + +int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error) +{ + return validate_next_arg(arg, arg_set, value, error, 0); +} +EXPORT_SYMBOL(dm_read_arg); + +int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set, + unsigned *value, char **error) +{ + return validate_next_arg(arg, arg_set, value, error, 1); +} +EXPORT_SYMBOL(dm_read_arg_group); + +const char *dm_shift_arg(struct dm_arg_set *as) +{ + char *r; + + if (as->argc) { + as->argc--; + r = *as->argv; + as->argv++; + return r; + } + + return NULL; +} +EXPORT_SYMBOL(dm_shift_arg); + +void dm_consume_args(struct dm_arg_set *as, unsigned num_args) +{ + BUG_ON(as->argc < num_args); + as->argc -= num_args; + as->argv += num_args; +} +EXPORT_SYMBOL(dm_consume_args); + +static int dm_table_set_type(struct dm_table *t) +{ + unsigned i; + unsigned bio_based = 0, request_based = 0, hybrid = 0; + struct dm_target *tgt; + struct dm_dev_internal *dd; + struct list_head *devices; + unsigned live_md_type; + + for (i = 0; i < t->num_targets; i++) { + tgt = t->targets + i; + if (dm_target_hybrid(tgt)) + hybrid = 1; + else if (dm_target_request_based(tgt)) + request_based = 1; + else + bio_based = 1; + + if (bio_based && request_based) { + DMWARN("Inconsistent table: different target types" + " can't be mixed up"); + return -EINVAL; + } + } + + if (hybrid && !bio_based && !request_based) { + /* + * The targets can work either way. + * Determine the type from the live device. + * Default to bio-based if device is new. + */ + live_md_type = dm_get_md_type(t->md); + if (live_md_type == DM_TYPE_REQUEST_BASED) + request_based = 1; + else + bio_based = 1; + } + + if (bio_based) { + /* We must use this table as bio-based */ + t->type = DM_TYPE_BIO_BASED; + return 0; + } + + BUG_ON(!request_based); /* No targets in this table */ + + /* Non-request-stackable devices can't be used for request-based dm */ + devices = dm_table_get_devices(t); + list_for_each_entry(dd, devices, list) { + if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { + DMWARN("table load rejected: including" + " non-request-stackable devices"); + return -EINVAL; + } + } + + /* + * Request-based dm supports only tables that have a single target now. + * To support multiple targets, request splitting support is needed, + * and that needs lots of changes in the block-layer. + * (e.g. request completion process for partial completion.) + */ + if (t->num_targets > 1) { + DMWARN("Request-based dm doesn't support multiple targets yet"); + return -EINVAL; + } + + t->type = DM_TYPE_REQUEST_BASED; + + return 0; +} + +unsigned dm_table_get_type(struct dm_table *t) +{ + return t->type; +} + +struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) +{ + return t->immutable_target_type; +} + +bool dm_table_request_based(struct dm_table *t) +{ + return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; +} + +static int dm_table_alloc_md_mempools(struct dm_table *t) +{ + unsigned type = dm_table_get_type(t); + unsigned per_bio_data_size = 0; + struct dm_target *tgt; + unsigned i; + + if (unlikely(type == DM_TYPE_NONE)) { + DMWARN("no table type is set, can't allocate mempools"); + return -EINVAL; + } + + if (type == DM_TYPE_BIO_BASED) + for (i = 0; i < t->num_targets; i++) { + tgt = t->targets + i; + per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size); + } + + t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size); + if (!t->mempools) + return -ENOMEM; + + return 0; +} + +void dm_table_free_md_mempools(struct dm_table *t) +{ + dm_free_md_mempools(t->mempools); + t->mempools = NULL; +} + +struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) +{ + return t->mempools; +} + static int setup_indexes(struct dm_table *t) { int i; @@ -760,13 +1009,11 @@ static int setup_indexes(struct dm_table *t) /* * Builds the btree to index the map. */ -int dm_table_complete(struct dm_table *t) +static int dm_table_build_index(struct dm_table *t) { int r = 0; unsigned int leaf_nodes; - check_for_valid_limits(&t->limits); - /* how many indexes will the btree have ? */ leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); @@ -781,6 +1028,117 @@ int dm_table_complete(struct dm_table *t) return r; } +/* + * Get a disk whose integrity profile reflects the table's profile. + * If %match_all is true, all devices' profiles must match. + * If %match_all is false, all devices must at least have an + * allocated integrity profile; but uninitialized is ok. + * Returns NULL if integrity support was inconsistent or unavailable. + */ +static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, + bool match_all) +{ + struct list_head *devices = dm_table_get_devices(t); + struct dm_dev_internal *dd = NULL; + struct gendisk *prev_disk = NULL, *template_disk = NULL; + + list_for_each_entry(dd, devices, list) { + template_disk = dd->dm_dev.bdev->bd_disk; + if (!blk_get_integrity(template_disk)) + goto no_integrity; + if (!match_all && !blk_integrity_is_initialized(template_disk)) + continue; /* skip uninitialized profiles */ + else if (prev_disk && + blk_integrity_compare(prev_disk, template_disk) < 0) + goto no_integrity; + prev_disk = template_disk; + } + + return template_disk; + +no_integrity: + if (prev_disk) + DMWARN("%s: integrity not set: %s and %s profile mismatch", + dm_device_name(t->md), + prev_disk->disk_name, + template_disk->disk_name); + return NULL; +} + +/* + * Register the mapped device for blk_integrity support if + * the underlying devices have an integrity profile. But all devices + * may not have matching profiles (checking all devices isn't reliable + * during table load because this table may use other DM device(s) which + * must be resumed before they will have an initialized integity profile). + * Stacked DM devices force a 2 stage integrity profile validation: + * 1 - during load, validate all initialized integrity profiles match + * 2 - during resume, validate all integrity profiles match + */ +static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) +{ + struct gendisk *template_disk = NULL; + + template_disk = dm_table_get_integrity_disk(t, false); + if (!template_disk) + return 0; + + if (!blk_integrity_is_initialized(dm_disk(md))) { + t->integrity_supported = 1; + return blk_integrity_register(dm_disk(md), NULL); + } + + /* + * If DM device already has an initalized integrity + * profile the new profile should not conflict. + */ + if (blk_integrity_is_initialized(template_disk) && + blk_integrity_compare(dm_disk(md), template_disk) < 0) { + DMWARN("%s: conflict with existing integrity profile: " + "%s profile mismatch", + dm_device_name(t->md), + template_disk->disk_name); + return 1; + } + + /* Preserve existing initialized integrity profile */ + t->integrity_supported = 1; + return 0; +} + +/* + * Prepares the table for use by building the indices, + * setting the type, and allocating mempools. + */ +int dm_table_complete(struct dm_table *t) +{ + int r; + + r = dm_table_set_type(t); + if (r) { + DMERR("unable to set table type"); + return r; + } + + r = dm_table_build_index(t); + if (r) { + DMERR("unable to build btrees"); + return r; + } + + r = dm_table_prealloc_integrity(t, t->md); + if (r) { + DMERR("could not register integrity profile."); + return r; + } + + r = dm_table_alloc_md_mempools(t); + if (r) + DMERR("unable to allocate mempools"); + + return r; +} + static DEFINE_MUTEX(_event_lock); void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context) @@ -804,11 +1162,13 @@ void dm_table_event(struct dm_table *t) t->event_fn(t->event_context); mutex_unlock(&_event_lock); } +EXPORT_SYMBOL(dm_table_event); sector_t dm_table_get_size(struct dm_table *t) { return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; } +EXPORT_SYMBOL(dm_table_get_size); struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) { @@ -841,26 +1201,292 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) return &t->targets[(KEYS_PER_NODE * n) + k]; } -void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) +static int count_device(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { + unsigned *num_devices = data; + + (*num_devices)++; + + return 0; +} + +/* + * Check whether a table has no data devices attached using each + * target's iterate_devices method. + * Returns false if the result is unknown because a target doesn't + * support iterate_devices. + */ +bool dm_table_has_no_data_devices(struct dm_table *table) +{ + struct dm_target *uninitialized_var(ti); + unsigned i = 0, num_devices = 0; + + while (i < dm_table_get_num_targets(table)) { + ti = dm_table_get_target(table, i++); + + if (!ti->type->iterate_devices) + return false; + + ti->type->iterate_devices(ti, count_device, &num_devices); + if (num_devices) + return false; + } + + return true; +} + +/* + * Establish the new table's queue_limits and validate them. + */ +int dm_calculate_queue_limits(struct dm_table *table, + struct queue_limits *limits) +{ + struct dm_target *uninitialized_var(ti); + struct queue_limits ti_limits; + unsigned i = 0; + + blk_set_stacking_limits(limits); + + while (i < dm_table_get_num_targets(table)) { + blk_set_stacking_limits(&ti_limits); + + ti = dm_table_get_target(table, i++); + + if (!ti->type->iterate_devices) + goto combine_limits; + + /* + * Combine queue limits of all the devices this target uses. + */ + ti->type->iterate_devices(ti, dm_set_device_limits, + &ti_limits); + + /* Set I/O hints portion of queue limits */ + if (ti->type->io_hints) + ti->type->io_hints(ti, &ti_limits); + + /* + * Check each device area is consistent with the target's + * overall queue limits. + */ + if (ti->type->iterate_devices(ti, device_area_is_invalid, + &ti_limits)) + return -EINVAL; + +combine_limits: + /* + * Merge this target's queue limits into the overall limits + * for the table. + */ + if (blk_stack_limits(limits, &ti_limits, 0) < 0) + DMWARN("%s: adding target device " + "(start sect %llu len %llu) " + "caused an alignment inconsistency", + dm_device_name(table->md), + (unsigned long long) ti->begin, + (unsigned long long) ti->len); + } + + return validate_hardware_logical_block_alignment(table, limits); +} + +/* + * Set the integrity profile for this device if all devices used have + * matching profiles. We're quite deep in the resume path but still + * don't know if all devices (particularly DM devices this device + * may be stacked on) have matching profiles. Even if the profiles + * don't match we have no way to fail (to resume) at this point. + */ +static void dm_table_set_integrity(struct dm_table *t) +{ + struct gendisk *template_disk = NULL; + + if (!blk_get_integrity(dm_disk(t->md))) + return; + + template_disk = dm_table_get_integrity_disk(t, true); + if (template_disk) + blk_integrity_register(dm_disk(t->md), + blk_get_integrity(template_disk)); + else if (blk_integrity_is_initialized(dm_disk(t->md))) + DMWARN("%s: device no longer has a valid integrity profile", + dm_device_name(t->md)); + else + DMWARN("%s: unable to establish an integrity profile", + dm_device_name(t->md)); +} + +static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + unsigned flush = (*(unsigned *)data); + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && (q->flush_flags & flush); +} + +static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) +{ + struct dm_target *ti; + unsigned i = 0; + /* - * Make sure we obey the optimistic sub devices - * restrictions. + * Require at least one underlying device to support flushes. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting flushes must provide. */ - blk_queue_max_sectors(q, t->limits.max_sectors); - q->max_phys_segments = t->limits.max_phys_segments; - q->max_hw_segments = t->limits.max_hw_segments; - q->hardsect_size = t->limits.hardsect_size; - q->max_segment_size = t->limits.max_segment_size; - q->max_hw_sectors = t->limits.max_hw_sectors; - q->seg_boundary_mask = t->limits.seg_boundary_mask; - q->bounce_pfn = t->limits.bounce_pfn; - - if (t->limits.no_cluster) - queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->num_flush_bios) + continue; + + if (ti->flush_supported) + return 1; + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_flush_capable, &flush)) + return 1; + } + + return 0; +} + +static bool dm_table_discard_zeroes_data(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + /* Ensure that all targets supports discard_zeroes_data. */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (ti->discard_zeroes_data_unsupported) + return 0; + } + + return 1; +} + +static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && blk_queue_nonrot(q); +} + +static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && !blk_queue_add_random(q); +} + +static bool dm_table_all_devices_attribute(struct dm_table *t, + iterate_devices_callout_fn func) +{ + struct dm_target *ti; + unsigned i = 0; + + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->type->iterate_devices || + !ti->type->iterate_devices(ti, func, NULL)) + return 0; + } + + return 1; +} + +static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && !q->limits.max_write_same_sectors; +} + +static bool dm_table_supports_write_same(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->num_write_same_bios) + return false; + + if (!ti->type->iterate_devices || + ti->type->iterate_devices(ti, device_not_write_same_capable, NULL)) + return false; + } + + return true; +} + +void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *limits) +{ + unsigned flush = 0; + + /* + * Copy table's limits to the DM device's request_queue + */ + q->limits = *limits; + + if (!dm_table_supports_discards(t)) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); else - queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + + if (dm_table_supports_flush(t, REQ_FLUSH)) { + flush |= REQ_FLUSH; + if (dm_table_supports_flush(t, REQ_FUA)) + flush |= REQ_FUA; + } + blk_queue_flush(q, flush); + + if (!dm_table_discard_zeroes_data(t)) + q->limits.discard_zeroes_data = 0; + /* Ensure that all underlying devices are non-rotational. */ + if (dm_table_all_devices_attribute(t, device_is_nonrot)) + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + else + queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q); + + if (!dm_table_supports_write_same(t)) + q->limits.max_write_same_sectors = 0; + + dm_table_set_integrity(t); + + /* + * Determine whether or not this queue's I/O timings contribute + * to the entropy pool, Only request-based targets use this. + * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not + * have it set. + */ + if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); + + /* + * QUEUE_FLAG_STACKABLE must be set after all queue settings are + * visible to other CPUs because, once the flag is set, incoming bios + * are processed by request-based dm, which refers to the queue + * settings. + * Until the flag set, bios are passed to bio-based dm and queued to + * md->deferred where queue settings are not needed yet. + * Those bios are passed to request-based dm at the resume time. + */ + smp_mb(); + if (dm_table_request_based(t)) + queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); } unsigned int dm_table_get_num_targets(struct dm_table *t) @@ -873,10 +1499,11 @@ struct list_head *dm_table_get_devices(struct dm_table *t) return &t->devices; } -int dm_table_get_mode(struct dm_table *t) +fmode_t dm_table_get_mode(struct dm_table *t) { return t->mode; } +EXPORT_SYMBOL(dm_table_get_mode); static void suspend_targets(struct dm_table *t, unsigned postsuspend) { @@ -921,8 +1548,11 @@ int dm_table_resume_targets(struct dm_table *t) continue; r = ti->type->preresume(ti); - if (r) + if (r) { + DMERR("%s: %s: preresume failed, error = %d", + dm_device_name(t->md), ti->type->name, r); return r; + } } for (i = 0; i < t->num_targets; i++) { @@ -935,46 +1565,110 @@ int dm_table_resume_targets(struct dm_table *t) return 0; } +void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb) +{ + list_add(&cb->list, &t->target_callbacks); +} +EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks); + int dm_table_any_congested(struct dm_table *t, int bdi_bits) { - struct dm_dev *dd; + struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); + struct dm_target_callbacks *cb; int r = 0; list_for_each_entry(dd, devices, list) { - struct request_queue *q = bdev_get_queue(dd->bdev); - r |= bdi_congested(&q->backing_dev_info, bdi_bits); + struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); + char b[BDEVNAME_SIZE]; + + if (likely(q)) + r |= bdi_congested(&q->backing_dev_info, bdi_bits); + else + DMWARN_LIMIT("%s: any_congested: nonexistent device %s", + dm_device_name(t->md), + bdevname(dd->dm_dev.bdev, b)); } + list_for_each_entry(cb, &t->target_callbacks, list) + if (cb->congested_fn) + r |= cb->congested_fn(cb, bdi_bits); + return r; } -void dm_table_unplug_all(struct dm_table *t) +int dm_table_any_busy_target(struct dm_table *t) { - struct dm_dev *dd; - struct list_head *devices = dm_table_get_devices(t); - - list_for_each_entry(dd, devices, list) { - struct request_queue *q = bdev_get_queue(dd->bdev); + unsigned i; + struct dm_target *ti; - blk_unplug(q); + for (i = 0; i < t->num_targets; i++) { + ti = t->targets + i; + if (ti->type->busy && ti->type->busy(ti)) + return 1; } + + return 0; } struct mapped_device *dm_table_get_md(struct dm_table *t) { - dm_get(t->md); - return t->md; } - -EXPORT_SYMBOL(dm_vcalloc); -EXPORT_SYMBOL(dm_get_device); -EXPORT_SYMBOL(dm_put_device); -EXPORT_SYMBOL(dm_table_event); -EXPORT_SYMBOL(dm_table_get_size); -EXPORT_SYMBOL(dm_table_get_mode); EXPORT_SYMBOL(dm_table_get_md); -EXPORT_SYMBOL(dm_table_put); -EXPORT_SYMBOL(dm_table_get); -EXPORT_SYMBOL(dm_table_unplug_all); + +void dm_table_run_md_queue_async(struct dm_table *t) +{ + struct mapped_device *md; + struct request_queue *queue; + unsigned long flags; + + if (!dm_table_request_based(t)) + return; + + md = dm_table_get_md(t); + queue = dm_get_md_queue(md); + if (queue) { + spin_lock_irqsave(queue->queue_lock, flags); + blk_run_queue_async(queue); + spin_unlock_irqrestore(queue->queue_lock, flags); + } +} +EXPORT_SYMBOL(dm_table_run_md_queue_async); + +static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && blk_queue_discard(q); +} + +bool dm_table_supports_discards(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + /* + * Unless any target used by the table set discards_supported, + * require at least one underlying device to support discards. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting discard selectively must provide. + */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->num_discard_bios) + continue; + + if (ti->discards_supported) + return 1; + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_discard_capable, NULL)) + return 1; + } + + return 0; +} diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 835cf95b857..242e3cec397 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -10,49 +10,37 @@ #include <linux/init.h> #include <linux/kmod.h> #include <linux/bio.h> -#include <linux/slab.h> #define DM_MSG_PREFIX "target" -struct tt_internal { - struct target_type tt; - - struct list_head list; - long use; -}; - static LIST_HEAD(_targets); static DECLARE_RWSEM(_lock); #define DM_MOD_NAME_SIZE 32 -static inline struct tt_internal *__find_target_type(const char *name) +static inline struct target_type *__find_target_type(const char *name) { - struct tt_internal *ti; + struct target_type *tt; - list_for_each_entry (ti, &_targets, list) - if (!strcmp(name, ti->tt.name)) - return ti; + list_for_each_entry(tt, &_targets, list) + if (!strcmp(name, tt->name)) + return tt; return NULL; } -static struct tt_internal *get_target_type(const char *name) +static struct target_type *get_target_type(const char *name) { - struct tt_internal *ti; + struct target_type *tt; down_read(&_lock); - ti = __find_target_type(name); - if (ti) { - if ((ti->use == 0) && !try_module_get(ti->tt.module)) - ti = NULL; - else - ti->use++; - } + tt = __find_target_type(name); + if (tt && !try_module_get(tt->module)) + tt = NULL; up_read(&_lock); - return ti; + return tt; } static void load_module(const char *name) @@ -62,122 +50,100 @@ static void load_module(const char *name) struct target_type *dm_get_target_type(const char *name) { - struct tt_internal *ti = get_target_type(name); + struct target_type *tt = get_target_type(name); - if (!ti) { + if (!tt) { load_module(name); - ti = get_target_type(name); + tt = get_target_type(name); } - return ti ? &ti->tt : NULL; + return tt; } -void dm_put_target_type(struct target_type *t) +void dm_put_target_type(struct target_type *tt) { - struct tt_internal *ti = (struct tt_internal *) t; - down_read(&_lock); - if (--ti->use == 0) - module_put(ti->tt.module); - - BUG_ON(ti->use < 0); + module_put(tt->module); up_read(&_lock); - - return; -} - -static struct tt_internal *alloc_target(struct target_type *t) -{ - struct tt_internal *ti = kzalloc(sizeof(*ti), GFP_KERNEL); - - if (ti) - ti->tt = *t; - - return ti; } - int dm_target_iterate(void (*iter_func)(struct target_type *tt, void *param), void *param) { - struct tt_internal *ti; + struct target_type *tt; down_read(&_lock); - list_for_each_entry (ti, &_targets, list) - iter_func(&ti->tt, param); + list_for_each_entry(tt, &_targets, list) + iter_func(tt, param); up_read(&_lock); return 0; } -int dm_register_target(struct target_type *t) +int dm_register_target(struct target_type *tt) { int rv = 0; - struct tt_internal *ti = alloc_target(t); - - if (!ti) - return -ENOMEM; down_write(&_lock); - if (__find_target_type(t->name)) + if (__find_target_type(tt->name)) rv = -EEXIST; else - list_add(&ti->list, &_targets); + list_add(&tt->list, &_targets); up_write(&_lock); - if (rv) - kfree(ti); return rv; } -int dm_unregister_target(struct target_type *t) +void dm_unregister_target(struct target_type *tt) { - struct tt_internal *ti; - down_write(&_lock); - if (!(ti = __find_target_type(t->name))) { - up_write(&_lock); - return -EINVAL; + if (!__find_target_type(tt->name)) { + DMCRIT("Unregistering unrecognised target: %s", tt->name); + BUG(); } - if (ti->use) { - up_write(&_lock); - return -ETXTBSY; - } - - list_del(&ti->list); - kfree(ti); + list_del(&tt->list); up_write(&_lock); - return 0; } /* * io-err: always fails an io, useful for bringing * up LVs that have holes in them. */ -static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args) +static int io_err_ctr(struct dm_target *tt, unsigned int argc, char **args) { + /* + * Return error for discards instead of -EOPNOTSUPP + */ + tt->num_discard_bios = 1; + return 0; } -static void io_err_dtr(struct dm_target *ti) +static void io_err_dtr(struct dm_target *tt) { /* empty */ } -static int io_err_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) +static int io_err_map(struct dm_target *tt, struct bio *bio) +{ + return -EIO; +} + +static int io_err_map_rq(struct dm_target *ti, struct request *clone, + union map_info *map_context) { return -EIO; } static struct target_type error_target = { .name = "error", - .version = {1, 0, 1}, + .version = {1, 2, 0}, .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, + .map_rq = io_err_map_rq, }; int __init dm_target_init(void) @@ -187,8 +153,7 @@ int __init dm_target_init(void) void dm_target_exit(void) { - if (dm_unregister_target(&error_target)) - DMWARN("error target unregistration failed"); + dm_unregister_target(&error_target); } EXPORT_SYMBOL(dm_register_target); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c new file mode 100644 index 00000000000..e9d33ad59df --- /dev/null +++ b/drivers/md/dm-thin-metadata.c @@ -0,0 +1,1815 @@ +/* + * Copyright (C) 2011-2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-thin-metadata.h" +#include "persistent-data/dm-btree.h" +#include "persistent-data/dm-space-map.h" +#include "persistent-data/dm-space-map-disk.h" +#include "persistent-data/dm-transaction-manager.h" + +#include <linux/list.h> +#include <linux/device-mapper.h> +#include <linux/workqueue.h> + +/*-------------------------------------------------------------------------- + * As far as the metadata goes, there is: + * + * - A superblock in block zero, taking up fewer than 512 bytes for + * atomic writes. + * + * - A space map managing the metadata blocks. + * + * - A space map managing the data blocks. + * + * - A btree mapping our internal thin dev ids onto struct disk_device_details. + * + * - A hierarchical btree, with 2 levels which effectively maps (thin + * dev id, virtual block) -> block_time. Block time is a 64-bit + * field holding the time in the low 24 bits, and block in the top 48 + * bits. + * + * BTrees consist solely of btree_nodes, that fill a block. Some are + * internal nodes, as such their values are a __le64 pointing to other + * nodes. Leaf nodes can store data of any reasonable size (ie. much + * smaller than the block size). The nodes consist of the header, + * followed by an array of keys, followed by an array of values. We have + * to binary search on the keys so they're all held together to help the + * cpu cache. + * + * Space maps have 2 btrees: + * + * - One maps a uint64_t onto a struct index_entry. Which points to a + * bitmap block, and has some details about how many free entries there + * are etc. + * + * - The bitmap blocks have a header (for the checksum). Then the rest + * of the block is pairs of bits. With the meaning being: + * + * 0 - ref count is 0 + * 1 - ref count is 1 + * 2 - ref count is 2 + * 3 - ref count is higher than 2 + * + * - If the count is higher than 2 then the ref count is entered in a + * second btree that directly maps the block_address to a uint32_t ref + * count. + * + * The space map metadata variant doesn't have a bitmaps btree. Instead + * it has one single blocks worth of index_entries. This avoids + * recursive issues with the bitmap btree needing to allocate space in + * order to insert. With a small data block size such as 64k the + * metadata support data devices that are hundreds of terrabytes. + * + * The space maps allocate space linearly from front to back. Space that + * is freed in a transaction is never recycled within that transaction. + * To try and avoid fragmenting _free_ space the allocator always goes + * back and fills in gaps. + * + * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks + * from the block manager. + *--------------------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "thin metadata" + +#define THIN_SUPERBLOCK_MAGIC 27022010 +#define THIN_SUPERBLOCK_LOCATION 0 +#define THIN_VERSION 2 +#define THIN_METADATA_CACHE_SIZE 64 +#define SECTOR_TO_BLOCK_SHIFT 3 + +/* + * 3 for btree insert + + * 2 for btree lookup used within space map + */ +#define THIN_MAX_CONCURRENT_LOCKS 5 + +/* This should be plenty */ +#define SPACE_MAP_ROOT_SIZE 128 + +/* + * Little endian on-disk superblock and device details. + */ +struct thin_disk_superblock { + __le32 csum; /* Checksum of superblock except for this field. */ + __le32 flags; + __le64 blocknr; /* This block number, dm_block_t. */ + + __u8 uuid[16]; + __le64 magic; + __le32 version; + __le32 time; + + __le64 trans_id; + + /* + * Root held by userspace transactions. + */ + __le64 held_root; + + __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; + + /* + * 2-level btree mapping (dev_id, (dev block, time)) -> data block + */ + __le64 data_mapping_root; + + /* + * Device detail root mapping dev_id -> device_details + */ + __le64 device_details_root; + + __le32 data_block_size; /* In 512-byte sectors. */ + + __le32 metadata_block_size; /* In 512-byte sectors. */ + __le64 metadata_nr_blocks; + + __le32 compat_flags; + __le32 compat_ro_flags; + __le32 incompat_flags; +} __packed; + +struct disk_device_details { + __le64 mapped_blocks; + __le64 transaction_id; /* When created. */ + __le32 creation_time; + __le32 snapshotted_time; +} __packed; + +struct dm_pool_metadata { + struct hlist_node hash; + + struct block_device *bdev; + struct dm_block_manager *bm; + struct dm_space_map *metadata_sm; + struct dm_space_map *data_sm; + struct dm_transaction_manager *tm; + struct dm_transaction_manager *nb_tm; + + /* + * Two-level btree. + * First level holds thin_dev_t. + * Second level holds mappings. + */ + struct dm_btree_info info; + + /* + * Non-blocking version of the above. + */ + struct dm_btree_info nb_info; + + /* + * Just the top level for deleting whole devices. + */ + struct dm_btree_info tl_info; + + /* + * Just the bottom level for creating new devices. + */ + struct dm_btree_info bl_info; + + /* + * Describes the device details btree. + */ + struct dm_btree_info details_info; + + struct rw_semaphore root_lock; + uint32_t time; + dm_block_t root; + dm_block_t details_root; + struct list_head thin_devices; + uint64_t trans_id; + unsigned long flags; + sector_t data_block_size; + bool read_only:1; + + /* + * Set if a transaction has to be aborted but the attempt to roll back + * to the previous (good) transaction failed. The only pool metadata + * operation possible in this state is the closing of the device. + */ + bool fail_io:1; + + /* + * Reading the space map roots can fail, so we read it into these + * buffers before the superblock is locked and updated. + */ + __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; + __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; +}; + +struct dm_thin_device { + struct list_head list; + struct dm_pool_metadata *pmd; + dm_thin_id id; + + int open_count; + bool changed:1; + bool aborted_with_changes:1; + uint64_t mapped_blocks; + uint64_t transaction_id; + uint32_t creation_time; + uint32_t snapshotted_time; +}; + +/*---------------------------------------------------------------- + * superblock validator + *--------------------------------------------------------------*/ + +#define SUPERBLOCK_CSUM_XOR 160774 + +static void sb_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct thin_disk_superblock *disk_super = dm_block_data(b); + + disk_super->blocknr = cpu_to_le64(dm_block_location(b)); + disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); +} + +static int sb_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct thin_disk_superblock *disk_super = dm_block_data(b); + __le32 csum_le; + + if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { + DMERR("sb_check failed: blocknr %llu: " + "wanted %llu", le64_to_cpu(disk_super->blocknr), + (unsigned long long)dm_block_location(b)); + return -ENOTBLK; + } + + if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) { + DMERR("sb_check failed: magic %llu: " + "wanted %llu", le64_to_cpu(disk_super->magic), + (unsigned long long)THIN_SUPERBLOCK_MAGIC); + return -EILSEQ; + } + + csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); + if (csum_le != disk_super->csum) { + DMERR("sb_check failed: csum %u: wanted %u", + le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator sb_validator = { + .name = "superblock", + .prepare_for_write = sb_prepare_for_write, + .check = sb_check +}; + +/*---------------------------------------------------------------- + * Methods for the btree value types + *--------------------------------------------------------------*/ + +static uint64_t pack_block_time(dm_block_t b, uint32_t t) +{ + return (b << 24) | t; +} + +static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) +{ + *b = v >> 24; + *t = v & ((1 << 24) - 1); +} + +static void data_block_inc(void *context, const void *value_le) +{ + struct dm_space_map *sm = context; + __le64 v_le; + uint64_t b; + uint32_t t; + + memcpy(&v_le, value_le, sizeof(v_le)); + unpack_block_time(le64_to_cpu(v_le), &b, &t); + dm_sm_inc_block(sm, b); +} + +static void data_block_dec(void *context, const void *value_le) +{ + struct dm_space_map *sm = context; + __le64 v_le; + uint64_t b; + uint32_t t; + + memcpy(&v_le, value_le, sizeof(v_le)); + unpack_block_time(le64_to_cpu(v_le), &b, &t); + dm_sm_dec_block(sm, b); +} + +static int data_block_equal(void *context, const void *value1_le, const void *value2_le) +{ + __le64 v1_le, v2_le; + uint64_t b1, b2; + uint32_t t; + + memcpy(&v1_le, value1_le, sizeof(v1_le)); + memcpy(&v2_le, value2_le, sizeof(v2_le)); + unpack_block_time(le64_to_cpu(v1_le), &b1, &t); + unpack_block_time(le64_to_cpu(v2_le), &b2, &t); + + return b1 == b2; +} + +static void subtree_inc(void *context, const void *value) +{ + struct dm_btree_info *info = context; + __le64 root_le; + uint64_t root; + + memcpy(&root_le, value, sizeof(root_le)); + root = le64_to_cpu(root_le); + dm_tm_inc(info->tm, root); +} + +static void subtree_dec(void *context, const void *value) +{ + struct dm_btree_info *info = context; + __le64 root_le; + uint64_t root; + + memcpy(&root_le, value, sizeof(root_le)); + root = le64_to_cpu(root_le); + if (dm_btree_del(info, root)) + DMERR("btree delete failed\n"); +} + +static int subtree_equal(void *context, const void *value1_le, const void *value2_le) +{ + __le64 v1_le, v2_le; + memcpy(&v1_le, value1_le, sizeof(v1_le)); + memcpy(&v2_le, value2_le, sizeof(v2_le)); + + return v1_le == v2_le; +} + +/*----------------------------------------------------------------*/ + +static int superblock_lock_zero(struct dm_pool_metadata *pmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int superblock_lock(struct dm_pool_metadata *pmd, + struct dm_block **sblock) +{ + return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, sblock); +} + +static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) +{ + int r; + unsigned i; + struct dm_block *b; + __le64 *data_le, zero = cpu_to_le64(0); + unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64); + + /* + * We can't use a validator here - it may be all zeroes. + */ + r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b); + if (r) + return r; + + data_le = dm_block_data(b); + *result = 1; + for (i = 0; i < block_size; i++) { + if (data_le[i] != zero) { + *result = 0; + break; + } + } + + return dm_bm_unlock(b); +} + +static void __setup_btree_details(struct dm_pool_metadata *pmd) +{ + pmd->info.tm = pmd->tm; + pmd->info.levels = 2; + pmd->info.value_type.context = pmd->data_sm; + pmd->info.value_type.size = sizeof(__le64); + pmd->info.value_type.inc = data_block_inc; + pmd->info.value_type.dec = data_block_dec; + pmd->info.value_type.equal = data_block_equal; + + memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); + pmd->nb_info.tm = pmd->nb_tm; + + pmd->tl_info.tm = pmd->tm; + pmd->tl_info.levels = 1; + pmd->tl_info.value_type.context = &pmd->bl_info; + pmd->tl_info.value_type.size = sizeof(__le64); + pmd->tl_info.value_type.inc = subtree_inc; + pmd->tl_info.value_type.dec = subtree_dec; + pmd->tl_info.value_type.equal = subtree_equal; + + pmd->bl_info.tm = pmd->tm; + pmd->bl_info.levels = 1; + pmd->bl_info.value_type.context = pmd->data_sm; + pmd->bl_info.value_type.size = sizeof(__le64); + pmd->bl_info.value_type.inc = data_block_inc; + pmd->bl_info.value_type.dec = data_block_dec; + pmd->bl_info.value_type.equal = data_block_equal; + + pmd->details_info.tm = pmd->tm; + pmd->details_info.levels = 1; + pmd->details_info.value_type.context = NULL; + pmd->details_info.value_type.size = sizeof(struct disk_device_details); + pmd->details_info.value_type.inc = NULL; + pmd->details_info.value_type.dec = NULL; + pmd->details_info.value_type.equal = NULL; +} + +static int save_sm_roots(struct dm_pool_metadata *pmd) +{ + int r; + size_t len; + + r = dm_sm_root_size(pmd->metadata_sm, &len); + if (r < 0) + return r; + + r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len); + if (r < 0) + return r; + + r = dm_sm_root_size(pmd->data_sm, &len); + if (r < 0) + return r; + + return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len); +} + +static void copy_sm_roots(struct dm_pool_metadata *pmd, + struct thin_disk_superblock *disk) +{ + memcpy(&disk->metadata_space_map_root, + &pmd->metadata_space_map_root, + sizeof(pmd->metadata_space_map_root)); + + memcpy(&disk->data_space_map_root, + &pmd->data_space_map_root, + sizeof(pmd->data_space_map_root)); +} + +static int __write_initial_superblock(struct dm_pool_metadata *pmd) +{ + int r; + struct dm_block *sblock; + struct thin_disk_superblock *disk_super; + sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT; + + if (bdev_size > THIN_METADATA_MAX_SECTORS) + bdev_size = THIN_METADATA_MAX_SECTORS; + + r = dm_sm_commit(pmd->data_sm); + if (r < 0) + return r; + + r = save_sm_roots(pmd); + if (r < 0) + return r; + + r = dm_tm_pre_commit(pmd->tm); + if (r < 0) + return r; + + r = superblock_lock_zero(pmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + disk_super->flags = 0; + memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); + disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); + disk_super->version = cpu_to_le32(THIN_VERSION); + disk_super->time = 0; + disk_super->trans_id = 0; + disk_super->held_root = 0; + + copy_sm_roots(pmd, disk_super); + + disk_super->data_mapping_root = cpu_to_le64(pmd->root); + disk_super->device_details_root = cpu_to_le64(pmd->details_root); + disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE); + disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); + disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); + + return dm_tm_commit(pmd->tm, sblock); +} + +static int __format_metadata(struct dm_pool_metadata *pmd) +{ + int r; + + r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &pmd->tm, &pmd->metadata_sm); + if (r < 0) { + DMERR("tm_create_with_sm failed"); + return r; + } + + pmd->data_sm = dm_sm_disk_create(pmd->tm, 0); + if (IS_ERR(pmd->data_sm)) { + DMERR("sm_disk_create failed"); + r = PTR_ERR(pmd->data_sm); + goto bad_cleanup_tm; + } + + pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm); + if (!pmd->nb_tm) { + DMERR("could not create non-blocking clone tm"); + r = -ENOMEM; + goto bad_cleanup_data_sm; + } + + __setup_btree_details(pmd); + + r = dm_btree_empty(&pmd->info, &pmd->root); + if (r < 0) + goto bad_cleanup_nb_tm; + + r = dm_btree_empty(&pmd->details_info, &pmd->details_root); + if (r < 0) { + DMERR("couldn't create devices root"); + goto bad_cleanup_nb_tm; + } + + r = __write_initial_superblock(pmd); + if (r) + goto bad_cleanup_nb_tm; + + return 0; + +bad_cleanup_nb_tm: + dm_tm_destroy(pmd->nb_tm); +bad_cleanup_data_sm: + dm_sm_destroy(pmd->data_sm); +bad_cleanup_tm: + dm_tm_destroy(pmd->tm); + dm_sm_destroy(pmd->metadata_sm); + + return r; +} + +static int __check_incompat_features(struct thin_disk_superblock *disk_super, + struct dm_pool_metadata *pmd) +{ + uint32_t features; + + features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; + if (features) { + DMERR("could not access metadata due to unsupported optional features (%lx).", + (unsigned long)features); + return -EINVAL; + } + + /* + * Check for read-only metadata to skip the following RDWR checks. + */ + if (get_disk_ro(pmd->bdev->bd_disk)) + return 0; + + features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; + if (features) { + DMERR("could not access metadata RDWR due to unsupported optional features (%lx).", + (unsigned long)features); + return -EINVAL; + } + + return 0; +} + +static int __open_metadata(struct dm_pool_metadata *pmd) +{ + int r; + struct dm_block *sblock; + struct thin_disk_superblock *disk_super; + + r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r < 0) { + DMERR("couldn't read superblock"); + return r; + } + + disk_super = dm_block_data(sblock); + + /* Verify the data block size hasn't changed */ + if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) { + DMERR("changing the data block size (from %u to %llu) is not supported", + le32_to_cpu(disk_super->data_block_size), + (unsigned long long)pmd->data_block_size); + r = -EINVAL; + goto bad_unlock_sblock; + } + + r = __check_incompat_features(disk_super, pmd); + if (r < 0) + goto bad_unlock_sblock; + + r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION, + disk_super->metadata_space_map_root, + sizeof(disk_super->metadata_space_map_root), + &pmd->tm, &pmd->metadata_sm); + if (r < 0) { + DMERR("tm_open_with_sm failed"); + goto bad_unlock_sblock; + } + + pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root, + sizeof(disk_super->data_space_map_root)); + if (IS_ERR(pmd->data_sm)) { + DMERR("sm_disk_open failed"); + r = PTR_ERR(pmd->data_sm); + goto bad_cleanup_tm; + } + + pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm); + if (!pmd->nb_tm) { + DMERR("could not create non-blocking clone tm"); + r = -ENOMEM; + goto bad_cleanup_data_sm; + } + + __setup_btree_details(pmd); + return dm_bm_unlock(sblock); + +bad_cleanup_data_sm: + dm_sm_destroy(pmd->data_sm); +bad_cleanup_tm: + dm_tm_destroy(pmd->tm); + dm_sm_destroy(pmd->metadata_sm); +bad_unlock_sblock: + dm_bm_unlock(sblock); + + return r; +} + +static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device) +{ + int r, unformatted; + + r = __superblock_all_zeroes(pmd->bm, &unformatted); + if (r) + return r; + + if (unformatted) + return format_device ? __format_metadata(pmd) : -EPERM; + + return __open_metadata(pmd); +} + +static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device) +{ + int r; + + pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, + THIN_METADATA_CACHE_SIZE, + THIN_MAX_CONCURRENT_LOCKS); + if (IS_ERR(pmd->bm)) { + DMERR("could not create block manager"); + return PTR_ERR(pmd->bm); + } + + r = __open_or_format_metadata(pmd, format_device); + if (r) + dm_block_manager_destroy(pmd->bm); + + return r; +} + +static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd) +{ + dm_sm_destroy(pmd->data_sm); + dm_sm_destroy(pmd->metadata_sm); + dm_tm_destroy(pmd->nb_tm); + dm_tm_destroy(pmd->tm); + dm_block_manager_destroy(pmd->bm); +} + +static int __begin_transaction(struct dm_pool_metadata *pmd) +{ + int r; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock; + + /* + * We re-read the superblock every time. Shouldn't need to do this + * really. + */ + r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + pmd->time = le32_to_cpu(disk_super->time); + pmd->root = le64_to_cpu(disk_super->data_mapping_root); + pmd->details_root = le64_to_cpu(disk_super->device_details_root); + pmd->trans_id = le64_to_cpu(disk_super->trans_id); + pmd->flags = le32_to_cpu(disk_super->flags); + pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); + + dm_bm_unlock(sblock); + return 0; +} + +static int __write_changed_details(struct dm_pool_metadata *pmd) +{ + int r; + struct dm_thin_device *td, *tmp; + struct disk_device_details details; + uint64_t key; + + list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { + if (!td->changed) + continue; + + key = td->id; + + details.mapped_blocks = cpu_to_le64(td->mapped_blocks); + details.transaction_id = cpu_to_le64(td->transaction_id); + details.creation_time = cpu_to_le32(td->creation_time); + details.snapshotted_time = cpu_to_le32(td->snapshotted_time); + __dm_bless_for_disk(&details); + + r = dm_btree_insert(&pmd->details_info, pmd->details_root, + &key, &details, &pmd->details_root); + if (r) + return r; + + if (td->open_count) + td->changed = 0; + else { + list_del(&td->list); + kfree(td); + } + } + + return 0; +} + +static int __commit_transaction(struct dm_pool_metadata *pmd) +{ + int r; + size_t metadata_len, data_len; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock; + + /* + * We need to know if the thin_disk_superblock exceeds a 512-byte sector. + */ + BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); + + r = __write_changed_details(pmd); + if (r < 0) + return r; + + r = dm_sm_commit(pmd->data_sm); + if (r < 0) + return r; + + r = dm_tm_pre_commit(pmd->tm); + if (r < 0) + return r; + + r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); + if (r < 0) + return r; + + r = dm_sm_root_size(pmd->data_sm, &data_len); + if (r < 0) + return r; + + r = save_sm_roots(pmd); + if (r < 0) + return r; + + r = superblock_lock(pmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + disk_super->time = cpu_to_le32(pmd->time); + disk_super->data_mapping_root = cpu_to_le64(pmd->root); + disk_super->device_details_root = cpu_to_le64(pmd->details_root); + disk_super->trans_id = cpu_to_le64(pmd->trans_id); + disk_super->flags = cpu_to_le32(pmd->flags); + + copy_sm_roots(pmd, disk_super); + + return dm_tm_commit(pmd->tm, sblock); +} + +struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool format_device) +{ + int r; + struct dm_pool_metadata *pmd; + + pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); + if (!pmd) { + DMERR("could not allocate metadata struct"); + return ERR_PTR(-ENOMEM); + } + + init_rwsem(&pmd->root_lock); + pmd->time = 0; + INIT_LIST_HEAD(&pmd->thin_devices); + pmd->read_only = false; + pmd->fail_io = false; + pmd->bdev = bdev; + pmd->data_block_size = data_block_size; + + r = __create_persistent_data_objects(pmd, format_device); + if (r) { + kfree(pmd); + return ERR_PTR(r); + } + + r = __begin_transaction(pmd); + if (r < 0) { + if (dm_pool_metadata_close(pmd) < 0) + DMWARN("%s: dm_pool_metadata_close() failed.", __func__); + return ERR_PTR(r); + } + + return pmd; +} + +int dm_pool_metadata_close(struct dm_pool_metadata *pmd) +{ + int r; + unsigned open_devices = 0; + struct dm_thin_device *td, *tmp; + + down_read(&pmd->root_lock); + list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { + if (td->open_count) + open_devices++; + else { + list_del(&td->list); + kfree(td); + } + } + up_read(&pmd->root_lock); + + if (open_devices) { + DMERR("attempt to close pmd when %u device(s) are still open", + open_devices); + return -EBUSY; + } + + if (!pmd->read_only && !pmd->fail_io) { + r = __commit_transaction(pmd); + if (r < 0) + DMWARN("%s: __commit_transaction() failed, error = %d", + __func__, r); + } + + if (!pmd->fail_io) + __destroy_persistent_data_objects(pmd); + + kfree(pmd); + return 0; +} + +/* + * __open_device: Returns @td corresponding to device with id @dev, + * creating it if @create is set and incrementing @td->open_count. + * On failure, @td is undefined. + */ +static int __open_device(struct dm_pool_metadata *pmd, + dm_thin_id dev, int create, + struct dm_thin_device **td) +{ + int r, changed = 0; + struct dm_thin_device *td2; + uint64_t key = dev; + struct disk_device_details details_le; + + /* + * If the device is already open, return it. + */ + list_for_each_entry(td2, &pmd->thin_devices, list) + if (td2->id == dev) { + /* + * May not create an already-open device. + */ + if (create) + return -EEXIST; + + td2->open_count++; + *td = td2; + return 0; + } + + /* + * Check the device exists. + */ + r = dm_btree_lookup(&pmd->details_info, pmd->details_root, + &key, &details_le); + if (r) { + if (r != -ENODATA || !create) + return r; + + /* + * Create new device. + */ + changed = 1; + details_le.mapped_blocks = 0; + details_le.transaction_id = cpu_to_le64(pmd->trans_id); + details_le.creation_time = cpu_to_le32(pmd->time); + details_le.snapshotted_time = cpu_to_le32(pmd->time); + } + + *td = kmalloc(sizeof(**td), GFP_NOIO); + if (!*td) + return -ENOMEM; + + (*td)->pmd = pmd; + (*td)->id = dev; + (*td)->open_count = 1; + (*td)->changed = changed; + (*td)->aborted_with_changes = false; + (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); + (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); + (*td)->creation_time = le32_to_cpu(details_le.creation_time); + (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time); + + list_add(&(*td)->list, &pmd->thin_devices); + + return 0; +} + +static void __close_device(struct dm_thin_device *td) +{ + --td->open_count; +} + +static int __create_thin(struct dm_pool_metadata *pmd, + dm_thin_id dev) +{ + int r; + dm_block_t dev_root; + uint64_t key = dev; + struct disk_device_details details_le; + struct dm_thin_device *td; + __le64 value; + + r = dm_btree_lookup(&pmd->details_info, pmd->details_root, + &key, &details_le); + if (!r) + return -EEXIST; + + /* + * Create an empty btree for the mappings. + */ + r = dm_btree_empty(&pmd->bl_info, &dev_root); + if (r) + return r; + + /* + * Insert it into the main mapping tree. + */ + value = cpu_to_le64(dev_root); + __dm_bless_for_disk(&value); + r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); + if (r) { + dm_btree_del(&pmd->bl_info, dev_root); + return r; + } + + r = __open_device(pmd, dev, 1, &td); + if (r) { + dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); + dm_btree_del(&pmd->bl_info, dev_root); + return r; + } + __close_device(td); + + return r; +} + +int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (!pmd->fail_io) + r = __create_thin(pmd, dev); + up_write(&pmd->root_lock); + + return r; +} + +static int __set_snapshot_details(struct dm_pool_metadata *pmd, + struct dm_thin_device *snap, + dm_thin_id origin, uint32_t time) +{ + int r; + struct dm_thin_device *td; + + r = __open_device(pmd, origin, 0, &td); + if (r) + return r; + + td->changed = 1; + td->snapshotted_time = time; + + snap->mapped_blocks = td->mapped_blocks; + snap->snapshotted_time = time; + __close_device(td); + + return 0; +} + +static int __create_snap(struct dm_pool_metadata *pmd, + dm_thin_id dev, dm_thin_id origin) +{ + int r; + dm_block_t origin_root; + uint64_t key = origin, dev_key = dev; + struct dm_thin_device *td; + struct disk_device_details details_le; + __le64 value; + + /* check this device is unused */ + r = dm_btree_lookup(&pmd->details_info, pmd->details_root, + &dev_key, &details_le); + if (!r) + return -EEXIST; + + /* find the mapping tree for the origin */ + r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value); + if (r) + return r; + origin_root = le64_to_cpu(value); + + /* clone the origin, an inc will do */ + dm_tm_inc(pmd->tm, origin_root); + + /* insert into the main mapping tree */ + value = cpu_to_le64(origin_root); + __dm_bless_for_disk(&value); + key = dev; + r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); + if (r) { + dm_tm_dec(pmd->tm, origin_root); + return r; + } + + pmd->time++; + + r = __open_device(pmd, dev, 1, &td); + if (r) + goto bad; + + r = __set_snapshot_details(pmd, td, origin, pmd->time); + __close_device(td); + + if (r) + goto bad; + + return 0; + +bad: + dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); + dm_btree_remove(&pmd->details_info, pmd->details_root, + &key, &pmd->details_root); + return r; +} + +int dm_pool_create_snap(struct dm_pool_metadata *pmd, + dm_thin_id dev, + dm_thin_id origin) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (!pmd->fail_io) + r = __create_snap(pmd, dev, origin); + up_write(&pmd->root_lock); + + return r; +} + +static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev) +{ + int r; + uint64_t key = dev; + struct dm_thin_device *td; + + /* TODO: failure should mark the transaction invalid */ + r = __open_device(pmd, dev, 0, &td); + if (r) + return r; + + if (td->open_count > 1) { + __close_device(td); + return -EBUSY; + } + + list_del(&td->list); + kfree(td); + r = dm_btree_remove(&pmd->details_info, pmd->details_root, + &key, &pmd->details_root); + if (r) + return r; + + r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); + if (r) + return r; + + return 0; +} + +int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, + dm_thin_id dev) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (!pmd->fail_io) + r = __delete_device(pmd, dev); + up_write(&pmd->root_lock); + + return r; +} + +int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, + uint64_t current_id, + uint64_t new_id) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + + if (pmd->fail_io) + goto out; + + if (pmd->trans_id != current_id) { + DMERR("mismatched transaction id"); + goto out; + } + + pmd->trans_id = new_id; + r = 0; + +out: + up_write(&pmd->root_lock); + + return r; +} + +int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, + uint64_t *result) +{ + int r = -EINVAL; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) { + *result = pmd->trans_id; + r = 0; + } + up_read(&pmd->root_lock); + + return r; +} + +static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r, inc; + struct thin_disk_superblock *disk_super; + struct dm_block *copy, *sblock; + dm_block_t held_root; + + /* + * Copy the superblock. + */ + dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION); + r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, ©, &inc); + if (r) + return r; + + BUG_ON(!inc); + + held_root = dm_block_location(copy); + disk_super = dm_block_data(copy); + + if (le64_to_cpu(disk_super->held_root)) { + DMWARN("Pool metadata snapshot already exists: release this before taking another."); + + dm_tm_dec(pmd->tm, held_root); + dm_tm_unlock(pmd->tm, copy); + return -EBUSY; + } + + /* + * Wipe the spacemap since we're not publishing this. + */ + memset(&disk_super->data_space_map_root, 0, + sizeof(disk_super->data_space_map_root)); + memset(&disk_super->metadata_space_map_root, 0, + sizeof(disk_super->metadata_space_map_root)); + + /* + * Increment the data structures that need to be preserved. + */ + dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root)); + dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root)); + dm_tm_unlock(pmd->tm, copy); + + /* + * Write the held root into the superblock. + */ + r = superblock_lock(pmd, &sblock); + if (r) { + dm_tm_dec(pmd->tm, held_root); + return r; + } + + disk_super = dm_block_data(sblock); + disk_super->held_root = cpu_to_le64(held_root); + dm_bm_unlock(sblock); + return 0; +} + +int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (!pmd->fail_io) + r = __reserve_metadata_snap(pmd); + up_write(&pmd->root_lock); + + return r; +} + +static int __release_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock, *copy; + dm_block_t held_root; + + r = superblock_lock(pmd, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + held_root = le64_to_cpu(disk_super->held_root); + disk_super->held_root = cpu_to_le64(0); + + dm_bm_unlock(sblock); + + if (!held_root) { + DMWARN("No pool metadata snapshot found: nothing to release."); + return -EINVAL; + } + + r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, ©); + if (r) + return r; + + disk_super = dm_block_data(copy); + dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root)); + dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root)); + dm_sm_dec_block(pmd->metadata_sm, held_root); + + return dm_tm_unlock(pmd->tm, copy); +} + +int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (!pmd->fail_io) + r = __release_metadata_snap(pmd); + up_write(&pmd->root_lock); + + return r; +} + +static int __get_metadata_snap(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock; + + r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + *result = le64_to_cpu(disk_super->held_root); + + return dm_bm_unlock(sblock); +} + +int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) + r = __get_metadata_snap(pmd, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, + struct dm_thin_device **td) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (!pmd->fail_io) + r = __open_device(pmd, dev, 0, td); + up_write(&pmd->root_lock); + + return r; +} + +int dm_pool_close_thin_device(struct dm_thin_device *td) +{ + down_write(&td->pmd->root_lock); + __close_device(td); + up_write(&td->pmd->root_lock); + + return 0; +} + +dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) +{ + return td->id; +} + +/* + * Check whether @time (of block creation) is older than @td's last snapshot. + * If so then the associated block is shared with the last snapshot device. + * Any block on a device created *after* the device last got snapshotted is + * necessarily not shared. + */ +static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) +{ + return td->snapshotted_time > time; +} + +int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, + int can_block, struct dm_thin_lookup_result *result) +{ + int r = -EINVAL; + uint64_t block_time = 0; + __le64 value; + struct dm_pool_metadata *pmd = td->pmd; + dm_block_t keys[2] = { td->id, block }; + struct dm_btree_info *info; + + if (can_block) { + down_read(&pmd->root_lock); + info = &pmd->info; + } else if (down_read_trylock(&pmd->root_lock)) + info = &pmd->nb_info; + else + return -EWOULDBLOCK; + + if (pmd->fail_io) + goto out; + + r = dm_btree_lookup(info, pmd->root, keys, &value); + if (!r) + block_time = le64_to_cpu(value); + +out: + up_read(&pmd->root_lock); + + if (!r) { + dm_block_t exception_block; + uint32_t exception_time; + unpack_block_time(block_time, &exception_block, + &exception_time); + result->block = exception_block; + result->shared = __snapshotted_since(td, exception_time); + } + + return r; +} + +static int __insert(struct dm_thin_device *td, dm_block_t block, + dm_block_t data_block) +{ + int r, inserted; + __le64 value; + struct dm_pool_metadata *pmd = td->pmd; + dm_block_t keys[2] = { td->id, block }; + + value = cpu_to_le64(pack_block_time(data_block, pmd->time)); + __dm_bless_for_disk(&value); + + r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value, + &pmd->root, &inserted); + if (r) + return r; + + td->changed = 1; + if (inserted) + td->mapped_blocks++; + + return 0; +} + +int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, + dm_block_t data_block) +{ + int r = -EINVAL; + + down_write(&td->pmd->root_lock); + if (!td->pmd->fail_io) + r = __insert(td, block, data_block); + up_write(&td->pmd->root_lock); + + return r; +} + +static int __remove(struct dm_thin_device *td, dm_block_t block) +{ + int r; + struct dm_pool_metadata *pmd = td->pmd; + dm_block_t keys[2] = { td->id, block }; + + r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root); + if (r) + return r; + + td->mapped_blocks--; + td->changed = 1; + + return 0; +} + +int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) +{ + int r = -EINVAL; + + down_write(&td->pmd->root_lock); + if (!td->pmd->fail_io) + r = __remove(td, block); + up_write(&td->pmd->root_lock); + + return r; +} + +int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result) +{ + int r; + uint32_t ref_count; + + down_read(&pmd->root_lock); + r = dm_sm_get_count(pmd->data_sm, b, &ref_count); + if (!r) + *result = (ref_count != 0); + up_read(&pmd->root_lock); + + return r; +} + +bool dm_thin_changed_this_transaction(struct dm_thin_device *td) +{ + int r; + + down_read(&td->pmd->root_lock); + r = td->changed; + up_read(&td->pmd->root_lock); + + return r; +} + +bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd) +{ + bool r = false; + struct dm_thin_device *td, *tmp; + + down_read(&pmd->root_lock); + list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { + if (td->changed) { + r = td->changed; + break; + } + } + up_read(&pmd->root_lock); + + return r; +} + +bool dm_thin_aborted_changes(struct dm_thin_device *td) +{ + bool r; + + down_read(&td->pmd->root_lock); + r = td->aborted_with_changes; + up_read(&td->pmd->root_lock); + + return r; +} + +int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (!pmd->fail_io) + r = dm_sm_new_block(pmd->data_sm, result); + up_write(&pmd->root_lock); + + return r; +} + +int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (pmd->fail_io) + goto out; + + r = __commit_transaction(pmd); + if (r <= 0) + goto out; + + /* + * Open the next transaction. + */ + r = __begin_transaction(pmd); +out: + up_write(&pmd->root_lock); + return r; +} + +static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd) +{ + struct dm_thin_device *td; + + list_for_each_entry(td, &pmd->thin_devices, list) + td->aborted_with_changes = td->changed; +} + +int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (pmd->fail_io) + goto out; + + __set_abort_with_changes_flags(pmd); + __destroy_persistent_data_objects(pmd); + r = __create_persistent_data_objects(pmd, false); + if (r) + pmd->fail_io = true; + +out: + up_write(&pmd->root_lock); + + return r; +} + +int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) + r = dm_sm_get_nr_free(pmd->data_sm, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) + r = dm_sm_get_nr_free(pmd->metadata_sm, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) + r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result) +{ + down_read(&pmd->root_lock); + *result = pmd->data_block_size; + up_read(&pmd->root_lock); + + return 0; +} + +int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) +{ + int r = -EINVAL; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) + r = dm_sm_get_nr_blocks(pmd->data_sm, result); + up_read(&pmd->root_lock); + + return r; +} + +int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) +{ + int r = -EINVAL; + struct dm_pool_metadata *pmd = td->pmd; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) { + *result = td->mapped_blocks; + r = 0; + } + up_read(&pmd->root_lock); + + return r; +} + +static int __highest_block(struct dm_thin_device *td, dm_block_t *result) +{ + int r; + __le64 value_le; + dm_block_t thin_root; + struct dm_pool_metadata *pmd = td->pmd; + + r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le); + if (r) + return r; + + thin_root = le64_to_cpu(value_le); + + return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result); +} + +int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, + dm_block_t *result) +{ + int r = -EINVAL; + struct dm_pool_metadata *pmd = td->pmd; + + down_read(&pmd->root_lock); + if (!pmd->fail_io) + r = __highest_block(td, result); + up_read(&pmd->root_lock); + + return r; +} + +static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count) +{ + int r; + dm_block_t old_count; + + r = dm_sm_get_nr_blocks(sm, &old_count); + if (r) + return r; + + if (new_count == old_count) + return 0; + + if (new_count < old_count) { + DMERR("cannot reduce size of space map"); + return -EINVAL; + } + + return dm_sm_extend(sm, new_count - old_count); +} + +int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (!pmd->fail_io) + r = __resize_space_map(pmd->data_sm, new_count); + up_write(&pmd->root_lock); + + return r; +} + +int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) +{ + int r = -EINVAL; + + down_write(&pmd->root_lock); + if (!pmd->fail_io) + r = __resize_space_map(pmd->metadata_sm, new_count); + up_write(&pmd->root_lock); + + return r; +} + +void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) +{ + down_write(&pmd->root_lock); + pmd->read_only = true; + dm_bm_set_read_only(pmd->bm); + up_write(&pmd->root_lock); +} + +void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) +{ + down_write(&pmd->root_lock); + pmd->read_only = false; + dm_bm_set_read_write(pmd->bm); + up_write(&pmd->root_lock); +} + +int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + int r; + + down_write(&pmd->root_lock); + r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context); + up_write(&pmd->root_lock); + + return r; +} + +int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) +{ + int r; + struct dm_block *sblock; + struct thin_disk_superblock *disk_super; + + down_write(&pmd->root_lock); + pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; + + r = superblock_lock(pmd, &sblock); + if (r) { + DMERR("couldn't read superblock"); + goto out; + } + + disk_super = dm_block_data(sblock); + disk_super->flags = cpu_to_le32(pmd->flags); + + dm_bm_unlock(sblock); +out: + up_write(&pmd->root_lock); + return r; +} + +bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) +{ + bool needs_check; + + down_read(&pmd->root_lock); + needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG; + up_read(&pmd->root_lock); + + return needs_check; +} diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h new file mode 100644 index 00000000000..e3c857db195 --- /dev/null +++ b/drivers/md/dm-thin-metadata.h @@ -0,0 +1,218 @@ +/* + * Copyright (C) 2010-2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_THIN_METADATA_H +#define DM_THIN_METADATA_H + +#include "persistent-data/dm-block-manager.h" +#include "persistent-data/dm-space-map.h" +#include "persistent-data/dm-space-map-metadata.h" + +#define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE + +/* + * The metadata device is currently limited in size. + */ +#define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS + +/* + * A metadata device larger than 16GB triggers a warning. + */ +#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) + +/*----------------------------------------------------------------*/ + +/* + * Thin metadata superblock flags. + */ +#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0) + +struct dm_pool_metadata; +struct dm_thin_device; + +/* + * Device identifier + */ +typedef uint64_t dm_thin_id; + +/* + * Reopens or creates a new, empty metadata volume. + */ +struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool format_device); + +int dm_pool_metadata_close(struct dm_pool_metadata *pmd); + +/* + * Compat feature flags. Any incompat flags beyond the ones + * specified below will prevent use of the thin metadata. + */ +#define THIN_FEATURE_COMPAT_SUPP 0UL +#define THIN_FEATURE_COMPAT_RO_SUPP 0UL +#define THIN_FEATURE_INCOMPAT_SUPP 0UL + +/* + * Device creation/deletion. + */ +int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev); + +/* + * An internal snapshot. + * + * You can only snapshot a quiesced origin i.e. one that is either + * suspended or not instanced at all. + */ +int dm_pool_create_snap(struct dm_pool_metadata *pmd, dm_thin_id dev, + dm_thin_id origin); + +/* + * Deletes a virtual device from the metadata. It _is_ safe to call this + * when that device is open. Operations on that device will just start + * failing. You still need to call close() on the device. + */ +int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, + dm_thin_id dev); + +/* + * Commits _all_ metadata changes: device creation, deletion, mapping + * updates. + */ +int dm_pool_commit_metadata(struct dm_pool_metadata *pmd); + +/* + * Discards all uncommitted changes. Rereads the superblock, rolling back + * to the last good transaction. Thin devices remain open. + * dm_thin_aborted_changes() tells you if they had uncommitted changes. + * + * If this call fails it's only useful to call dm_pool_metadata_close(). + * All other methods will fail with -EINVAL. + */ +int dm_pool_abort_metadata(struct dm_pool_metadata *pmd); + +/* + * Set/get userspace transaction id. + */ +int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, + uint64_t current_id, + uint64_t new_id); + +int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, + uint64_t *result); + +/* + * Hold/get root for userspace transaction. + * + * The metadata snapshot is a copy of the current superblock (minus the + * space maps). Userland can access the data structures for READ + * operations only. A small performance hit is incurred by providing this + * copy of the metadata to userland due to extra copy-on-write operations + * on the metadata nodes. Release this as soon as you finish with it. + */ +int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd); +int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd); + +int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, + dm_block_t *result); + +/* + * Actions on a single virtual device. + */ + +/* + * Opening the same device more than once will fail with -EBUSY. + */ +int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, + struct dm_thin_device **td); + +int dm_pool_close_thin_device(struct dm_thin_device *td); + +dm_thin_id dm_thin_dev_id(struct dm_thin_device *td); + +struct dm_thin_lookup_result { + dm_block_t block; + bool shared:1; +}; + +/* + * Returns: + * -EWOULDBLOCK iff @can_block is set and would block. + * -ENODATA iff that mapping is not present. + * 0 success + */ +int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, + int can_block, struct dm_thin_lookup_result *result); + +/* + * Obtain an unused block. + */ +int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result); + +/* + * Insert or remove block. + */ +int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, + dm_block_t data_block); + +int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block); + +/* + * Queries. + */ +bool dm_thin_changed_this_transaction(struct dm_thin_device *td); + +bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd); + +bool dm_thin_aborted_changes(struct dm_thin_device *td); + +int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, + dm_block_t *highest_mapped); + +int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result); + +int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, + dm_block_t *result); + +int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, + dm_block_t *result); + +int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, + dm_block_t *result); + +int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result); + +int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); + +int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result); + +/* + * Returns -ENOSPC if the new size is too small and already allocated + * blocks would be lost. + */ +int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); +int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_size); + +/* + * Flicks the underlying block manager into read only mode, so you know + * that nothing is changing. + */ +void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd); +void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd); + +int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context); + +/* + * Updates the superblock immediately. + */ +int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd); +bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c new file mode 100644 index 00000000000..fc9c848a60c --- /dev/null +++ b/drivers/md/dm-thin.c @@ -0,0 +1,3546 @@ +/* + * Copyright (C) 2011-2012 Red Hat UK. + * + * This file is released under the GPL. + */ + +#include "dm-thin-metadata.h" +#include "dm-bio-prison.h" +#include "dm.h" + +#include <linux/device-mapper.h> +#include <linux/dm-io.h> +#include <linux/dm-kcopyd.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/rbtree.h> + +#define DM_MSG_PREFIX "thin" + +/* + * Tunable constants + */ +#define ENDIO_HOOK_POOL_SIZE 1024 +#define MAPPING_POOL_SIZE 1024 +#define PRISON_CELLS 1024 +#define COMMIT_PERIOD HZ +#define NO_SPACE_TIMEOUT_SECS 60 + +static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS; + +DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, + "A percentage of time allocated for copy on write"); + +/* + * The block size of the device holding pool data must be + * between 64KB and 1GB. + */ +#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT) +#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) + +/* + * Device id is restricted to 24 bits. + */ +#define MAX_DEV_ID ((1 << 24) - 1) + +/* + * How do we handle breaking sharing of data blocks? + * ================================================= + * + * We use a standard copy-on-write btree to store the mappings for the + * devices (note I'm talking about copy-on-write of the metadata here, not + * the data). When you take an internal snapshot you clone the root node + * of the origin btree. After this there is no concept of an origin or a + * snapshot. They are just two device trees that happen to point to the + * same data blocks. + * + * When we get a write in we decide if it's to a shared data block using + * some timestamp magic. If it is, we have to break sharing. + * + * Let's say we write to a shared block in what was the origin. The + * steps are: + * + * i) plug io further to this physical block. (see bio_prison code). + * + * ii) quiesce any read io to that shared data block. Obviously + * including all devices that share this block. (see dm_deferred_set code) + * + * iii) copy the data block to a newly allocate block. This step can be + * missed out if the io covers the block. (schedule_copy). + * + * iv) insert the new mapping into the origin's btree + * (process_prepared_mapping). This act of inserting breaks some + * sharing of btree nodes between the two devices. Breaking sharing only + * effects the btree of that specific device. Btrees for the other + * devices that share the block never change. The btree for the origin + * device as it was after the last commit is untouched, ie. we're using + * persistent data structures in the functional programming sense. + * + * v) unplug io to this physical block, including the io that triggered + * the breaking of sharing. + * + * Steps (ii) and (iii) occur in parallel. + * + * The metadata _doesn't_ need to be committed before the io continues. We + * get away with this because the io is always written to a _new_ block. + * If there's a crash, then: + * + * - The origin mapping will point to the old origin block (the shared + * one). This will contain the data as it was before the io that triggered + * the breaking of sharing came in. + * + * - The snap mapping still points to the old block. As it would after + * the commit. + * + * The downside of this scheme is the timestamp magic isn't perfect, and + * will continue to think that data block in the snapshot device is shared + * even after the write to the origin has broken sharing. I suspect data + * blocks will typically be shared by many different devices, so we're + * breaking sharing n + 1 times, rather than n, where n is the number of + * devices that reference this data block. At the moment I think the + * benefits far, far outweigh the disadvantages. + */ + +/*----------------------------------------------------------------*/ + +/* + * Key building. + */ +static void build_data_key(struct dm_thin_device *td, + dm_block_t b, struct dm_cell_key *key) +{ + key->virtual = 0; + key->dev = dm_thin_dev_id(td); + key->block = b; +} + +static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, + struct dm_cell_key *key) +{ + key->virtual = 1; + key->dev = dm_thin_dev_id(td); + key->block = b; +} + +/*----------------------------------------------------------------*/ + +/* + * A pool device ties together a metadata device and a data device. It + * also provides the interface for creating and destroying internal + * devices. + */ +struct dm_thin_new_mapping; + +/* + * The pool runs in 4 modes. Ordered in degraded order for comparisons. + */ +enum pool_mode { + PM_WRITE, /* metadata may be changed */ + PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */ + PM_READ_ONLY, /* metadata may not be changed */ + PM_FAIL, /* all I/O fails */ +}; + +struct pool_features { + enum pool_mode mode; + + bool zero_new_blocks:1; + bool discard_enabled:1; + bool discard_passdown:1; + bool error_if_no_space:1; +}; + +struct thin_c; +typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); +typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); + +struct pool { + struct list_head list; + struct dm_target *ti; /* Only set if a pool target is bound */ + + struct mapped_device *pool_md; + struct block_device *md_dev; + struct dm_pool_metadata *pmd; + + dm_block_t low_water_blocks; + uint32_t sectors_per_block; + int sectors_per_block_shift; + + struct pool_features pf; + bool low_water_triggered:1; /* A dm event has been sent */ + + struct dm_bio_prison *prison; + struct dm_kcopyd_client *copier; + + struct workqueue_struct *wq; + struct work_struct worker; + struct delayed_work waker; + struct delayed_work no_space_timeout; + + unsigned long last_commit_jiffies; + unsigned ref_count; + + spinlock_t lock; + struct bio_list deferred_flush_bios; + struct list_head prepared_mappings; + struct list_head prepared_discards; + struct list_head active_thins; + + struct dm_deferred_set *shared_read_ds; + struct dm_deferred_set *all_io_ds; + + struct dm_thin_new_mapping *next_mapping; + mempool_t *mapping_pool; + + process_bio_fn process_bio; + process_bio_fn process_discard; + + process_mapping_fn process_prepared_mapping; + process_mapping_fn process_prepared_discard; +}; + +static enum pool_mode get_pool_mode(struct pool *pool); +static void metadata_operation_failed(struct pool *pool, const char *op, int r); + +/* + * Target context for a pool. + */ +struct pool_c { + struct dm_target *ti; + struct pool *pool; + struct dm_dev *data_dev; + struct dm_dev *metadata_dev; + struct dm_target_callbacks callbacks; + + dm_block_t low_water_blocks; + struct pool_features requested_pf; /* Features requested during table load */ + struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */ +}; + +/* + * Target context for a thin. + */ +struct thin_c { + struct list_head list; + struct dm_dev *pool_dev; + struct dm_dev *origin_dev; + dm_thin_id dev_id; + + struct pool *pool; + struct dm_thin_device *td; + bool requeue_mode:1; + spinlock_t lock; + struct bio_list deferred_bio_list; + struct bio_list retry_on_resume_list; + struct rb_root sort_bio_list; /* sorted list of deferred bios */ + + /* + * Ensures the thin is not destroyed until the worker has finished + * iterating the active_thins list. + */ + atomic_t refcount; + struct completion can_destroy; +}; + +/*----------------------------------------------------------------*/ + +/* + * wake_worker() is used when new work is queued and when pool_resume is + * ready to continue deferred IO processing. + */ +static void wake_worker(struct pool *pool) +{ + queue_work(pool->wq, &pool->worker); +} + +/*----------------------------------------------------------------*/ + +static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio, + struct dm_bio_prison_cell **cell_result) +{ + int r; + struct dm_bio_prison_cell *cell_prealloc; + + /* + * Allocate a cell from the prison's mempool. + * This might block but it can't fail. + */ + cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO); + + r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result); + if (r) + /* + * We reused an old cell; we can get rid of + * the new one. + */ + dm_bio_prison_free_cell(pool->prison, cell_prealloc); + + return r; +} + +static void cell_release(struct pool *pool, + struct dm_bio_prison_cell *cell, + struct bio_list *bios) +{ + dm_cell_release(pool->prison, cell, bios); + dm_bio_prison_free_cell(pool->prison, cell); +} + +static void cell_release_no_holder(struct pool *pool, + struct dm_bio_prison_cell *cell, + struct bio_list *bios) +{ + dm_cell_release_no_holder(pool->prison, cell, bios); + dm_bio_prison_free_cell(pool->prison, cell); +} + +static void cell_defer_no_holder_no_free(struct thin_c *tc, + struct dm_bio_prison_cell *cell) +{ + struct pool *pool = tc->pool; + unsigned long flags; + + spin_lock_irqsave(&tc->lock, flags); + dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list); + spin_unlock_irqrestore(&tc->lock, flags); + + wake_worker(pool); +} + +static void cell_error_with_code(struct pool *pool, + struct dm_bio_prison_cell *cell, int error_code) +{ + dm_cell_error(pool->prison, cell, error_code); + dm_bio_prison_free_cell(pool->prison, cell); +} + +static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, -EIO); +} + +/*----------------------------------------------------------------*/ + +/* + * A global list of pools that uses a struct mapped_device as a key. + */ +static struct dm_thin_pool_table { + struct mutex mutex; + struct list_head pools; +} dm_thin_pool_table; + +static void pool_table_init(void) +{ + mutex_init(&dm_thin_pool_table.mutex); + INIT_LIST_HEAD(&dm_thin_pool_table.pools); +} + +static void __pool_table_insert(struct pool *pool) +{ + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + list_add(&pool->list, &dm_thin_pool_table.pools); +} + +static void __pool_table_remove(struct pool *pool) +{ + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + list_del(&pool->list); +} + +static struct pool *__pool_table_lookup(struct mapped_device *md) +{ + struct pool *pool = NULL, *tmp; + + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + + list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { + if (tmp->pool_md == md) { + pool = tmp; + break; + } + } + + return pool; +} + +static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) +{ + struct pool *pool = NULL, *tmp; + + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + + list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { + if (tmp->md_dev == md_dev) { + pool = tmp; + break; + } + } + + return pool; +} + +/*----------------------------------------------------------------*/ + +struct dm_thin_endio_hook { + struct thin_c *tc; + struct dm_deferred_entry *shared_read_entry; + struct dm_deferred_entry *all_io_entry; + struct dm_thin_new_mapping *overwrite_mapping; + struct rb_node rb_node; +}; + +static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) +{ + struct bio *bio; + struct bio_list bios; + unsigned long flags; + + bio_list_init(&bios); + + spin_lock_irqsave(&tc->lock, flags); + bio_list_merge(&bios, master); + bio_list_init(master); + spin_unlock_irqrestore(&tc->lock, flags); + + while ((bio = bio_list_pop(&bios))) + bio_endio(bio, DM_ENDIO_REQUEUE); +} + +static void requeue_io(struct thin_c *tc) +{ + requeue_bio_list(tc, &tc->deferred_bio_list); + requeue_bio_list(tc, &tc->retry_on_resume_list); +} + +static void error_thin_retry_list(struct thin_c *tc) +{ + struct bio *bio; + unsigned long flags; + struct bio_list bios; + + bio_list_init(&bios); + + spin_lock_irqsave(&tc->lock, flags); + bio_list_merge(&bios, &tc->retry_on_resume_list); + bio_list_init(&tc->retry_on_resume_list); + spin_unlock_irqrestore(&tc->lock, flags); + + while ((bio = bio_list_pop(&bios))) + bio_io_error(bio); +} + +static void error_retry_list(struct pool *pool) +{ + struct thin_c *tc; + + rcu_read_lock(); + list_for_each_entry_rcu(tc, &pool->active_thins, list) + error_thin_retry_list(tc); + rcu_read_unlock(); +} + +/* + * This section of code contains the logic for processing a thin device's IO. + * Much of the code depends on pool object resources (lists, workqueues, etc) + * but most is exclusively called from the thin target rather than the thin-pool + * target. + */ + +static bool block_size_is_power_of_two(struct pool *pool) +{ + return pool->sectors_per_block_shift >= 0; +} + +static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + sector_t block_nr = bio->bi_iter.bi_sector; + + if (block_size_is_power_of_two(pool)) + block_nr >>= pool->sectors_per_block_shift; + else + (void) sector_div(block_nr, pool->sectors_per_block); + + return block_nr; +} + +static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) +{ + struct pool *pool = tc->pool; + sector_t bi_sector = bio->bi_iter.bi_sector; + + bio->bi_bdev = tc->pool_dev->bdev; + if (block_size_is_power_of_two(pool)) + bio->bi_iter.bi_sector = + (block << pool->sectors_per_block_shift) | + (bi_sector & (pool->sectors_per_block - 1)); + else + bio->bi_iter.bi_sector = (block * pool->sectors_per_block) + + sector_div(bi_sector, pool->sectors_per_block); +} + +static void remap_to_origin(struct thin_c *tc, struct bio *bio) +{ + bio->bi_bdev = tc->origin_dev->bdev; +} + +static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) +{ + return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && + dm_thin_changed_this_transaction(tc->td); +} + +static void inc_all_io_entry(struct pool *pool, struct bio *bio) +{ + struct dm_thin_endio_hook *h; + + if (bio->bi_rw & REQ_DISCARD) + return; + + h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds); +} + +static void issue(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + unsigned long flags; + + if (!bio_triggers_commit(tc, bio)) { + generic_make_request(bio); + return; + } + + /* + * Complete bio with an error if earlier I/O caused changes to + * the metadata that can't be committed e.g, due to I/O errors + * on the metadata device. + */ + if (dm_thin_aborted_changes(tc->td)) { + bio_io_error(bio); + return; + } + + /* + * Batch together any bios that trigger commits and then issue a + * single commit for them in process_deferred_bios(). + */ + spin_lock_irqsave(&pool->lock, flags); + bio_list_add(&pool->deferred_flush_bios, bio); + spin_unlock_irqrestore(&pool->lock, flags); +} + +static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) +{ + remap_to_origin(tc, bio); + issue(tc, bio); +} + +static void remap_and_issue(struct thin_c *tc, struct bio *bio, + dm_block_t block) +{ + remap(tc, bio, block); + issue(tc, bio); +} + +/*----------------------------------------------------------------*/ + +/* + * Bio endio functions. + */ +struct dm_thin_new_mapping { + struct list_head list; + + bool quiesced:1; + bool prepared:1; + bool pass_discard:1; + bool definitely_not_shared:1; + + int err; + struct thin_c *tc; + dm_block_t virt_block; + dm_block_t data_block; + struct dm_bio_prison_cell *cell, *cell2; + + /* + * If the bio covers the whole area of a block then we can avoid + * zeroing or copying. Instead this bio is hooked. The bio will + * still be in the cell, so care has to be taken to avoid issuing + * the bio twice. + */ + struct bio *bio; + bio_end_io_t *saved_bi_end_io; +}; + +static void __maybe_add_mapping(struct dm_thin_new_mapping *m) +{ + struct pool *pool = m->tc->pool; + + if (m->quiesced && m->prepared) { + list_add_tail(&m->list, &pool->prepared_mappings); + wake_worker(pool); + } +} + +static void copy_complete(int read_err, unsigned long write_err, void *context) +{ + unsigned long flags; + struct dm_thin_new_mapping *m = context; + struct pool *pool = m->tc->pool; + + m->err = read_err || write_err ? -EIO : 0; + + spin_lock_irqsave(&pool->lock, flags); + m->prepared = true; + __maybe_add_mapping(m); + spin_unlock_irqrestore(&pool->lock, flags); +} + +static void overwrite_endio(struct bio *bio, int err) +{ + unsigned long flags; + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + struct dm_thin_new_mapping *m = h->overwrite_mapping; + struct pool *pool = m->tc->pool; + + m->err = err; + + spin_lock_irqsave(&pool->lock, flags); + m->prepared = true; + __maybe_add_mapping(m); + spin_unlock_irqrestore(&pool->lock, flags); +} + +/*----------------------------------------------------------------*/ + +/* + * Workqueue. + */ + +/* + * Prepared mapping jobs. + */ + +/* + * This sends the bios in the cell back to the deferred_bios list. + */ +static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + struct pool *pool = tc->pool; + unsigned long flags; + + spin_lock_irqsave(&tc->lock, flags); + cell_release(pool, cell, &tc->deferred_bio_list); + spin_unlock_irqrestore(&tc->lock, flags); + + wake_worker(pool); +} + +/* + * Same as cell_defer above, except it omits the original holder of the cell. + */ +static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + struct pool *pool = tc->pool; + unsigned long flags; + + spin_lock_irqsave(&tc->lock, flags); + cell_release_no_holder(pool, cell, &tc->deferred_bio_list); + spin_unlock_irqrestore(&tc->lock, flags); + + wake_worker(pool); +} + +static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) +{ + if (m->bio) { + m->bio->bi_end_io = m->saved_bi_end_io; + atomic_inc(&m->bio->bi_remaining); + } + cell_error(m->tc->pool, m->cell); + list_del(&m->list); + mempool_free(m, m->tc->pool->mapping_pool); +} + +static void process_prepared_mapping(struct dm_thin_new_mapping *m) +{ + struct thin_c *tc = m->tc; + struct pool *pool = tc->pool; + struct bio *bio; + int r; + + bio = m->bio; + if (bio) { + bio->bi_end_io = m->saved_bi_end_io; + atomic_inc(&bio->bi_remaining); + } + + if (m->err) { + cell_error(pool, m->cell); + goto out; + } + + /* + * Commit the prepared block into the mapping btree. + * Any I/O for this block arriving after this point will get + * remapped to it directly. + */ + r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); + if (r) { + metadata_operation_failed(pool, "dm_thin_insert_block", r); + cell_error(pool, m->cell); + goto out; + } + + /* + * Release any bios held while the block was being provisioned. + * If we are processing a write bio that completely covers the block, + * we already processed it so can ignore it now when processing + * the bios in the cell. + */ + if (bio) { + cell_defer_no_holder(tc, m->cell); + bio_endio(bio, 0); + } else + cell_defer(tc, m->cell); + +out: + list_del(&m->list); + mempool_free(m, pool->mapping_pool); +} + +static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) +{ + struct thin_c *tc = m->tc; + + bio_io_error(m->bio); + cell_defer_no_holder(tc, m->cell); + cell_defer_no_holder(tc, m->cell2); + mempool_free(m, tc->pool->mapping_pool); +} + +static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) +{ + struct thin_c *tc = m->tc; + + inc_all_io_entry(tc->pool, m->bio); + cell_defer_no_holder(tc, m->cell); + cell_defer_no_holder(tc, m->cell2); + + if (m->pass_discard) + if (m->definitely_not_shared) + remap_and_issue(tc, m->bio, m->data_block); + else { + bool used = false; + if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used) + bio_endio(m->bio, 0); + else + remap_and_issue(tc, m->bio, m->data_block); + } + else + bio_endio(m->bio, 0); + + mempool_free(m, tc->pool->mapping_pool); +} + +static void process_prepared_discard(struct dm_thin_new_mapping *m) +{ + int r; + struct thin_c *tc = m->tc; + + r = dm_thin_remove_block(tc->td, m->virt_block); + if (r) + DMERR_LIMIT("dm_thin_remove_block() failed"); + + process_prepared_discard_passdown(m); +} + +static void process_prepared(struct pool *pool, struct list_head *head, + process_mapping_fn *fn) +{ + unsigned long flags; + struct list_head maps; + struct dm_thin_new_mapping *m, *tmp; + + INIT_LIST_HEAD(&maps); + spin_lock_irqsave(&pool->lock, flags); + list_splice_init(head, &maps); + spin_unlock_irqrestore(&pool->lock, flags); + + list_for_each_entry_safe(m, tmp, &maps, list) + (*fn)(m); +} + +/* + * Deferred bio jobs. + */ +static int io_overlaps_block(struct pool *pool, struct bio *bio) +{ + return bio->bi_iter.bi_size == + (pool->sectors_per_block << SECTOR_SHIFT); +} + +static int io_overwrites_block(struct pool *pool, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE) && + io_overlaps_block(pool, bio); +} + +static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, + bio_end_io_t *fn) +{ + *save = bio->bi_end_io; + bio->bi_end_io = fn; +} + +static int ensure_next_mapping(struct pool *pool) +{ + if (pool->next_mapping) + return 0; + + pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC); + + return pool->next_mapping ? 0 : -ENOMEM; +} + +static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) +{ + struct dm_thin_new_mapping *m = pool->next_mapping; + + BUG_ON(!pool->next_mapping); + + memset(m, 0, sizeof(struct dm_thin_new_mapping)); + INIT_LIST_HEAD(&m->list); + m->bio = NULL; + + pool->next_mapping = NULL; + + return m; +} + +static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, + struct dm_dev *origin, dm_block_t data_origin, + dm_block_t data_dest, + struct dm_bio_prison_cell *cell, struct bio *bio) +{ + int r; + struct pool *pool = tc->pool; + struct dm_thin_new_mapping *m = get_next_mapping(pool); + + m->tc = tc; + m->virt_block = virt_block; + m->data_block = data_dest; + m->cell = cell; + + if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) + m->quiesced = true; + + /* + * IO to pool_dev remaps to the pool target's data_dev. + * + * If the whole block of data is being overwritten, we can issue the + * bio immediately. Otherwise we use kcopyd to clone the data first. + */ + if (io_overwrites_block(pool, bio)) { + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + h->overwrite_mapping = m; + m->bio = bio; + save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); + inc_all_io_entry(pool, bio); + remap_and_issue(tc, bio, data_dest); + } else { + struct dm_io_region from, to; + + from.bdev = origin->bdev; + from.sector = data_origin * pool->sectors_per_block; + from.count = pool->sectors_per_block; + + to.bdev = tc->pool_dev->bdev; + to.sector = data_dest * pool->sectors_per_block; + to.count = pool->sectors_per_block; + + r = dm_kcopyd_copy(pool->copier, &from, 1, &to, + 0, copy_complete, m); + if (r < 0) { + mempool_free(m, pool->mapping_pool); + DMERR_LIMIT("dm_kcopyd_copy() failed"); + cell_error(pool, cell); + } + } +} + +static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_origin, dm_block_t data_dest, + struct dm_bio_prison_cell *cell, struct bio *bio) +{ + schedule_copy(tc, virt_block, tc->pool_dev, + data_origin, data_dest, cell, bio); +} + +static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_dest, + struct dm_bio_prison_cell *cell, struct bio *bio) +{ + schedule_copy(tc, virt_block, tc->origin_dev, + virt_block, data_dest, cell, bio); +} + +static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_block, struct dm_bio_prison_cell *cell, + struct bio *bio) +{ + struct pool *pool = tc->pool; + struct dm_thin_new_mapping *m = get_next_mapping(pool); + + m->quiesced = true; + m->prepared = false; + m->tc = tc; + m->virt_block = virt_block; + m->data_block = data_block; + m->cell = cell; + + /* + * If the whole block of data is being overwritten or we are not + * zeroing pre-existing data, we can issue the bio immediately. + * Otherwise we use kcopyd to zero the data first. + */ + if (!pool->pf.zero_new_blocks) + process_prepared_mapping(m); + + else if (io_overwrites_block(pool, bio)) { + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + h->overwrite_mapping = m; + m->bio = bio; + save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); + inc_all_io_entry(pool, bio); + remap_and_issue(tc, bio, data_block); + } else { + int r; + struct dm_io_region to; + + to.bdev = tc->pool_dev->bdev; + to.sector = data_block * pool->sectors_per_block; + to.count = pool->sectors_per_block; + + r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); + if (r < 0) { + mempool_free(m, pool->mapping_pool); + DMERR_LIMIT("dm_kcopyd_zero() failed"); + cell_error(pool, cell); + } + } +} + +/* + * A non-zero return indicates read_only or fail_io mode. + * Many callers don't care about the return value. + */ +static int commit(struct pool *pool) +{ + int r; + + if (get_pool_mode(pool) >= PM_READ_ONLY) + return -EINVAL; + + r = dm_pool_commit_metadata(pool->pmd); + if (r) + metadata_operation_failed(pool, "dm_pool_commit_metadata", r); + + return r; +} + +static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) +{ + unsigned long flags; + + if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { + DMWARN("%s: reached low water mark for data device: sending event.", + dm_device_name(pool->pool_md)); + spin_lock_irqsave(&pool->lock, flags); + pool->low_water_triggered = true; + spin_unlock_irqrestore(&pool->lock, flags); + dm_table_event(pool->ti->table); + } +} + +static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); + +static int alloc_data_block(struct thin_c *tc, dm_block_t *result) +{ + int r; + dm_block_t free_blocks; + struct pool *pool = tc->pool; + + if (WARN_ON(get_pool_mode(pool) != PM_WRITE)) + return -EINVAL; + + r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); + if (r) { + metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); + return r; + } + + check_low_water_mark(pool, free_blocks); + + if (!free_blocks) { + /* + * Try to commit to see if that will free up some + * more space. + */ + r = commit(pool); + if (r) + return r; + + r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); + if (r) { + metadata_operation_failed(pool, "dm_pool_get_free_block_count", r); + return r; + } + + if (!free_blocks) { + set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); + return -ENOSPC; + } + } + + r = dm_pool_alloc_data_block(pool->pmd, result); + if (r) { + metadata_operation_failed(pool, "dm_pool_alloc_data_block", r); + return r; + } + + return 0; +} + +/* + * If we have run out of space, queue bios until the device is + * resumed, presumably after having been reloaded with more space. + */ +static void retry_on_resume(struct bio *bio) +{ + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + struct thin_c *tc = h->tc; + unsigned long flags; + + spin_lock_irqsave(&tc->lock, flags); + bio_list_add(&tc->retry_on_resume_list, bio); + spin_unlock_irqrestore(&tc->lock, flags); +} + +static int should_error_unserviceable_bio(struct pool *pool) +{ + enum pool_mode m = get_pool_mode(pool); + + switch (m) { + case PM_WRITE: + /* Shouldn't get here */ + DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); + return -EIO; + + case PM_OUT_OF_DATA_SPACE: + return pool->pf.error_if_no_space ? -ENOSPC : 0; + + case PM_READ_ONLY: + case PM_FAIL: + return -EIO; + default: + /* Shouldn't get here */ + DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); + return -EIO; + } +} + +static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) +{ + int error = should_error_unserviceable_bio(pool); + + if (error) + bio_endio(bio, error); + else + retry_on_resume(bio); +} + +static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + struct bio *bio; + struct bio_list bios; + int error; + + error = should_error_unserviceable_bio(pool); + if (error) { + cell_error_with_code(pool, cell, error); + return; + } + + bio_list_init(&bios); + cell_release(pool, cell, &bios); + + error = should_error_unserviceable_bio(pool); + if (error) + while ((bio = bio_list_pop(&bios))) + bio_endio(bio, error); + else + while ((bio = bio_list_pop(&bios))) + retry_on_resume(bio); +} + +static void process_discard(struct thin_c *tc, struct bio *bio) +{ + int r; + unsigned long flags; + struct pool *pool = tc->pool; + struct dm_bio_prison_cell *cell, *cell2; + struct dm_cell_key key, key2; + dm_block_t block = get_bio_block(tc, bio); + struct dm_thin_lookup_result lookup_result; + struct dm_thin_new_mapping *m; + + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool, &key, bio, &cell)) + return; + + r = dm_thin_find_block(tc->td, block, 1, &lookup_result); + switch (r) { + case 0: + /* + * Check nobody is fiddling with this pool block. This can + * happen if someone's in the process of breaking sharing + * on this block. + */ + build_data_key(tc->td, lookup_result.block, &key2); + if (bio_detain(tc->pool, &key2, bio, &cell2)) { + cell_defer_no_holder(tc, cell); + break; + } + + if (io_overlaps_block(pool, bio)) { + /* + * IO may still be going to the destination block. We must + * quiesce before we can do the removal. + */ + m = get_next_mapping(pool); + m->tc = tc; + m->pass_discard = pool->pf.discard_passdown; + m->definitely_not_shared = !lookup_result.shared; + m->virt_block = block; + m->data_block = lookup_result.block; + m->cell = cell; + m->cell2 = cell2; + m->bio = bio; + + if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { + spin_lock_irqsave(&pool->lock, flags); + list_add_tail(&m->list, &pool->prepared_discards); + spin_unlock_irqrestore(&pool->lock, flags); + wake_worker(pool); + } + } else { + inc_all_io_entry(pool, bio); + cell_defer_no_holder(tc, cell); + cell_defer_no_holder(tc, cell2); + + /* + * The DM core makes sure that the discard doesn't span + * a block boundary. So we submit the discard of a + * partial block appropriately. + */ + if ((!lookup_result.shared) && pool->pf.discard_passdown) + remap_and_issue(tc, bio, lookup_result.block); + else + bio_endio(bio, 0); + } + break; + + case -ENODATA: + /* + * It isn't provisioned, just forget it. + */ + cell_defer_no_holder(tc, cell); + bio_endio(bio, 0); + break; + + default: + DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", + __func__, r); + cell_defer_no_holder(tc, cell); + bio_io_error(bio); + break; + } +} + +static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, + struct dm_cell_key *key, + struct dm_thin_lookup_result *lookup_result, + struct dm_bio_prison_cell *cell) +{ + int r; + dm_block_t data_block; + struct pool *pool = tc->pool; + + r = alloc_data_block(tc, &data_block); + switch (r) { + case 0: + schedule_internal_copy(tc, block, lookup_result->block, + data_block, cell, bio); + break; + + case -ENOSPC: + retry_bios_on_resume(pool, cell); + break; + + default: + DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", + __func__, r); + cell_error(pool, cell); + break; + } +} + +static void process_shared_bio(struct thin_c *tc, struct bio *bio, + dm_block_t block, + struct dm_thin_lookup_result *lookup_result) +{ + struct dm_bio_prison_cell *cell; + struct pool *pool = tc->pool; + struct dm_cell_key key; + + /* + * If cell is already occupied, then sharing is already in the process + * of being broken so we have nothing further to do here. + */ + build_data_key(tc->td, lookup_result->block, &key); + if (bio_detain(pool, &key, bio, &cell)) + return; + + if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) + break_sharing(tc, bio, block, &key, lookup_result, cell); + else { + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); + inc_all_io_entry(pool, bio); + cell_defer_no_holder(tc, cell); + + remap_and_issue(tc, bio, lookup_result->block); + } +} + +static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block, + struct dm_bio_prison_cell *cell) +{ + int r; + dm_block_t data_block; + struct pool *pool = tc->pool; + + /* + * Remap empty bios (flushes) immediately, without provisioning. + */ + if (!bio->bi_iter.bi_size) { + inc_all_io_entry(pool, bio); + cell_defer_no_holder(tc, cell); + + remap_and_issue(tc, bio, 0); + return; + } + + /* + * Fill read bios with zeroes and complete them immediately. + */ + if (bio_data_dir(bio) == READ) { + zero_fill_bio(bio); + cell_defer_no_holder(tc, cell); + bio_endio(bio, 0); + return; + } + + r = alloc_data_block(tc, &data_block); + switch (r) { + case 0: + if (tc->origin_dev) + schedule_external_copy(tc, block, data_block, cell, bio); + else + schedule_zero(tc, block, data_block, cell, bio); + break; + + case -ENOSPC: + retry_bios_on_resume(pool, cell); + break; + + default: + DMERR_LIMIT("%s: alloc_data_block() failed: error = %d", + __func__, r); + cell_error(pool, cell); + break; + } +} + +static void process_bio(struct thin_c *tc, struct bio *bio) +{ + int r; + struct pool *pool = tc->pool; + dm_block_t block = get_bio_block(tc, bio); + struct dm_bio_prison_cell *cell; + struct dm_cell_key key; + struct dm_thin_lookup_result lookup_result; + + /* + * If cell is already occupied, then the block is already + * being provisioned so we have nothing further to do here. + */ + build_virtual_key(tc->td, block, &key); + if (bio_detain(pool, &key, bio, &cell)) + return; + + r = dm_thin_find_block(tc->td, block, 1, &lookup_result); + switch (r) { + case 0: + if (lookup_result.shared) { + process_shared_bio(tc, bio, block, &lookup_result); + cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ + } else { + inc_all_io_entry(pool, bio); + cell_defer_no_holder(tc, cell); + + remap_and_issue(tc, bio, lookup_result.block); + } + break; + + case -ENODATA: + if (bio_data_dir(bio) == READ && tc->origin_dev) { + inc_all_io_entry(pool, bio); + cell_defer_no_holder(tc, cell); + + remap_to_origin_and_issue(tc, bio); + } else + provision_block(tc, bio, block, cell); + break; + + default: + DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", + __func__, r); + cell_defer_no_holder(tc, cell); + bio_io_error(bio); + break; + } +} + +static void process_bio_read_only(struct thin_c *tc, struct bio *bio) +{ + int r; + int rw = bio_data_dir(bio); + dm_block_t block = get_bio_block(tc, bio); + struct dm_thin_lookup_result lookup_result; + + r = dm_thin_find_block(tc->td, block, 1, &lookup_result); + switch (r) { + case 0: + if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) + handle_unserviceable_bio(tc->pool, bio); + else { + inc_all_io_entry(tc->pool, bio); + remap_and_issue(tc, bio, lookup_result.block); + } + break; + + case -ENODATA: + if (rw != READ) { + handle_unserviceable_bio(tc->pool, bio); + break; + } + + if (tc->origin_dev) { + inc_all_io_entry(tc->pool, bio); + remap_to_origin_and_issue(tc, bio); + break; + } + + zero_fill_bio(bio); + bio_endio(bio, 0); + break; + + default: + DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", + __func__, r); + bio_io_error(bio); + break; + } +} + +static void process_bio_success(struct thin_c *tc, struct bio *bio) +{ + bio_endio(bio, 0); +} + +static void process_bio_fail(struct thin_c *tc, struct bio *bio) +{ + bio_io_error(bio); +} + +/* + * FIXME: should we also commit due to size of transaction, measured in + * metadata blocks? + */ +static int need_commit_due_to_time(struct pool *pool) +{ + return jiffies < pool->last_commit_jiffies || + jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; +} + +#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node) +#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook)) + +static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio) +{ + struct rb_node **rbp, *parent; + struct dm_thin_endio_hook *pbd; + sector_t bi_sector = bio->bi_iter.bi_sector; + + rbp = &tc->sort_bio_list.rb_node; + parent = NULL; + while (*rbp) { + parent = *rbp; + pbd = thin_pbd(parent); + + if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector) + rbp = &(*rbp)->rb_left; + else + rbp = &(*rbp)->rb_right; + } + + pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + rb_link_node(&pbd->rb_node, parent, rbp); + rb_insert_color(&pbd->rb_node, &tc->sort_bio_list); +} + +static void __extract_sorted_bios(struct thin_c *tc) +{ + struct rb_node *node; + struct dm_thin_endio_hook *pbd; + struct bio *bio; + + for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) { + pbd = thin_pbd(node); + bio = thin_bio(pbd); + + bio_list_add(&tc->deferred_bio_list, bio); + rb_erase(&pbd->rb_node, &tc->sort_bio_list); + } + + WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list)); +} + +static void __sort_thin_deferred_bios(struct thin_c *tc) +{ + struct bio *bio; + struct bio_list bios; + + bio_list_init(&bios); + bio_list_merge(&bios, &tc->deferred_bio_list); + bio_list_init(&tc->deferred_bio_list); + + /* Sort deferred_bio_list using rb-tree */ + while ((bio = bio_list_pop(&bios))) + __thin_bio_rb_add(tc, bio); + + /* + * Transfer the sorted bios in sort_bio_list back to + * deferred_bio_list to allow lockless submission of + * all bios. + */ + __extract_sorted_bios(tc); +} + +static void process_thin_deferred_bios(struct thin_c *tc) +{ + struct pool *pool = tc->pool; + unsigned long flags; + struct bio *bio; + struct bio_list bios; + struct blk_plug plug; + + if (tc->requeue_mode) { + requeue_bio_list(tc, &tc->deferred_bio_list); + return; + } + + bio_list_init(&bios); + + spin_lock_irqsave(&tc->lock, flags); + + if (bio_list_empty(&tc->deferred_bio_list)) { + spin_unlock_irqrestore(&tc->lock, flags); + return; + } + + __sort_thin_deferred_bios(tc); + + bio_list_merge(&bios, &tc->deferred_bio_list); + bio_list_init(&tc->deferred_bio_list); + + spin_unlock_irqrestore(&tc->lock, flags); + + blk_start_plug(&plug); + while ((bio = bio_list_pop(&bios))) { + /* + * If we've got no free new_mapping structs, and processing + * this bio might require one, we pause until there are some + * prepared mappings to process. + */ + if (ensure_next_mapping(pool)) { + spin_lock_irqsave(&tc->lock, flags); + bio_list_add(&tc->deferred_bio_list, bio); + bio_list_merge(&tc->deferred_bio_list, &bios); + spin_unlock_irqrestore(&tc->lock, flags); + break; + } + + if (bio->bi_rw & REQ_DISCARD) + pool->process_discard(tc, bio); + else + pool->process_bio(tc, bio); + } + blk_finish_plug(&plug); +} + +static void thin_get(struct thin_c *tc); +static void thin_put(struct thin_c *tc); + +/* + * We can't hold rcu_read_lock() around code that can block. So we + * find a thin with the rcu lock held; bump a refcount; then drop + * the lock. + */ +static struct thin_c *get_first_thin(struct pool *pool) +{ + struct thin_c *tc = NULL; + + rcu_read_lock(); + if (!list_empty(&pool->active_thins)) { + tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list); + thin_get(tc); + } + rcu_read_unlock(); + + return tc; +} + +static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc) +{ + struct thin_c *old_tc = tc; + + rcu_read_lock(); + list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) { + thin_get(tc); + thin_put(old_tc); + rcu_read_unlock(); + return tc; + } + thin_put(old_tc); + rcu_read_unlock(); + + return NULL; +} + +static void process_deferred_bios(struct pool *pool) +{ + unsigned long flags; + struct bio *bio; + struct bio_list bios; + struct thin_c *tc; + + tc = get_first_thin(pool); + while (tc) { + process_thin_deferred_bios(tc); + tc = get_next_thin(pool, tc); + } + + /* + * If there are any deferred flush bios, we must commit + * the metadata before issuing them. + */ + bio_list_init(&bios); + spin_lock_irqsave(&pool->lock, flags); + bio_list_merge(&bios, &pool->deferred_flush_bios); + bio_list_init(&pool->deferred_flush_bios); + spin_unlock_irqrestore(&pool->lock, flags); + + if (bio_list_empty(&bios) && + !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) + return; + + if (commit(pool)) { + while ((bio = bio_list_pop(&bios))) + bio_io_error(bio); + return; + } + pool->last_commit_jiffies = jiffies; + + while ((bio = bio_list_pop(&bios))) + generic_make_request(bio); +} + +static void do_worker(struct work_struct *ws) +{ + struct pool *pool = container_of(ws, struct pool, worker); + + process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); + process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); + process_deferred_bios(pool); +} + +/* + * We want to commit periodically so that not too much + * unwritten data builds up. + */ +static void do_waker(struct work_struct *ws) +{ + struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); + wake_worker(pool); + queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); +} + +/* + * We're holding onto IO to allow userland time to react. After the + * timeout either the pool will have been resized (and thus back in + * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO. + */ +static void do_no_space_timeout(struct work_struct *ws) +{ + struct pool *pool = container_of(to_delayed_work(ws), struct pool, + no_space_timeout); + + if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) + set_pool_mode(pool, PM_READ_ONLY); +} + +/*----------------------------------------------------------------*/ + +struct pool_work { + struct work_struct worker; + struct completion complete; +}; + +static struct pool_work *to_pool_work(struct work_struct *ws) +{ + return container_of(ws, struct pool_work, worker); +} + +static void pool_work_complete(struct pool_work *pw) +{ + complete(&pw->complete); +} + +static void pool_work_wait(struct pool_work *pw, struct pool *pool, + void (*fn)(struct work_struct *)) +{ + INIT_WORK_ONSTACK(&pw->worker, fn); + init_completion(&pw->complete); + queue_work(pool->wq, &pw->worker); + wait_for_completion(&pw->complete); +} + +/*----------------------------------------------------------------*/ + +struct noflush_work { + struct pool_work pw; + struct thin_c *tc; +}; + +static struct noflush_work *to_noflush(struct work_struct *ws) +{ + return container_of(to_pool_work(ws), struct noflush_work, pw); +} + +static void do_noflush_start(struct work_struct *ws) +{ + struct noflush_work *w = to_noflush(ws); + w->tc->requeue_mode = true; + requeue_io(w->tc); + pool_work_complete(&w->pw); +} + +static void do_noflush_stop(struct work_struct *ws) +{ + struct noflush_work *w = to_noflush(ws); + w->tc->requeue_mode = false; + pool_work_complete(&w->pw); +} + +static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) +{ + struct noflush_work w; + + w.tc = tc; + pool_work_wait(&w.pw, tc->pool, fn); +} + +/*----------------------------------------------------------------*/ + +static enum pool_mode get_pool_mode(struct pool *pool) +{ + return pool->pf.mode; +} + +static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode) +{ + dm_table_event(pool->ti->table); + DMINFO("%s: switching pool to %s mode", + dm_device_name(pool->pool_md), new_mode); +} + +static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) +{ + struct pool_c *pt = pool->ti->private; + bool needs_check = dm_pool_metadata_needs_check(pool->pmd); + enum pool_mode old_mode = get_pool_mode(pool); + unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ; + + /* + * Never allow the pool to transition to PM_WRITE mode if user + * intervention is required to verify metadata and data consistency. + */ + if (new_mode == PM_WRITE && needs_check) { + DMERR("%s: unable to switch pool to write mode until repaired.", + dm_device_name(pool->pool_md)); + if (old_mode != new_mode) + new_mode = old_mode; + else + new_mode = PM_READ_ONLY; + } + /* + * If we were in PM_FAIL mode, rollback of metadata failed. We're + * not going to recover without a thin_repair. So we never let the + * pool move out of the old mode. + */ + if (old_mode == PM_FAIL) + new_mode = old_mode; + + switch (new_mode) { + case PM_FAIL: + if (old_mode != new_mode) + notify_of_pool_mode_change(pool, "failure"); + dm_pool_metadata_read_only(pool->pmd); + pool->process_bio = process_bio_fail; + pool->process_discard = process_bio_fail; + pool->process_prepared_mapping = process_prepared_mapping_fail; + pool->process_prepared_discard = process_prepared_discard_fail; + + error_retry_list(pool); + break; + + case PM_READ_ONLY: + if (old_mode != new_mode) + notify_of_pool_mode_change(pool, "read-only"); + dm_pool_metadata_read_only(pool->pmd); + pool->process_bio = process_bio_read_only; + pool->process_discard = process_bio_success; + pool->process_prepared_mapping = process_prepared_mapping_fail; + pool->process_prepared_discard = process_prepared_discard_passdown; + + error_retry_list(pool); + break; + + case PM_OUT_OF_DATA_SPACE: + /* + * Ideally we'd never hit this state; the low water mark + * would trigger userland to extend the pool before we + * completely run out of data space. However, many small + * IOs to unprovisioned space can consume data space at an + * alarming rate. Adjust your low water mark if you're + * frequently seeing this mode. + */ + if (old_mode != new_mode) + notify_of_pool_mode_change(pool, "out-of-data-space"); + pool->process_bio = process_bio_read_only; + pool->process_discard = process_discard; + pool->process_prepared_mapping = process_prepared_mapping; + pool->process_prepared_discard = process_prepared_discard_passdown; + + if (!pool->pf.error_if_no_space && no_space_timeout) + queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout); + break; + + case PM_WRITE: + if (old_mode != new_mode) + notify_of_pool_mode_change(pool, "write"); + dm_pool_metadata_read_write(pool->pmd); + pool->process_bio = process_bio; + pool->process_discard = process_discard; + pool->process_prepared_mapping = process_prepared_mapping; + pool->process_prepared_discard = process_prepared_discard; + break; + } + + pool->pf.mode = new_mode; + /* + * The pool mode may have changed, sync it so bind_control_target() + * doesn't cause an unexpected mode transition on resume. + */ + pt->adjusted_pf.mode = new_mode; +} + +static void abort_transaction(struct pool *pool) +{ + const char *dev_name = dm_device_name(pool->pool_md); + + DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); + if (dm_pool_abort_metadata(pool->pmd)) { + DMERR("%s: failed to abort metadata transaction", dev_name); + set_pool_mode(pool, PM_FAIL); + } + + if (dm_pool_metadata_set_needs_check(pool->pmd)) { + DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); + set_pool_mode(pool, PM_FAIL); + } +} + +static void metadata_operation_failed(struct pool *pool, const char *op, int r) +{ + DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", + dm_device_name(pool->pool_md), op, r); + + abort_transaction(pool); + set_pool_mode(pool, PM_READ_ONLY); +} + +/*----------------------------------------------------------------*/ + +/* + * Mapping functions. + */ + +/* + * Called only while mapping a thin bio to hand it over to the workqueue. + */ +static void thin_defer_bio(struct thin_c *tc, struct bio *bio) +{ + unsigned long flags; + struct pool *pool = tc->pool; + + spin_lock_irqsave(&tc->lock, flags); + bio_list_add(&tc->deferred_bio_list, bio); + spin_unlock_irqrestore(&tc->lock, flags); + + wake_worker(pool); +} + +static void thin_hook_bio(struct thin_c *tc, struct bio *bio) +{ + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + h->tc = tc; + h->shared_read_entry = NULL; + h->all_io_entry = NULL; + h->overwrite_mapping = NULL; +} + +/* + * Non-blocking function called from the thin target's map function. + */ +static int thin_bio_map(struct dm_target *ti, struct bio *bio) +{ + int r; + struct thin_c *tc = ti->private; + dm_block_t block = get_bio_block(tc, bio); + struct dm_thin_device *td = tc->td; + struct dm_thin_lookup_result result; + struct dm_bio_prison_cell cell1, cell2; + struct dm_bio_prison_cell *cell_result; + struct dm_cell_key key; + + thin_hook_bio(tc, bio); + + if (tc->requeue_mode) { + bio_endio(bio, DM_ENDIO_REQUEUE); + return DM_MAPIO_SUBMITTED; + } + + if (get_pool_mode(tc->pool) == PM_FAIL) { + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } + + if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { + thin_defer_bio(tc, bio); + return DM_MAPIO_SUBMITTED; + } + + r = dm_thin_find_block(td, block, 0, &result); + + /* + * Note that we defer readahead too. + */ + switch (r) { + case 0: + if (unlikely(result.shared)) { + /* + * We have a race condition here between the + * result.shared value returned by the lookup and + * snapshot creation, which may cause new + * sharing. + * + * To avoid this always quiesce the origin before + * taking the snap. You want to do this anyway to + * ensure a consistent application view + * (i.e. lockfs). + * + * More distant ancestors are irrelevant. The + * shared flag will be set in their case. + */ + thin_defer_bio(tc, bio); + return DM_MAPIO_SUBMITTED; + } + + build_virtual_key(tc->td, block, &key); + if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) + return DM_MAPIO_SUBMITTED; + + build_data_key(tc->td, result.block, &key); + if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { + cell_defer_no_holder_no_free(tc, &cell1); + return DM_MAPIO_SUBMITTED; + } + + inc_all_io_entry(tc->pool, bio); + cell_defer_no_holder_no_free(tc, &cell2); + cell_defer_no_holder_no_free(tc, &cell1); + + remap(tc, bio, result.block); + return DM_MAPIO_REMAPPED; + + case -ENODATA: + if (get_pool_mode(tc->pool) == PM_READ_ONLY) { + /* + * This block isn't provisioned, and we have no way + * of doing so. + */ + handle_unserviceable_bio(tc->pool, bio); + return DM_MAPIO_SUBMITTED; + } + /* fall through */ + + case -EWOULDBLOCK: + /* + * In future, the failed dm_thin_find_block above could + * provide the hint to load the metadata into cache. + */ + thin_defer_bio(tc, bio); + return DM_MAPIO_SUBMITTED; + + default: + /* + * Must always call bio_io_error on failure. + * dm_thin_find_block can fail with -EINVAL if the + * pool is switched to fail-io mode. + */ + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } +} + +static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) +{ + struct pool_c *pt = container_of(cb, struct pool_c, callbacks); + struct request_queue *q; + + if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE) + return 1; + + q = bdev_get_queue(pt->data_dev->bdev); + return bdi_congested(&q->backing_dev_info, bdi_bits); +} + +static void requeue_bios(struct pool *pool) +{ + unsigned long flags; + struct thin_c *tc; + + rcu_read_lock(); + list_for_each_entry_rcu(tc, &pool->active_thins, list) { + spin_lock_irqsave(&tc->lock, flags); + bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list); + bio_list_init(&tc->retry_on_resume_list); + spin_unlock_irqrestore(&tc->lock, flags); + } + rcu_read_unlock(); +} + +/*---------------------------------------------------------------- + * Binding of control targets to a pool object + *--------------------------------------------------------------*/ +static bool data_dev_supports_discard(struct pool_c *pt) +{ + struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); + + return q && blk_queue_discard(q); +} + +static bool is_factor(sector_t block_size, uint32_t n) +{ + return !sector_div(block_size, n); +} + +/* + * If discard_passdown was enabled verify that the data device + * supports discards. Disable discard_passdown if not. + */ +static void disable_passdown_if_not_supported(struct pool_c *pt) +{ + struct pool *pool = pt->pool; + struct block_device *data_bdev = pt->data_dev->bdev; + struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; + sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT; + const char *reason = NULL; + char buf[BDEVNAME_SIZE]; + + if (!pt->adjusted_pf.discard_passdown) + return; + + if (!data_dev_supports_discard(pt)) + reason = "discard unsupported"; + + else if (data_limits->max_discard_sectors < pool->sectors_per_block) + reason = "max discard sectors smaller than a block"; + + else if (data_limits->discard_granularity > block_size) + reason = "discard granularity larger than a block"; + + else if (!is_factor(block_size, data_limits->discard_granularity)) + reason = "discard granularity not a factor of block size"; + + if (reason) { + DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason); + pt->adjusted_pf.discard_passdown = false; + } +} + +static int bind_control_target(struct pool *pool, struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + + /* + * We want to make sure that a pool in PM_FAIL mode is never upgraded. + */ + enum pool_mode old_mode = get_pool_mode(pool); + enum pool_mode new_mode = pt->adjusted_pf.mode; + + /* + * Don't change the pool's mode until set_pool_mode() below. + * Otherwise the pool's process_* function pointers may + * not match the desired pool mode. + */ + pt->adjusted_pf.mode = old_mode; + + pool->ti = ti; + pool->pf = pt->adjusted_pf; + pool->low_water_blocks = pt->low_water_blocks; + + set_pool_mode(pool, new_mode); + + return 0; +} + +static void unbind_control_target(struct pool *pool, struct dm_target *ti) +{ + if (pool->ti == ti) + pool->ti = NULL; +} + +/*---------------------------------------------------------------- + * Pool creation + *--------------------------------------------------------------*/ +/* Initialize pool features. */ +static void pool_features_init(struct pool_features *pf) +{ + pf->mode = PM_WRITE; + pf->zero_new_blocks = true; + pf->discard_enabled = true; + pf->discard_passdown = true; + pf->error_if_no_space = false; +} + +static void __pool_destroy(struct pool *pool) +{ + __pool_table_remove(pool); + + if (dm_pool_metadata_close(pool->pmd) < 0) + DMWARN("%s: dm_pool_metadata_close() failed.", __func__); + + dm_bio_prison_destroy(pool->prison); + dm_kcopyd_client_destroy(pool->copier); + + if (pool->wq) + destroy_workqueue(pool->wq); + + if (pool->next_mapping) + mempool_free(pool->next_mapping, pool->mapping_pool); + mempool_destroy(pool->mapping_pool); + dm_deferred_set_destroy(pool->shared_read_ds); + dm_deferred_set_destroy(pool->all_io_ds); + kfree(pool); +} + +static struct kmem_cache *_new_mapping_cache; + +static struct pool *pool_create(struct mapped_device *pool_md, + struct block_device *metadata_dev, + unsigned long block_size, + int read_only, char **error) +{ + int r; + void *err_p; + struct pool *pool; + struct dm_pool_metadata *pmd; + bool format_device = read_only ? false : true; + + pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device); + if (IS_ERR(pmd)) { + *error = "Error creating metadata object"; + return (struct pool *)pmd; + } + + pool = kmalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) { + *error = "Error allocating memory for pool"; + err_p = ERR_PTR(-ENOMEM); + goto bad_pool; + } + + pool->pmd = pmd; + pool->sectors_per_block = block_size; + if (block_size & (block_size - 1)) + pool->sectors_per_block_shift = -1; + else + pool->sectors_per_block_shift = __ffs(block_size); + pool->low_water_blocks = 0; + pool_features_init(&pool->pf); + pool->prison = dm_bio_prison_create(PRISON_CELLS); + if (!pool->prison) { + *error = "Error creating pool's bio prison"; + err_p = ERR_PTR(-ENOMEM); + goto bad_prison; + } + + pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); + if (IS_ERR(pool->copier)) { + r = PTR_ERR(pool->copier); + *error = "Error creating pool's kcopyd client"; + err_p = ERR_PTR(r); + goto bad_kcopyd_client; + } + + /* + * Create singlethreaded workqueue that will service all devices + * that use this metadata. + */ + pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + if (!pool->wq) { + *error = "Error creating pool's workqueue"; + err_p = ERR_PTR(-ENOMEM); + goto bad_wq; + } + + INIT_WORK(&pool->worker, do_worker); + INIT_DELAYED_WORK(&pool->waker, do_waker); + INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); + spin_lock_init(&pool->lock); + bio_list_init(&pool->deferred_flush_bios); + INIT_LIST_HEAD(&pool->prepared_mappings); + INIT_LIST_HEAD(&pool->prepared_discards); + INIT_LIST_HEAD(&pool->active_thins); + pool->low_water_triggered = false; + + pool->shared_read_ds = dm_deferred_set_create(); + if (!pool->shared_read_ds) { + *error = "Error creating pool's shared read deferred set"; + err_p = ERR_PTR(-ENOMEM); + goto bad_shared_read_ds; + } + + pool->all_io_ds = dm_deferred_set_create(); + if (!pool->all_io_ds) { + *error = "Error creating pool's all io deferred set"; + err_p = ERR_PTR(-ENOMEM); + goto bad_all_io_ds; + } + + pool->next_mapping = NULL; + pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE, + _new_mapping_cache); + if (!pool->mapping_pool) { + *error = "Error creating pool's mapping mempool"; + err_p = ERR_PTR(-ENOMEM); + goto bad_mapping_pool; + } + + pool->ref_count = 1; + pool->last_commit_jiffies = jiffies; + pool->pool_md = pool_md; + pool->md_dev = metadata_dev; + __pool_table_insert(pool); + + return pool; + +bad_mapping_pool: + dm_deferred_set_destroy(pool->all_io_ds); +bad_all_io_ds: + dm_deferred_set_destroy(pool->shared_read_ds); +bad_shared_read_ds: + destroy_workqueue(pool->wq); +bad_wq: + dm_kcopyd_client_destroy(pool->copier); +bad_kcopyd_client: + dm_bio_prison_destroy(pool->prison); +bad_prison: + kfree(pool); +bad_pool: + if (dm_pool_metadata_close(pmd)) + DMWARN("%s: dm_pool_metadata_close() failed.", __func__); + + return err_p; +} + +static void __pool_inc(struct pool *pool) +{ + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + pool->ref_count++; +} + +static void __pool_dec(struct pool *pool) +{ + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + BUG_ON(!pool->ref_count); + if (!--pool->ref_count) + __pool_destroy(pool); +} + +static struct pool *__pool_find(struct mapped_device *pool_md, + struct block_device *metadata_dev, + unsigned long block_size, int read_only, + char **error, int *created) +{ + struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); + + if (pool) { + if (pool->pool_md != pool_md) { + *error = "metadata device already in use by a pool"; + return ERR_PTR(-EBUSY); + } + __pool_inc(pool); + + } else { + pool = __pool_table_lookup(pool_md); + if (pool) { + if (pool->md_dev != metadata_dev) { + *error = "different pool cannot replace a pool"; + return ERR_PTR(-EINVAL); + } + __pool_inc(pool); + + } else { + pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); + *created = 1; + } + } + + return pool; +} + +/*---------------------------------------------------------------- + * Pool target methods + *--------------------------------------------------------------*/ +static void pool_dtr(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + + mutex_lock(&dm_thin_pool_table.mutex); + + unbind_control_target(pt->pool, ti); + __pool_dec(pt->pool); + dm_put_device(ti, pt->metadata_dev); + dm_put_device(ti, pt->data_dev); + kfree(pt); + + mutex_unlock(&dm_thin_pool_table.mutex); +} + +static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, + struct dm_target *ti) +{ + int r; + unsigned argc; + const char *arg_name; + + static struct dm_arg _args[] = { + {0, 4, "Invalid number of pool feature arguments"}, + }; + + /* + * No feature arguments supplied. + */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return -EINVAL; + + while (argc && !r) { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, "skip_block_zeroing")) + pf->zero_new_blocks = false; + + else if (!strcasecmp(arg_name, "ignore_discard")) + pf->discard_enabled = false; + + else if (!strcasecmp(arg_name, "no_discard_passdown")) + pf->discard_passdown = false; + + else if (!strcasecmp(arg_name, "read_only")) + pf->mode = PM_READ_ONLY; + + else if (!strcasecmp(arg_name, "error_if_no_space")) + pf->error_if_no_space = true; + + else { + ti->error = "Unrecognised pool feature requested"; + r = -EINVAL; + break; + } + } + + return r; +} + +static void metadata_low_callback(void *context) +{ + struct pool *pool = context; + + DMWARN("%s: reached low water mark for metadata device: sending event.", + dm_device_name(pool->pool_md)); + + dm_table_event(pool->ti->table); +} + +static sector_t get_dev_size(struct block_device *bdev) +{ + return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; +} + +static void warn_if_metadata_device_too_big(struct block_device *bdev) +{ + sector_t metadata_dev_size = get_dev_size(bdev); + char buffer[BDEVNAME_SIZE]; + + if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) + DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", + bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); +} + +static sector_t get_metadata_dev_size(struct block_device *bdev) +{ + sector_t metadata_dev_size = get_dev_size(bdev); + + if (metadata_dev_size > THIN_METADATA_MAX_SECTORS) + metadata_dev_size = THIN_METADATA_MAX_SECTORS; + + return metadata_dev_size; +} + +static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev) +{ + sector_t metadata_dev_size = get_metadata_dev_size(bdev); + + sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE); + + return metadata_dev_size; +} + +/* + * When a metadata threshold is crossed a dm event is triggered, and + * userland should respond by growing the metadata device. We could let + * userland set the threshold, like we do with the data threshold, but I'm + * not sure they know enough to do this well. + */ +static dm_block_t calc_metadata_threshold(struct pool_c *pt) +{ + /* + * 4M is ample for all ops with the possible exception of thin + * device deletion which is harmless if it fails (just retry the + * delete after you've grown the device). + */ + dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4; + return min((dm_block_t)1024ULL /* 4M */, quarter); +} + +/* + * thin-pool <metadata dev> <data dev> + * <data block size (sectors)> + * <low water mark (blocks)> + * [<#feature args> [<arg>]*] + * + * Optional feature arguments are: + * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. + * ignore_discard: disable discard + * no_discard_passdown: don't pass discards down to the data device + * read_only: Don't allow any changes to be made to the pool metadata. + * error_if_no_space: error IOs, instead of queueing, if no space. + */ +static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int r, pool_created = 0; + struct pool_c *pt; + struct pool *pool; + struct pool_features pf; + struct dm_arg_set as; + struct dm_dev *data_dev; + unsigned long block_size; + dm_block_t low_water_blocks; + struct dm_dev *metadata_dev; + fmode_t metadata_mode; + + /* + * FIXME Remove validation from scope of lock. + */ + mutex_lock(&dm_thin_pool_table.mutex); + + if (argc < 4) { + ti->error = "Invalid argument count"; + r = -EINVAL; + goto out_unlock; + } + + as.argc = argc; + as.argv = argv; + + /* + * Set default pool features. + */ + pool_features_init(&pf); + + dm_consume_args(&as, 4); + r = parse_pool_features(&as, &pf, ti); + if (r) + goto out_unlock; + + metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE); + r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev); + if (r) { + ti->error = "Error opening metadata block device"; + goto out_unlock; + } + warn_if_metadata_device_too_big(metadata_dev->bdev); + + r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); + if (r) { + ti->error = "Error getting data device"; + goto out_metadata; + } + + if (kstrtoul(argv[2], 10, &block_size) || !block_size || + block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || + block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || + block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { + ti->error = "Invalid block size"; + r = -EINVAL; + goto out; + } + + if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { + ti->error = "Invalid low water mark"; + r = -EINVAL; + goto out; + } + + pt = kzalloc(sizeof(*pt), GFP_KERNEL); + if (!pt) { + r = -ENOMEM; + goto out; + } + + pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, + block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); + if (IS_ERR(pool)) { + r = PTR_ERR(pool); + goto out_free_pt; + } + + /* + * 'pool_created' reflects whether this is the first table load. + * Top level discard support is not allowed to be changed after + * initial load. This would require a pool reload to trigger thin + * device changes. + */ + if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { + ti->error = "Discard support cannot be disabled once enabled"; + r = -EINVAL; + goto out_flags_changed; + } + + pt->pool = pool; + pt->ti = ti; + pt->metadata_dev = metadata_dev; + pt->data_dev = data_dev; + pt->low_water_blocks = low_water_blocks; + pt->adjusted_pf = pt->requested_pf = pf; + ti->num_flush_bios = 1; + + /* + * Only need to enable discards if the pool should pass + * them down to the data device. The thin device's discard + * processing will cause mappings to be removed from the btree. + */ + ti->discard_zeroes_data_unsupported = true; + if (pf.discard_enabled && pf.discard_passdown) { + ti->num_discard_bios = 1; + + /* + * Setting 'discards_supported' circumvents the normal + * stacking of discard limits (this keeps the pool and + * thin devices' discard limits consistent). + */ + ti->discards_supported = true; + } + ti->private = pt; + + r = dm_pool_register_metadata_threshold(pt->pool->pmd, + calc_metadata_threshold(pt), + metadata_low_callback, + pool); + if (r) + goto out_free_pt; + + pt->callbacks.congested_fn = pool_is_congested; + dm_table_add_target_callbacks(ti->table, &pt->callbacks); + + mutex_unlock(&dm_thin_pool_table.mutex); + + return 0; + +out_flags_changed: + __pool_dec(pool); +out_free_pt: + kfree(pt); +out: + dm_put_device(ti, data_dev); +out_metadata: + dm_put_device(ti, metadata_dev); +out_unlock: + mutex_unlock(&dm_thin_pool_table.mutex); + + return r; +} + +static int pool_map(struct dm_target *ti, struct bio *bio) +{ + int r; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + unsigned long flags; + + /* + * As this is a singleton target, ti->begin is always zero. + */ + spin_lock_irqsave(&pool->lock, flags); + bio->bi_bdev = pt->data_dev->bdev; + r = DM_MAPIO_REMAPPED; + spin_unlock_irqrestore(&pool->lock, flags); + + return r; +} + +static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit) +{ + int r; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + sector_t data_size = ti->len; + dm_block_t sb_data_size; + + *need_commit = false; + + (void) sector_div(data_size, pool->sectors_per_block); + + r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); + if (r) { + DMERR("%s: failed to retrieve data device size", + dm_device_name(pool->pool_md)); + return r; + } + + if (data_size < sb_data_size) { + DMERR("%s: pool target (%llu blocks) too small: expected %llu", + dm_device_name(pool->pool_md), + (unsigned long long)data_size, sb_data_size); + return -EINVAL; + + } else if (data_size > sb_data_size) { + if (dm_pool_metadata_needs_check(pool->pmd)) { + DMERR("%s: unable to grow the data device until repaired.", + dm_device_name(pool->pool_md)); + return 0; + } + + if (sb_data_size) + DMINFO("%s: growing the data device from %llu to %llu blocks", + dm_device_name(pool->pool_md), + sb_data_size, (unsigned long long)data_size); + r = dm_pool_resize_data_dev(pool->pmd, data_size); + if (r) { + metadata_operation_failed(pool, "dm_pool_resize_data_dev", r); + return r; + } + + *need_commit = true; + } + + return 0; +} + +static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) +{ + int r; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + dm_block_t metadata_dev_size, sb_metadata_dev_size; + + *need_commit = false; + + metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev); + + r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size); + if (r) { + DMERR("%s: failed to retrieve metadata device size", + dm_device_name(pool->pool_md)); + return r; + } + + if (metadata_dev_size < sb_metadata_dev_size) { + DMERR("%s: metadata device (%llu blocks) too small: expected %llu", + dm_device_name(pool->pool_md), + metadata_dev_size, sb_metadata_dev_size); + return -EINVAL; + + } else if (metadata_dev_size > sb_metadata_dev_size) { + if (dm_pool_metadata_needs_check(pool->pmd)) { + DMERR("%s: unable to grow the metadata device until repaired.", + dm_device_name(pool->pool_md)); + return 0; + } + + warn_if_metadata_device_too_big(pool->md_dev); + DMINFO("%s: growing the metadata device from %llu to %llu blocks", + dm_device_name(pool->pool_md), + sb_metadata_dev_size, metadata_dev_size); + r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size); + if (r) { + metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r); + return r; + } + + *need_commit = true; + } + + return 0; +} + +/* + * Retrieves the number of blocks of the data device from + * the superblock and compares it to the actual device size, + * thus resizing the data device in case it has grown. + * + * This both copes with opening preallocated data devices in the ctr + * being followed by a resume + * -and- + * calling the resume method individually after userspace has + * grown the data device in reaction to a table event. + */ +static int pool_preresume(struct dm_target *ti) +{ + int r; + bool need_commit1, need_commit2; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + /* + * Take control of the pool object. + */ + r = bind_control_target(pool, ti); + if (r) + return r; + + r = maybe_resize_data_dev(ti, &need_commit1); + if (r) + return r; + + r = maybe_resize_metadata_dev(ti, &need_commit2); + if (r) + return r; + + if (need_commit1 || need_commit2) + (void) commit(pool); + + return 0; +} + +static void pool_resume(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + pool->low_water_triggered = false; + spin_unlock_irqrestore(&pool->lock, flags); + requeue_bios(pool); + + do_waker(&pool->waker.work); +} + +static void pool_postsuspend(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + cancel_delayed_work(&pool->waker); + cancel_delayed_work(&pool->no_space_timeout); + flush_workqueue(pool->wq); + (void) commit(pool); +} + +static int check_arg_count(unsigned argc, unsigned args_required) +{ + if (argc != args_required) { + DMWARN("Message received with %u arguments instead of %u.", + argc, args_required); + return -EINVAL; + } + + return 0; +} + +static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning) +{ + if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) && + *dev_id <= MAX_DEV_ID) + return 0; + + if (warning) + DMWARN("Message received with invalid device id: %s", arg); + + return -EINVAL; +} + +static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool) +{ + dm_thin_id dev_id; + int r; + + r = check_arg_count(argc, 2); + if (r) + return r; + + r = read_dev_id(argv[1], &dev_id, 1); + if (r) + return r; + + r = dm_pool_create_thin(pool->pmd, dev_id); + if (r) { + DMWARN("Creation of new thinly-provisioned device with id %s failed.", + argv[1]); + return r; + } + + return 0; +} + +static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool) +{ + dm_thin_id dev_id; + dm_thin_id origin_dev_id; + int r; + + r = check_arg_count(argc, 3); + if (r) + return r; + + r = read_dev_id(argv[1], &dev_id, 1); + if (r) + return r; + + r = read_dev_id(argv[2], &origin_dev_id, 1); + if (r) + return r; + + r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id); + if (r) { + DMWARN("Creation of new snapshot %s of device %s failed.", + argv[1], argv[2]); + return r; + } + + return 0; +} + +static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool) +{ + dm_thin_id dev_id; + int r; + + r = check_arg_count(argc, 2); + if (r) + return r; + + r = read_dev_id(argv[1], &dev_id, 1); + if (r) + return r; + + r = dm_pool_delete_thin_device(pool->pmd, dev_id); + if (r) + DMWARN("Deletion of thin device %s failed.", argv[1]); + + return r; +} + +static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) +{ + dm_thin_id old_id, new_id; + int r; + + r = check_arg_count(argc, 3); + if (r) + return r; + + if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) { + DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]); + return -EINVAL; + } + + if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) { + DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]); + return -EINVAL; + } + + r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id); + if (r) { + DMWARN("Failed to change transaction id from %s to %s.", + argv[1], argv[2]); + return r; + } + + return 0; +} + +static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) +{ + int r; + + r = check_arg_count(argc, 1); + if (r) + return r; + + (void) commit(pool); + + r = dm_pool_reserve_metadata_snap(pool->pmd); + if (r) + DMWARN("reserve_metadata_snap message failed."); + + return r; +} + +static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool) +{ + int r; + + r = check_arg_count(argc, 1); + if (r) + return r; + + r = dm_pool_release_metadata_snap(pool->pmd); + if (r) + DMWARN("release_metadata_snap message failed."); + + return r; +} + +/* + * Messages supported: + * create_thin <dev_id> + * create_snap <dev_id> <origin_id> + * delete <dev_id> + * trim <dev_id> <new_size_in_sectors> + * set_transaction_id <current_trans_id> <new_trans_id> + * reserve_metadata_snap + * release_metadata_snap + */ +static int pool_message(struct dm_target *ti, unsigned argc, char **argv) +{ + int r = -EINVAL; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + if (!strcasecmp(argv[0], "create_thin")) + r = process_create_thin_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "create_snap")) + r = process_create_snap_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "delete")) + r = process_delete_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "set_transaction_id")) + r = process_set_transaction_id_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "reserve_metadata_snap")) + r = process_reserve_metadata_snap_mesg(argc, argv, pool); + + else if (!strcasecmp(argv[0], "release_metadata_snap")) + r = process_release_metadata_snap_mesg(argc, argv, pool); + + else + DMWARN("Unrecognised thin pool target message received: %s", argv[0]); + + if (!r) + (void) commit(pool); + + return r; +} + +static void emit_flags(struct pool_features *pf, char *result, + unsigned sz, unsigned maxlen) +{ + unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + + !pf->discard_passdown + (pf->mode == PM_READ_ONLY) + + pf->error_if_no_space; + DMEMIT("%u ", count); + + if (!pf->zero_new_blocks) + DMEMIT("skip_block_zeroing "); + + if (!pf->discard_enabled) + DMEMIT("ignore_discard "); + + if (!pf->discard_passdown) + DMEMIT("no_discard_passdown "); + + if (pf->mode == PM_READ_ONLY) + DMEMIT("read_only "); + + if (pf->error_if_no_space) + DMEMIT("error_if_no_space "); +} + +/* + * Status line is: + * <transaction id> <used metadata sectors>/<total metadata sectors> + * <used data sectors>/<total data sectors> <held metadata root> + */ +static void pool_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + int r; + unsigned sz = 0; + uint64_t transaction_id; + dm_block_t nr_free_blocks_data; + dm_block_t nr_free_blocks_metadata; + dm_block_t nr_blocks_data; + dm_block_t nr_blocks_metadata; + dm_block_t held_root; + char buf[BDEVNAME_SIZE]; + char buf2[BDEVNAME_SIZE]; + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + switch (type) { + case STATUSTYPE_INFO: + if (get_pool_mode(pool) == PM_FAIL) { + DMEMIT("Fail"); + break; + } + + /* Commit to ensure statistics aren't out-of-date */ + if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) + (void) commit(pool); + + r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); + if (r) { + DMERR("%s: dm_pool_get_metadata_transaction_id returned %d", + dm_device_name(pool->pool_md), r); + goto err; + } + + r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata); + if (r) { + DMERR("%s: dm_pool_get_free_metadata_block_count returned %d", + dm_device_name(pool->pool_md), r); + goto err; + } + + r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata); + if (r) { + DMERR("%s: dm_pool_get_metadata_dev_size returned %d", + dm_device_name(pool->pool_md), r); + goto err; + } + + r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data); + if (r) { + DMERR("%s: dm_pool_get_free_block_count returned %d", + dm_device_name(pool->pool_md), r); + goto err; + } + + r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data); + if (r) { + DMERR("%s: dm_pool_get_data_dev_size returned %d", + dm_device_name(pool->pool_md), r); + goto err; + } + + r = dm_pool_get_metadata_snap(pool->pmd, &held_root); + if (r) { + DMERR("%s: dm_pool_get_metadata_snap returned %d", + dm_device_name(pool->pool_md), r); + goto err; + } + + DMEMIT("%llu %llu/%llu %llu/%llu ", + (unsigned long long)transaction_id, + (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), + (unsigned long long)nr_blocks_metadata, + (unsigned long long)(nr_blocks_data - nr_free_blocks_data), + (unsigned long long)nr_blocks_data); + + if (held_root) + DMEMIT("%llu ", held_root); + else + DMEMIT("- "); + + if (pool->pf.mode == PM_OUT_OF_DATA_SPACE) + DMEMIT("out_of_data_space "); + else if (pool->pf.mode == PM_READ_ONLY) + DMEMIT("ro "); + else + DMEMIT("rw "); + + if (!pool->pf.discard_enabled) + DMEMIT("ignore_discard "); + else if (pool->pf.discard_passdown) + DMEMIT("discard_passdown "); + else + DMEMIT("no_discard_passdown "); + + if (pool->pf.error_if_no_space) + DMEMIT("error_if_no_space "); + else + DMEMIT("queue_if_no_space "); + + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %s %lu %llu ", + format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), + format_dev_t(buf2, pt->data_dev->bdev->bd_dev), + (unsigned long)pool->sectors_per_block, + (unsigned long long)pt->low_water_blocks); + emit_flags(&pt->requested_pf, result, sz, maxlen); + break; + } + return; + +err: + DMEMIT("Error"); +} + +static int pool_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct pool_c *pt = ti->private; + + return fn(ti, pt->data_dev, 0, ti->len, data); +} + +static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct pool_c *pt = ti->private; + struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = pt->data_dev->bdev; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) +{ + struct pool *pool = pt->pool; + struct queue_limits *data_limits; + + limits->max_discard_sectors = pool->sectors_per_block; + + /* + * discard_granularity is just a hint, and not enforced. + */ + if (pt->adjusted_pf.discard_passdown) { + data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; + limits->discard_granularity = max(data_limits->discard_granularity, + pool->sectors_per_block << SECTOR_SHIFT); + } else + limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; +} + +static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + + /* + * If the system-determined stacked limits are compatible with the + * pool's blocksize (io_opt is a factor) do not override them. + */ + if (io_opt_sectors < pool->sectors_per_block || + do_div(io_opt_sectors, pool->sectors_per_block)) { + blk_limits_io_min(limits, 0); + blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); + } + + /* + * pt->adjusted_pf is a staging area for the actual features to use. + * They get transferred to the live pool in bind_control_target() + * called from pool_preresume(). + */ + if (!pt->adjusted_pf.discard_enabled) { + /* + * Must explicitly disallow stacking discard limits otherwise the + * block layer will stack them if pool's data device has support. + * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the + * user to see that, so make sure to set all discard limits to 0. + */ + limits->discard_granularity = 0; + return; + } + + disable_passdown_if_not_supported(pt); + + set_discard_limits(pt, limits); +} + +static struct target_type pool_target = { + .name = "thin-pool", + .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | + DM_TARGET_IMMUTABLE, + .version = {1, 12, 0}, + .module = THIS_MODULE, + .ctr = pool_ctr, + .dtr = pool_dtr, + .map = pool_map, + .postsuspend = pool_postsuspend, + .preresume = pool_preresume, + .resume = pool_resume, + .message = pool_message, + .status = pool_status, + .merge = pool_merge, + .iterate_devices = pool_iterate_devices, + .io_hints = pool_io_hints, +}; + +/*---------------------------------------------------------------- + * Thin target methods + *--------------------------------------------------------------*/ +static void thin_get(struct thin_c *tc) +{ + atomic_inc(&tc->refcount); +} + +static void thin_put(struct thin_c *tc) +{ + if (atomic_dec_and_test(&tc->refcount)) + complete(&tc->can_destroy); +} + +static void thin_dtr(struct dm_target *ti) +{ + struct thin_c *tc = ti->private; + unsigned long flags; + + thin_put(tc); + wait_for_completion(&tc->can_destroy); + + spin_lock_irqsave(&tc->pool->lock, flags); + list_del_rcu(&tc->list); + spin_unlock_irqrestore(&tc->pool->lock, flags); + synchronize_rcu(); + + mutex_lock(&dm_thin_pool_table.mutex); + + __pool_dec(tc->pool); + dm_pool_close_thin_device(tc->td); + dm_put_device(ti, tc->pool_dev); + if (tc->origin_dev) + dm_put_device(ti, tc->origin_dev); + kfree(tc); + + mutex_unlock(&dm_thin_pool_table.mutex); +} + +/* + * Thin target parameters: + * + * <pool_dev> <dev_id> [origin_dev] + * + * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) + * dev_id: the internal device identifier + * origin_dev: a device external to the pool that should act as the origin + * + * If the pool device has discards disabled, they get disabled for the thin + * device as well. + */ +static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int r; + struct thin_c *tc; + struct dm_dev *pool_dev, *origin_dev; + struct mapped_device *pool_md; + unsigned long flags; + + mutex_lock(&dm_thin_pool_table.mutex); + + if (argc != 2 && argc != 3) { + ti->error = "Invalid argument count"; + r = -EINVAL; + goto out_unlock; + } + + tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); + if (!tc) { + ti->error = "Out of memory"; + r = -ENOMEM; + goto out_unlock; + } + spin_lock_init(&tc->lock); + bio_list_init(&tc->deferred_bio_list); + bio_list_init(&tc->retry_on_resume_list); + tc->sort_bio_list = RB_ROOT; + + if (argc == 3) { + r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); + if (r) { + ti->error = "Error opening origin device"; + goto bad_origin_dev; + } + tc->origin_dev = origin_dev; + } + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); + if (r) { + ti->error = "Error opening pool device"; + goto bad_pool_dev; + } + tc->pool_dev = pool_dev; + + if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) { + ti->error = "Invalid device id"; + r = -EINVAL; + goto bad_common; + } + + pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev); + if (!pool_md) { + ti->error = "Couldn't get pool mapped device"; + r = -EINVAL; + goto bad_common; + } + + tc->pool = __pool_table_lookup(pool_md); + if (!tc->pool) { + ti->error = "Couldn't find pool object"; + r = -EINVAL; + goto bad_pool_lookup; + } + __pool_inc(tc->pool); + + if (get_pool_mode(tc->pool) == PM_FAIL) { + ti->error = "Couldn't open thin device, Pool is in fail mode"; + r = -EINVAL; + goto bad_thin_open; + } + + r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); + if (r) { + ti->error = "Couldn't open thin internal device"; + goto bad_thin_open; + } + + r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); + if (r) + goto bad_target_max_io_len; + + ti->num_flush_bios = 1; + ti->flush_supported = true; + ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook); + + /* In case the pool supports discards, pass them on. */ + ti->discard_zeroes_data_unsupported = true; + if (tc->pool->pf.discard_enabled) { + ti->discards_supported = true; + ti->num_discard_bios = 1; + /* Discard bios must be split on a block boundary */ + ti->split_discard_bios = true; + } + + dm_put(pool_md); + + mutex_unlock(&dm_thin_pool_table.mutex); + + atomic_set(&tc->refcount, 1); + init_completion(&tc->can_destroy); + + spin_lock_irqsave(&tc->pool->lock, flags); + list_add_tail_rcu(&tc->list, &tc->pool->active_thins); + spin_unlock_irqrestore(&tc->pool->lock, flags); + /* + * This synchronize_rcu() call is needed here otherwise we risk a + * wake_worker() call finding no bios to process (because the newly + * added tc isn't yet visible). So this reduces latency since we + * aren't then dependent on the periodic commit to wake_worker(). + */ + synchronize_rcu(); + + return 0; + +bad_target_max_io_len: + dm_pool_close_thin_device(tc->td); +bad_thin_open: + __pool_dec(tc->pool); +bad_pool_lookup: + dm_put(pool_md); +bad_common: + dm_put_device(ti, tc->pool_dev); +bad_pool_dev: + if (tc->origin_dev) + dm_put_device(ti, tc->origin_dev); +bad_origin_dev: + kfree(tc); +out_unlock: + mutex_unlock(&dm_thin_pool_table.mutex); + + return r; +} + +static int thin_map(struct dm_target *ti, struct bio *bio) +{ + bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); + + return thin_bio_map(ti, bio); +} + +static int thin_endio(struct dm_target *ti, struct bio *bio, int err) +{ + unsigned long flags; + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + struct list_head work; + struct dm_thin_new_mapping *m, *tmp; + struct pool *pool = h->tc->pool; + + if (h->shared_read_entry) { + INIT_LIST_HEAD(&work); + dm_deferred_entry_dec(h->shared_read_entry, &work); + + spin_lock_irqsave(&pool->lock, flags); + list_for_each_entry_safe(m, tmp, &work, list) { + list_del(&m->list); + m->quiesced = true; + __maybe_add_mapping(m); + } + spin_unlock_irqrestore(&pool->lock, flags); + } + + if (h->all_io_entry) { + INIT_LIST_HEAD(&work); + dm_deferred_entry_dec(h->all_io_entry, &work); + if (!list_empty(&work)) { + spin_lock_irqsave(&pool->lock, flags); + list_for_each_entry_safe(m, tmp, &work, list) + list_add_tail(&m->list, &pool->prepared_discards); + spin_unlock_irqrestore(&pool->lock, flags); + wake_worker(pool); + } + } + + return 0; +} + +static void thin_presuspend(struct dm_target *ti) +{ + struct thin_c *tc = ti->private; + + if (dm_noflush_suspending(ti)) + noflush_work(tc, do_noflush_start); +} + +static void thin_postsuspend(struct dm_target *ti) +{ + struct thin_c *tc = ti->private; + + /* + * The dm_noflush_suspending flag has been cleared by now, so + * unfortunately we must always run this. + */ + noflush_work(tc, do_noflush_stop); +} + +/* + * <nr mapped sectors> <highest mapped sector> + */ +static void thin_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + int r; + ssize_t sz = 0; + dm_block_t mapped, highest; + char buf[BDEVNAME_SIZE]; + struct thin_c *tc = ti->private; + + if (get_pool_mode(tc->pool) == PM_FAIL) { + DMEMIT("Fail"); + return; + } + + if (!tc->td) + DMEMIT("-"); + else { + switch (type) { + case STATUSTYPE_INFO: + r = dm_thin_get_mapped_count(tc->td, &mapped); + if (r) { + DMERR("dm_thin_get_mapped_count returned %d", r); + goto err; + } + + r = dm_thin_get_highest_mapped_block(tc->td, &highest); + if (r < 0) { + DMERR("dm_thin_get_highest_mapped_block returned %d", r); + goto err; + } + + DMEMIT("%llu ", mapped * tc->pool->sectors_per_block); + if (r) + DMEMIT("%llu", ((highest + 1) * + tc->pool->sectors_per_block) - 1); + else + DMEMIT("-"); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %lu", + format_dev_t(buf, tc->pool_dev->bdev->bd_dev), + (unsigned long) tc->dev_id); + if (tc->origin_dev) + DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); + break; + } + } + + return; + +err: + DMEMIT("Error"); +} + +static int thin_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + sector_t blocks; + struct thin_c *tc = ti->private; + struct pool *pool = tc->pool; + + /* + * We can't call dm_pool_get_data_dev_size() since that blocks. So + * we follow a more convoluted path through to the pool's target. + */ + if (!pool->ti) + return 0; /* nothing is bound */ + + blocks = pool->ti->len; + (void) sector_div(blocks, pool->sectors_per_block); + if (blocks) + return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data); + + return 0; +} + +static struct target_type thin_target = { + .name = "thin", + .version = {1, 12, 0}, + .module = THIS_MODULE, + .ctr = thin_ctr, + .dtr = thin_dtr, + .map = thin_map, + .end_io = thin_endio, + .presuspend = thin_presuspend, + .postsuspend = thin_postsuspend, + .status = thin_status, + .iterate_devices = thin_iterate_devices, +}; + +/*----------------------------------------------------------------*/ + +static int __init dm_thin_init(void) +{ + int r; + + pool_table_init(); + + r = dm_register_target(&thin_target); + if (r) + return r; + + r = dm_register_target(&pool_target); + if (r) + goto bad_pool_target; + + r = -ENOMEM; + + _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0); + if (!_new_mapping_cache) + goto bad_new_mapping_cache; + + return 0; + +bad_new_mapping_cache: + dm_unregister_target(&pool_target); +bad_pool_target: + dm_unregister_target(&thin_target); + + return r; +} + +static void dm_thin_exit(void) +{ + dm_unregister_target(&thin_target); + dm_unregister_target(&pool_target); + + kmem_cache_destroy(_new_mapping_cache); +} + +module_init(dm_thin_init); +module_exit(dm_thin_exit); + +module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds"); + +MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); +MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c index 6f65883aef1..8efe033bab5 100644 --- a/drivers/md/dm-uevent.c +++ b/drivers/md/dm-uevent.c @@ -22,6 +22,7 @@ #include <linux/slab.h> #include <linux/kobject.h> #include <linux/dm-ioctl.h> +#include <linux/export.h> #include "dm.h" #include "dm-uevent.h" @@ -139,14 +140,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj) list_del_init(&event->elist); /* - * Need to call dm_copy_name_and_uuid from here for now. - * Context of previous var adds and locking used for - * hash_cell not compatable. + * When a device is being removed this copy fails and we + * discard these unsent events. */ if (dm_copy_name_and_uuid(event->md, event->name, event->uuid)) { - DMERR("%s: dm_copy_name_and_uuid() failed", - __func__); + DMINFO("%s: skipping sending uevent for lost device", + __func__); goto uevent_free; } @@ -188,7 +188,7 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { DMERR("%s: Invalid event_type %d", __func__, event_type); - goto out; + return; } event = dm_build_path_uevent(md, ti, @@ -196,12 +196,9 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, _dm_uevent_type_names[event_type].name, path, nr_valid_paths); if (IS_ERR(event)) - goto out; + return; dm_uevent_add(md, &event->elist); - -out: - dm_put(md); } EXPORT_SYMBOL_GPL(dm_path_uevent); diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c new file mode 100644 index 00000000000..7a7bab8947a --- /dev/null +++ b/drivers/md/dm-verity.c @@ -0,0 +1,903 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * Author: Mikulas Patocka <mpatocka@redhat.com> + * + * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors + * + * This file is released under the GPLv2. + * + * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set + * default prefetch value. Data are read in "prefetch_cluster" chunks from the + * hash device. Setting this greatly improves performance when data and hash + * are on the same disk on different partitions on devices with poor random + * access behavior. + */ + +#include "dm-bufio.h" + +#include <linux/module.h> +#include <linux/device-mapper.h> +#include <crypto/hash.h> + +#define DM_MSG_PREFIX "verity" + +#define DM_VERITY_IO_VEC_INLINE 16 +#define DM_VERITY_MEMPOOL_SIZE 4 +#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 + +#define DM_VERITY_MAX_LEVELS 63 + +static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; + +module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); + +struct dm_verity { + struct dm_dev *data_dev; + struct dm_dev *hash_dev; + struct dm_target *ti; + struct dm_bufio_client *bufio; + char *alg_name; + struct crypto_shash *tfm; + u8 *root_digest; /* digest of the root block */ + u8 *salt; /* salt: its size is salt_size */ + unsigned salt_size; + sector_t data_start; /* data offset in 512-byte sectors */ + sector_t hash_start; /* hash start in blocks */ + sector_t data_blocks; /* the number of data blocks */ + sector_t hash_blocks; /* the number of hash blocks */ + unsigned char data_dev_block_bits; /* log2(data blocksize) */ + unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ + unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ + unsigned char levels; /* the number of tree levels */ + unsigned char version; + unsigned digest_size; /* digest size for the current hash algorithm */ + unsigned shash_descsize;/* the size of temporary space for crypto */ + int hash_failed; /* set to 1 if hash of any block failed */ + + mempool_t *vec_mempool; /* mempool of bio vector */ + + struct workqueue_struct *verify_wq; + + /* starting blocks for each tree level. 0 is the lowest level. */ + sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; +}; + +struct dm_verity_io { + struct dm_verity *v; + + /* original values of bio->bi_end_io and bio->bi_private */ + bio_end_io_t *orig_bi_end_io; + void *orig_bi_private; + + sector_t block; + unsigned n_blocks; + + struct bvec_iter iter; + + struct work_struct work; + + /* + * Three variably-size fields follow this struct: + * + * u8 hash_desc[v->shash_descsize]; + * u8 real_digest[v->digest_size]; + * u8 want_digest[v->digest_size]; + * + * To access them use: io_hash_desc(), io_real_digest() and io_want_digest(). + */ +}; + +struct dm_verity_prefetch_work { + struct work_struct work; + struct dm_verity *v; + sector_t block; + unsigned n_blocks; +}; + +static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) +{ + return (struct shash_desc *)(io + 1); +} + +static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->shash_descsize; +} + +static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io) +{ + return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; +} + +/* + * Auxiliary structure appended to each dm-bufio buffer. If the value + * hash_verified is nonzero, hash of the block has been verified. + * + * The variable hash_verified is set to 0 when allocating the buffer, then + * it can be changed to 1 and it is never reset to 0 again. + * + * There is no lock around this value, a race condition can at worst cause + * that multiple processes verify the hash of the same buffer simultaneously + * and write 1 to hash_verified simultaneously. + * This condition is harmless, so we don't need locking. + */ +struct buffer_aux { + int hash_verified; +}; + +/* + * Initialize struct buffer_aux for a freshly created buffer. + */ +static void dm_bufio_alloc_callback(struct dm_buffer *buf) +{ + struct buffer_aux *aux = dm_bufio_get_aux_data(buf); + + aux->hash_verified = 0; +} + +/* + * Translate input sector number to the sector number on the target device. + */ +static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector) +{ + return v->data_start + dm_target_offset(v->ti, bi_sector); +} + +/* + * Return hash position of a specified block at a specified tree level + * (0 is the lowest level). + * The lowest "hash_per_block_bits"-bits of the result denote hash position + * inside a hash block. The remaining bits denote location of the hash block. + */ +static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, + int level) +{ + return block >> (level * v->hash_per_block_bits); +} + +static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, + sector_t *hash_block, unsigned *offset) +{ + sector_t position = verity_position_at_level(v, block, level); + unsigned idx; + + *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits); + + if (!offset) + return; + + idx = position & ((1 << v->hash_per_block_bits) - 1); + if (!v->version) + *offset = idx * v->digest_size; + else + *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits); +} + +/* + * Verify hash of a metadata block pertaining to the specified data block + * ("block" argument) at a specified level ("level" argument). + * + * On successful return, io_want_digest(v, io) contains the hash value for + * a lower tree level or for the data block (if we're at the lowest leve). + * + * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. + * If "skip_unverified" is false, unverified buffer is hashed and verified + * against current value of io_want_digest(v, io). + */ +static int verity_verify_level(struct dm_verity_io *io, sector_t block, + int level, bool skip_unverified) +{ + struct dm_verity *v = io->v; + struct dm_buffer *buf; + struct buffer_aux *aux; + u8 *data; + int r; + sector_t hash_block; + unsigned offset; + + verity_hash_at_level(v, block, level, &hash_block, &offset); + + data = dm_bufio_read(v->bufio, hash_block, &buf); + if (unlikely(IS_ERR(data))) + return PTR_ERR(data); + + aux = dm_bufio_get_aux_data(buf); + + if (!aux->hash_verified) { + struct shash_desc *desc; + u8 *result; + + if (skip_unverified) { + r = 1; + goto release_ret_r; + } + + desc = io_hash_desc(v, io); + desc->tfm = v->tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + r = crypto_shash_init(desc); + if (r < 0) { + DMERR("crypto_shash_init failed: %d", r); + goto release_ret_r; + } + + if (likely(v->version >= 1)) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + goto release_ret_r; + } + } + + r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + goto release_ret_r; + } + + if (!v->version) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + goto release_ret_r; + } + } + + result = io_real_digest(v, io); + r = crypto_shash_final(desc, result); + if (r < 0) { + DMERR("crypto_shash_final failed: %d", r); + goto release_ret_r; + } + if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { + DMERR_LIMIT("metadata block %llu is corrupted", + (unsigned long long)hash_block); + v->hash_failed = 1; + r = -EIO; + goto release_ret_r; + } else + aux->hash_verified = 1; + } + + data += offset; + + memcpy(io_want_digest(v, io), data, v->digest_size); + + dm_bufio_release(buf); + return 0; + +release_ret_r: + dm_bufio_release(buf); + + return r; +} + +/* + * Verify one "dm_verity_io" structure. + */ +static int verity_verify_io(struct dm_verity_io *io) +{ + struct dm_verity *v = io->v; + struct bio *bio = dm_bio_from_per_bio_data(io, + v->ti->per_bio_data_size); + unsigned b; + int i; + + for (b = 0; b < io->n_blocks; b++) { + struct shash_desc *desc; + u8 *result; + int r; + unsigned todo; + + if (likely(v->levels)) { + /* + * First, we try to get the requested hash for + * the current block. If the hash block itself is + * verified, zero is returned. If it isn't, this + * function returns 0 and we fall back to whole + * chain verification. + */ + int r = verity_verify_level(io, io->block + b, 0, true); + if (likely(!r)) + goto test_block_hash; + if (r < 0) + return r; + } + + memcpy(io_want_digest(v, io), v->root_digest, v->digest_size); + + for (i = v->levels - 1; i >= 0; i--) { + int r = verity_verify_level(io, io->block + b, i, false); + if (unlikely(r)) + return r; + } + +test_block_hash: + desc = io_hash_desc(v, io); + desc->tfm = v->tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + r = crypto_shash_init(desc); + if (r < 0) { + DMERR("crypto_shash_init failed: %d", r); + return r; + } + + if (likely(v->version >= 1)) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + } + todo = 1 << v->data_dev_block_bits; + do { + u8 *page; + unsigned len; + struct bio_vec bv = bio_iter_iovec(bio, io->iter); + + page = kmap_atomic(bv.bv_page); + len = bv.bv_len; + if (likely(len >= todo)) + len = todo; + r = crypto_shash_update(desc, page + bv.bv_offset, len); + kunmap_atomic(page); + + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + + bio_advance_iter(bio, &io->iter, len); + todo -= len; + } while (todo); + + if (!v->version) { + r = crypto_shash_update(desc, v->salt, v->salt_size); + if (r < 0) { + DMERR("crypto_shash_update failed: %d", r); + return r; + } + } + + result = io_real_digest(v, io); + r = crypto_shash_final(desc, result); + if (r < 0) { + DMERR("crypto_shash_final failed: %d", r); + return r; + } + if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { + DMERR_LIMIT("data block %llu is corrupted", + (unsigned long long)(io->block + b)); + v->hash_failed = 1; + return -EIO; + } + } + + return 0; +} + +/* + * End one "io" structure with a given error. + */ +static void verity_finish_io(struct dm_verity_io *io, int error) +{ + struct dm_verity *v = io->v; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); + + bio->bi_end_io = io->orig_bi_end_io; + bio->bi_private = io->orig_bi_private; + + bio_endio_nodec(bio, error); +} + +static void verity_work(struct work_struct *w) +{ + struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); + + verity_finish_io(io, verity_verify_io(io)); +} + +static void verity_end_io(struct bio *bio, int error) +{ + struct dm_verity_io *io = bio->bi_private; + + if (error) { + verity_finish_io(io, error); + return; + } + + INIT_WORK(&io->work, verity_work); + queue_work(io->v->verify_wq, &io->work); +} + +/* + * Prefetch buffers for the specified io. + * The root buffer is not prefetched, it is assumed that it will be cached + * all the time. + */ +static void verity_prefetch_io(struct work_struct *work) +{ + struct dm_verity_prefetch_work *pw = + container_of(work, struct dm_verity_prefetch_work, work); + struct dm_verity *v = pw->v; + int i; + + for (i = v->levels - 2; i >= 0; i--) { + sector_t hash_block_start; + sector_t hash_block_end; + verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL); + verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL); + if (!i) { + unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster); + + cluster >>= v->data_dev_block_bits; + if (unlikely(!cluster)) + goto no_prefetch_cluster; + + if (unlikely(cluster & (cluster - 1))) + cluster = 1 << __fls(cluster); + + hash_block_start &= ~(sector_t)(cluster - 1); + hash_block_end |= cluster - 1; + if (unlikely(hash_block_end >= v->hash_blocks)) + hash_block_end = v->hash_blocks - 1; + } +no_prefetch_cluster: + dm_bufio_prefetch(v->bufio, hash_block_start, + hash_block_end - hash_block_start + 1); + } + + kfree(pw); +} + +static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) +{ + struct dm_verity_prefetch_work *pw; + + pw = kmalloc(sizeof(struct dm_verity_prefetch_work), + GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + + if (!pw) + return; + + INIT_WORK(&pw->work, verity_prefetch_io); + pw->v = v; + pw->block = io->block; + pw->n_blocks = io->n_blocks; + queue_work(v->verify_wq, &pw->work); +} + +/* + * Bio map function. It allocates dm_verity_io structure and bio vector and + * fills them. Then it issues prefetches and the I/O. + */ +static int verity_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_verity *v = ti->private; + struct dm_verity_io *io; + + bio->bi_bdev = v->data_dev->bdev; + bio->bi_iter.bi_sector = verity_map_sector(v, bio->bi_iter.bi_sector); + + if (((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & + ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { + DMERR_LIMIT("unaligned io"); + return -EIO; + } + + if (bio_end_sector(bio) >> + (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { + DMERR_LIMIT("io out of range"); + return -EIO; + } + + if (bio_data_dir(bio) == WRITE) + return -EIO; + + io = dm_per_bio_data(bio, ti->per_bio_data_size); + io->v = v; + io->orig_bi_end_io = bio->bi_end_io; + io->orig_bi_private = bio->bi_private; + io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); + io->n_blocks = bio->bi_iter.bi_size >> v->data_dev_block_bits; + + bio->bi_end_io = verity_end_io; + bio->bi_private = io; + io->iter = bio->bi_iter; + + verity_submit_prefetch(v, io); + + generic_make_request(bio); + + return DM_MAPIO_SUBMITTED; +} + +/* + * Status: V (valid) or C (corruption found) + */ +static void verity_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct dm_verity *v = ti->private; + unsigned sz = 0; + unsigned x; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%c", v->hash_failed ? 'C' : 'V'); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u %s %s %u %u %llu %llu %s ", + v->version, + v->data_dev->name, + v->hash_dev->name, + 1 << v->data_dev_block_bits, + 1 << v->hash_dev_block_bits, + (unsigned long long)v->data_blocks, + (unsigned long long)v->hash_start, + v->alg_name + ); + for (x = 0; x < v->digest_size; x++) + DMEMIT("%02x", v->root_digest[x]); + DMEMIT(" "); + if (!v->salt_size) + DMEMIT("-"); + else + for (x = 0; x < v->salt_size; x++) + DMEMIT("%02x", v->salt[x]); + break; + } +} + +static int verity_ioctl(struct dm_target *ti, unsigned cmd, + unsigned long arg) +{ + struct dm_verity *v = ti->private; + int r = 0; + + if (v->data_start || + ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT) + r = scsi_verify_blk_ioctl(NULL, cmd); + + return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode, + cmd, arg); +} + +static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct dm_verity *v = ti->private; + struct request_queue *q = bdev_get_queue(v->data_dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = v->data_dev->bdev; + bvm->bi_sector = verity_map_sector(v, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static int verity_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct dm_verity *v = ti->private; + + return fn(ti, v->data_dev, v->data_start, ti->len, data); +} + +static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct dm_verity *v = ti->private; + + if (limits->logical_block_size < 1 << v->data_dev_block_bits) + limits->logical_block_size = 1 << v->data_dev_block_bits; + + if (limits->physical_block_size < 1 << v->data_dev_block_bits) + limits->physical_block_size = 1 << v->data_dev_block_bits; + + blk_limits_io_min(limits, limits->logical_block_size); +} + +static void verity_dtr(struct dm_target *ti) +{ + struct dm_verity *v = ti->private; + + if (v->verify_wq) + destroy_workqueue(v->verify_wq); + + if (v->vec_mempool) + mempool_destroy(v->vec_mempool); + + if (v->bufio) + dm_bufio_client_destroy(v->bufio); + + kfree(v->salt); + kfree(v->root_digest); + + if (v->tfm) + crypto_free_shash(v->tfm); + + kfree(v->alg_name); + + if (v->hash_dev) + dm_put_device(ti, v->hash_dev); + + if (v->data_dev) + dm_put_device(ti, v->data_dev); + + kfree(v); +} + +/* + * Target parameters: + * <version> The current format is version 1. + * Vsn 0 is compatible with original Chromium OS releases. + * <data device> + * <hash device> + * <data block size> + * <hash block size> + * <the number of data blocks> + * <hash start block> + * <algorithm> + * <digest> + * <salt> Hex string or "-" if no salt. + */ +static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + struct dm_verity *v; + unsigned num; + unsigned long long num_ll; + int r; + int i; + sector_t hash_position; + char dummy; + + v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); + if (!v) { + ti->error = "Cannot allocate verity structure"; + return -ENOMEM; + } + ti->private = v; + v->ti = ti; + + if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { + ti->error = "Device must be readonly"; + r = -EINVAL; + goto bad; + } + + if (argc != 10) { + ti->error = "Invalid argument count: exactly 10 arguments required"; + r = -EINVAL; + goto bad; + } + + if (sscanf(argv[0], "%u%c", &num, &dummy) != 1 || + num > 1) { + ti->error = "Invalid version"; + r = -EINVAL; + goto bad; + } + v->version = num; + + r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev); + if (r) { + ti->error = "Data device lookup failed"; + goto bad; + } + + r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev); + if (r) { + ti->error = "Data device lookup failed"; + goto bad; + } + + if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 || + !num || (num & (num - 1)) || + num < bdev_logical_block_size(v->data_dev->bdev) || + num > PAGE_SIZE) { + ti->error = "Invalid data device block size"; + r = -EINVAL; + goto bad; + } + v->data_dev_block_bits = __ffs(num); + + if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 || + !num || (num & (num - 1)) || + num < bdev_logical_block_size(v->hash_dev->bdev) || + num > INT_MAX) { + ti->error = "Invalid hash device block size"; + r = -EINVAL; + goto bad; + } + v->hash_dev_block_bits = __ffs(num); + + if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 || + (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) + >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll) { + ti->error = "Invalid data blocks"; + r = -EINVAL; + goto bad; + } + v->data_blocks = num_ll; + + if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) { + ti->error = "Data device is too small"; + r = -EINVAL; + goto bad; + } + + if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 || + (sector_t)(num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) + >> (v->hash_dev_block_bits - SECTOR_SHIFT) != num_ll) { + ti->error = "Invalid hash start"; + r = -EINVAL; + goto bad; + } + v->hash_start = num_ll; + + v->alg_name = kstrdup(argv[7], GFP_KERNEL); + if (!v->alg_name) { + ti->error = "Cannot allocate algorithm name"; + r = -ENOMEM; + goto bad; + } + + v->tfm = crypto_alloc_shash(v->alg_name, 0, 0); + if (IS_ERR(v->tfm)) { + ti->error = "Cannot initialize hash function"; + r = PTR_ERR(v->tfm); + v->tfm = NULL; + goto bad; + } + v->digest_size = crypto_shash_digestsize(v->tfm); + if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { + ti->error = "Digest size too big"; + r = -EINVAL; + goto bad; + } + v->shash_descsize = + sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm); + + v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); + if (!v->root_digest) { + ti->error = "Cannot allocate root digest"; + r = -ENOMEM; + goto bad; + } + if (strlen(argv[8]) != v->digest_size * 2 || + hex2bin(v->root_digest, argv[8], v->digest_size)) { + ti->error = "Invalid root digest"; + r = -EINVAL; + goto bad; + } + + if (strcmp(argv[9], "-")) { + v->salt_size = strlen(argv[9]) / 2; + v->salt = kmalloc(v->salt_size, GFP_KERNEL); + if (!v->salt) { + ti->error = "Cannot allocate salt"; + r = -ENOMEM; + goto bad; + } + if (strlen(argv[9]) != v->salt_size * 2 || + hex2bin(v->salt, argv[9], v->salt_size)) { + ti->error = "Invalid salt"; + r = -EINVAL; + goto bad; + } + } + + v->hash_per_block_bits = + __fls((1 << v->hash_dev_block_bits) / v->digest_size); + + v->levels = 0; + if (v->data_blocks) + while (v->hash_per_block_bits * v->levels < 64 && + (unsigned long long)(v->data_blocks - 1) >> + (v->hash_per_block_bits * v->levels)) + v->levels++; + + if (v->levels > DM_VERITY_MAX_LEVELS) { + ti->error = "Too many tree levels"; + r = -E2BIG; + goto bad; + } + + hash_position = v->hash_start; + for (i = v->levels - 1; i >= 0; i--) { + sector_t s; + v->hash_level_block[i] = hash_position; + s = (v->data_blocks + ((sector_t)1 << ((i + 1) * v->hash_per_block_bits)) - 1) + >> ((i + 1) * v->hash_per_block_bits); + if (hash_position + s < hash_position) { + ti->error = "Hash device offset overflow"; + r = -E2BIG; + goto bad; + } + hash_position += s; + } + v->hash_blocks = hash_position; + + v->bufio = dm_bufio_client_create(v->hash_dev->bdev, + 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), + dm_bufio_alloc_callback, NULL); + if (IS_ERR(v->bufio)) { + ti->error = "Cannot initialize dm-bufio"; + r = PTR_ERR(v->bufio); + v->bufio = NULL; + goto bad; + } + + if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) { + ti->error = "Hash device is too small"; + r = -E2BIG; + goto bad; + } + + ti->per_bio_data_size = roundup(sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2, __alignof__(struct dm_verity_io)); + + v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, + BIO_MAX_PAGES * sizeof(struct bio_vec)); + if (!v->vec_mempool) { + ti->error = "Cannot allocate vector mempool"; + r = -ENOMEM; + goto bad; + } + + /* WQ_UNBOUND greatly improves performance when running on ramdisk */ + v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); + if (!v->verify_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + + return 0; + +bad: + verity_dtr(ti); + + return r; +} + +static struct target_type verity_target = { + .name = "verity", + .version = {1, 2, 0}, + .module = THIS_MODULE, + .ctr = verity_ctr, + .dtr = verity_dtr, + .map = verity_map, + .status = verity_status, + .ioctl = verity_ioctl, + .merge = verity_merge, + .iterate_devices = verity_iterate_devices, + .io_hints = verity_io_hints, +}; + +static int __init dm_verity_init(void) +{ + int r; + + r = dm_register_target(&verity_target); + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_verity_exit(void) +{ + dm_unregister_target(&verity_target); +} + +module_init(dm_verity_init); +module_exit(dm_verity_exit); + +MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); +MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>"); +MODULE_AUTHOR("Will Drewry <wad@chromium.org>"); +MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index bdec206c404..b9a64bbce30 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -1,10 +1,10 @@ /* - * Copyright (C) 2003 Christophe Saout <christophe@saout.de> + * Copyright (C) 2003 Jana Saout <jana@saout.de> * * This file is released under the GPL. */ -#include "dm.h" +#include <linux/device-mapper.h> #include <linux/module.h> #include <linux/init.h> @@ -22,14 +22,18 @@ static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } + /* + * Silently drop discards, avoiding -EOPNOTSUPP. + */ + ti->num_discard_bios = 1; + return 0; } /* * Return zeros only on reads */ -static int zero_map(struct dm_target *ti, struct bio *bio, - union map_info *map_context) +static int zero_map(struct dm_target *ti, struct bio *bio) { switch(bio_rw(bio)) { case READ: @@ -51,7 +55,7 @@ static int zero_map(struct dm_target *ti, struct bio *bio, static struct target_type zero_target = { .name = "zero", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = zero_ctr, .map = zero_map, @@ -69,15 +73,12 @@ static int __init dm_zero_init(void) static void __exit dm_zero_exit(void) { - int r = dm_unregister_target(&zero_target); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_target(&zero_target); } module_init(dm_zero_init) module_exit(dm_zero_exit) -MODULE_AUTHOR("Christophe Saout <christophe@saout.de>"); +MODULE_AUTHOR("Jana Saout <jana@saout.de>"); MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index bca448e1187..32b958dbc49 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1,12 +1,11 @@ /* * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ #include "dm.h" -#include "dm-bio-list.h" #include "dm-uevent.h" #include <linux/init.h> @@ -15,23 +14,50 @@ #include <linux/moduleparam.h> #include <linux/blkpg.h> #include <linux/bio.h> -#include <linux/buffer_head.h> #include <linux/mempool.h> #include <linux/slab.h> #include <linux/idr.h> #include <linux/hdreg.h> -#include <linux/blktrace_api.h> -#include <linux/smp_lock.h> +#include <linux/delay.h> + +#include <trace/events/block.h> #define DM_MSG_PREFIX "core" +#ifdef CONFIG_PRINTK +/* + * ratelimit state to be used in DMXXX_LIMIT(). + */ +DEFINE_RATELIMIT_STATE(dm_ratelimit_state, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); +EXPORT_SYMBOL(dm_ratelimit_state); +#endif + +/* + * Cookies are numeric values sent with CHANGE and REMOVE + * uevents while resuming, removing or renaming the device. + */ +#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" +#define DM_COOKIE_LENGTH 24 + static const char *_name = DM_NAME; static unsigned int major = 0; static unsigned int _major = 0; +static DEFINE_IDR(_minor_idr); + static DEFINE_SPINLOCK(_minor_lock); + +static void do_deferred_remove(struct work_struct *w); + +static DECLARE_WORK(deferred_remove_work, do_deferred_remove); + +static struct workqueue_struct *deferred_remove_workqueue; + /* + * For bio-based dm. * One of these is allocated per bio. */ struct dm_io { @@ -40,61 +66,91 @@ struct dm_io { atomic_t io_count; struct bio *bio; unsigned long start_time; + spinlock_t endio_lock; + struct dm_stats_aux stats_aux; }; /* - * One of these is allocated per target within a bio. Hopefully - * this will be simplified out one day. + * For request-based dm. + * One of these is allocated per request. */ -struct dm_target_io { - struct dm_io *io; +struct dm_rq_target_io { + struct mapped_device *md; struct dm_target *ti; + struct request *orig, clone; + int error; union map_info info; }; -union map_info *dm_get_mapinfo(struct bio *bio) +/* + * For request-based dm - the bio clones we allocate are embedded in these + * structs. + * + * We allocate these with bio_alloc_bioset, using the front_pad parameter when + * the bioset is created - this means the bio has to come at the end of the + * struct. + */ +struct dm_rq_clone_bio_info { + struct bio *orig; + struct dm_rq_target_io *tio; + struct bio clone; +}; + +union map_info *dm_get_rq_mapinfo(struct request *rq) { - if (bio && bio->bi_private) - return &((struct dm_target_io *)bio->bi_private)->info; + if (rq && rq->end_io_data) + return &((struct dm_rq_target_io *)rq->end_io_data)->info; return NULL; } +EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define MINOR_ALLOCED ((void *)-1) /* * Bits for the md->flags field. */ -#define DMF_BLOCK_IO 0 +#define DMF_BLOCK_IO_FOR_SUSPEND 0 #define DMF_SUSPENDED 1 #define DMF_FROZEN 2 #define DMF_FREEING 3 #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 +#define DMF_MERGE_IS_OPTIONAL 6 +#define DMF_DEFERRED_REMOVE 7 /* - * Work processed by per-device workqueue. + * A dummy definition to make RCU happy. + * struct dm_table should never be dereferenced in this file. */ -struct dm_wq_req { - enum { - DM_WQ_FLUSH_ALL, - DM_WQ_FLUSH_DEFERRED, - } type; - struct work_struct work; - struct mapped_device *md; - void *context; +struct dm_table { + int undefined__; }; +/* + * Work processed by per-device workqueue. + */ struct mapped_device { - struct rw_semaphore io_lock; + struct srcu_struct io_barrier; struct mutex suspend_lock; - spinlock_t pushback_lock; - rwlock_t map_lock; atomic_t holders; atomic_t open_count; + /* + * The current mapping. + * Use dm_get_live_table{_fast} or take suspend_lock for + * dereference. + */ + struct dm_table *map; + unsigned long flags; struct request_queue *queue; + unsigned type; + /* Protect queue and type against concurrent access. */ + struct mutex type_lock; + + struct target_type *immutable_target_type; + struct gendisk *disk; char name[16]; @@ -103,26 +159,21 @@ struct mapped_device { /* * A list of ios that arrived while we were suspended. */ - atomic_t pending; + atomic_t pending[2]; wait_queue_head_t wait; + struct work_struct work; struct bio_list deferred; - struct bio_list pushback; + spinlock_t deferred_lock; /* - * Processing queue (flush/barriers) + * Processing queue (flush) */ struct workqueue_struct *wq; /* - * The current mapping. - */ - struct dm_table *map; - - /* * io objects are allocated from here. */ mempool_t *io_pool; - mempool_t *tio_pool; struct bio_set *bs; @@ -139,57 +190,128 @@ struct mapped_device { * freeze/thaw support require holding onto a super block */ struct super_block *frozen_sb; - struct block_device *suspended_bdev; + struct block_device *bdev; /* forced geometry settings */ struct hd_geometry geometry; + + /* kobject and completion */ + struct dm_kobject_holder kobj_holder; + + /* zero-length flush that will be cloned and submitted to targets */ + struct bio flush_bio; + + struct dm_stats stats; }; -#define MIN_IOS 256 +/* + * For mempools pre-allocation at the table loading time. + */ +struct dm_md_mempools { + mempool_t *io_pool; + struct bio_set *bs; +}; + +#define RESERVED_BIO_BASED_IOS 16 +#define RESERVED_REQUEST_BASED_IOS 256 +#define RESERVED_MAX_IOS 1024 static struct kmem_cache *_io_cache; -static struct kmem_cache *_tio_cache; +static struct kmem_cache *_rq_tio_cache; + +/* + * Bio-based DM's mempools' reserved IOs set by the user. + */ +static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; + +/* + * Request-based DM's mempools' reserved IOs set by the user. + */ +static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; + +static unsigned __dm_get_reserved_ios(unsigned *reserved_ios, + unsigned def, unsigned max) +{ + unsigned ios = ACCESS_ONCE(*reserved_ios); + unsigned modified_ios = 0; + + if (!ios) + modified_ios = def; + else if (ios > max) + modified_ios = max; + + if (modified_ios) { + (void)cmpxchg(reserved_ios, ios, modified_ios); + ios = modified_ios; + } + + return ios; +} + +unsigned dm_get_reserved_bio_based_ios(void) +{ + return __dm_get_reserved_ios(&reserved_bio_based_ios, + RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); +} +EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); + +unsigned dm_get_reserved_rq_based_ios(void) +{ + return __dm_get_reserved_ios(&reserved_rq_based_ios, + RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); +} +EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); static int __init local_init(void) { - int r; + int r = -ENOMEM; /* allocate a slab for the dm_ios */ _io_cache = KMEM_CACHE(dm_io, 0); if (!_io_cache) - return -ENOMEM; + return r; - /* allocate a slab for the target ios */ - _tio_cache = KMEM_CACHE(dm_target_io, 0); - if (!_tio_cache) { - kmem_cache_destroy(_io_cache); - return -ENOMEM; - } + _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); + if (!_rq_tio_cache) + goto out_free_io_cache; r = dm_uevent_init(); - if (r) { - kmem_cache_destroy(_tio_cache); - kmem_cache_destroy(_io_cache); - return r; + if (r) + goto out_free_rq_tio_cache; + + deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); + if (!deferred_remove_workqueue) { + r = -ENOMEM; + goto out_uevent_exit; } _major = major; r = register_blkdev(_major, _name); - if (r < 0) { - kmem_cache_destroy(_tio_cache); - kmem_cache_destroy(_io_cache); - dm_uevent_exit(); - return r; - } + if (r < 0) + goto out_free_workqueue; if (!_major) _major = r; return 0; + +out_free_workqueue: + destroy_workqueue(deferred_remove_workqueue); +out_uevent_exit: + dm_uevent_exit(); +out_free_rq_tio_cache: + kmem_cache_destroy(_rq_tio_cache); +out_free_io_cache: + kmem_cache_destroy(_io_cache); + + return r; } static void local_exit(void) { - kmem_cache_destroy(_tio_cache); + flush_scheduled_work(); + destroy_workqueue(deferred_remove_workqueue); + + kmem_cache_destroy(_rq_tio_cache); kmem_cache_destroy(_io_cache); unregister_blkdev(_major, _name); dm_uevent_exit(); @@ -204,8 +326,10 @@ static int (*_inits[])(void) __initdata = { dm_target_init, dm_linear_init, dm_stripe_init, + dm_io_init, dm_kcopyd_init, dm_interface_init, + dm_statistics_init, }; static void (*_exits[])(void) = { @@ -213,8 +337,10 @@ static void (*_exits[])(void) = { dm_target_exit, dm_linear_exit, dm_stripe_exit, + dm_io_exit, dm_kcopyd_exit, dm_interface_exit, + dm_statistics_exit, }; static int __init dm_init(void) @@ -244,23 +370,33 @@ static void __exit dm_exit(void) while (i--) _exits[i](); + + /* + * Should be empty by this point. + */ + idr_destroy(&_minor_idr); } /* * Block device functions */ -static int dm_blk_open(struct inode *inode, struct file *file) +int dm_deleting_md(struct mapped_device *md) +{ + return test_bit(DMF_DELETING, &md->flags); +} + +static int dm_blk_open(struct block_device *bdev, fmode_t mode) { struct mapped_device *md; spin_lock(&_minor_lock); - md = inode->i_bdev->bd_disk->private_data; + md = bdev->bd_disk->private_data; if (!md) goto out; if (test_bit(DMF_FREEING, &md->flags) || - test_bit(DMF_DELETING, &md->flags)) { + dm_deleting_md(md)) { md = NULL; goto out; } @@ -274,14 +410,19 @@ out: return md ? 0 : -ENXIO; } -static int dm_blk_close(struct inode *inode, struct file *file) +static void dm_blk_close(struct gendisk *disk, fmode_t mode) { - struct mapped_device *md; + struct mapped_device *md = disk->private_data; + + spin_lock(&_minor_lock); + + if (atomic_dec_and_test(&md->open_count) && + (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) + queue_work(deferred_remove_workqueue, &deferred_remove_work); - md = inode->i_bdev->bd_disk->private_data; - atomic_dec(&md->open_count); dm_put(md); - return 0; + + spin_unlock(&_minor_lock); } int dm_open_count(struct mapped_device *md) @@ -292,14 +433,18 @@ int dm_open_count(struct mapped_device *md) /* * Guarantees nothing is using the device before it's deleted. */ -int dm_lock_for_deletion(struct mapped_device *md) +int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) { int r = 0; spin_lock(&_minor_lock); - if (dm_open_count(md)) + if (dm_open_count(md)) { r = -EBUSY; + if (mark_deferred) + set_bit(DMF_DEFERRED_REMOVE, &md->flags); + } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) + r = -EEXIST; else set_bit(DMF_DELETING, &md->flags); @@ -308,6 +453,42 @@ int dm_lock_for_deletion(struct mapped_device *md) return r; } +int dm_cancel_deferred_remove(struct mapped_device *md) +{ + int r = 0; + + spin_lock(&_minor_lock); + + if (test_bit(DMF_DELETING, &md->flags)) + r = -EBUSY; + else + clear_bit(DMF_DEFERRED_REMOVE, &md->flags); + + spin_unlock(&_minor_lock); + + return r; +} + +static void do_deferred_remove(struct work_struct *w) +{ + dm_deferred_remove(); +} + +sector_t dm_get_size(struct mapped_device *md) +{ + return get_capacity(md->disk); +} + +struct request_queue *dm_get_md_queue(struct mapped_device *md) +{ + return md->queue; +} + +struct dm_stats *dm_get_stats(struct mapped_device *md) +{ + return &md->stats; +} + static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct mapped_device *md = bdev->bd_disk->private_data; @@ -315,20 +496,17 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } -static int dm_blk_ioctl(struct inode *inode, struct file *file, +static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - struct mapped_device *md; + struct mapped_device *md = bdev->bd_disk->private_data; + int srcu_idx; struct dm_table *map; struct dm_target *tgt; int r = -ENOTTY; - /* We don't really need this lock, but we do need 'inode'. */ - unlock_kernel(); - - md = inode->i_bdev->bd_disk->private_data; - - map = dm_get_table(md); +retry: + map = dm_get_live_table(md, &srcu_idx); if (!map || !dm_table_get_size(map)) goto out; @@ -339,18 +517,22 @@ static int dm_blk_ioctl(struct inode *inode, struct file *file, tgt = dm_table_get_target(map, 0); - if (dm_suspended(md)) { + if (dm_suspended_md(md)) { r = -EAGAIN; goto out; } if (tgt->type->ioctl) - r = tgt->type->ioctl(tgt, inode, file, cmd, arg); + r = tgt->type->ioctl(tgt, cmd, arg); out: - dm_table_put(map); + dm_put_live_table(md, srcu_idx); + + if (r == -ENOTCONN) { + msleep(10); + goto retry; + } - lock_kernel(); return r; } @@ -364,80 +546,127 @@ static void free_io(struct mapped_device *md, struct dm_io *io) mempool_free(io, md->io_pool); } -static struct dm_target_io *alloc_tio(struct mapped_device *md) +static void free_tio(struct mapped_device *md, struct dm_target_io *tio) { - return mempool_alloc(md->tio_pool, GFP_NOIO); + bio_put(&tio->clone); } -static void free_tio(struct mapped_device *md, struct dm_target_io *tio) +static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, + gfp_t gfp_mask) +{ + return mempool_alloc(md->io_pool, gfp_mask); +} + +static void free_rq_tio(struct dm_rq_target_io *tio) +{ + mempool_free(tio, tio->md->io_pool); +} + +static int md_in_flight(struct mapped_device *md) { - mempool_free(tio, md->tio_pool); + return atomic_read(&md->pending[READ]) + + atomic_read(&md->pending[WRITE]); } static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; + struct bio *bio = io->bio; + int cpu; + int rw = bio_data_dir(bio); io->start_time = jiffies; - preempt_disable(); - disk_round_stats(dm_disk(md)); - preempt_enable(); - dm_disk(md)->in_flight = atomic_inc_return(&md->pending); + cpu = part_stat_lock(); + part_round_stats(cpu, &dm_disk(md)->part0); + part_stat_unlock(); + atomic_set(&dm_disk(md)->part0.in_flight[rw], + atomic_inc_return(&md->pending[rw])); + + if (unlikely(dm_stats_used(&md->stats))) + dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, + bio_sectors(bio), false, 0, &io->stats_aux); } -static int end_io_acct(struct dm_io *io) +static void end_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; struct bio *bio = io->bio; unsigned long duration = jiffies - io->start_time; - int pending; + int pending, cpu; int rw = bio_data_dir(bio); - preempt_disable(); - disk_round_stats(dm_disk(md)); - preempt_enable(); - dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending); + cpu = part_stat_lock(); + part_round_stats(cpu, &dm_disk(md)->part0); + part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); + part_stat_unlock(); - disk_stat_add(dm_disk(md), ticks[rw], duration); + if (unlikely(dm_stats_used(&md->stats))) + dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, + bio_sectors(bio), true, duration, &io->stats_aux); + + /* + * After this is decremented the bio must not be touched if it is + * a flush. + */ + pending = atomic_dec_return(&md->pending[rw]); + atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); + pending += atomic_read(&md->pending[rw^0x1]); - return !pending; + /* nudge anyone waiting on suspend queue */ + if (!pending) + wake_up(&md->wait); } /* * Add the bio to the list of deferred io. */ -static int queue_io(struct mapped_device *md, struct bio *bio) +static void queue_io(struct mapped_device *md, struct bio *bio) { - down_write(&md->io_lock); - - if (!test_bit(DMF_BLOCK_IO, &md->flags)) { - up_write(&md->io_lock); - return 1; - } + unsigned long flags; + spin_lock_irqsave(&md->deferred_lock, flags); bio_list_add(&md->deferred, bio); - - up_write(&md->io_lock); - return 0; /* deferred successfully */ + spin_unlock_irqrestore(&md->deferred_lock, flags); + queue_work(md->wq, &md->work); } /* * Everyone (including functions in this file), should use this * function to access the md->map field, and make sure they call - * dm_table_put() when finished. + * dm_put_live_table() when finished. */ -struct dm_table *dm_get_table(struct mapped_device *md) +struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) { - struct dm_table *t; + *srcu_idx = srcu_read_lock(&md->io_barrier); - read_lock(&md->map_lock); - t = md->map; - if (t) - dm_table_get(t); - read_unlock(&md->map_lock); + return srcu_dereference(md->map, &md->io_barrier); +} - return t; +void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) +{ + srcu_read_unlock(&md->io_barrier, srcu_idx); +} + +void dm_sync_table(struct mapped_device *md) +{ + synchronize_srcu(&md->io_barrier); + synchronize_rcu_expedited(); +} + +/* + * A fast alternative to dm_get_live_table/dm_put_live_table. + * The caller must not block between these two functions. + */ +static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) +{ + rcu_read_lock(); + return rcu_dereference(md->map); +} + +static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) +{ + rcu_read_unlock(); } /* @@ -488,46 +717,68 @@ static int __noflush_suspending(struct mapped_device *md) static void dec_pending(struct dm_io *io, int error) { unsigned long flags; + int io_error; + struct bio *bio; + struct mapped_device *md = io->md; /* Push-back supersedes any I/O errors */ - if (error && !(io->error > 0 && __noflush_suspending(io->md))) - io->error = error; + if (unlikely(error)) { + spin_lock_irqsave(&io->endio_lock, flags); + if (!(io->error > 0 && __noflush_suspending(md))) + io->error = error; + spin_unlock_irqrestore(&io->endio_lock, flags); + } if (atomic_dec_and_test(&io->io_count)) { if (io->error == DM_ENDIO_REQUEUE) { /* * Target requested pushing back the I/O. - * This must be handled before the sleeper on - * suspend queue merges the pushback list. */ - spin_lock_irqsave(&io->md->pushback_lock, flags); - if (__noflush_suspending(io->md)) - bio_list_add(&io->md->pushback, io->bio); + spin_lock_irqsave(&md->deferred_lock, flags); + if (__noflush_suspending(md)) + bio_list_add_head(&md->deferred, io->bio); else /* noflush suspend was interrupted. */ io->error = -EIO; - spin_unlock_irqrestore(&io->md->pushback_lock, flags); + spin_unlock_irqrestore(&md->deferred_lock, flags); } - if (end_io_acct(io)) - /* nudge anyone waiting on suspend queue */ - wake_up(&io->md->wait); + io_error = io->error; + bio = io->bio; + end_io_acct(io); + free_io(md, io); - if (io->error != DM_ENDIO_REQUEUE) { - blk_add_trace_bio(io->md->queue, io->bio, - BLK_TA_COMPLETE); + if (io_error == DM_ENDIO_REQUEUE) + return; - bio_endio(io->bio, io->error); + if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) { + /* + * Preflush done for flush with data, reissue + * without REQ_FLUSH. + */ + bio->bi_rw &= ~REQ_FLUSH; + queue_io(md, bio); + } else { + /* done with normal IO or empty flush */ + trace_block_bio_complete(md->queue, bio, io_error); + bio_endio(bio, io_error); } - - free_io(io->md, io); } } +static void disable_write_same(struct mapped_device *md) +{ + struct queue_limits *limits = dm_get_queue_limits(md); + + /* device doesn't really support WRITE SAME, disable it */ + limits->max_write_same_sectors = 0; +} + static void clone_endio(struct bio *bio, int error) { int r = 0; - struct dm_target_io *tio = bio->bi_private; + struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); + struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; @@ -535,7 +786,7 @@ static void clone_endio(struct bio *bio, int error) error = -EIO; if (endio) { - r = endio(tio->ti, bio, error, &tio->info); + r = endio(tio->ti, bio, error); if (r < 0 || r == DM_ENDIO_REQUEUE) /* * error and requeue request are handled @@ -551,51 +802,390 @@ static void clone_endio(struct bio *bio, int error) } } - dec_pending(tio->io, error); + if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) && + !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) + disable_write_same(md); + + free_tio(md, tio); + dec_pending(io, error); +} + +/* + * Partial completion handling for request-based dm + */ +static void end_clone_bio(struct bio *clone, int error) +{ + struct dm_rq_clone_bio_info *info = + container_of(clone, struct dm_rq_clone_bio_info, clone); + struct dm_rq_target_io *tio = info->tio; + struct bio *bio = info->orig; + unsigned int nr_bytes = info->orig->bi_iter.bi_size; + + bio_put(clone); + + if (tio->error) + /* + * An error has already been detected on the request. + * Once error occurred, just let clone->end_io() handle + * the remainder. + */ + return; + else if (error) { + /* + * Don't notice the error to the upper layer yet. + * The error handling decision is made by the target driver, + * when the request is completed. + */ + tio->error = error; + return; + } /* - * Store md for cleanup instead of tio which is about to get freed. + * I/O for the bio successfully completed. + * Notice the data completion to the upper layer. */ - bio->bi_private = md->bs; - bio_put(bio); - free_tio(md, tio); + /* + * bios are processed from the head of the list. + * So the completing bio should always be rq->bio. + * If it's not, something wrong is happening. + */ + if (tio->orig->bio != bio) + DMERR("bio completion is going in the middle of the request"); + + /* + * Update the original request. + * Do not use blk_end_request() here, because it may complete + * the original request before the clone, and break the ordering. + */ + blk_update_request(tio->orig, 0, nr_bytes); +} + +/* + * Don't touch any member of the md after calling this function because + * the md may be freed in dm_put() at the end of this function. + * Or do dm_get() before calling this function and dm_put() later. + */ +static void rq_completed(struct mapped_device *md, int rw, int run_queue) +{ + atomic_dec(&md->pending[rw]); + + /* nudge anyone waiting on suspend queue */ + if (!md_in_flight(md)) + wake_up(&md->wait); + + /* + * Run this off this callpath, as drivers could invoke end_io while + * inside their request_fn (and holding the queue lock). Calling + * back into ->request_fn() could deadlock attempting to grab the + * queue lock again. + */ + if (run_queue) + blk_run_queue_async(md->queue); + + /* + * dm_put() must be at the end of this function. See the comment above + */ + dm_put(md); +} + +static void free_rq_clone(struct request *clone) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + + blk_rq_unprep_clone(clone); + free_rq_tio(tio); +} + +/* + * Complete the clone and the original request. + * Must be called without queue lock. + */ +static void dm_end_request(struct request *clone, int error) +{ + int rw = rq_data_dir(clone); + struct dm_rq_target_io *tio = clone->end_io_data; + struct mapped_device *md = tio->md; + struct request *rq = tio->orig; + + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { + rq->errors = clone->errors; + rq->resid_len = clone->resid_len; + + if (rq->sense) + /* + * We are using the sense buffer of the original + * request. + * So setting the length of the sense data is enough. + */ + rq->sense_len = clone->sense_len; + } + + free_rq_clone(clone); + blk_end_request_all(rq, error); + rq_completed(md, rw, true); +} + +static void dm_unprep_request(struct request *rq) +{ + struct request *clone = rq->special; + + rq->special = NULL; + rq->cmd_flags &= ~REQ_DONTPREP; + + free_rq_clone(clone); +} + +/* + * Requeue the original request of a clone. + */ +void dm_requeue_unmapped_request(struct request *clone) +{ + int rw = rq_data_dir(clone); + struct dm_rq_target_io *tio = clone->end_io_data; + struct mapped_device *md = tio->md; + struct request *rq = tio->orig; + struct request_queue *q = rq->q; + unsigned long flags; + + dm_unprep_request(rq); + + spin_lock_irqsave(q->queue_lock, flags); + blk_requeue_request(q, rq); + spin_unlock_irqrestore(q->queue_lock, flags); + + rq_completed(md, rw, 0); +} +EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); + +static void __stop_queue(struct request_queue *q) +{ + blk_stop_queue(q); +} + +static void stop_queue(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + __stop_queue(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void __start_queue(struct request_queue *q) +{ + if (blk_queue_stopped(q)) + blk_start_queue(q); +} + +static void start_queue(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + __start_queue(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void dm_done(struct request *clone, int error, bool mapped) +{ + int r = error; + struct dm_rq_target_io *tio = clone->end_io_data; + dm_request_endio_fn rq_end_io = NULL; + + if (tio->ti) { + rq_end_io = tio->ti->type->rq_end_io; + + if (mapped && rq_end_io) + r = rq_end_io(tio->ti, clone, error, &tio->info); + } + + if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) && + !clone->q->limits.max_write_same_sectors)) + disable_write_same(tio->md); + + if (r <= 0) + /* The target wants to complete the I/O */ + dm_end_request(clone, r); + else if (r == DM_ENDIO_INCOMPLETE) + /* The target will handle the I/O */ + return; + else if (r == DM_ENDIO_REQUEUE) + /* The target wants to requeue the I/O */ + dm_requeue_unmapped_request(clone); + else { + DMWARN("unimplemented target endio return value: %d", r); + BUG(); + } +} + +/* + * Request completion handler for request-based dm + */ +static void dm_softirq_done(struct request *rq) +{ + bool mapped = true; + struct request *clone = rq->completion_data; + struct dm_rq_target_io *tio = clone->end_io_data; + + if (rq->cmd_flags & REQ_FAILED) + mapped = false; + + dm_done(clone, tio->error, mapped); +} + +/* + * Complete the clone and the original request with the error status + * through softirq context. + */ +static void dm_complete_request(struct request *clone, int error) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + struct request *rq = tio->orig; + + tio->error = error; + rq->completion_data = clone; + blk_complete_request(rq); +} + +/* + * Complete the not-mapped clone and the original request with the error status + * through softirq context. + * Target's rq_end_io() function isn't called. + * This may be used when the target's map_rq() function fails. + */ +void dm_kill_unmapped_request(struct request *clone, int error) +{ + struct dm_rq_target_io *tio = clone->end_io_data; + struct request *rq = tio->orig; + + rq->cmd_flags |= REQ_FAILED; + dm_complete_request(clone, error); +} +EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); + +/* + * Called with the queue lock held + */ +static void end_clone_request(struct request *clone, int error) +{ + /* + * For just cleaning up the information of the queue in which + * the clone was dispatched. + * The clone is *NOT* freed actually here because it is alloced from + * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. + */ + __blk_put_request(clone->q, clone); + + /* + * Actual request completion is done in a softirq context which doesn't + * hold the queue lock. Otherwise, deadlock could occur because: + * - another request may be submitted by the upper level driver + * of the stacking during the completion + * - the submission which requires queue lock may be done + * against this queue + */ + dm_complete_request(clone, error); +} + +/* + * Return maximum size of I/O possible at the supplied sector up to the current + * target boundary. + */ +static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) +{ + sector_t target_offset = dm_target_offset(ti, sector); + + return ti->len - target_offset; } -static sector_t max_io_len(struct mapped_device *md, - sector_t sector, struct dm_target *ti) +static sector_t max_io_len(sector_t sector, struct dm_target *ti) { - sector_t offset = sector - ti->begin; - sector_t len = ti->len - offset; + sector_t len = max_io_len_target_boundary(sector, ti); + sector_t offset, max_len; /* - * Does the target need to split even further ? + * Does the target need to split even further? */ - if (ti->split_io) { - sector_t boundary; - boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) - - offset; - if (len > boundary) - len = boundary; + if (ti->max_io_len) { + offset = dm_target_offset(ti, sector); + if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) + max_len = sector_div(offset, ti->max_io_len); + else + max_len = offset & (ti->max_io_len - 1); + max_len = ti->max_io_len - max_len; + + if (len > max_len) + len = max_len; } return len; } -static void __map_bio(struct dm_target *ti, struct bio *clone, - struct dm_target_io *tio) +int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) +{ + if (len > UINT_MAX) { + DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", + (unsigned long long)len, UINT_MAX); + ti->error = "Maximum size of target IO is too large"; + return -EINVAL; + } + + ti->max_io_len = (uint32_t) len; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); + +/* + * A target may call dm_accept_partial_bio only from the map routine. It is + * allowed for all bio types except REQ_FLUSH. + * + * dm_accept_partial_bio informs the dm that the target only wants to process + * additional n_sectors sectors of the bio and the rest of the data should be + * sent in a next bio. + * + * A diagram that explains the arithmetics: + * +--------------------+---------------+-------+ + * | 1 | 2 | 3 | + * +--------------------+---------------+-------+ + * + * <-------------- *tio->len_ptr ---------------> + * <------- bi_size -------> + * <-- n_sectors --> + * + * Region 1 was already iterated over with bio_advance or similar function. + * (it may be empty if the target doesn't use bio_advance) + * Region 2 is the remaining bio size that the target wants to process. + * (it may be empty if region 1 is non-empty, although there is no reason + * to make it empty) + * The target requires that region 3 is to be sent in the next bio. + * + * If the target wants to receive multiple copies of the bio (via num_*bios, etc), + * the partially processed part (the sum of regions 1+2) must be the same for all + * copies of the bio. + */ +void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) +{ + struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); + unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; + BUG_ON(bio->bi_rw & REQ_FLUSH); + BUG_ON(bi_size > *tio->len_ptr); + BUG_ON(n_sectors > bi_size); + *tio->len_ptr -= bi_size - n_sectors; + bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; +} +EXPORT_SYMBOL_GPL(dm_accept_partial_bio); + +static void __map_bio(struct dm_target_io *tio) { int r; sector_t sector; struct mapped_device *md; - - /* - * Sanity checks. - */ - BUG_ON(!clone->bi_size); + struct bio *clone = &tio->clone; + struct dm_target *ti = tio->ti; clone->bi_end_io = clone_endio; - clone->bi_private = tio; /* * Map the clone. If r == 0 we don't need to do @@ -603,25 +1193,19 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, * this io. */ atomic_inc(&tio->io->io_count); - sector = clone->bi_sector; - r = ti->type->map(ti, clone, &tio->info); + sector = clone->bi_iter.bi_sector; + r = ti->type->map(ti, clone); if (r == DM_MAPIO_REMAPPED) { /* the bio has been remapped so dispatch it */ - blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, - tio->io->bio->bi_bdev->bd_dev, - clone->bi_sector, sector); + trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, + tio->io->bio->bi_bdev->bd_dev, sector); generic_make_request(clone); } else if (r < 0 || r == DM_MAPIO_REQUEUE) { /* error the io and bail out, or requeue it if needed */ md = tio->io->md; dec_pending(tio->io, r); - /* - * Store bio_set for cleanup. - */ - clone->bi_private = md->bs; - bio_put(clone); free_tio(md, tio); } else if (r) { DMWARN("unimplemented target map return value: %d", r); @@ -635,195 +1219,250 @@ struct clone_info { struct bio *bio; struct dm_io *io; sector_t sector; - sector_t sector_count; - unsigned short idx; + unsigned sector_count; }; -static void dm_bio_destructor(struct bio *bio) +static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) { - struct bio_set *bs = bio->bi_private; - - bio_free(bio, bs); + bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_size = to_bytes(len); } /* - * Creates a little bio that is just does part of a bvec. + * Creates a bio that consists of range of complete bvecs. */ -static struct bio *split_bvec(struct bio *bio, sector_t sector, - unsigned short idx, unsigned int offset, - unsigned int len, struct bio_set *bs) +static void clone_bio(struct dm_target_io *tio, struct bio *bio, + sector_t sector, unsigned len) { - struct bio *clone; - struct bio_vec *bv = bio->bi_io_vec + idx; + struct bio *clone = &tio->clone; - clone = bio_alloc_bioset(GFP_NOIO, 1, bs); - clone->bi_destructor = dm_bio_destructor; - *clone->bi_io_vec = *bv; + __bio_clone_fast(clone, bio); - clone->bi_sector = sector; - clone->bi_bdev = bio->bi_bdev; - clone->bi_rw = bio->bi_rw; - clone->bi_vcnt = 1; - clone->bi_size = to_bytes(len); - clone->bi_io_vec->bv_offset = offset; - clone->bi_io_vec->bv_len = clone->bi_size; + if (bio_integrity(bio)) + bio_integrity_clone(clone, bio, GFP_NOIO); - return clone; + bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); + clone->bi_iter.bi_size = to_bytes(len); + + if (bio_integrity(bio)) + bio_integrity_trim(clone, 0, len); } -/* - * Creates a bio that consists of range of complete bvecs. - */ -static struct bio *clone_bio(struct bio *bio, sector_t sector, - unsigned short idx, unsigned short bv_count, - unsigned int len, struct bio_set *bs) +static struct dm_target_io *alloc_tio(struct clone_info *ci, + struct dm_target *ti, int nr_iovecs, + unsigned target_bio_nr) { + struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); - __bio_clone(clone, bio); - clone->bi_destructor = dm_bio_destructor; - clone->bi_sector = sector; - clone->bi_idx = idx; - clone->bi_vcnt = idx + bv_count; - clone->bi_size = to_bytes(len); - clone->bi_flags &= ~(1 << BIO_SEG_VALID); + clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs); + tio = container_of(clone, struct dm_target_io, clone); - return clone; + tio->io = ci->io; + tio->ti = ti; + tio->target_bio_nr = target_bio_nr; + + return tio; } -static int __clone_and_map(struct clone_info *ci) +static void __clone_and_map_simple_bio(struct clone_info *ci, + struct dm_target *ti, + unsigned target_bio_nr, unsigned *len) { - struct bio *clone, *bio = ci->bio; + struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); + struct bio *clone = &tio->clone; + + tio->len_ptr = len; + + /* + * Discard requests require the bio's inline iovecs be initialized. + * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush + * and discard, so no need for concern about wasted bvec allocations. + */ + __bio_clone_fast(clone, ci->bio); + if (len) + bio_setup_sector(clone, ci->sector, *len); + + __map_bio(tio); +} + +static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, + unsigned num_bios, unsigned *len) +{ + unsigned target_bio_nr; + + for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) + __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); +} + +static int __send_empty_flush(struct clone_info *ci) +{ + unsigned target_nr = 0; struct dm_target *ti; - sector_t len = 0, max; - struct dm_target_io *tio; - ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) - return -EIO; + BUG_ON(bio_has_data(ci->bio)); + while ((ti = dm_table_get_target(ci->map, target_nr++))) + __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + + return 0; +} - max = max_io_len(ci->md, ci->sector, ti); +static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, + sector_t sector, unsigned *len) +{ + struct bio *bio = ci->bio; + struct dm_target_io *tio; + unsigned target_bio_nr; + unsigned num_target_bios = 1; /* - * Allocate a target io object. + * Does the target want to receive duplicate copies of the bio? */ - tio = alloc_tio(ci->md); - tio->io = ci->io; - tio->ti = ti; - memset(&tio->info, 0, sizeof(tio->info)); + if (bio_data_dir(bio) == WRITE && ti->num_write_bios) + num_target_bios = ti->num_write_bios(ti, bio); + + for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { + tio = alloc_tio(ci, ti, 0, target_bio_nr); + tio->len_ptr = len; + clone_bio(tio, bio, sector, *len); + __map_bio(tio); + } +} - if (ci->sector_count <= max) { - /* - * Optimise for the simple case where we can do all of - * the remaining io with a single clone. - */ - clone = clone_bio(bio, ci->sector, ci->idx, - bio->bi_vcnt - ci->idx, ci->sector_count, - ci->md->bs); - __map_bio(ti, clone, tio); - ci->sector_count = 0; +typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); - } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { - /* - * There are some bvecs that don't span targets. - * Do as many of these as possible. - */ - int i; - sector_t remaining = max; - sector_t bv_len; +static unsigned get_num_discard_bios(struct dm_target *ti) +{ + return ti->num_discard_bios; +} - for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { - bv_len = to_sector(bio->bi_io_vec[i].bv_len); +static unsigned get_num_write_same_bios(struct dm_target *ti) +{ + return ti->num_write_same_bios; +} - if (bv_len > remaining) - break; +typedef bool (*is_split_required_fn)(struct dm_target *ti); - remaining -= bv_len; - len += bv_len; - } +static bool is_split_required_for_discard(struct dm_target *ti) +{ + return ti->split_discard_bios; +} - clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, - ci->md->bs); - __map_bio(ti, clone, tio); +static int __send_changing_extent_only(struct clone_info *ci, + get_num_bios_fn get_num_bios, + is_split_required_fn is_split_required) +{ + struct dm_target *ti; + unsigned len; + unsigned num_bios; - ci->sector += len; - ci->sector_count -= len; - ci->idx = i; + do { + ti = dm_table_find_target(ci->map, ci->sector); + if (!dm_target_is_valid(ti)) + return -EIO; - } else { /* - * Handle a bvec that must be split between two or more targets. + * Even though the device advertised support for this type of + * request, that does not mean every target supports it, and + * reconfiguration might also have changed that since the + * check was performed. */ - struct bio_vec *bv = bio->bi_io_vec + ci->idx; - sector_t remaining = to_sector(bv->bv_len); - unsigned int offset = 0; + num_bios = get_num_bios ? get_num_bios(ti) : 0; + if (!num_bios) + return -EOPNOTSUPP; + + if (is_split_required && !is_split_required(ti)) + len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); + else + len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); + + __send_duplicate_bios(ci, ti, num_bios, &len); + + ci->sector += len; + } while (ci->sector_count -= len); - do { - if (offset) { - ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) - return -EIO; + return 0; +} - max = max_io_len(ci->md, ci->sector, ti); +static int __send_discard(struct clone_info *ci) +{ + return __send_changing_extent_only(ci, get_num_discard_bios, + is_split_required_for_discard); +} - tio = alloc_tio(ci->md); - tio->io = ci->io; - tio->ti = ti; - memset(&tio->info, 0, sizeof(tio->info)); - } +static int __send_write_same(struct clone_info *ci) +{ + return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); +} - len = min(remaining, max); +/* + * Select the correct strategy for processing a non-flush bio. + */ +static int __split_and_process_non_flush(struct clone_info *ci) +{ + struct bio *bio = ci->bio; + struct dm_target *ti; + unsigned len; - clone = split_bvec(bio, ci->sector, ci->idx, - bv->bv_offset + offset, len, - ci->md->bs); + if (unlikely(bio->bi_rw & REQ_DISCARD)) + return __send_discard(ci); + else if (unlikely(bio->bi_rw & REQ_WRITE_SAME)) + return __send_write_same(ci); - __map_bio(ti, clone, tio); + ti = dm_table_find_target(ci->map, ci->sector); + if (!dm_target_is_valid(ti)) + return -EIO; - ci->sector += len; - ci->sector_count -= len; - offset += to_bytes(len); - } while (remaining -= len); + len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); - ci->idx++; - } + __clone_and_map_data_bio(ci, ti, ci->sector, &len); + + ci->sector += len; + ci->sector_count -= len; return 0; } /* - * Split the bio into several clones. + * Entry point to split a bio into clones and submit them to the targets. */ -static int __split_bio(struct mapped_device *md, struct bio *bio) +static void __split_and_process_bio(struct mapped_device *md, + struct dm_table *map, struct bio *bio) { struct clone_info ci; int error = 0; - ci.map = dm_get_table(md); - if (unlikely(!ci.map)) - return -EIO; + if (unlikely(!map)) { + bio_io_error(bio); + return; + } + ci.map = map; ci.md = md; - ci.bio = bio; ci.io = alloc_io(md); ci.io->error = 0; atomic_set(&ci.io->io_count, 1); ci.io->bio = bio; ci.io->md = md; - ci.sector = bio->bi_sector; - ci.sector_count = bio_sectors(bio); - ci.idx = bio->bi_idx; + spin_lock_init(&ci.io->endio_lock); + ci.sector = bio->bi_iter.bi_sector; start_io_acct(ci.io); - while (ci.sector_count && !error) - error = __clone_and_map(&ci); + + if (bio->bi_rw & REQ_FLUSH) { + ci.bio = &ci.md->flush_bio; + ci.sector_count = 0; + error = __send_empty_flush(&ci); + /* dec_pending submits any data associated with flush */ + } else { + ci.bio = bio; + ci.sector_count = bio_sectors(bio); + while (ci.sector_count && !error) + error = __split_and_process_non_flush(&ci); + } /* drop the extra reference count */ dec_pending(ci.io, error); - dm_table_put(ci.map); - - return 0; } /*----------------------------------------------------------------- * CRUD END @@ -834,20 +1473,22 @@ static int dm_merge_bvec(struct request_queue *q, struct bio_vec *biovec) { struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_table(md); + struct dm_table *map = dm_get_live_table_fast(md); struct dm_target *ti; sector_t max_sectors; - int max_size; + int max_size = 0; if (unlikely(!map)) - return 0; + goto out; ti = dm_table_find_target(map, bvm->bi_sector); + if (!dm_target_is_valid(ti)) + goto out; /* * Find maximum amount of I/O that won't need splitting */ - max_sectors = min(max_io_len(md, bvm->bi_sector, ti), + max_sectors = min(max_io_len(bvm->bi_sector, ti), (sector_t) BIO_MAX_SECTORS); max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; if (max_size < 0) @@ -860,15 +1501,24 @@ static int dm_merge_bvec(struct request_queue *q, */ if (max_size && ti->type->merge) max_size = ti->type->merge(ti, bvm, biovec, max_size); + /* + * If the target doesn't support merge method and some of the devices + * provided their merge_bvec method (we know this by looking at + * queue_max_hw_sectors), then we can't allow bios with multiple vector + * entries. So always set max_size to 0, and the code below allows + * just one page. + */ + else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) + max_size = 0; +out: + dm_put_live_table_fast(md); /* * Always allow an entire first page */ if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) max_size = biovec->bv_len; - dm_table_put(map); - return max_size; } @@ -876,87 +1526,326 @@ static int dm_merge_bvec(struct request_queue *q, * The request function that just remaps the bio built up by * dm_merge_bvec. */ -static int dm_request(struct request_queue *q, struct bio *bio) +static void _dm_request(struct request_queue *q, struct bio *bio) { - int r = -EIO; int rw = bio_data_dir(bio); struct mapped_device *md = q->queuedata; + int cpu; + int srcu_idx; + struct dm_table *map; + + map = dm_get_live_table(md, &srcu_idx); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); + part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); + part_stat_unlock(); + + /* if we're suspended, we have to queue this io for later */ + if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { + dm_put_live_table(md, srcu_idx); + + if (bio_rw(bio) != READA) + queue_io(md, bio); + else + bio_io_error(bio); + return; + } + + __split_and_process_bio(md, map, bio); + dm_put_live_table(md, srcu_idx); + return; +} + +int dm_request_based(struct mapped_device *md) +{ + return blk_queue_stackable(md->queue); +} + +static void dm_request(struct request_queue *q, struct bio *bio) +{ + struct mapped_device *md = q->queuedata; + + if (dm_request_based(md)) + blk_queue_bio(q, bio); + else + _dm_request(q, bio); +} + +void dm_dispatch_request(struct request *rq) +{ + int r; + + if (blk_queue_io_stat(rq->q)) + rq->cmd_flags |= REQ_IO_STAT; + + rq->start_time = jiffies; + r = blk_insert_cloned_request(rq->q, rq); + if (r) + dm_complete_request(rq, r); +} +EXPORT_SYMBOL_GPL(dm_dispatch_request); + +static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, + void *data) +{ + struct dm_rq_target_io *tio = data; + struct dm_rq_clone_bio_info *info = + container_of(bio, struct dm_rq_clone_bio_info, clone); + + info->orig = bio_orig; + info->tio = tio; + bio->bi_end_io = end_clone_bio; + + return 0; +} + +static int setup_clone(struct request *clone, struct request *rq, + struct dm_rq_target_io *tio) +{ + int r; + + r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, + dm_rq_bio_constructor, tio); + if (r) + return r; + + clone->cmd = rq->cmd; + clone->cmd_len = rq->cmd_len; + clone->sense = rq->sense; + clone->end_io = end_clone_request; + clone->end_io_data = tio; + + return 0; +} + +static struct request *clone_rq(struct request *rq, struct mapped_device *md, + gfp_t gfp_mask) +{ + struct request *clone; + struct dm_rq_target_io *tio; + + tio = alloc_rq_tio(md, gfp_mask); + if (!tio) + return NULL; + + tio->md = md; + tio->ti = NULL; + tio->orig = rq; + tio->error = 0; + memset(&tio->info, 0, sizeof(tio->info)); + + clone = &tio->clone; + if (setup_clone(clone, rq, tio)) { + /* -ENOMEM */ + free_rq_tio(tio); + return NULL; + } + + return clone; +} + +/* + * Called with the queue lock held. + */ +static int dm_prep_fn(struct request_queue *q, struct request *rq) +{ + struct mapped_device *md = q->queuedata; + struct request *clone; + + if (unlikely(rq->special)) { + DMWARN("Already has something in rq->special."); + return BLKPREP_KILL; + } + + clone = clone_rq(rq, md, GFP_ATOMIC); + if (!clone) + return BLKPREP_DEFER; + + rq->special = clone; + rq->cmd_flags |= REQ_DONTPREP; + + return BLKPREP_OK; +} + +/* + * Returns: + * 0 : the request has been processed (not requeued) + * !0 : the request has been requeued + */ +static int map_request(struct dm_target *ti, struct request *clone, + struct mapped_device *md) +{ + int r, requeued = 0; + struct dm_rq_target_io *tio = clone->end_io_data; + + tio->ti = ti; + r = ti->type->map_rq(ti, clone, &tio->info); + switch (r) { + case DM_MAPIO_SUBMITTED: + /* The target has taken the I/O to submit by itself later */ + break; + case DM_MAPIO_REMAPPED: + /* The target has remapped the I/O so dispatch it */ + trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), + blk_rq_pos(tio->orig)); + dm_dispatch_request(clone); + break; + case DM_MAPIO_REQUEUE: + /* The target wants to requeue the I/O */ + dm_requeue_unmapped_request(clone); + requeued = 1; + break; + default: + if (r > 0) { + DMWARN("unimplemented target map return value: %d", r); + BUG(); + } + + /* The target wants to complete the I/O */ + dm_kill_unmapped_request(clone, r); + break; + } + + return requeued; +} + +static struct request *dm_start_request(struct mapped_device *md, struct request *orig) +{ + struct request *clone; + + blk_start_request(orig); + clone = orig->special; + atomic_inc(&md->pending[rq_data_dir(clone)]); /* - * There is no use in forwarding any barrier request since we can't - * guarantee it is (or can be) handled by the targets correctly. + * Hold the md reference here for the in-flight I/O. + * We can't rely on the reference count by device opener, + * because the device may be closed during the request completion + * when all bios are completed. + * See the comment in rq_completed() too. */ - if (unlikely(bio_barrier(bio))) { - bio_endio(bio, -EOPNOTSUPP); - return 0; - } + dm_get(md); - down_read(&md->io_lock); + return clone; +} - disk_stat_inc(dm_disk(md), ios[rw]); - disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio)); +/* + * q->request_fn for request-based dm. + * Called with the queue lock held. + */ +static void dm_request_fn(struct request_queue *q) +{ + struct mapped_device *md = q->queuedata; + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); + struct dm_target *ti; + struct request *rq, *clone; + sector_t pos; /* - * If we're suspended we have to queue - * this io for later. + * For suspend, check blk_queue_stopped() and increment + * ->pending within a single queue_lock not to increment the + * number of in-flight I/Os after the queue is stopped in + * dm_suspend(). */ - while (test_bit(DMF_BLOCK_IO, &md->flags)) { - up_read(&md->io_lock); + while (!blk_queue_stopped(q)) { + rq = blk_peek_request(q); + if (!rq) + goto delay_and_out; + + /* always use block 0 to find the target for flushes for now */ + pos = 0; + if (!(rq->cmd_flags & REQ_FLUSH)) + pos = blk_rq_pos(rq); + + ti = dm_table_find_target(map, pos); + if (!dm_target_is_valid(ti)) { + /* + * Must perform setup, that dm_done() requires, + * before calling dm_kill_unmapped_request + */ + DMERR_LIMIT("request attempted access beyond the end of device"); + clone = dm_start_request(md, rq); + dm_kill_unmapped_request(clone, -EIO); + continue; + } - if (bio_rw(bio) != READA) - r = queue_io(md, bio); + if (ti->type->busy && ti->type->busy(ti)) + goto delay_and_out; - if (r <= 0) - goto out_req; + clone = dm_start_request(md, rq); - /* - * We're in a while loop, because someone could suspend - * before we get to the following read lock. - */ - down_read(&md->io_lock); + spin_unlock(q->queue_lock); + if (map_request(ti, clone, md)) + goto requeued; + + BUG_ON(!irqs_disabled()); + spin_lock(q->queue_lock); } - r = __split_bio(md, bio); - up_read(&md->io_lock); + goto out; -out_req: - if (r < 0) - bio_io_error(bio); +requeued: + BUG_ON(!irqs_disabled()); + spin_lock(q->queue_lock); - return 0; +delay_and_out: + blk_delay_queue(q, HZ / 10); +out: + dm_put_live_table(md, srcu_idx); +} + +int dm_underlying_device_busy(struct request_queue *q) +{ + return blk_lld_busy(q); } +EXPORT_SYMBOL_GPL(dm_underlying_device_busy); -static void dm_unplug_all(struct request_queue *q) +static int dm_lld_busy(struct request_queue *q) { + int r; struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_table(md); + struct dm_table *map = dm_get_live_table_fast(md); - if (map) { - dm_table_unplug_all(map); - dm_table_put(map); - } + if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) + r = 1; + else + r = dm_table_any_busy_target(map); + + dm_put_live_table_fast(md); + + return r; } static int dm_any_congested(void *congested_data, int bdi_bits) { - int r; - struct mapped_device *md = (struct mapped_device *) congested_data; - struct dm_table *map = dm_get_table(md); + int r = bdi_bits; + struct mapped_device *md = congested_data; + struct dm_table *map; - if (!map || test_bit(DMF_BLOCK_IO, &md->flags)) - r = bdi_bits; - else - r = dm_table_any_congested(map, bdi_bits); + if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { + map = dm_get_live_table_fast(md); + if (map) { + /* + * Request-based dm cares about only own queue for + * the query about congestion status of request_queue + */ + if (dm_request_based(md)) + r = md->queue->backing_dev_info.state & + bdi_bits; + else + r = dm_table_any_congested(map, bdi_bits); + } + dm_put_live_table_fast(md); + } - dm_table_put(map); return r; } /*----------------------------------------------------------------- * An IDR is used to keep track of allocated minor numbers. *---------------------------------------------------------------*/ -static DEFINE_IDR(_minor_idr); - static void free_minor(int minor) { spin_lock(&_minor_lock); @@ -969,65 +1858,64 @@ static void free_minor(int minor) */ static int specific_minor(int minor) { - int r, m; + int r; if (minor >= (1 << MINORBITS)) return -EINVAL; - r = idr_pre_get(&_minor_idr, GFP_KERNEL); - if (!r) - return -ENOMEM; - + idr_preload(GFP_KERNEL); spin_lock(&_minor_lock); - if (idr_find(&_minor_idr, minor)) { - r = -EBUSY; - goto out; - } - - r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); - if (r) - goto out; - - if (m != minor) { - idr_remove(&_minor_idr, m); - r = -EBUSY; - goto out; - } + r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT); -out: spin_unlock(&_minor_lock); - return r; + idr_preload_end(); + if (r < 0) + return r == -ENOSPC ? -EBUSY : r; + return 0; } static int next_free_minor(int *minor) { - int r, m; - - r = idr_pre_get(&_minor_idr, GFP_KERNEL); - if (!r) - return -ENOMEM; + int r; + idr_preload(GFP_KERNEL); spin_lock(&_minor_lock); - r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); - if (r) - goto out; + r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT); - if (m >= (1 << MINORBITS)) { - idr_remove(&_minor_idr, m); - r = -ENOSPC; - goto out; - } - - *minor = m; - -out: spin_unlock(&_minor_lock); - return r; + idr_preload_end(); + if (r < 0) + return r; + *minor = r; + return 0; } -static struct block_device_operations dm_blk_dops; +static const struct block_device_operations dm_blk_dops; + +static void dm_wq_work(struct work_struct *work); + +static void dm_init_md_queue(struct mapped_device *md) +{ + /* + * Request-based dm devices cannot be stacked on top of bio-based dm + * devices. The type of this dm device has not been decided yet. + * The type is decided at the first table loading time. + * To prevent problematic device stacking, clear the queue flag + * for request stacking support until then. + * + * This queue is new, so no concurrency on the queue_flags. + */ + queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); + + md->queue->queuedata = md; + md->queue->backing_dev_info.congested_fn = dm_any_congested; + md->queue->backing_dev_info.congested_data = md; + blk_queue_make_request(md->queue, dm_request); + blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); + blk_queue_merge_bvec(md->queue, dm_merge_bvec); +} /* * Allocate and initialise a blank device with a given minor. @@ -1054,10 +1942,14 @@ static struct mapped_device *alloc_dev(int minor) if (r < 0) goto bad_minor; - init_rwsem(&md->io_lock); + r = init_srcu_struct(&md->io_barrier); + if (r < 0) + goto bad_io_barrier; + + md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); - spin_lock_init(&md->pushback_lock); - rwlock_init(&md->map_lock); + mutex_init(&md->type_lock); + spin_lock_init(&md->deferred_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); atomic_set(&md->event_nr, 0); @@ -1069,33 +1961,18 @@ static struct mapped_device *alloc_dev(int minor) if (!md->queue) goto bad_queue; - md->queue->queuedata = md; - md->queue->backing_dev_info.congested_fn = dm_any_congested; - md->queue->backing_dev_info.congested_data = md; - blk_queue_make_request(md->queue, dm_request); - blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); - md->queue->unplug_fn = dm_unplug_all; - blk_queue_merge_bvec(md->queue, dm_merge_bvec); - - md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); - if (!md->io_pool) - goto bad_io_pool; - - md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); - if (!md->tio_pool) - goto bad_tio_pool; - - md->bs = bioset_create(16, 16); - if (!md->bs) - goto bad_no_bioset; + dm_init_md_queue(md); md->disk = alloc_disk(1); if (!md->disk) goto bad_disk; - atomic_set(&md->pending, 0); + atomic_set(&md->pending[0], 0); + atomic_set(&md->pending[1], 0); init_waitqueue_head(&md->wait); + INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); + init_completion(&md->kobj_holder.completion); md->disk->major = _major; md->disk->first_minor = minor; @@ -1106,10 +1983,20 @@ static struct mapped_device *alloc_dev(int minor) add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); - md->wq = create_singlethread_workqueue("kdmflush"); + md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); if (!md->wq) goto bad_thread; + md->bdev = bdget_disk(md->disk, 0); + if (!md->bdev) + goto bad_bdev; + + bio_init(&md->flush_bio); + md->flush_bio.bi_bdev = md->bdev; + md->flush_bio.bi_rw = WRITE_FLUSH; + + dm_stats_init(&md->stats); + /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); old_md = idr_replace(&_minor_idr, md, minor); @@ -1119,17 +2006,16 @@ static struct mapped_device *alloc_dev(int minor) return md; +bad_bdev: + destroy_workqueue(md->wq); bad_thread: + del_gendisk(md->disk); put_disk(md->disk); bad_disk: - bioset_free(md->bs); -bad_no_bioset: - mempool_destroy(md->tio_pool); -bad_tio_pool: - mempool_destroy(md->io_pool); -bad_io_pool: blk_cleanup_queue(md->queue); bad_queue: + cleanup_srcu_struct(&md->io_barrier); +bad_io_barrier: free_minor(minor); bad_minor: module_put(THIS_MODULE); @@ -1142,17 +2028,18 @@ static void unlock_fs(struct mapped_device *md); static void free_dev(struct mapped_device *md) { - int minor = md->disk->first_minor; + int minor = MINOR(disk_devt(md->disk)); - if (md->suspended_bdev) { - unlock_fs(md); - bdput(md->suspended_bdev); - } + unlock_fs(md); + bdput(md->bdev); destroy_workqueue(md->wq); - mempool_destroy(md->tio_pool); - mempool_destroy(md->io_pool); - bioset_free(md->bs); + if (md->io_pool) + mempool_destroy(md->io_pool); + if (md->bs) + bioset_free(md->bs); + blk_integrity_unregister(md->disk); del_gendisk(md->disk); + cleanup_srcu_struct(&md->io_barrier); free_minor(minor); spin_lock(&_minor_lock); @@ -1161,10 +2048,50 @@ static void free_dev(struct mapped_device *md) put_disk(md->disk); blk_cleanup_queue(md->queue); + dm_stats_cleanup(&md->stats); module_put(THIS_MODULE); kfree(md); } +static void __bind_mempools(struct mapped_device *md, struct dm_table *t) +{ + struct dm_md_mempools *p = dm_table_get_md_mempools(t); + + if (md->io_pool && md->bs) { + /* The md already has necessary mempools. */ + if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { + /* + * Reload bioset because front_pad may have changed + * because a different table was loaded. + */ + bioset_free(md->bs); + md->bs = p->bs; + p->bs = NULL; + } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) { + /* + * There's no need to reload with request-based dm + * because the size of front_pad doesn't change. + * Note for future: If you are to reload bioset, + * prep-ed requests in the queue may refer + * to bio from the old bioset, so you must walk + * through the queue to unprep. + */ + } + goto out; + } + + BUG_ON(!p || md->io_pool || md->bs); + + md->io_pool = p->io_pool; + p->io_pool = NULL; + md->bs = p->bs; + p->bs = NULL; + +out: + /* mempool bind completed, now no need any mempools in the table */ + dm_table_free_md_mempools(t); +} + /* * Bind a table to the device. */ @@ -1178,62 +2105,141 @@ static void event_callback(void *context) list_splice_init(&md->uevent_list, &uevents); spin_unlock_irqrestore(&md->uevent_lock, flags); - dm_send_uevents(&uevents, &md->disk->dev.kobj); + dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); atomic_inc(&md->event_nr); wake_up(&md->eventq); } +/* + * Protected by md->suspend_lock obtained by dm_swap_table(). + */ static void __set_size(struct mapped_device *md, sector_t size) { set_capacity(md->disk, size); - mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); - i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); - mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); + i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); +} + +/* + * Return 1 if the queue has a compulsory merge_bvec_fn function. + * + * If this function returns 0, then the device is either a non-dm + * device without a merge_bvec_fn, or it is a dm device that is + * able to split any bios it receives that are too big. + */ +int dm_queue_merge_is_compulsory(struct request_queue *q) +{ + struct mapped_device *dev_md; + + if (!q->merge_bvec_fn) + return 0; + + if (q->make_request_fn == dm_request) { + dev_md = q->queuedata; + if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) + return 0; + } + + return 1; +} + +static int dm_device_merge_is_compulsory(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); + + return dm_queue_merge_is_compulsory(q); } -static int __bind(struct mapped_device *md, struct dm_table *t) +/* + * Return 1 if it is acceptable to ignore merge_bvec_fn based + * on the properties of the underlying devices. + */ +static int dm_table_merge_is_optional(struct dm_table *table) { + unsigned i = 0; + struct dm_target *ti; + + while (i < dm_table_get_num_targets(table)) { + ti = dm_table_get_target(table, i++); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) + return 0; + } + + return 1; +} + +/* + * Returns old map, which caller must destroy. + */ +static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, + struct queue_limits *limits) +{ + struct dm_table *old_map; struct request_queue *q = md->queue; sector_t size; + int merge_is_optional; size = dm_table_get_size(t); /* * Wipe any geometry if the size of the table changed. */ - if (size != get_capacity(md->disk)) + if (size != dm_get_size(md)) memset(&md->geometry, 0, sizeof(md->geometry)); - if (md->suspended_bdev) - __set_size(md, size); - if (size == 0) - return 0; + __set_size(md, size); - dm_table_get(t); dm_table_event_callback(t, event_callback, md); - write_lock(&md->map_lock); - md->map = t; - dm_table_set_restrictions(t, q); - write_unlock(&md->map_lock); + /* + * The queue hasn't been stopped yet, if the old table type wasn't + * for request-based during suspension. So stop it to prevent + * I/O mapping before resume. + * This must be done before setting the queue restrictions, + * because request-based dm may be run just after the setting. + */ + if (dm_table_request_based(t) && !blk_queue_stopped(q)) + stop_queue(q); - return 0; + __bind_mempools(md, t); + + merge_is_optional = dm_table_merge_is_optional(t); + + old_map = md->map; + rcu_assign_pointer(md->map, t); + md->immutable_target_type = dm_table_get_immutable_target_type(t); + + dm_table_set_restrictions(t, q, limits); + if (merge_is_optional) + set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); + else + clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); + dm_sync_table(md); + + return old_map; } -static void __unbind(struct mapped_device *md) +/* + * Returns unbound table for the caller to free. + */ +static struct dm_table *__unbind(struct mapped_device *md) { struct dm_table *map = md->map; if (!map) - return; + return NULL; dm_table_event_callback(map, NULL, NULL); - write_lock(&md->map_lock); - md->map = NULL; - write_unlock(&md->map_lock); - dm_table_put(map); + RCU_INIT_POINTER(md->map, NULL); + dm_sync_table(md); + + return map; } /* @@ -1247,10 +2253,94 @@ int dm_create(int minor, struct mapped_device **result) if (!md) return -ENXIO; + dm_sysfs_init(md); + *result = md; return 0; } +/* + * Functions to manage md->type. + * All are required to hold md->type_lock. + */ +void dm_lock_md_type(struct mapped_device *md) +{ + mutex_lock(&md->type_lock); +} + +void dm_unlock_md_type(struct mapped_device *md) +{ + mutex_unlock(&md->type_lock); +} + +void dm_set_md_type(struct mapped_device *md, unsigned type) +{ + BUG_ON(!mutex_is_locked(&md->type_lock)); + md->type = type; +} + +unsigned dm_get_md_type(struct mapped_device *md) +{ + BUG_ON(!mutex_is_locked(&md->type_lock)); + return md->type; +} + +struct target_type *dm_get_immutable_target_type(struct mapped_device *md) +{ + return md->immutable_target_type; +} + +/* + * The queue_limits are only valid as long as you have a reference + * count on 'md'. + */ +struct queue_limits *dm_get_queue_limits(struct mapped_device *md) +{ + BUG_ON(!atomic_read(&md->holders)); + return &md->queue->limits; +} +EXPORT_SYMBOL_GPL(dm_get_queue_limits); + +/* + * Fully initialize a request-based queue (->elevator, ->request_fn, etc). + */ +static int dm_init_request_based_queue(struct mapped_device *md) +{ + struct request_queue *q = NULL; + + if (md->queue->elevator) + return 1; + + /* Fully initialize the queue */ + q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); + if (!q) + return 0; + + md->queue = q; + dm_init_md_queue(md); + blk_queue_softirq_done(md->queue, dm_softirq_done); + blk_queue_prep_rq(md->queue, dm_prep_fn); + blk_queue_lld_busy(md->queue, dm_lld_busy); + + elv_register_queue(md->queue); + + return 1; +} + +/* + * Setup the DM device's queue based on md's type + */ +int dm_setup_md_queue(struct mapped_device *md) +{ + if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && + !dm_init_request_based_queue(md)) { + DMWARN("Cannot initialize queue for request-based mapped device"); + return -EINVAL; + } + + return 0; +} + static struct mapped_device *dm_find_md(dev_t dev) { struct mapped_device *md; @@ -1263,7 +2353,8 @@ static struct mapped_device *dm_find_md(dev_t dev) md = idr_find(&_minor_idr, minor); if (md && (md == MINOR_ALLOCED || - (dm_disk(md)->first_minor != minor) || + (MINOR(disk_devt(dm_disk(md))) != minor) || + dm_deleting_md(md) || test_bit(DMF_FREEING, &md->flags))) { md = NULL; goto out; @@ -1284,6 +2375,7 @@ struct mapped_device *dm_get_md(dev_t dev) return md; } +EXPORT_SYMBOL_GPL(dm_get_md); void *dm_get_mdptr(struct mapped_device *md) { @@ -1298,6 +2390,7 @@ void dm_set_mdptr(struct mapped_device *md, void *ptr) void dm_get(struct mapped_device *md) { atomic_inc(&md->holders); + BUG_ON(test_bit(DMF_FREEING, &md->flags)); } const char *dm_device_name(struct mapped_device *md) @@ -1306,40 +2399,76 @@ const char *dm_device_name(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_device_name); -void dm_put(struct mapped_device *md) +static void __dm_destroy(struct mapped_device *md, bool wait) { struct dm_table *map; + int srcu_idx; - BUG_ON(test_bit(DMF_FREEING, &md->flags)); + might_sleep(); - if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { - map = dm_get_table(md); - idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor); - set_bit(DMF_FREEING, &md->flags); - spin_unlock(&_minor_lock); - if (!dm_suspended(md)) { - dm_table_presuspend_targets(map); - dm_table_postsuspend_targets(map); - } - __unbind(md); - dm_table_put(map); - free_dev(md); + spin_lock(&_minor_lock); + map = dm_get_live_table(md, &srcu_idx); + idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); + set_bit(DMF_FREEING, &md->flags); + spin_unlock(&_minor_lock); + + if (!dm_suspended_md(md)) { + dm_table_presuspend_targets(map); + dm_table_postsuspend_targets(map); } + + /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ + dm_put_live_table(md, srcu_idx); + + /* + * Rare, but there may be I/O requests still going to complete, + * for example. Wait for all references to disappear. + * No one should increment the reference count of the mapped_device, + * after the mapped_device state becomes DMF_FREEING. + */ + if (wait) + while (atomic_read(&md->holders)) + msleep(1); + else if (atomic_read(&md->holders)) + DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", + dm_device_name(md), atomic_read(&md->holders)); + + dm_sysfs_exit(md); + dm_table_destroy(__unbind(md)); + free_dev(md); +} + +void dm_destroy(struct mapped_device *md) +{ + __dm_destroy(md, true); +} + +void dm_destroy_immediate(struct mapped_device *md) +{ + __dm_destroy(md, false); +} + +void dm_put(struct mapped_device *md) +{ + atomic_dec(&md->holders); } EXPORT_SYMBOL_GPL(dm_put); -static int dm_wait_for_completion(struct mapped_device *md) +static int dm_wait_for_completion(struct mapped_device *md, int interruptible) { int r = 0; + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&md->wait, &wait); while (1) { - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(interruptible); - smp_mb(); - if (!atomic_read(&md->pending)) + if (!md_in_flight(md)) break; - if (signal_pending(current)) { + if (interruptible == TASK_INTERRUPTIBLE && + signal_pending(current)) { r = -EINTR; break; } @@ -1348,97 +2477,89 @@ static int dm_wait_for_completion(struct mapped_device *md) } set_current_state(TASK_RUNNING); + remove_wait_queue(&md->wait, &wait); + return r; } /* * Process the deferred bios */ -static void __flush_deferred_io(struct mapped_device *md) +static void dm_wq_work(struct work_struct *work) { + struct mapped_device *md = container_of(work, struct mapped_device, + work); struct bio *c; + int srcu_idx; + struct dm_table *map; - while ((c = bio_list_pop(&md->deferred))) { - if (__split_bio(md, c)) - bio_io_error(c); - } - - clear_bit(DMF_BLOCK_IO, &md->flags); -} + map = dm_get_live_table(md, &srcu_idx); -static void __merge_pushback_list(struct mapped_device *md) -{ - unsigned long flags; - - spin_lock_irqsave(&md->pushback_lock, flags); - clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - bio_list_merge_head(&md->deferred, &md->pushback); - bio_list_init(&md->pushback); - spin_unlock_irqrestore(&md->pushback_lock, flags); -} + while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { + spin_lock_irq(&md->deferred_lock); + c = bio_list_pop(&md->deferred); + spin_unlock_irq(&md->deferred_lock); -static void dm_wq_work(struct work_struct *work) -{ - struct dm_wq_req *req = container_of(work, struct dm_wq_req, work); - struct mapped_device *md = req->md; + if (!c) + break; - down_write(&md->io_lock); - switch (req->type) { - case DM_WQ_FLUSH_ALL: - __merge_pushback_list(md); - /* pass through */ - case DM_WQ_FLUSH_DEFERRED: - __flush_deferred_io(md); - break; - default: - DMERR("dm_wq_work: unrecognised work type %d", req->type); - BUG(); + if (dm_request_based(md)) + generic_make_request(c); + else + __split_and_process_bio(md, map, c); } - up_write(&md->io_lock); -} -static void dm_wq_queue(struct mapped_device *md, int type, void *context, - struct dm_wq_req *req) -{ - req->type = type; - req->md = md; - req->context = context; - INIT_WORK(&req->work, dm_wq_work); - queue_work(md->wq, &req->work); + dm_put_live_table(md, srcu_idx); } -static void dm_queue_flush(struct mapped_device *md, int type, void *context) +static void dm_queue_flush(struct mapped_device *md) { - struct dm_wq_req req; - - dm_wq_queue(md, type, context, &req); - flush_workqueue(md->wq); + clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); + smp_mb__after_atomic(); + queue_work(md->wq, &md->work); } /* - * Swap in a new table (destroying old one). + * Swap in a new table, returning the old one for the caller to destroy. */ -int dm_swap_table(struct mapped_device *md, struct dm_table *table) +struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) { - int r = -EINVAL; + struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL); + struct queue_limits limits; + int r; mutex_lock(&md->suspend_lock); /* device must be suspended */ - if (!dm_suspended(md)) + if (!dm_suspended_md(md)) goto out; - /* without bdev, the device size cannot be changed */ - if (!md->suspended_bdev) - if (get_capacity(md->disk) != dm_table_get_size(table)) + /* + * If the new table has no data devices, retain the existing limits. + * This helps multipath with queue_if_no_path if all paths disappear, + * then new I/O is queued based on these limits, and then some paths + * reappear. + */ + if (dm_table_has_no_data_devices(table)) { + live_map = dm_get_live_table_fast(md); + if (live_map) + limits = md->queue->limits; + dm_put_live_table_fast(md); + } + + if (!live_map) { + r = dm_calculate_queue_limits(table, &limits); + if (r) { + map = ERR_PTR(r); goto out; + } + } - __unbind(md); - r = __bind(md, table); + map = __bind(md, table, &limits); out: mutex_unlock(&md->suspend_lock); - return r; + return map; } /* @@ -1451,7 +2572,7 @@ static int lock_fs(struct mapped_device *md) WARN_ON(md->frozen_sb); - md->frozen_sb = freeze_bdev(md->suspended_bdev); + md->frozen_sb = freeze_bdev(md->bdev); if (IS_ERR(md->frozen_sb)) { r = PTR_ERR(md->frozen_sb); md->frozen_sb = NULL; @@ -1460,9 +2581,6 @@ static int lock_fs(struct mapped_device *md) set_bit(DMF_FROZEN, &md->flags); - /* don't bdput right now, we don't want the bdev - * to go away while it is locked. - */ return 0; } @@ -1471,7 +2589,7 @@ static void unlock_fs(struct mapped_device *md) if (!test_bit(DMF_FROZEN, &md->flags)) return; - thaw_bdev(md->suspended_bdev, md->frozen_sb); + thaw_bdev(md->bdev, md->frozen_sb); md->frozen_sb = NULL; clear_bit(DMF_FROZEN, &md->flags); } @@ -1483,22 +2601,30 @@ static void unlock_fs(struct mapped_device *md) * dm_bind_table, dm_suspend must be called to flush any in * flight bios and ensure that any further io gets deferred. */ +/* + * Suspend mechanism in request-based dm. + * + * 1. Flush all I/Os by lock_fs() if needed. + * 2. Stop dispatching any I/O by stopping the request_queue. + * 3. Wait for all in-flight I/Os to be completed or requeued. + * + * To abort suspend, start the request_queue. + */ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) { struct dm_table *map = NULL; - DECLARE_WAITQUEUE(wait, current); int r = 0; int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; mutex_lock(&md->suspend_lock); - if (dm_suspended(md)) { + if (dm_suspended_md(md)) { r = -EINVAL; goto out_unlock; } - map = dm_get_table(md); + map = md->map; /* * DMF_NOFLUSH_SUSPENDING must be set before presuspend. @@ -1510,78 +2636,73 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) /* This does not get reverted if there's an error later. */ dm_table_presuspend_targets(map); - /* bdget() can stall if the pending I/Os are not flushed */ - if (!noflush) { - md->suspended_bdev = bdget_disk(md->disk, 0); - if (!md->suspended_bdev) { - DMWARN("bdget failed in dm_suspend"); - r = -ENOMEM; - goto flush_and_out; - } - - /* - * Flush I/O to the device. noflush supersedes do_lockfs, - * because lock_fs() needs to flush I/Os. - */ - if (do_lockfs) { - r = lock_fs(md); - if (r) - goto out; - } + /* + * Flush I/O to the device. + * Any I/O submitted after lock_fs() may not be flushed. + * noflush takes precedence over do_lockfs. + * (lock_fs() flushes I/Os and waits for them to complete.) + */ + if (!noflush && do_lockfs) { + r = lock_fs(md); + if (r) + goto out_unlock; } /* - * First we set the BLOCK_IO flag so no more ios will be mapped. + * Here we must make sure that no processes are submitting requests + * to target drivers i.e. no one may be executing + * __split_and_process_bio. This is called from dm_request and + * dm_wq_work. + * + * To get all processes out of __split_and_process_bio in dm_request, + * we take the write lock. To prevent any process from reentering + * __split_and_process_bio from dm_request and quiesce the thread + * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call + * flush_workqueue(md->wq). */ - down_write(&md->io_lock); - set_bit(DMF_BLOCK_IO, &md->flags); + set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); + synchronize_srcu(&md->io_barrier); - add_wait_queue(&md->wait, &wait); - up_write(&md->io_lock); + /* + * Stop md->queue before flushing md->wq in case request-based + * dm defers requests to md->wq from md->queue. + */ + if (dm_request_based(md)) + stop_queue(md->queue); - /* unplug */ - if (map) - dm_table_unplug_all(map); + flush_workqueue(md->wq); /* - * Wait for the already-mapped ios to complete. + * At this point no more requests are entering target request routines. + * We call dm_wait_for_completion to wait for all existing requests + * to finish. */ - r = dm_wait_for_completion(md); - - down_write(&md->io_lock); - remove_wait_queue(&md->wait, &wait); + r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); if (noflush) - __merge_pushback_list(md); - up_write(&md->io_lock); + clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); + synchronize_srcu(&md->io_barrier); /* were we interrupted ? */ if (r < 0) { - dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); + dm_queue_flush(md); + + if (dm_request_based(md)) + start_queue(md->queue); unlock_fs(md); - goto out; /* pushback list is already flushed, so skip flush */ + goto out_unlock; /* pushback list is already flushed, so skip flush */ } - dm_table_postsuspend_targets(map); + /* + * If dm_wait_for_completion returned 0, the device is completely + * quiescent now. There is no request-processing activity. All new + * requests are being added to md->deferred list. + */ set_bit(DMF_SUSPENDED, &md->flags); -flush_and_out: - if (r && noflush) - /* - * Because there may be already I/Os in the pushback list, - * flush them before return. - */ - dm_queue_flush(md, DM_WQ_FLUSH_ALL, NULL); - -out: - if (r && md->suspended_bdev) { - bdput(md->suspended_bdev); - md->suspended_bdev = NULL; - } - - dm_table_put(map); + dm_table_postsuspend_targets(map); out_unlock: mutex_unlock(&md->suspend_lock); @@ -1594,10 +2715,10 @@ int dm_resume(struct mapped_device *md) struct dm_table *map = NULL; mutex_lock(&md->suspend_lock); - if (!dm_suspended(md)) + if (!dm_suspended_md(md)) goto out; - map = dm_get_table(md); + map = md->map; if (!map || !dm_table_get_size(map)) goto out; @@ -1605,36 +2726,76 @@ int dm_resume(struct mapped_device *md) if (r) goto out; - dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL); + dm_queue_flush(md); - unlock_fs(md); + /* + * Flushing deferred I/Os must be done after targets are resumed + * so that mapping of targets can work correctly. + * Request-based dm is queueing the deferred I/Os in its request_queue. + */ + if (dm_request_based(md)) + start_queue(md->queue); - if (md->suspended_bdev) { - bdput(md->suspended_bdev); - md->suspended_bdev = NULL; - } + unlock_fs(md); clear_bit(DMF_SUSPENDED, &md->flags); - dm_table_unplug_all(map); - - dm_kobject_uevent(md); - r = 0; - out: - dm_table_put(map); mutex_unlock(&md->suspend_lock); return r; } +/* + * Internal suspend/resume works like userspace-driven suspend. It waits + * until all bios finish and prevents issuing new bios to the target drivers. + * It may be used only from the kernel. + * + * Internal suspend holds md->suspend_lock, which prevents interaction with + * userspace-driven suspend. + */ + +void dm_internal_suspend(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + if (dm_suspended_md(md)) + return; + + set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); + synchronize_srcu(&md->io_barrier); + flush_workqueue(md->wq); + dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); +} + +void dm_internal_resume(struct mapped_device *md) +{ + if (dm_suspended_md(md)) + goto done; + + dm_queue_flush(md); + +done: + mutex_unlock(&md->suspend_lock); +} + /*----------------------------------------------------------------- * Event notification. *---------------------------------------------------------------*/ -void dm_kobject_uevent(struct mapped_device *md) -{ - kobject_uevent(&md->disk->dev.kobj, KOBJ_CHANGE); +int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, + unsigned cookie) +{ + char udev_cookie[DM_COOKIE_LENGTH]; + char *envp[] = { udev_cookie, NULL }; + + if (!cookie) + return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); + else { + snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", + DM_COOKIE_ENV_VAR_NAME, cookie); + return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, + action, envp); + } } uint32_t dm_next_uevent_seq(struct mapped_device *md) @@ -1671,23 +2832,104 @@ struct gendisk *dm_disk(struct mapped_device *md) return md->disk; } -int dm_suspended(struct mapped_device *md) +struct kobject *dm_kobject(struct mapped_device *md) +{ + return &md->kobj_holder.kobj; +} + +struct mapped_device *dm_get_from_kobject(struct kobject *kobj) +{ + struct mapped_device *md; + + md = container_of(kobj, struct mapped_device, kobj_holder.kobj); + + if (test_bit(DMF_FREEING, &md->flags) || + dm_deleting_md(md)) + return NULL; + + dm_get(md); + return md; +} + +int dm_suspended_md(struct mapped_device *md) { return test_bit(DMF_SUSPENDED, &md->flags); } -int dm_noflush_suspending(struct dm_target *ti) +int dm_test_deferred_remove_flag(struct mapped_device *md) { - struct mapped_device *md = dm_table_get_md(ti->table); - int r = __noflush_suspending(md); + return test_bit(DMF_DEFERRED_REMOVE, &md->flags); +} - dm_put(md); +int dm_suspended(struct dm_target *ti) +{ + return dm_suspended_md(dm_table_get_md(ti->table)); +} +EXPORT_SYMBOL_GPL(dm_suspended); - return r; +int dm_noflush_suspending(struct dm_target *ti) +{ + return __noflush_suspending(dm_table_get_md(ti->table)); } EXPORT_SYMBOL_GPL(dm_noflush_suspending); -static struct block_device_operations dm_blk_dops = { +struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) +{ + struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); + struct kmem_cache *cachep; + unsigned int pool_size; + unsigned int front_pad; + + if (!pools) + return NULL; + + if (type == DM_TYPE_BIO_BASED) { + cachep = _io_cache; + pool_size = dm_get_reserved_bio_based_ios(); + front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); + } else if (type == DM_TYPE_REQUEST_BASED) { + cachep = _rq_tio_cache; + pool_size = dm_get_reserved_rq_based_ios(); + front_pad = offsetof(struct dm_rq_clone_bio_info, clone); + /* per_bio_data_size is not used. See __bind_mempools(). */ + WARN_ON(per_bio_data_size != 0); + } else + goto out; + + pools->io_pool = mempool_create_slab_pool(pool_size, cachep); + if (!pools->io_pool) + goto out; + + pools->bs = bioset_create(pool_size, front_pad); + if (!pools->bs) + goto out; + + if (integrity && bioset_integrity_create(pools->bs, pool_size)) + goto out; + + return pools; + +out: + dm_free_md_mempools(pools); + + return NULL; +} + +void dm_free_md_mempools(struct dm_md_mempools *pools) +{ + if (!pools) + return; + + if (pools->io_pool) + mempool_destroy(pools->io_pool); + + if (pools->bs) + bioset_free(pools->bs); + + kfree(pools); +} + +static const struct block_device_operations dm_blk_dops = { .open = dm_blk_open, .release = dm_blk_close, .ioctl = dm_blk_ioctl, @@ -1695,8 +2937,6 @@ static struct block_device_operations dm_blk_dops = { .owner = THIS_MODULE }; -EXPORT_SYMBOL(dm_get_mapinfo); - /* * module hooks */ @@ -1705,6 +2945,13 @@ module_exit(dm_exit); module_param(major, uint, 0); MODULE_PARM_DESC(major, "The major number of the device mapper"); + +module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); + +module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); + MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 1e59a0b0a78..ed76126aac5 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -15,6 +15,10 @@ #include <linux/list.h> #include <linux/blkdev.h> #include <linux/hdreg.h> +#include <linux/completion.h> +#include <linux/kobject.h> + +#include "dm-stats.h" /* * Suspend feature flags @@ -23,61 +27,119 @@ #define DM_SUSPEND_NOFLUSH_FLAG (1 << 1) /* + * Status feature flags + */ +#define DM_STATUS_NOFLUSH_FLAG (1 << 0) + +/* + * Type of table and mapped_device's mempool + */ +#define DM_TYPE_NONE 0 +#define DM_TYPE_BIO_BASED 1 +#define DM_TYPE_REQUEST_BASED 2 + +/* * List of devices that a metadevice uses and should open/close. */ -struct dm_dev { +struct dm_dev_internal { struct list_head list; - atomic_t count; - int mode; - struct block_device *bdev; - char name[16]; + struct dm_dev dm_dev; }; struct dm_table; +struct dm_md_mempools; /*----------------------------------------------------------------- * Internal table functions. *---------------------------------------------------------------*/ +void dm_table_destroy(struct dm_table *t); void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context); struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); -void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q); +bool dm_table_has_no_data_devices(struct dm_table *table); +int dm_calculate_queue_limits(struct dm_table *table, + struct queue_limits *limits); +void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *limits); struct list_head *dm_table_get_devices(struct dm_table *t); void dm_table_presuspend_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); -void dm_table_unplug_all(struct dm_table *t); +int dm_table_any_busy_target(struct dm_table *t); +unsigned dm_table_get_type(struct dm_table *t); +struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); +bool dm_table_request_based(struct dm_table *t); +bool dm_table_supports_discards(struct dm_table *t); +void dm_table_free_md_mempools(struct dm_table *t); +struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); + +int dm_queue_merge_is_compulsory(struct request_queue *q); + +void dm_lock_md_type(struct mapped_device *md); +void dm_unlock_md_type(struct mapped_device *md); +void dm_set_md_type(struct mapped_device *md, unsigned type); +unsigned dm_get_md_type(struct mapped_device *md); +struct target_type *dm_get_immutable_target_type(struct mapped_device *md); + +int dm_setup_md_queue(struct mapped_device *md); /* * To check the return value from dm_table_find_target(). */ #define dm_target_is_valid(t) ((t)->table) +/* + * To check whether the target type is bio-based or not (request-based). + */ +#define dm_target_bio_based(t) ((t)->type->map != NULL) + +/* + * To check whether the target type is request-based or not (bio-based). + */ +#define dm_target_request_based(t) ((t)->type->map_rq != NULL) + +/* + * To check whether the target type is a hybrid (capable of being + * either request-based or bio-based). + */ +#define dm_target_hybrid(t) (dm_target_bio_based(t) && dm_target_request_based(t)) + /*----------------------------------------------------------------- * A registry of target types. *---------------------------------------------------------------*/ int dm_target_init(void); void dm_target_exit(void); struct target_type *dm_get_target_type(const char *name); -void dm_put_target_type(struct target_type *t); +void dm_put_target_type(struct target_type *tt); int dm_target_iterate(void (*iter_func)(struct target_type *tt, void *param), void *param); -/*----------------------------------------------------------------- - * Useful inlines. - *---------------------------------------------------------------*/ -static inline int array_too_big(unsigned long fixed, unsigned long obj, - unsigned long num) -{ - return (num > (ULONG_MAX - fixed) / obj); -} - int dm_split_args(int *argc, char ***argvp, char *input); /* + * Is this mapped_device being deleted? + */ +int dm_deleting_md(struct mapped_device *md); + +/* + * Is this mapped_device suspended? + */ +int dm_suspended_md(struct mapped_device *md); + +/* + * Test if the device is scheduled for deferred remove. + */ +int dm_test_deferred_remove_flag(struct mapped_device *md); + +/* + * Try to remove devices marked for deferred removal. + */ +void dm_deferred_remove(void); + +/* * The device-mapper can be driven through one of two interfaces; * ioctl or filesystem, depending which patch you have applied. */ @@ -85,6 +147,29 @@ int dm_interface_init(void); void dm_interface_exit(void); /* + * sysfs interface + */ +struct dm_kobject_holder { + struct kobject kobj; + struct completion completion; +}; + +static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) +{ + return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; +} + +int dm_sysfs_init(struct mapped_device *md); +void dm_sysfs_exit(struct mapped_device *md); +struct kobject *dm_kobject(struct mapped_device *md); +struct mapped_device *dm_get_from_kobject(struct kobject *kobj); + +/* + * The kobject helper + */ +void dm_kobject_release(struct kobject *kobj); + +/* * Targets for linear and striped mappings */ int dm_linear_init(void); @@ -93,14 +178,46 @@ void dm_linear_exit(void); int dm_stripe_init(void); void dm_stripe_exit(void); -void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); -union map_info *dm_get_mapinfo(struct bio *bio); +/* + * mapped_device operations + */ +void dm_destroy(struct mapped_device *md); +void dm_destroy_immediate(struct mapped_device *md); int dm_open_count(struct mapped_device *md); -int dm_lock_for_deletion(struct mapped_device *md); +int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred); +int dm_cancel_deferred_remove(struct mapped_device *md); +int dm_request_based(struct mapped_device *md); +sector_t dm_get_size(struct mapped_device *md); +struct request_queue *dm_get_md_queue(struct mapped_device *md); +struct dm_stats *dm_get_stats(struct mapped_device *md); -void dm_kobject_uevent(struct mapped_device *md); +int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, + unsigned cookie); + +void dm_internal_suspend(struct mapped_device *md); +void dm_internal_resume(struct mapped_device *md); + +int dm_io_init(void); +void dm_io_exit(void); int dm_kcopyd_init(void); void dm_kcopyd_exit(void); +/* + * Mempool operations + */ +struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size); +void dm_free_md_mempools(struct dm_md_mempools *pools); + +/* + * Helpers that are used by DM core + */ +unsigned dm_get_reserved_bio_based_ios(void); +unsigned dm_get_reserved_rq_based_ios(void); + +static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen) +{ + return !maxlen || strlen(result) + 1 >= maxlen; +} + #endif diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 268547dbfbd..e8b4574956c 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -30,7 +30,7 @@ * * Different modes can be active at a time, but only * one can be set at array creation. Others can be added later. - * A mode can be one-shot or recurrent with the recurrance being + * A mode can be one-shot or recurrent with the recurrence being * once in every N requests. * The bottom 5 bits of the "layout" indicate the mode. The * remainder indicate a period, or 0 for one-shot. @@ -62,31 +62,36 @@ #define ModeShift 5 #define MaxFault 50 -#include <linux/raid/md.h> +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/raid/md_u.h> +#include <linux/slab.h> +#include "md.h" +#include <linux/seq_file.h> static void faulty_fail(struct bio *bio, int error) { struct bio *b = bio->bi_private; - b->bi_size = bio->bi_size; - b->bi_sector = bio->bi_sector; + b->bi_iter.bi_size = bio->bi_iter.bi_size; + b->bi_iter.bi_sector = bio->bi_iter.bi_sector; bio_put(bio); bio_io_error(b); } -typedef struct faulty_conf { +struct faulty_conf { int period[Modes]; atomic_t counters[Modes]; sector_t faults[MaxFault]; int modes[MaxFault]; int nfaults; - mdk_rdev_t *rdev; -} conf_t; + struct md_rdev *rdev; +}; -static int check_mode(conf_t *conf, int mode) +static int check_mode(struct faulty_conf *conf, int mode) { if (conf->period[mode] == 0 && atomic_read(&conf->counters[mode]) <= 0) @@ -101,7 +106,7 @@ static int check_mode(conf_t *conf, int mode) return 0; } -static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir) +static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir) { /* If we find a ReadFixable sector, we fix it ... */ int i; @@ -125,7 +130,7 @@ static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir) return 0; } -static void add_sector(conf_t *conf, sector_t start, int mode) +static void add_sector(struct faulty_conf *conf, sector_t start, int mode) { int i; int n = conf->nfaults; @@ -165,10 +170,9 @@ static void add_sector(conf_t *conf, sector_t start, int mode) conf->nfaults = n+1; } -static int make_request(struct request_queue *q, struct bio *bio) +static void make_request(struct mddev *mddev, struct bio *bio) { - mddev_t *mddev = q->queuedata; - conf_t *conf = (conf_t*)mddev->private; + struct faulty_conf *conf = mddev->private; int failit = 0; if (bio_data_dir(bio) == WRITE) { @@ -178,50 +182,53 @@ static int make_request(struct request_queue *q, struct bio *bio) * just fail immediately */ bio_endio(bio, -EIO); - return 0; + return; } - if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9), - WRITE)) + if (check_sector(conf, bio->bi_iter.bi_sector, + bio_end_sector(bio), WRITE)) failit = 1; if (check_mode(conf, WritePersistent)) { - add_sector(conf, bio->bi_sector, WritePersistent); + add_sector(conf, bio->bi_iter.bi_sector, + WritePersistent); failit = 1; } if (check_mode(conf, WriteTransient)) failit = 1; } else { /* read request */ - if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9), - READ)) + if (check_sector(conf, bio->bi_iter.bi_sector, + bio_end_sector(bio), READ)) failit = 1; if (check_mode(conf, ReadTransient)) failit = 1; if (check_mode(conf, ReadPersistent)) { - add_sector(conf, bio->bi_sector, ReadPersistent); + add_sector(conf, bio->bi_iter.bi_sector, + ReadPersistent); failit = 1; } if (check_mode(conf, ReadFixable)) { - add_sector(conf, bio->bi_sector, ReadFixable); + add_sector(conf, bio->bi_iter.bi_sector, + ReadFixable); failit = 1; } } if (failit) { - struct bio *b = bio_clone(bio, GFP_NOIO); + struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev); + b->bi_bdev = conf->rdev->bdev; b->bi_private = bio; b->bi_end_io = faulty_fail; - generic_make_request(b); - return 0; - } else { + bio = b; + } else bio->bi_bdev = conf->rdev->bdev; - return 1; - } + + generic_make_request(bio); } -static void status(struct seq_file *seq, mddev_t *mddev) +static void status(struct seq_file *seq, struct mddev *mddev) { - conf_t *conf = (conf_t*)mddev->private; + struct faulty_conf *conf = mddev->private; int n; if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) @@ -252,14 +259,14 @@ static void status(struct seq_file *seq, mddev_t *mddev) } -static int reconfig(mddev_t *mddev, int layout, int chunk_size) +static int reshape(struct mddev *mddev) { - int mode = layout & ModeMask; - int count = layout >> ModeShift; - conf_t *conf = mddev->private; + int mode = mddev->new_layout & ModeMask; + int count = mddev->new_layout >> ModeShift; + struct faulty_conf *conf = mddev->private; - if (chunk_size != -1) - return -EINVAL; + if (mddev->new_layout < 0) + return 0; /* new layout */ if (mode == ClearFaults) @@ -276,17 +283,34 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size) atomic_set(&conf->counters[mode], count); } else return -EINVAL; + mddev->new_layout = -1; mddev->layout = -1; /* makes sure further changes come through */ return 0; } -static int run(mddev_t *mddev) +static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks) { - mdk_rdev_t *rdev; - struct list_head *tmp; + WARN_ONCE(raid_disks, + "%s does not support generic reshape\n", __func__); + + if (sectors == 0) + return mddev->dev_sectors; + + return sectors; +} + +static int run(struct mddev *mddev) +{ + struct md_rdev *rdev; int i; + struct faulty_conf *conf; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; - conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL); + conf = kmalloc(sizeof(*conf), GFP_KERNEL); + if (!conf) + return -ENOMEM; for (i=0; i<Modes; i++) { atomic_set(&conf->counters[i], 0); @@ -294,27 +318,30 @@ static int run(mddev_t *mddev) } conf->nfaults = 0; - rdev_for_each(rdev, tmp, mddev) + rdev_for_each(rdev, mddev) { conf->rdev = rdev; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + } - mddev->array_sectors = mddev->size * 2; + md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); mddev->private = conf; - reconfig(mddev, mddev->layout, -1); + reshape(mddev); return 0; } -static int stop(mddev_t *mddev) +static int stop(struct mddev *mddev) { - conf_t *conf = (conf_t *)mddev->private; + struct faulty_conf *conf = mddev->private; kfree(conf); mddev->private = NULL; return 0; } -static struct mdk_personality faulty_personality = +static struct md_personality faulty_personality = { .name = "faulty", .level = LEVEL_FAULTY, @@ -323,7 +350,8 @@ static struct mdk_personality faulty_personality = .run = run, .stop = stop, .status = status, - .reconfig = reconfig, + .check_reshape = reshape, + .size = faulty_size, }; static int __init raid_init(void) @@ -339,6 +367,7 @@ static void raid_exit(void) module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Fault injection personality for MD"); MODULE_ALIAS("md-personality-10"); /* faulty */ MODULE_ALIAS("md-faulty"); MODULE_ALIAS("md-level--5"); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index b1eebf88c20..56f534b4a2d 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -16,35 +16,40 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include <linux/blkdev.h> +#include <linux/raid/md_u.h> +#include <linux/seq_file.h> #include <linux/module.h> - -#include <linux/raid/md.h> #include <linux/slab.h> -#include <linux/raid/linear.h> - -#define MAJOR_NR MD_MAJOR -#define MD_DRIVER -#define MD_PERSONALITY +#include "md.h" +#include "linear.h" /* * find which device holds a particular offset */ -static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) +static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) { - dev_info_t *hash; - linear_conf_t *conf = mddev_to_conf(mddev); - sector_t block = sector >> 1; + int lo, mid, hi; + struct linear_conf *conf; + + lo = 0; + hi = mddev->raid_disks - 1; + conf = rcu_dereference(mddev->private); /* - * sector_div(a,b) returns the remainer and sets a to a/b + * Binary Search */ - block >>= conf->preshift; - (void)sector_div(block, conf->hash_spacing); - hash = conf->hash_table[block]; - while ((sector>>1) >= (hash->size + hash->offset)) - hash++; - return hash; + while (hi > lo) { + + mid = (hi + lo) / 2; + if (sector < conf->disks[mid].end_sector) + hi = mid; + else + lo = mid + 1; + } + + return conf->disks + lo; } /** @@ -59,13 +64,24 @@ static int linear_mergeable_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - mddev_t *mddev = q->queuedata; - dev_info_t *dev0; + struct mddev *mddev = q->queuedata; + struct dev_info *dev0; unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + int maxbytes = biovec->bv_len; + struct request_queue *subq; + rcu_read_lock(); dev0 = which_dev(mddev, sector); - maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1)); + maxsectors = dev0->end_sector - sector; + subq = bdev_get_queue(dev0->rdev->bdev); + if (subq->merge_bvec_fn) { + bvm->bi_bdev = dev0->rdev->bdev; + bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors; + maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm, + biovec)); + } + rcu_read_unlock(); if (maxsectors < bio_sectors) maxsectors = 0; @@ -73,50 +89,58 @@ static int linear_mergeable_bvec(struct request_queue *q, maxsectors -= bio_sectors; if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) - return biovec->bv_len; - /* The bytes available at this offset could be really big, - * so we cap at 2^31 to avoid overflow */ - if (maxsectors > (1 << (31-9))) - return 1<<31; - return maxsectors << 9; -} - -static void linear_unplug(struct request_queue *q) -{ - mddev_t *mddev = q->queuedata; - linear_conf_t *conf = mddev_to_conf(mddev); - int i; + return maxbytes; - for (i=0; i < mddev->raid_disks; i++) { - struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); - blk_unplug(r_queue); - } + if (maxsectors > (maxbytes >> 9)) + return maxbytes; + else + return maxsectors << 9; } static int linear_congested(void *data, int bits) { - mddev_t *mddev = data; - linear_conf_t *conf = mddev_to_conf(mddev); + struct mddev *mddev = data; + struct linear_conf *conf; int i, ret = 0; + if (mddev_congested(mddev, bits)) + return 1; + + rcu_read_lock(); + conf = rcu_dereference(mddev->private); + for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); } + + rcu_read_unlock(); return ret; } -static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) +static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + struct linear_conf *conf; + sector_t array_sectors; + + rcu_read_lock(); + conf = rcu_dereference(mddev->private); + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + array_sectors = conf->array_sectors; + rcu_read_unlock(); + + return array_sectors; +} + +static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) { - linear_conf_t *conf; - dev_info_t **table; - mdk_rdev_t *rdev; - int i, nb_zone, cnt; - sector_t min_spacing; - sector_t curr_offset; - struct list_head *tmp; - - conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t), + struct linear_conf *conf; + struct md_rdev *rdev; + int i, cnt; + bool discard_supported = false; + + conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), GFP_KERNEL); if (!conf) return NULL; @@ -124,122 +148,53 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) cnt = 0; conf->array_sectors = 0; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { int j = rdev->raid_disk; - dev_info_t *disk = conf->disks + j; + struct dev_info *disk = conf->disks + j; + sector_t sectors; if (j < 0 || j >= raid_disks || disk->rdev) { - printk("linear: disk numbering problem. Aborting!\n"); + printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", + mdname(mddev)); goto out; } disk->rdev = rdev; + if (mddev->chunk_sectors) { + sectors = rdev->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev->sectors = sectors * mddev->chunk_sectors; + } - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); - - disk->size = rdev->size; - conf->array_sectors += rdev->size * 2; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + conf->array_sectors += rdev->sectors; cnt++; + + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } if (cnt != raid_disks) { - printk("linear: not enough drives present. Aborting!\n"); + printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", + mdname(mddev)); goto out; } - min_spacing = conf->array_sectors / 2; - sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); - - /* min_spacing is the minimum spacing that will fit the hash - * table in one PAGE. This may be much smaller than needed. - * We find the smallest non-terminal set of consecutive devices - * that is larger than min_spacing as use the size of that as - * the actual spacing - */ - conf->hash_spacing = conf->array_sectors / 2; - for (i=0; i < cnt-1 ; i++) { - sector_t sz = 0; - int j; - for (j = i; j < cnt - 1 && sz < min_spacing; j++) - sz += conf->disks[j].size; - if (sz >= min_spacing && sz < conf->hash_spacing) - conf->hash_spacing = sz; - } + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); - /* hash_spacing may be too large for sector_div to work with, - * so we might need to pre-shift - */ - conf->preshift = 0; - if (sizeof(sector_t) > sizeof(u32)) { - sector_t space = conf->hash_spacing; - while (space > (sector_t)(~(u32)0)) { - space >>= 1; - conf->preshift++; - } - } /* - * This code was restructured to work around a gcc-2.95.3 internal - * compiler error. Alter it with care. + * Here we calculate the device offsets. */ - { - sector_t sz; - unsigned round; - unsigned long base; - - sz = conf->array_sectors >> (conf->preshift + 1); - sz += 1; /* force round-up */ - base = conf->hash_spacing >> conf->preshift; - round = sector_div(sz, base); - nb_zone = sz + (round ? 1 : 0); - } - BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *)); + conf->disks[0].end_sector = conf->disks[0].rdev->sectors; - conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone, - GFP_KERNEL); - if (!conf->hash_table) - goto out; - - /* - * Here we generate the linear hash table - * First calculate the device offsets. - */ - conf->disks[0].offset = 0; for (i = 1; i < raid_disks; i++) - conf->disks[i].offset = - conf->disks[i-1].offset + - conf->disks[i-1].size; - - table = conf->hash_table; - curr_offset = 0; - i = 0; - for (curr_offset = 0; - curr_offset < conf->array_sectors / 2; - curr_offset += conf->hash_spacing) { - - while (i < raid_disks-1 && - curr_offset >= conf->disks[i+1].offset) - i++; - - *table ++ = conf->disks + i; - } - - if (conf->preshift) { - conf->hash_spacing >>= conf->preshift; - /* round hash_spacing up so that when we divide by it, - * we err on the side of "too-low", which is safest. - */ - conf->hash_spacing++; - } - - BUG_ON(table - conf->hash_table > nb_zone); + conf->disks[i].end_sector = + conf->disks[i-1].end_sector + + conf->disks[i].rdev->sectors; return conf; @@ -248,26 +203,33 @@ out: return NULL; } -static int linear_run (mddev_t *mddev) +static int linear_run (struct mddev *mddev) { - linear_conf_t *conf; + struct linear_conf *conf; + int ret; - mddev->queue->queue_lock = &mddev->queue->__queue_lock; + if (md_check_no_bitmap(mddev)) + return -EINVAL; conf = linear_conf(mddev, mddev->raid_disks); if (!conf) return 1; mddev->private = conf; - mddev->array_sectors = conf->array_sectors; + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); - mddev->queue->unplug_fn = linear_unplug; mddev->queue->backing_dev_info.congested_fn = linear_congested; mddev->queue->backing_dev_info.congested_data = mddev; - return 0; + + ret = md_integrity_register(mddev); + if (ret) { + kfree(conf); + mddev->private = NULL; + } + return ret; } -static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) +static int linear_add(struct mddev *mddev, struct md_rdev *rdev) { /* Adding a drive to a linear array allows the array to grow. * It is permitted if the new drive has a matching superblock @@ -277,125 +239,124 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) * The current one is never freed until the array is stopped. * This avoids races. */ - linear_conf_t *newconf; + struct linear_conf *newconf, *oldconf; if (rdev->saved_raid_disk != mddev->raid_disks) return -EINVAL; rdev->raid_disk = rdev->saved_raid_disk; + rdev->saved_raid_disk = -1; newconf = linear_conf(mddev,mddev->raid_disks+1); if (!newconf) return -ENOMEM; - newconf->prev = mddev_to_conf(mddev); - mddev->private = newconf; + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held( + &mddev->reconfig_mutex)); mddev->raid_disks++; - mddev->array_sectors = newconf->array_sectors; + rcu_assign_pointer(mddev->private, newconf); + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + kfree_rcu(oldconf, rcu); return 0; } -static int linear_stop (mddev_t *mddev) +static int linear_stop (struct mddev *mddev) { - linear_conf_t *conf = mddev_to_conf(mddev); - + struct linear_conf *conf = + rcu_dereference_protected(mddev->private, + lockdep_is_held( + &mddev->reconfig_mutex)); + + /* + * We do not require rcu protection here since + * we hold reconfig_mutex for both linear_add and + * linear_stop, so they cannot race. + * We should make sure any old 'conf's are properly + * freed though. + */ + rcu_barrier(); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - do { - linear_conf_t *t = conf->prev; - kfree(conf->hash_table); - kfree(conf); - conf = t; - } while (conf); + kfree(conf); + mddev->private = NULL; return 0; } -static int linear_make_request (struct request_queue *q, struct bio *bio) +static void linear_make_request(struct mddev *mddev, struct bio *bio) { - const int rw = bio_data_dir(bio); - mddev_t *mddev = q->queuedata; - dev_info_t *tmp_dev; - sector_t block; - - if (unlikely(bio_barrier(bio))) { - bio_endio(bio, -EOPNOTSUPP); - return 0; + char b[BDEVNAME_SIZE]; + struct dev_info *tmp_dev; + struct bio *split; + sector_t start_sector, end_sector, data_offset; + + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); + return; } - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); - - tmp_dev = which_dev(mddev, bio->bi_sector); - block = bio->bi_sector >> 1; - - if (unlikely(block >= (tmp_dev->size + tmp_dev->offset) - || block < tmp_dev->offset)) { - char b[BDEVNAME_SIZE]; - - printk("linear_make_request: Block %llu out of bounds on " - "dev %s size %llu offset %llu\n", - (unsigned long long)block, - bdevname(tmp_dev->rdev->bdev, b), - (unsigned long long)tmp_dev->size, - (unsigned long long)tmp_dev->offset); - bio_io_error(bio); - return 0; - } - if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > - (tmp_dev->offset + tmp_dev->size)<<1)) { - /* This bio crosses a device boundary, so we have to - * split it. - */ - struct bio_pair *bp; - bp = bio_split(bio, bio_split_pool, - ((tmp_dev->offset + tmp_dev->size)<<1) - bio->bi_sector); - if (linear_make_request(q, &bp->bio1)) - generic_make_request(&bp->bio1); - if (linear_make_request(q, &bp->bio2)) - generic_make_request(&bp->bio2); - bio_pair_release(bp); - return 0; - } - - bio->bi_bdev = tmp_dev->rdev->bdev; - bio->bi_sector = bio->bi_sector - (tmp_dev->offset << 1) + tmp_dev->rdev->data_offset; + do { + rcu_read_lock(); + + tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector); + start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; + end_sector = tmp_dev->end_sector; + data_offset = tmp_dev->rdev->data_offset; + bio->bi_bdev = tmp_dev->rdev->bdev; + + rcu_read_unlock(); + + if (unlikely(bio->bi_iter.bi_sector >= end_sector || + bio->bi_iter.bi_sector < start_sector)) + goto out_of_bounds; + + if (unlikely(bio_end_sector(bio) > end_sector)) { + /* This bio crosses a device boundary, so we have to + * split it. + */ + split = bio_split(bio, end_sector - + bio->bi_iter.bi_sector, + GFP_NOIO, fs_bio_set); + bio_chain(split, bio); + } else { + split = bio; + } - return 1; + split->bi_iter.bi_sector = split->bi_iter.bi_sector - + start_sector + data_offset; + + if (unlikely((split->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { + /* Just ignore it */ + bio_endio(split, 0); + } else + generic_make_request(split); + } while (split != bio); + return; + +out_of_bounds: + printk(KERN_ERR + "md/linear:%s: make_request: Sector %llu out of bounds on " + "dev %s: %llu sectors, offset %llu\n", + mdname(mddev), + (unsigned long long)bio->bi_iter.bi_sector, + bdevname(tmp_dev->rdev->bdev, b), + (unsigned long long)tmp_dev->rdev->sectors, + (unsigned long long)start_sector); + bio_io_error(bio); } -static void linear_status (struct seq_file *seq, mddev_t *mddev) +static void linear_status (struct seq_file *seq, struct mddev *mddev) { -#undef MD_DEBUG -#ifdef MD_DEBUG - int j; - linear_conf_t *conf = mddev_to_conf(mddev); - sector_t s = 0; - - seq_printf(seq, " "); - for (j = 0; j < mddev->raid_disks; j++) - { - char b[BDEVNAME_SIZE]; - s += conf->smallest_size; - seq_printf(seq, "[%s", - bdevname(conf->hash_table[j][0].rdev->bdev,b)); - - while (s > conf->hash_table[j][0].offset + - conf->hash_table[j][0].size) - seq_printf(seq, "/%s] ", - bdevname(conf->hash_table[j][1].rdev->bdev,b)); - else - seq_printf(seq, "] "); - } - seq_printf(seq, "\n"); -#endif - seq_printf(seq, " %dk rounding", mddev->chunk_size/1024); + seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); } -static struct mdk_personality linear_personality = +static struct md_personality linear_personality = { .name = "linear", .level = LEVEL_LINEAR, @@ -405,6 +366,7 @@ static struct mdk_personality linear_personality = .stop = linear_stop, .status = linear_status, .hot_add_disk = linear_add, + .size = linear_size, }; static int __init linear_init (void) @@ -421,6 +383,7 @@ static void linear_exit (void) module_init(linear_init); module_exit(linear_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Linear device concatenation personality for MD"); MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ MODULE_ALIAS("md-linear"); MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/linear.h b/drivers/md/linear.h new file mode 100644 index 00000000000..b685ddd7d7f --- /dev/null +++ b/drivers/md/linear.h @@ -0,0 +1,15 @@ +#ifndef _LINEAR_H +#define _LINEAR_H + +struct dev_info { + struct md_rdev *rdev; + sector_t end_sector; +}; + +struct linear_conf +{ + struct rcu_head rcu; + sector_t array_sectors; + struct dev_info disks[0]; +}; +#endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 4790c83d78d..32fc19c540d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -32,53 +32,58 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/module.h> -#include <linux/kernel.h> #include <linux/kthread.h> -#include <linux/linkage.h> -#include <linux/raid/md.h> -#include <linux/raid/bitmap.h> +#include <linux/blkdev.h> #include <linux/sysctl.h> -#include <linux/buffer_head.h> /* for invalidate_bdev */ +#include <linux/seq_file.h> +#include <linux/fs.h> #include <linux/poll.h> -#include <linux/mutex.h> #include <linux/ctype.h> -#include <linux/freezer.h> - -#include <linux/init.h> - +#include <linux/string.h> +#include <linux/hdreg.h> +#include <linux/proc_fs.h> +#include <linux/random.h> +#include <linux/module.h> +#include <linux/reboot.h> #include <linux/file.h> - -#ifdef CONFIG_KMOD -#include <linux/kmod.h> -#endif - -#include <asm/unaligned.h> - -#define MAJOR_NR MD_MAJOR -#define MD_DRIVER - -/* 63 partitions with the alternate major number (mdp) */ -#define MdpMinorShift 6 - -#define DEBUG 0 -#define dprintk(x...) ((void)(DEBUG && printk(x))) - +#include <linux/compat.h> +#include <linux/delay.h> +#include <linux/raid/md_p.h> +#include <linux/raid/md_u.h> +#include <linux/slab.h> +#include "md.h" +#include "bitmap.h" #ifndef MODULE -static void autostart_arrays (int part); +static void autostart_arrays(int part); #endif +/* pers_list is a list of registered personalities protected + * by pers_lock. + * pers_lock does extra service to protect accesses to + * mddev->thread when the mutex cannot be held. + */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); static void md_print_devices(void); static DECLARE_WAIT_QUEUE_HEAD(resync_wait); +static struct workqueue_struct *md_wq; +static struct workqueue_struct *md_misc_wq; + +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this); #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } /* + * Default number of read corrections we'll attempt on an rdev + * before ejecting it from the array. We divide the read error + * count by 2 for every hour elapsed between read errors. + */ +#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 +/* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. * Increase it if you want to have more _guaranteed_ speed. Note that @@ -93,13 +98,13 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; -static inline int speed_min(mddev_t *mddev) +static inline int speed_min(struct mddev *mddev) { return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min; } -static inline int speed_max(mddev_t *mddev) +static inline int speed_max(struct mddev *mddev) { return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max; @@ -107,52 +112,77 @@ static inline int speed_max(mddev_t *mddev) static struct ctl_table_header *raid_table_header; -static ctl_table raid_table[] = { +static struct ctl_table raid_table[] = { { - .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), .mode = S_IRUGO|S_IWUSR, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { .ctl_name = 0 } + { } }; -static ctl_table raid_dir_table[] = { +static struct ctl_table raid_dir_table[] = { { - .ctl_name = DEV_RAID, .procname = "raid", .maxlen = 0, .mode = S_IRUGO|S_IXUGO, .child = raid_table, }, - { .ctl_name = 0 } + { } }; -static ctl_table raid_root_table[] = { +static struct ctl_table raid_root_table[] = { { - .ctl_name = CTL_DEV, .procname = "dev", .maxlen = 0, .mode = 0555, .child = raid_dir_table, }, - { .ctl_name = 0 } + { } }; -static struct block_device_operations md_fops; +static const struct block_device_operations md_fops; static int start_readonly; +/* bio_clone_mddev + * like bio_clone, but with a local bio set + */ + +struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, + struct mddev *mddev) +{ + struct bio *b; + + if (!mddev || !mddev->bio_set) + return bio_alloc(gfp_mask, nr_iovecs); + + b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); + if (!b) + return NULL; + return b; +} +EXPORT_SYMBOL_GPL(bio_alloc_mddev); + +struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, + struct mddev *mddev) +{ + if (!mddev || !mddev->bio_set) + return bio_clone(bio, gfp_mask); + + return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); +} +EXPORT_SYMBOL_GPL(bio_clone_mddev); + /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -165,7 +195,7 @@ static int start_readonly; */ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; -void md_new_event(mddev_t *mddev) +void md_new_event(struct mddev *mddev) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); @@ -175,7 +205,7 @@ EXPORT_SYMBOL_GPL(md_new_event); /* Alternate version that can be called from interrupts * when calling sysfs_notify isn't needed. */ -static void md_new_event_inintr(mddev_t *mddev) +static void md_new_event_inintr(struct mddev *mddev) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); @@ -196,62 +226,321 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * Any code which breaks out of this loop while own * a reference to the current mddev and must mddev_put it. */ -#define for_each_mddev(mddev,tmp) \ +#define for_each_mddev(_mddev,_tmp) \ \ for (({ spin_lock(&all_mddevs_lock); \ - tmp = all_mddevs.next; \ - mddev = NULL;}); \ - ({ if (tmp != &all_mddevs) \ - mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ + _tmp = all_mddevs.next; \ + _mddev = NULL;}); \ + ({ if (_tmp != &all_mddevs) \ + mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ spin_unlock(&all_mddevs_lock); \ - if (mddev) mddev_put(mddev); \ - mddev = list_entry(tmp, mddev_t, all_mddevs); \ - tmp != &all_mddevs;}); \ + if (_mddev) mddev_put(_mddev); \ + _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ + _tmp != &all_mddevs;}); \ ({ spin_lock(&all_mddevs_lock); \ - tmp = tmp->next;}) \ + _tmp = _tmp->next;}) \ ) -static int md_fail_request (struct request_queue *q, struct bio *bio) +/* Rather than calling directly into the personality make_request function, + * IO requests come here first so that we can check if the device is + * being suspended pending a reconfiguration. + * We hold a refcount over the call to ->make_request. By the time that + * call has finished, the bio has been linked into some internal structure + * and so is visible to ->quiesce(), so we don't need the refcount any more. + */ +static void md_make_request(struct request_queue *q, struct bio *bio) { - bio_io_error(bio); - return 0; + const int rw = bio_data_dir(bio); + struct mddev *mddev = q->queuedata; + int cpu; + unsigned int sectors; + + if (mddev == NULL || mddev->pers == NULL + || !mddev->ready) { + bio_io_error(bio); + return; + } + if (mddev->ro == 1 && unlikely(rw == WRITE)) { + bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); + return; + } + smp_rmb(); /* Ensure implications of 'active' are visible */ + rcu_read_lock(); + if (mddev->suspended) { + DEFINE_WAIT(__wait); + for (;;) { + prepare_to_wait(&mddev->sb_wait, &__wait, + TASK_UNINTERRUPTIBLE); + if (!mddev->suspended) + break; + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + } + finish_wait(&mddev->sb_wait, &__wait); + } + atomic_inc(&mddev->active_io); + rcu_read_unlock(); + + /* + * save the sectors now since our bio can + * go away inside make_request + */ + sectors = bio_sectors(bio); + mddev->pers->make_request(mddev, bio); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); + part_stat_unlock(); + + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); +} + +/* mddev_suspend makes sure no new requests are submitted + * to the device, and that any requests that have been submitted + * are completely handled. + * Once ->stop is called and completes, the module will be completely + * unused. + */ +void mddev_suspend(struct mddev *mddev) +{ + BUG_ON(mddev->suspended); + mddev->suspended = 1; + synchronize_rcu(); + wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); + mddev->pers->quiesce(mddev, 1); + + del_timer_sync(&mddev->safemode_timer); +} +EXPORT_SYMBOL_GPL(mddev_suspend); + +void mddev_resume(struct mddev *mddev) +{ + mddev->suspended = 0; + wake_up(&mddev->sb_wait); + mddev->pers->quiesce(mddev, 0); + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ +} +EXPORT_SYMBOL_GPL(mddev_resume); + +int mddev_congested(struct mddev *mddev, int bits) +{ + return mddev->suspended; } +EXPORT_SYMBOL(mddev_congested); + +/* + * Generic flush handling for md + */ + +static void md_end_flush(struct bio *bio, int err) +{ + struct md_rdev *rdev = bio->bi_private; + struct mddev *mddev = rdev->mddev; + + rdev_dec_pending(rdev, mddev); + + if (atomic_dec_and_test(&mddev->flush_pending)) { + /* The pre-request flush has finished */ + queue_work(md_wq, &mddev->flush_work); + } + bio_put(bio); +} + +static void md_submit_flush_data(struct work_struct *ws); -static inline mddev_t *mddev_get(mddev_t *mddev) +static void submit_flushes(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, flush_work); + struct md_rdev *rdev; + + INIT_WORK(&mddev->flush_work, md_submit_flush_data); + atomic_set(&mddev->flush_pending, 1); + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags)) { + /* Take two references, one is dropped + * when request finishes, one after + * we reclaim rcu_read_lock + */ + struct bio *bi; + atomic_inc(&rdev->nr_pending); + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); + bi->bi_end_io = md_end_flush; + bi->bi_private = rdev; + bi->bi_bdev = rdev->bdev; + atomic_inc(&mddev->flush_pending); + submit_bio(WRITE_FLUSH, bi); + rcu_read_lock(); + rdev_dec_pending(rdev, mddev); + } + rcu_read_unlock(); + if (atomic_dec_and_test(&mddev->flush_pending)) + queue_work(md_wq, &mddev->flush_work); +} + +static void md_submit_flush_data(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, flush_work); + struct bio *bio = mddev->flush_bio; + + if (bio->bi_iter.bi_size == 0) + /* an empty barrier - all done */ + bio_endio(bio, 0); + else { + bio->bi_rw &= ~REQ_FLUSH; + mddev->pers->make_request(mddev, bio); + } + + mddev->flush_bio = NULL; + wake_up(&mddev->sb_wait); +} + +void md_flush_request(struct mddev *mddev, struct bio *bio) +{ + spin_lock_irq(&mddev->write_lock); + wait_event_lock_irq(mddev->sb_wait, + !mddev->flush_bio, + mddev->write_lock); + mddev->flush_bio = bio; + spin_unlock_irq(&mddev->write_lock); + + INIT_WORK(&mddev->flush_work, submit_flushes); + queue_work(md_wq, &mddev->flush_work); +} +EXPORT_SYMBOL(md_flush_request); + +void md_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct mddev *mddev = cb->data; + md_wakeup_thread(mddev->thread); + kfree(cb); +} +EXPORT_SYMBOL(md_unplug); + +static inline struct mddev *mddev_get(struct mddev *mddev) { atomic_inc(&mddev->active); return mddev; } -static void mddev_put(mddev_t *mddev) +static void mddev_delayed_delete(struct work_struct *ws); + +static void mddev_put(struct mddev *mddev) { + struct bio_set *bs = NULL; + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; - if (!mddev->raid_disks && list_empty(&mddev->disks)) { - list_del(&mddev->all_mddevs); - spin_unlock(&all_mddevs_lock); - blk_cleanup_queue(mddev->queue); - kobject_put(&mddev->kobj); - } else - spin_unlock(&all_mddevs_lock); + if (!mddev->raid_disks && list_empty(&mddev->disks) && + mddev->ctime == 0 && !mddev->hold_active) { + /* Array is not configured at all, and not held active, + * so destroy it */ + list_del_init(&mddev->all_mddevs); + bs = mddev->bio_set; + mddev->bio_set = NULL; + if (mddev->gendisk) { + /* We did a probe so need to clean up. Call + * queue_work inside the spinlock so that + * flush_workqueue() after mddev_find will + * succeed in waiting for the work to be done. + */ + INIT_WORK(&mddev->del_work, mddev_delayed_delete); + queue_work(md_misc_wq, &mddev->del_work); + } else + kfree(mddev); + } + spin_unlock(&all_mddevs_lock); + if (bs) + bioset_free(bs); +} + +void mddev_init(struct mddev *mddev) +{ + mutex_init(&mddev->open_mutex); + mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->bitmap_info.mutex); + INIT_LIST_HEAD(&mddev->disks); + INIT_LIST_HEAD(&mddev->all_mddevs); + init_timer(&mddev->safemode_timer); + atomic_set(&mddev->active, 1); + atomic_set(&mddev->openers, 0); + atomic_set(&mddev->active_io, 0); + spin_lock_init(&mddev->write_lock); + atomic_set(&mddev->flush_pending, 0); + init_waitqueue_head(&mddev->sb_wait); + init_waitqueue_head(&mddev->recovery_wait); + mddev->reshape_position = MaxSector; + mddev->reshape_backwards = 0; + mddev->last_sync_action = "none"; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->level = LEVEL_NONE; } +EXPORT_SYMBOL_GPL(mddev_init); -static mddev_t * mddev_find(dev_t unit) +static struct mddev * mddev_find(dev_t unit) { - mddev_t *mddev, *new = NULL; + struct mddev *mddev, *new = NULL; + + if (unit && MAJOR(unit) != MD_MAJOR) + unit &= ~((1<<MdpMinorShift)-1); retry: spin_lock(&all_mddevs_lock); - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == unit) { - mddev_get(mddev); + + if (unit) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == unit) { + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + kfree(new); + return mddev; + } + + if (new) { + list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); - kfree(new); - return mddev; + new->hold_active = UNTIL_IOCTL; + return new; } - - if (new) { + } else if (new) { + /* find an unused unit number */ + static int next_minor = 512; + int start = next_minor; + int is_free = 0; + int dev = 0; + while (!is_free) { + dev = MKDEV(MD_MAJOR, next_minor); + next_minor++; + if (next_minor > MINORMASK) + next_minor = 0; + if (next_minor == start) { + /* Oh dear, all in use. */ + spin_unlock(&all_mddevs_lock); + kfree(new); + return NULL; + } + + is_free = 1; + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == dev) { + is_free = 0; + break; + } + } + new->unit = dev; + new->md_minor = MINOR(dev); + new->hold_active = UNTIL_STOP; list_add(&new->all_mddevs, &all_mddevs); spin_unlock(&all_mddevs_lock); return new; @@ -268,77 +557,126 @@ static mddev_t * mddev_find(dev_t unit) else new->md_minor = MINOR(unit) >> MdpMinorShift; - mutex_init(&new->reconfig_mutex); - INIT_LIST_HEAD(&new->disks); - INIT_LIST_HEAD(&new->all_mddevs); - init_timer(&new->safemode_timer); - atomic_set(&new->active, 1); - atomic_set(&new->openers, 0); - spin_lock_init(&new->write_lock); - init_waitqueue_head(&new->sb_wait); - init_waitqueue_head(&new->recovery_wait); - new->reshape_position = MaxSector; - new->resync_min = 0; - new->resync_max = MaxSector; - new->level = LEVEL_NONE; - - new->queue = blk_alloc_queue(GFP_KERNEL); - if (!new->queue) { - kfree(new); - return NULL; - } - /* Can be unlocked because the queue is new: no concurrency */ - queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, new->queue); - - blk_queue_make_request(new->queue, md_fail_request); + mddev_init(new); goto retry; } -static inline int mddev_lock(mddev_t * mddev) +static inline int __must_check mddev_lock(struct mddev * mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); } -static inline int mddev_trylock(mddev_t * mddev) +/* Sometimes we need to take the lock in a situation where + * failure due to interrupts is not acceptable. + */ +static inline void mddev_lock_nointr(struct mddev * mddev) +{ + mutex_lock(&mddev->reconfig_mutex); +} + +static inline int mddev_is_locked(struct mddev *mddev) +{ + return mutex_is_locked(&mddev->reconfig_mutex); +} + +static inline int mddev_trylock(struct mddev * mddev) { return mutex_trylock(&mddev->reconfig_mutex); } -static inline void mddev_unlock(mddev_t * mddev) +static struct attribute_group md_redundancy_group; + +static void mddev_unlock(struct mddev * mddev) { - mutex_unlock(&mddev->reconfig_mutex); + if (mddev->to_remove) { + /* These cannot be removed under reconfig_mutex as + * an access to the files will try to take reconfig_mutex + * while holding the file unremovable, which leads to + * a deadlock. + * So hold set sysfs_active while the remove in happeing, + * and anything else which might set ->to_remove or my + * otherwise change the sysfs namespace will fail with + * -EBUSY if sysfs_active is still set. + * We set sysfs_active under reconfig_mutex and elsewhere + * test it under the same mutex to ensure its correct value + * is seen. + */ + struct attribute_group *to_remove = mddev->to_remove; + mddev->to_remove = NULL; + mddev->sysfs_active = 1; + mutex_unlock(&mddev->reconfig_mutex); + + if (mddev->kobj.sd) { + if (to_remove != &md_redundancy_group) + sysfs_remove_group(&mddev->kobj, to_remove); + if (mddev->pers == NULL || + mddev->pers->sync_request == NULL) { + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->sysfs_action) + sysfs_put(mddev->sysfs_action); + mddev->sysfs_action = NULL; + } + } + mddev->sysfs_active = 0; + } else + mutex_unlock(&mddev->reconfig_mutex); + /* As we've dropped the mutex we need a spinlock to + * make sure the thread doesn't disappear + */ + spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); + spin_unlock(&pers_lock); } -static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) { - mdk_rdev_t * rdev; - struct list_head *tmp; + struct md_rdev *rdev; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) if (rdev->desc_nr == nr) return rdev; - } + return NULL; } -static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) +static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) { - struct list_head *tmp; - mdk_rdev_t *rdev; + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->desc_nr == nr) + return rdev; + + return NULL; +} + +static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) if (rdev->bdev->bd_dev == dev) return rdev; - } + return NULL; } -static struct mdk_personality *find_pers(int level, char *clevel) +static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) { - struct mdk_personality *pers; + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->bdev->bd_dev == dev) + return rdev; + + return NULL; +} + +static struct md_personality *find_pers(int level, char *clevel) +{ + struct md_personality *pers; list_for_each_entry(pers, &pers_list, list) { if (level != LEVEL_NONE && pers->level == level) return pers; @@ -349,22 +687,13 @@ static struct mdk_personality *find_pers(int level, char *clevel) } /* return the offset of the super block in 512byte sectors */ -static inline sector_t calc_dev_sboffset(struct block_device *bdev) +static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) { - sector_t num_sectors = bdev->bd_inode->i_size / 512; + sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; return MD_NEW_SIZE_SECTORS(num_sectors); } -static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) -{ - sector_t num_sectors = rdev->sb_start; - - if (chunk_size) - num_sectors &= ~((sector_t)chunk_size/512 - 1); - return num_sectors; -} - -static int alloc_disk_sb(mdk_rdev_t * rdev) +static int alloc_disk_sb(struct md_rdev * rdev) { if (rdev->sb_page) MD_BUG(); @@ -378,22 +707,28 @@ static int alloc_disk_sb(mdk_rdev_t * rdev) return 0; } -static void free_disk_sb(mdk_rdev_t * rdev) +void md_rdev_clear(struct md_rdev *rdev) { if (rdev->sb_page) { put_page(rdev->sb_page); rdev->sb_loaded = 0; rdev->sb_page = NULL; rdev->sb_start = 0; - rdev->size = 0; + rdev->sectors = 0; + } + if (rdev->bb_page) { + put_page(rdev->bb_page); + rdev->bb_page = NULL; } + kfree(rdev->badblocks.page); + rdev->badblocks.page = NULL; } - +EXPORT_SYMBOL_GPL(md_rdev_clear); static void super_written(struct bio *bio, int error) { - mdk_rdev_t *rdev = bio->bi_private; - mddev_t *mddev = rdev->mddev; + struct md_rdev *rdev = bio->bi_private; + struct mddev *mddev = rdev->mddev; if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { printk("md: super_written gets error=%d, uptodate=%d\n", @@ -407,32 +742,7 @@ static void super_written(struct bio *bio, int error) bio_put(bio); } -static void super_written_barrier(struct bio *bio, int error) -{ - struct bio *bio2 = bio->bi_private; - mdk_rdev_t *rdev = bio2->bi_private; - mddev_t *mddev = rdev->mddev; - - if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && - error == -EOPNOTSUPP) { - unsigned long flags; - /* barriers don't appear to be supported :-( */ - set_bit(BarriersNotsupp, &rdev->flags); - mddev->barriers_work = 0; - spin_lock_irqsave(&mddev->write_lock, flags); - bio2->bi_next = mddev->biolist; - mddev->biolist = bio2; - spin_unlock_irqrestore(&mddev->write_lock, flags); - wake_up(&mddev->sb_wait); - bio_put(bio); - } else { - bio_put(bio2); - bio->bi_private = rdev; - super_written(bio, error); - } -} - -void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, +void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page) { /* write first size bytes of page to sector of rdev @@ -440,78 +750,50 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * and decrement it on completion, waking up sb_wait * if zero is reached. * If an error occurred, call md_error - * - * As we might need to resubmit the request if BIO_RW_BARRIER - * causes ENOTSUPP, we allocate a spare bio... */ - struct bio *bio = bio_alloc(GFP_NOIO, 1); - int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); + struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); - bio->bi_bdev = rdev->bdev; - bio->bi_sector = sector; + bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; + bio->bi_iter.bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; - bio->bi_rw = rw; atomic_inc(&mddev->pending_writes); - if (!test_bit(BarriersNotsupp, &rdev->flags)) { - struct bio *rbio; - rw |= (1<<BIO_RW_BARRIER); - rbio = bio_clone(bio, GFP_NOIO); - rbio->bi_private = bio; - rbio->bi_end_io = super_written_barrier; - submit_bio(rw, rbio); - } else - submit_bio(rw, bio); + submit_bio(WRITE_FLUSH_FUA, bio); } -void md_super_wait(mddev_t *mddev) +void md_super_wait(struct mddev *mddev) { - /* wait for all superblock writes that were scheduled to complete. - * if any had to be retried (due to BARRIER problems), retry them - */ + /* wait for all superblock writes that were scheduled to complete */ DEFINE_WAIT(wq); for(;;) { prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); if (atomic_read(&mddev->pending_writes)==0) break; - while (mddev->biolist) { - struct bio *bio; - spin_lock_irq(&mddev->write_lock); - bio = mddev->biolist; - mddev->biolist = bio->bi_next ; - bio->bi_next = NULL; - spin_unlock_irq(&mddev->write_lock); - submit_bio(bio->bi_rw, bio); - } schedule(); } finish_wait(&mddev->sb_wait, &wq); } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion*)bio->bi_private); -} - -int sync_page_io(struct block_device *bdev, sector_t sector, int size, - struct page *page, int rw) +int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, + struct page *page, int rw, bool metadata_op) { - struct bio *bio = bio_alloc(GFP_NOIO, 1); - struct completion event; + struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); int ret; - rw |= (1 << BIO_RW_SYNC); - - bio->bi_bdev = bdev; - bio->bi_sector = sector; + bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? + rdev->meta_bdev : rdev->bdev; + if (metadata_op) + bio->bi_iter.bi_sector = sector + rdev->sb_start; + else if (rdev->mddev->reshape_position != MaxSector && + (rdev->mddev->reshape_backwards == + (sector >= rdev->mddev->reshape_position))) + bio->bi_iter.bi_sector = sector + rdev->new_data_offset; + else + bio->bi_iter.bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); + submit_bio_wait(rw, bio); ret = test_bit(BIO_UPTODATE, &bio->bi_flags); bio_put(bio); @@ -519,7 +801,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, } EXPORT_SYMBOL_GPL(sync_page_io); -static int read_disk_sb(mdk_rdev_t * rdev, int size) +static int read_disk_sb(struct md_rdev * rdev, int size) { char b[BDEVNAME_SIZE]; if (!rdev->sb_page) { @@ -530,7 +812,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) return 0; - if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) + if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) goto fail; rdev->sb_loaded = 1; return 0; @@ -626,7 +908,7 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) * We rely on user-space to write the initial superblock, and support * reading and updating of superblocks. * Interface methods are: - * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) + * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) * loads and validates a superblock on dev. * if refdev != NULL, compare superblocks on both devices * Return: @@ -636,13 +918,13 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) * -EINVAL superblock incompatible or invalid * -othererror e.g. -EIO * - * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) + * int validate_super(struct mddev *mddev, struct md_rdev *dev) * Verify that dev is acceptable into mddev. * The first time, mddev->raid_disks will be 0, and data from * dev should be merged in. Subsequent calls check that dev * is new enough. Return 0 or -EINVAL * - * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) + * void sync_super(struct mddev *mddev, struct md_rdev *dev) * Update the superblock for rdev with data in mddev * This does not write to disc. * @@ -651,18 +933,41 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) struct super_type { char *name; struct module *owner; - int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, + int (*load_super)(struct md_rdev *rdev, + struct md_rdev *refdev, int minor_version); - int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); - void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); - unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, + int (*validate_super)(struct mddev *mddev, + struct md_rdev *rdev); + void (*sync_super)(struct mddev *mddev, + struct md_rdev *rdev); + unsigned long long (*rdev_size_change)(struct md_rdev *rdev, sector_t num_sectors); + int (*allow_new_offset)(struct md_rdev *rdev, + unsigned long long new_offset); }; /* + * Check that the given mddev has no bitmap. + * + * This function is called from the run method of all personalities that do not + * support bitmaps. It prints an error message and returns non-zero if mddev + * has a bitmap. Otherwise, it returns 0. + * + */ +int md_check_no_bitmap(struct mddev *mddev) +{ + if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) + return 0; + printk(KERN_ERR "%s: bitmaps are not supported for %s\n", + mdname(mddev), mddev->pers->name); + return 1; +} +EXPORT_SYMBOL(md_check_no_bitmap); + +/* * load_super for 0.90.0 */ -static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; mdp_super_t *sb; @@ -674,7 +979,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version * * It also happens to be a multiple of 4Kb. */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; @@ -682,7 +987,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ret = -EINVAL; bdevname(rdev->bdev, b); - sb = (mdp_super_t*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); if (sb->md_magic != MD_SB_MAGIC) { printk(KERN_ERR "md: invalid raid superblock magic on %s\n", @@ -710,18 +1015,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; + rdev->new_data_offset = 0; rdev->sb_size = MD_SB_BYTES; - - if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { - if (sb->level != 1 && sb->level != 4 - && sb->level != 5 && sb->level != 6 - && sb->level != 10) { - /* FIXME use a better test */ - printk(KERN_WARNING - "md: bitmaps not supported for this level.\n"); - goto abort; - } - } + rdev->badblocks.shift = -1; if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; @@ -732,7 +1028,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ret = 1; } else { __u64 ev1, ev2; - mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); + mdp_super_t *refsb = page_address(refdev->sb_page); if (!uuid_equal(refsb, sb)) { printk(KERN_WARNING "md: %s has different UUID to %s\n", b, bdevname(refdev->bdev,b2)); @@ -751,9 +1047,15 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version else ret = 0; } - rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; + rdev->sectors = rdev->sb_start; + /* Limit to 4TB as metadata cannot record more than that. + * (not needed for Linear and RAID0 as metadata doesn't + * record this size) + */ + if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) + rdev->sectors = (2ULL << 32) - 2; - if (rdev->size < sb->size && sb->level > 1) + if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) /* "this cannot possibly happen" ... */ ret = -EINVAL; @@ -764,47 +1066,53 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version /* * validate_super for 0.90.0 */ -static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) +static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) { mdp_disk_t *desc; - mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + mdp_super_t *sb = page_address(rdev->sb_page); __u64 ev1 = md_event(sb); rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); - clear_bit(BarriersNotsupp, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 0; mddev->minor_version = sb->minor_version; mddev->patch_version = sb->patch_version; mddev->external = 0; - mddev->chunk_size = sb->chunk_size; + mddev->chunk_sectors = sb->chunk_size >> 9; mddev->ctime = sb->ctime; mddev->utime = sb->utime; mddev->level = sb->level; mddev->clevel[0] = 0; mddev->layout = sb->layout; mddev->raid_disks = sb->raid_disks; - mddev->size = sb->size; + mddev->dev_sectors = ((sector_t)sb->size) * 2; mddev->events = ev1; - mddev->bitmap_offset = 0; - mddev->default_bitmap_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.space = 0; + /* bitmap can use 60 K after the 4K superblocks */ + mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); + mddev->reshape_backwards = 0; if (mddev->minor_version >= 91) { mddev->reshape_position = sb->reshape_position; mddev->delta_disks = sb->delta_disks; mddev->new_level = sb->new_level; mddev->new_layout = sb->new_layout; - mddev->new_chunk = sb->new_chunk; + mddev->new_chunk_sectors = sb->new_chunk >> 9; + if (mddev->delta_disks < 0) + mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; } if (sb->state & (1<<MD_SB_CLEAN)) @@ -825,20 +1133,29 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->max_disks = MD_SB_DISKS; if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && - mddev->bitmap_file == NULL) - mddev->bitmap_offset = mddev->default_bitmap_offset; + mddev->bitmap_info.file == NULL) { + mddev->bitmap_info.offset = + mddev->bitmap_info.default_offset; + mddev->bitmap_info.space = + mddev->bitmap_info.default_space; + } } else if (mddev->pers == NULL) { - /* Insist on good event counter while assembling */ + /* Insist on good event counter while assembling, except + * for spares (which don't need an event count) */ ++ev1; - if (ev1 < mddev->events) - return -EINVAL; + if (sb->disks[rdev->desc_nr].state & ( + (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) + if (ev1 < mddev->events) + return -EINVAL; } else if (mddev->bitmap) { /* if adding to array with a bitmap, then we can accept an * older device ... but not too old. */ if (ev1 < mddev->bitmap->events_cleared) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ @@ -854,6 +1171,15 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) desc->raid_disk < mddev->raid_disks */) { set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; + rdev->saved_raid_disk = desc->raid_disk; + } else if (desc->state & (1<<MD_DISK_ACTIVE)) { + /* active but not in sync implies recovery up to + * reshape position. We don't know exactly where + * that is, so set to zero for now */ + if (mddev->minor_version >= 91) { + rdev->recovery_offset = 0; + rdev->raid_disk = desc->raid_disk; + } } if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); @@ -865,11 +1191,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) /* * sync_super for 0.90.0 */ -static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) +static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) { mdp_super_t *sb; - struct list_head *tmp; - mdk_rdev_t *rdev2; + struct md_rdev *rdev2; int next_spare = mddev->raid_disks; @@ -888,7 +1213,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) rdev->sb_size = MD_SB_BYTES; - sb = (mdp_super_t*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); memset(sb, 0, sizeof(*sb)); @@ -903,7 +1228,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->ctime = mddev->ctime; sb->level = mddev->level; - sb->size = mddev->size; + sb->size = mddev->dev_sectors / 2; sb->raid_disks = mddev->raid_disks; sb->md_minor = mddev->md_minor; sb->not_persistent = 0; @@ -920,7 +1245,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->new_level = mddev->new_level; sb->delta_disks = mddev->delta_disks; sb->new_layout = mddev->new_layout; - sb->new_chunk = mddev->new_chunk; + sb->new_chunk = mddev->new_chunk_sectors << 9; } mddev->minor_version = sb->minor_version; if (mddev->in_sync) @@ -934,17 +1259,28 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->recovery_cp = 0; sb->layout = mddev->layout; - sb->chunk_size = mddev->chunk_size; + sb->chunk_size = mddev->chunk_sectors << 9; - if (mddev->bitmap && mddev->bitmap_file == NULL) + if (mddev->bitmap && mddev->bitmap_info.file == NULL) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); - rdev_for_each(rdev2, tmp, mddev) { + rdev_for_each(rdev2, mddev) { mdp_disk_t *d; int desc_nr; - if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) - && !test_bit(Faulty, &rdev2->flags)) + int is_active = test_bit(In_sync, &rdev2->flags); + + if (rdev2->raid_disk >= 0 && + sb->minor_version >= 91) + /* we have nowhere to store the recovery_offset, + * but if it is not below the reshape_position, + * we can piggy-back on that. + */ + is_active = 1; + if (rdev2->raid_disk < 0 || + test_bit(Faulty, &rdev2->flags)) + is_active = 0; + if (is_active) desc_nr = rdev2->raid_disk; else desc_nr = next_spare++; @@ -954,16 +1290,16 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) d->number = rdev2->desc_nr; d->major = MAJOR(rdev2->bdev->bd_dev); d->minor = MINOR(rdev2->bdev->bd_dev); - if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) - && !test_bit(Faulty, &rdev2->flags)) + if (is_active) d->raid_disk = rdev2->raid_disk; else d->raid_disk = rdev2->desc_nr; /* compatibility */ if (test_bit(Faulty, &rdev2->flags)) d->state = (1<<MD_DISK_FAULTY); - else if (test_bit(In_sync, &rdev2->flags)) { + else if (is_active) { d->state = (1<<MD_DISK_ACTIVE); - d->state |= (1<<MD_DISK_SYNC); + if (test_bit(In_sync, &rdev2->flags)) + d->state |= (1<<MD_DISK_SYNC); active++; working++; } else { @@ -999,21 +1335,32 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) * rdev_size_change for 0.90.0 */ static unsigned long long -super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) +super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { - if (num_sectors && num_sectors < rdev->mddev->size * 2) + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ - if (rdev->mddev->bitmap_offset) + if (rdev->mddev->bitmap_info.offset) return 0; /* can't move bitmap */ - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); if (!num_sectors || num_sectors > rdev->sb_start) num_sectors = rdev->sb_start; + /* Limit to 4TB as metadata cannot record more than that. + * 4TB == 2^32 KB, or 2*2^32 sectors. + */ + if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) + num_sectors = (2ULL << 32) - 2; md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); md_super_wait(rdev->mddev); - return num_sectors / 2; /* kB for sysfs */ + return num_sectors; } +static int +super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) +{ + /* non-zero offset changes not possible with v0.90 */ + return new_offset == 0; +} /* * version 1 superblock @@ -1026,12 +1373,11 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) unsigned long long newcsum; int size = 256 + le32_to_cpu(sb->max_dev)*2; __le32 *isuper = (__le32*)sb; - int i; disk_csum = sb->sb_csum; sb->sb_csum = 0; newcsum = 0; - for (i=0; size>=4; size -= 4 ) + for (; size >= 4; size -= 4) newcsum += le32_to_cpu(*isuper++); if (size == 2) @@ -1042,11 +1388,14 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) return cpu_to_le32(csum); } -static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, + int acknowledged); +static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { struct mdp_superblock_1 *sb; int ret; sector_t sb_start; + sector_t sectors; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; @@ -1060,7 +1409,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) */ switch(minor_version) { case 0: - sb_start = rdev->bdev->bd_inode->i_size >> 9; + sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; sb_start -= 8*2; sb_start &= ~(sector_t)(4*2-1); break; @@ -1082,7 +1431,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) if (ret) return ret; - sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || sb->major_version != cpu_to_le32(1) || @@ -1101,42 +1450,83 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) bdevname(rdev->bdev,b)); return -EINVAL; } - if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { - if (sb->level != cpu_to_le32(1) && - sb->level != cpu_to_le32(4) && - sb->level != cpu_to_le32(5) && - sb->level != cpu_to_le32(6) && - sb->level != cpu_to_le32(10)) { - printk(KERN_WARNING - "md: bitmaps not supported for this level.\n"); - return -EINVAL; - } - } + if (sb->pad0 || + sb->pad3[0] || + memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) + /* Some padding is non-zero, might be a new feature */ + return -EINVAL; rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); + rdev->new_data_offset = rdev->data_offset; + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && + (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) + rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; - bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; if (minor_version && rdev->data_offset < sb_start + (rdev->sb_size/512)) return -EINVAL; + if (minor_version + && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) + return -EINVAL; if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) rdev->desc_nr = -1; else rdev->desc_nr = le32_to_cpu(sb->dev_number); + if (!rdev->bb_page) { + rdev->bb_page = alloc_page(GFP_KERNEL); + if (!rdev->bb_page) + return -ENOMEM; + } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && + rdev->badblocks.count == 0) { + /* need to load the bad block list. + * Currently we limit it to one page. + */ + s32 offset; + sector_t bb_sector; + u64 *bbp; + int i; + int sectors = le16_to_cpu(sb->bblog_size); + if (sectors > (PAGE_SIZE / 512)) + return -EINVAL; + offset = le32_to_cpu(sb->bblog_offset); + if (offset == 0) + return -EINVAL; + bb_sector = (long long)offset; + if (!sync_page_io(rdev, bb_sector, sectors << 9, + rdev->bb_page, READ, true)) + return -EIO; + bbp = (u64 *)page_address(rdev->bb_page); + rdev->badblocks.shift = sb->bblog_shift; + for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { + u64 bb = le64_to_cpu(*bbp); + int count = bb & (0x3ff); + u64 sector = bb >> 10; + sector <<= sb->bblog_shift; + count <<= sb->bblog_shift; + if (bb + 1 == 0) + break; + if (md_set_badblocks(&rdev->badblocks, + sector, count, 1) == 0) + return -EINVAL; + } + } else if (sb->bblog_offset != 0) + rdev->badblocks.shift = 0; + if (!refdev) { ret = 1; } else { __u64 ev1, ev2; - struct mdp_superblock_1 *refsb = - (struct mdp_superblock_1*)page_address(refdev->sb_page); + struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || sb->level != refsb->level || @@ -1156,82 +1546,110 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) else ret = 0; } - if (minor_version) - rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; - else - rdev->size = rdev->sb_start / 2; - if (rdev->size < le64_to_cpu(sb->data_size)/2) - return -EINVAL; - rdev->size = le64_to_cpu(sb->data_size)/2; - if (le32_to_cpu(sb->chunksize)) - rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); - - if (le64_to_cpu(sb->size) > rdev->size*2) + if (minor_version) { + sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); + sectors -= rdev->data_offset; + } else + sectors = rdev->sb_start; + if (sectors < le64_to_cpu(sb->data_size)) return -EINVAL; + rdev->sectors = le64_to_cpu(sb->data_size); return ret; } -static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) +static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) { - struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); __u64 ev1 = le64_to_cpu(sb->events); rdev->raid_disk = -1; clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); - clear_bit(BarriersNotsupp, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 1; mddev->patch_version = 0; mddev->external = 0; - mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; + mddev->chunk_sectors = le32_to_cpu(sb->chunksize); mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); mddev->level = le32_to_cpu(sb->level); mddev->clevel[0] = 0; mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); - mddev->size = le64_to_cpu(sb->size)/2; + mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; - mddev->bitmap_offset = 0; - mddev->default_bitmap_offset = 1024 >> 9; - + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.space = 0; + /* Default location for bitmap is 1K after superblock + * using 3K - total of 4K + */ + mddev->bitmap_info.default_offset = 1024 >> 9; + mddev->bitmap_info.default_space = (4096-1024) >> 9; + mddev->reshape_backwards = 0; + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && - mddev->bitmap_file == NULL ) - mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); + mddev->bitmap_info.file == NULL) { + mddev->bitmap_info.offset = + (__s32)le32_to_cpu(sb->bitmap_offset); + /* Metadata doesn't record how much space is available. + * For 1.0, we assume we can use up to the superblock + * if before, else to 4K beyond superblock. + * For others, assume no change is possible. + */ + if (mddev->minor_version > 0) + mddev->bitmap_info.space = 0; + else if (mddev->bitmap_info.offset > 0) + mddev->bitmap_info.space = + 8 - mddev->bitmap_info.offset; + else + mddev->bitmap_info.space = + -mddev->bitmap_info.offset; + } if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { mddev->reshape_position = le64_to_cpu(sb->reshape_position); mddev->delta_disks = le32_to_cpu(sb->delta_disks); mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_layout = le32_to_cpu(sb->new_layout); - mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; + mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); + if (mddev->delta_disks < 0 || + (mddev->delta_disks == 0 && + (le32_to_cpu(sb->feature_map) + & MD_FEATURE_RESHAPE_BACKWARDS))) + mddev->reshape_backwards = 1; } else { mddev->reshape_position = MaxSector; mddev->delta_disks = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; } } else if (mddev->pers == NULL) { - /* Insist of good event counter while assembling */ + /* Insist of good event counter while assembling, except for + * spares (which don't need an event count) */ ++ev1; - if (ev1 < mddev->events) - return -EINVAL; + if (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) + if (ev1 < mddev->events) + return -EINVAL; } else if (mddev->bitmap) { /* If adding to array with a bitmap, then we can accept an * older device, but not too old. */ if (ev1 < mddev->bitmap->events_cleared) return 0; + if (ev1 < mddev->events) + set_bit(Bitmap_sync, &rdev->flags); } else { if (ev1 < mddev->events) /* just a hot-add of a new device, leave raid_disk at -1 */ @@ -1239,7 +1657,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } if (mddev->level != LEVEL_MULTIPATH) { int role; - role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + if (rdev->desc_nr < 0 || + rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { + role = 0xffff; + rdev->desc_nr = -1; + } else + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { case 0xffff: /* spare */ break; @@ -1247,37 +1670,40 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(Faulty, &rdev->flags); break; default: + rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & - MD_FEATURE_RECOVERY_OFFSET)) + MD_FEATURE_RECOVERY_OFFSET)) { rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); - else + if (!(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RECOVERY_BITMAP)) + rdev->saved_raid_disk = -1; + } else set_bit(In_sync, &rdev->flags); rdev->raid_disk = role; break; } if (sb->devflags & WriteMostly1) set_bit(WriteMostly, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) + set_bit(Replacement, &rdev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); return 0; } -static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) +static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb; - struct list_head *tmp; - mdk_rdev_t *rdev2; + struct md_rdev *rdev2; int max_dev, i; /* make rdev->sb match mddev and rdev data. */ - sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); sb->feature_map = 0; sb->pad0 = 0; sb->recovery_offset = cpu_to_le64(0); - memset(sb->pad1, 0, sizeof(sb->pad1)); - memset(sb->pad2, 0, sizeof(sb->pad2)); memset(sb->pad3, 0, sizeof(sb->pad3)); sb->utime = cpu_to_le64((__u64)mddev->utime); @@ -1290,19 +1716,36 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->raid_disks = cpu_to_le32(mddev->raid_disks); - sb->size = cpu_to_le64(mddev->size<<1); + sb->size = cpu_to_le64(mddev->dev_sectors); + sb->chunksize = cpu_to_le32(mddev->chunk_sectors); + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); + + if (test_bit(WriteMostly, &rdev->flags)) + sb->devflags |= WriteMostly1; + else + sb->devflags &= ~WriteMostly1; + sb->data_offset = cpu_to_le64(rdev->data_offset); + sb->data_size = cpu_to_le64(rdev->sectors); - if (mddev->bitmap && mddev->bitmap_file == NULL) { - sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); + if (mddev->bitmap && mddev->bitmap_info.file == NULL) { + sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset > 0) { - sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); - sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); - } + !test_bit(In_sync, &rdev->flags)) { + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = + cpu_to_le64(rdev->recovery_offset); + if (rdev->saved_raid_disk >= 0 && mddev->bitmap) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); + } + if (test_bit(Replacement, &rdev->flags)) + sb->feature_map |= + cpu_to_le32(MD_FEATURE_REPLACEMENT); if (mddev->reshape_position != MaxSector) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); @@ -1310,26 +1753,78 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->new_layout = cpu_to_le32(mddev->new_layout); sb->delta_disks = cpu_to_le32(mddev->delta_disks); sb->new_level = cpu_to_le32(mddev->new_level); - sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); + sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); + if (mddev->delta_disks == 0 && + mddev->reshape_backwards) + sb->feature_map + |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); + if (rdev->new_data_offset != rdev->data_offset) { + sb->feature_map + |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); + sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset + - rdev->data_offset)); + } + } + + if (rdev->badblocks.count == 0) + /* Nothing to do for bad blocks*/ ; + else if (sb->bblog_offset == 0) + /* Cannot record bad blocks on this device */ + md_error(mddev, rdev); + else { + struct badblocks *bb = &rdev->badblocks; + u64 *bbp = (u64 *)page_address(rdev->bb_page); + u64 *p = bb->page; + sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); + if (bb->changed) { + unsigned seq; + +retry: + seq = read_seqbegin(&bb->lock); + + memset(bbp, 0xff, PAGE_SIZE); + + for (i = 0 ; i < bb->count ; i++) { + u64 internal_bb = p[i]; + u64 store_bb = ((BB_OFFSET(internal_bb) << 10) + | BB_LEN(internal_bb)); + bbp[i] = cpu_to_le64(store_bb); + } + bb->changed = 0; + if (read_seqretry(&bb->lock, seq)) + goto retry; + + bb->sector = (rdev->sb_start + + (int)le32_to_cpu(sb->bblog_offset)); + bb->size = le16_to_cpu(sb->bblog_size); + } } max_dev = 0; - rdev_for_each(rdev2, tmp, mddev) + rdev_for_each(rdev2, mddev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; - if (max_dev > le32_to_cpu(sb->max_dev)) + if (max_dev > le32_to_cpu(sb->max_dev)) { + int bmask; sb->max_dev = cpu_to_le32(max_dev); + rdev->sb_size = max_dev * 2 + 256; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; + if (rdev->sb_size & bmask) + rdev->sb_size = (rdev->sb_size | bmask) + 1; + } else + max_dev = le32_to_cpu(sb->max_dev); + for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); - rdev_for_each(rdev2, tmp, mddev) { + rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(0xfffe); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); - else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) + else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(0xffff); @@ -1339,39 +1834,75 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) } static unsigned long long -super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) +super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { struct mdp_superblock_1 *sb; sector_t max_sectors; - if (num_sectors && num_sectors < rdev->mddev->size * 2) + if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ + if (rdev->data_offset != rdev->new_data_offset) + return 0; /* too confusing */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ - max_sectors = rdev->bdev->bd_inode->i_size >> 9; + max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; max_sectors -= rdev->data_offset; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; - } else if (rdev->mddev->bitmap_offset) { + } else if (rdev->mddev->bitmap_info.offset) { /* minor version 0 with bitmap we can't move */ return 0; } else { /* minor version 0; superblock after data */ sector_t sb_start; - sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; + sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; sb_start &= ~(sector_t)(4*2 - 1); - max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; + max_sectors = rdev->sectors + sb_start - rdev->sb_start; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; rdev->sb_start = sb_start; } - sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); + sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); sb->super_offset = rdev->sb_start; sb->sb_csum = calc_sb_1_csum(sb); md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); md_super_wait(rdev->mddev); - return num_sectors / 2; /* kB for sysfs */ + return num_sectors; + +} + +static int +super_1_allow_new_offset(struct md_rdev *rdev, + unsigned long long new_offset) +{ + /* All necessary checks on new >= old have been done */ + struct bitmap *bitmap; + if (new_offset >= rdev->data_offset) + return 1; + + /* with 1.0 metadata, there is no metadata to tread on + * so we can always move back */ + if (rdev->mddev->minor_version == 0) + return 1; + + /* otherwise we must be sure not to step on + * any metadata, so stay: + * 36K beyond start of superblock + * beyond end of badblocks + * beyond write-intent bitmap + */ + if (rdev->sb_start + (32+4)*2 > new_offset) + return 0; + bitmap = rdev->mddev->bitmap; + if (bitmap && !rdev->mddev->bitmap_info.file && + rdev->sb_start + rdev->mddev->bitmap_info.offset + + bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) + return 0; + if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) + return 0; + + return 1; } static struct super_type super_types[] = { @@ -1382,6 +1913,7 @@ static struct super_type super_types[] = { .validate_super = super_90_validate, .sync_super = super_90_sync, .rdev_size_change = super_90_rdev_size_change, + .allow_new_offset = super_90_allow_new_offset, }, [1] = { .name = "md-1", @@ -1390,12 +1922,25 @@ static struct super_type super_types[] = { .validate_super = super_1_validate, .sync_super = super_1_sync, .rdev_size_change = super_1_rdev_size_change, + .allow_new_offset = super_1_allow_new_offset, }, }; -static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +static void sync_super(struct mddev *mddev, struct md_rdev *rdev) { - mdk_rdev_t *rdev, *rdev2; + if (mddev->sync_super) { + mddev->sync_super(mddev, rdev); + return; + } + + BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); + + super_types[mddev->major_version].sync_super(mddev, rdev); +} + +static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) +{ + struct md_rdev *rdev, *rdev2; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev1) @@ -1411,7 +1956,84 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) static LIST_HEAD(pending_raid_disks); -static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) +/* + * Try to register data integrity profile for an mddev + * + * This is called when an array is started and after a disk has been kicked + * from the array. It only succeeds if all working and active component devices + * are integrity capable with matching profiles. + */ +int md_integrity_register(struct mddev *mddev) +{ + struct md_rdev *rdev, *reference = NULL; + + if (list_empty(&mddev->disks)) + return 0; /* nothing to do */ + if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) + return 0; /* shouldn't register, or already is */ + rdev_for_each(rdev, mddev) { + /* skip spares and non-functional disks */ + if (test_bit(Faulty, &rdev->flags)) + continue; + if (rdev->raid_disk < 0) + continue; + if (!reference) { + /* Use the first rdev as the reference */ + reference = rdev; + continue; + } + /* does this rdev's profile match the reference profile? */ + if (blk_integrity_compare(reference->bdev->bd_disk, + rdev->bdev->bd_disk) < 0) + return -EINVAL; + } + if (!reference || !bdev_get_integrity(reference->bdev)) + return 0; + /* + * All component devices are integrity capable and have matching + * profiles, register the common profile for the md device. + */ + if (blk_integrity_register(mddev->gendisk, + bdev_get_integrity(reference->bdev)) != 0) { + printk(KERN_ERR "md: failed to register integrity for %s\n", + mdname(mddev)); + return -EINVAL; + } + printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); + if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { + printk(KERN_ERR "md: failed to create integrity pool for %s\n", + mdname(mddev)); + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(md_integrity_register); + +/* Disable data integrity if non-capable/non-matching disk is being added */ +void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) +{ + struct blk_integrity *bi_rdev; + struct blk_integrity *bi_mddev; + + if (!mddev->gendisk) + return; + + bi_rdev = bdev_get_integrity(rdev->bdev); + bi_mddev = blk_get_integrity(mddev->gendisk); + + if (!bi_mddev) /* nothing to do */ + return; + if (rdev->raid_disk < 0) /* skip spares */ + return; + if (bi_rdev && blk_integrity_compare(mddev->gendisk, + rdev->bdev->bd_disk) >= 0) + return; + printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); + blk_integrity_unregister(mddev->gendisk); +} +EXPORT_SYMBOL(md_integrity_add_rdev); + +static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) { char b[BDEVNAME_SIZE]; struct kobject *ko; @@ -1427,8 +2049,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; - /* make sure rdev->size exceeds mddev->size */ - if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { + /* make sure rdev->sectors exceeds mddev->dev_sectors */ + if (rdev->sectors && (mddev->dev_sectors == 0 || + rdev->sectors < mddev->dev_sectors)) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -1437,7 +2060,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (mddev->level > 0) return -ENOSPC; } else - mddev->size = rdev->size; + mddev->dev_sectors = rdev->sectors; } /* Verify rdev->desc_nr is unique. @@ -1454,6 +2077,11 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (find_rdev_nr(mddev, rdev->desc_nr)) return -EBUSY; } + if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { + printk(KERN_WARNING "md: %s: array is limited to %d devices\n", + mdname(mddev), mddev->max_disks); + return -EBUSY; + } bdevname(rdev->bdev,b); while ( (s=strchr(b, '/')) != NULL) *s = '!'; @@ -1464,16 +2092,17 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; - if (rdev->bdev->bd_part) - ko = &rdev->bdev->bd_part->dev.kobj; - else - ko = &rdev->bdev->bd_disk->dev.kobj; - if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { - kobject_del(&rdev->kobj); - goto fail; - } + ko = &part_to_dev(rdev->bdev->bd_part)->kobj; + if (sysfs_create_link(&rdev->kobj, ko, "block")) + /* failure here is OK */; + rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); + list_add_rcu(&rdev->same_set, &mddev->disks); - bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); + bd_link_disk_holder(rdev->bdev, mddev->gendisk); + + /* May as well allow recovery to be retried once */ + mddev->recovery_disabled++; + return 0; fail: @@ -1484,24 +2113,26 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) static void md_delayed_delete(struct work_struct *ws) { - mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); + struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); kobject_del(&rdev->kobj); kobject_put(&rdev->kobj); } -static void unbind_rdev_from_array(mdk_rdev_t * rdev) +static void unbind_rdev_from_array(struct md_rdev * rdev) { char b[BDEVNAME_SIZE]; if (!rdev->mddev) { MD_BUG(); return; } - bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); - + sysfs_put(rdev->sysfs_state); + rdev->sysfs_state = NULL; + rdev->badblocks.count = 0; /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state". We also need * to delay it due to rcu usage. @@ -1509,7 +2140,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) synchronize_rcu(); INIT_WORK(&rdev->del_work, md_delayed_delete); kobject_get(&rdev->kobj); - schedule_work(&rdev->del_work); + queue_work(md_misc_wq, &rdev->del_work); } /* @@ -1517,51 +2148,42 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) * otherwise reused by a RAID array (or any other kernel * subsystem), by bd_claiming the device. */ -static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) +static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + shared ? (struct md_rdev *)lock_rdev : rdev); if (IS_ERR(bdev)) { printk(KERN_ERR "md: could not open %s.\n", __bdevname(dev, b)); return PTR_ERR(bdev); } - err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); - if (err) { - printk(KERN_ERR "md: could not bd_claim %s.\n", - bdevname(bdev, b)); - blkdev_put(bdev); - return err; - } - if (!shared) - set_bit(AllReserved, &rdev->flags); rdev->bdev = bdev; return err; } -static void unlock_rdev(mdk_rdev_t *rdev) +static void unlock_rdev(struct md_rdev *rdev) { struct block_device *bdev = rdev->bdev; rdev->bdev = NULL; if (!bdev) MD_BUG(); - bd_release(bdev); - blkdev_put(bdev); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } void md_autodetect_dev(dev_t dev); -static void export_rdev(mdk_rdev_t * rdev) +static void export_rdev(struct md_rdev * rdev) { char b[BDEVNAME_SIZE]; printk(KERN_INFO "md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); if (rdev->mddev) MD_BUG(); - free_disk_sb(rdev); + md_rdev_clear(rdev); #ifndef MODULE if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); @@ -1570,18 +2192,17 @@ static void export_rdev(mdk_rdev_t * rdev) kobject_put(&rdev->kobj); } -static void kick_rdev_from_array(mdk_rdev_t * rdev) +static void kick_rdev_from_array(struct md_rdev * rdev) { unbind_rdev_from_array(rdev); export_rdev(rdev); } -static void export_array(mddev_t *mddev) +static void export_array(struct mddev *mddev) { - struct list_head *tmp; - mdk_rdev_t *rdev; + struct md_rdev *rdev, *tmp; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (!rdev->mddev) { MD_BUG(); continue; @@ -1600,7 +2221,7 @@ static void print_desc(mdp_disk_t *desc) desc->major,desc->minor,desc->raid_disk,desc->state); } -static void print_sb(mdp_super_t *sb) +static void print_sb_90(mdp_super_t *sb) { int i; @@ -1631,28 +2252,76 @@ static void print_sb(mdp_super_t *sb) } printk(KERN_INFO "md: THIS: "); print_desc(&sb->this_disk); - } -static void print_rdev(mdk_rdev_t *rdev) +static void print_sb_1(struct mdp_superblock_1 *sb) +{ + __u8 *uuid; + + uuid = sb->set_uuid; + printk(KERN_INFO + "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" + "md: Name: \"%s\" CT:%llu\n", + le32_to_cpu(sb->major_version), + le32_to_cpu(sb->feature_map), + uuid, + sb->set_name, + (unsigned long long)le64_to_cpu(sb->ctime) + & MD_SUPERBLOCK_1_TIME_SEC_MASK); + + uuid = sb->device_uuid; + printk(KERN_INFO + "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" + " RO:%llu\n" + "md: Dev:%08x UUID: %pU\n" + "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" + "md: (MaxDev:%u) \n", + le32_to_cpu(sb->level), + (unsigned long long)le64_to_cpu(sb->size), + le32_to_cpu(sb->raid_disks), + le32_to_cpu(sb->layout), + le32_to_cpu(sb->chunksize), + (unsigned long long)le64_to_cpu(sb->data_offset), + (unsigned long long)le64_to_cpu(sb->data_size), + (unsigned long long)le64_to_cpu(sb->super_offset), + (unsigned long long)le64_to_cpu(sb->recovery_offset), + le32_to_cpu(sb->dev_number), + uuid, + sb->devflags, + (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, + (unsigned long long)le64_to_cpu(sb->events), + (unsigned long long)le64_to_cpu(sb->resync_offset), + le32_to_cpu(sb->sb_csum), + le32_to_cpu(sb->max_dev) + ); +} + +static void print_rdev(struct md_rdev *rdev, int major_version) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", - bdevname(rdev->bdev,b), (unsigned long long)rdev->size, + printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", + bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), rdev->desc_nr); if (rdev->sb_loaded) { - printk(KERN_INFO "md: rdev superblock:\n"); - print_sb((mdp_super_t*)page_address(rdev->sb_page)); + printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); + switch (major_version) { + case 0: + print_sb_90(page_address(rdev->sb_page)); + break; + case 1: + print_sb_1(page_address(rdev->sb_page)); + break; + } } else printk(KERN_INFO "md: no rdev superblock!\n"); } static void md_print_devices(void) { - struct list_head *tmp, *tmp2; - mdk_rdev_t *rdev; - mddev_t *mddev; + struct list_head *tmp; + struct md_rdev *rdev; + struct mddev *mddev; char b[BDEVNAME_SIZE]; printk("\n"); @@ -1665,19 +2334,19 @@ static void md_print_devices(void) bitmap_print_sb(mddev->bitmap); else printk("%s: ", mdname(mddev)); - rdev_for_each(rdev, tmp2, mddev) + rdev_for_each(rdev, mddev) printk("<%s>", bdevname(rdev->bdev,b)); printk("\n"); - rdev_for_each(rdev, tmp2, mddev) - print_rdev(rdev); + rdev_for_each(rdev, mddev) + print_rdev(rdev, mddev->major_version); } printk("md: **********************************\n"); printk("\n"); } -static void sync_sbs(mddev_t * mddev, int nospares) +static void sync_sbs(struct mddev * mddev, int nospares) { /* Update each superblock (in-memory image), but * if we are allowed to, skip spares which already @@ -1685,38 +2354,67 @@ static void sync_sbs(mddev_t * mddev, int nospares) * (which would mean they aren't being marked as dirty * with the rest of the array) */ - mdk_rdev_t *rdev; - struct list_head *tmp; - - rdev_for_each(rdev, tmp, mddev) { + struct md_rdev *rdev; + rdev_for_each(rdev, mddev) { if (rdev->sb_events == mddev->events || (nospares && rdev->raid_disk < 0 && - (rdev->sb_events&1)==0 && rdev->sb_events+1 == mddev->events)) { /* Don't update this superblock */ rdev->sb_loaded = 2; } else { - super_types[mddev->major_version]. - sync_super(mddev, rdev); + sync_super(mddev, rdev); rdev->sb_loaded = 1; } } } -static void md_update_sb(mddev_t * mddev, int force_change) +static void md_update_sb(struct mddev * mddev, int force_change) { - struct list_head *tmp; - mdk_rdev_t *rdev; + struct md_rdev *rdev; int sync_req; int nospares = 0; + int any_badblocks_changed = 0; - if (mddev->external) + if (mddev->ro) { + if (force_change) + set_bit(MD_CHANGE_DEVS, &mddev->flags); return; + } repeat: + /* First make sure individual recovery_offsets are correct */ + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(In_sync, &rdev->flags) && + mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + + } + if (!mddev->persistent) { + clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + clear_bit(MD_CHANGE_DEVS, &mddev->flags); + if (!mddev->external) { + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + rdev_for_each(rdev, mddev) { + if (rdev->badblocks.changed) { + rdev->badblocks.changed = 0; + md_ack_all_badblocks(&rdev->badblocks); + md_error(mddev, rdev); + } + clear_bit(Blocked, &rdev->flags); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + } + } + wake_up(&mddev->sb_wait); + return; + } + spin_lock_irq(&mddev->write_lock); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + mddev->utime = get_seconds(); + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) force_change = 1; if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) @@ -1740,31 +2438,19 @@ repeat: nospares = 0; sync_req = mddev->in_sync; - mddev->utime = get_seconds(); /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ if (nospares && (mddev->in_sync && mddev->recovery_cp == MaxSector) - && (mddev->events & 1) - && mddev->events != 1) + && mddev->can_decrease_events + && mddev->events != 1) { mddev->events--; - else { + mddev->can_decrease_events = 0; + } else { /* otherwise we have to go forward and ... */ mddev->events ++; - if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ - /* .. if the array isn't clean, insist on an odd 'events' */ - if ((mddev->events&1)==0) { - mddev->events++; - nospares = 0; - } - } else { - /* otherwise insist on an even 'events' (for clean states) */ - if ((mddev->events&1)) { - mddev->events++; - nospares = 0; - } - } + mddev->can_decrease_events = nospares; } if (!mddev->events) { @@ -1777,46 +2463,46 @@ repeat: mddev->events --; } - /* - * do not write anything to disk if using - * nonpersistent superblocks - */ - if (!mddev->persistent) { - if (!mddev->external) - clear_bit(MD_CHANGE_PENDING, &mddev->flags); - - spin_unlock_irq(&mddev->write_lock); - wake_up(&mddev->sb_wait); - return; + rdev_for_each(rdev, mddev) { + if (rdev->badblocks.changed) + any_badblocks_changed++; + if (test_bit(Faulty, &rdev->flags)) + set_bit(FaultRecorded, &rdev->flags); } + sync_sbs(mddev, nospares); spin_unlock_irq(&mddev->write_lock); - dprintk(KERN_INFO - "md: updating %s RAID superblock on device (in sync %d)\n", - mdname(mddev),mddev->in_sync); + pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", + mdname(mddev), mddev->in_sync); bitmap_update_sb(mddev->bitmap); - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; - dprintk(KERN_INFO "md: "); + if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ - if (test_bit(Faulty, &rdev->flags)) - dprintk("(skipping faulty "); - dprintk("%s ", bdevname(rdev->bdev,b)); if (!test_bit(Faulty, &rdev->flags)) { md_super_write(mddev,rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); - dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", - bdevname(rdev->bdev,b), - (unsigned long long)rdev->sb_start); + pr_debug("md: (write) %s's sb offset: %llu\n", + bdevname(rdev->bdev, b), + (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; + if (rdev->badblocks.size) { + md_super_write(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page); + rdev->badblocks.size = 0; + } } else - dprintk(")\n"); + pr_debug("md: %s (skipping faulty)\n", + bdevname(rdev->bdev, b)); + if (mddev->level == LEVEL_MULTIPATH) /* only need to write one superblock... */ break; @@ -1834,7 +2520,18 @@ repeat: clear_bit(MD_CHANGE_PENDING, &mddev->flags); spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + + rdev_for_each(rdev, mddev) { + if (test_and_clear_bit(FaultRecorded, &rdev->flags)) + clear_bit(Blocked, &rdev->flags); + if (any_badblocks_changed) + md_ack_all_badblocks(&rdev->badblocks); + clear_bit(BlockedBadBlocks, &rdev->flags); + wake_up(&rdev->blocked_wait); + } } /* words written to sysfs files may, or may not, be \n terminated. @@ -1859,17 +2556,18 @@ static int cmd_match(const char *cmd, const char *str) struct rdev_sysfs_entry { struct attribute attr; - ssize_t (*show)(mdk_rdev_t *, char *); - ssize_t (*store)(mdk_rdev_t *, const char *, size_t); + ssize_t (*show)(struct md_rdev *, char *); + ssize_t (*store)(struct md_rdev *, const char *, size_t); }; static ssize_t -state_show(mdk_rdev_t *rdev, char *page) +state_show(struct md_rdev *rdev, char *page) { char *sep = ""; size_t len = 0; - if (test_bit(Faulty, &rdev->flags)) { + if (test_bit(Faulty, &rdev->flags) || + rdev->badblocks.unacked_exist) { len+= sprintf(page+len, "%sfaulty",sep); sep = ","; } @@ -1881,7 +2579,9 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; } - if (test_bit(Blocked, &rdev->flags)) { + if (test_bit(Blocked, &rdev->flags) || + (rdev->badblocks.unacked_exist + && !test_bit(Faulty, &rdev->flags))) { len += sprintf(page+len, "%sblocked", sep); sep = ","; } @@ -1890,29 +2590,50 @@ state_show(mdk_rdev_t *rdev, char *page) len += sprintf(page+len, "%sspare", sep); sep = ","; } + if (test_bit(WriteErrorSeen, &rdev->flags)) { + len += sprintf(page+len, "%swrite_error", sep); + sep = ","; + } + if (test_bit(WantReplacement, &rdev->flags)) { + len += sprintf(page+len, "%swant_replacement", sep); + sep = ","; + } + if (test_bit(Replacement, &rdev->flags)) { + len += sprintf(page+len, "%sreplacement", sep); + sep = ","; + } + return len+sprintf(page+len, "\n"); } static ssize_t -state_store(mdk_rdev_t *rdev, const char *buf, size_t len) +state_store(struct md_rdev *rdev, const char *buf, size_t len) { /* can write - * faulty - simulates and error + * faulty - simulates an error * remove - disconnects the device * writemostly - sets write_mostly * -writemostly - clears write_mostly - * blocked - sets the Blocked flag - * -blocked - clears the Blocked flag + * blocked - sets the Blocked flags + * -blocked - clears the Blocked and possibly simulates an error + * insync - sets Insync providing device isn't active + * -insync - clear Insync for a device with a slot assigned, + * so that it gets rebuilt based on bitmap + * write_error - sets WriteErrorSeen + * -write_error - clears WriteErrorSeen */ int err = -EINVAL; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { md_error(rdev->mddev, rdev); - err = 0; + if (test_bit(Faulty, &rdev->flags)) + err = 0; + else + err = -EBUSY; } else if (cmd_match(buf, "remove")) { if (rdev->raid_disk >= 0) err = -EBUSY; else { - mddev_t *mddev = rdev->mddev; + struct mddev *mddev = rdev->mddev; kick_rdev_from_array(rdev); if (mddev->pers) md_update_sb(mddev, 1); @@ -1929,28 +2650,86 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) set_bit(Blocked, &rdev->flags); err = 0; } else if (cmd_match(buf, "-blocked")) { + if (!test_bit(Faulty, &rdev->flags) && + rdev->badblocks.unacked_exist) { + /* metadata handler doesn't understand badblocks, + * so we need to fail the device + */ + md_error(rdev->mddev, rdev); + } clear_bit(Blocked, &rdev->flags); + clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); err = 0; + } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { + set_bit(In_sync, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + err = 0; + } else if (cmd_match(buf, "write_error")) { + set_bit(WriteErrorSeen, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-write_error")) { + clear_bit(WriteErrorSeen, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "want_replacement")) { + /* Any non-spare device that is not a replacement can + * become want_replacement at any time, but we then need to + * check if recovery is needed. + */ + if (rdev->raid_disk >= 0 && + !test_bit(Replacement, &rdev->flags)) + set_bit(WantReplacement, &rdev->flags); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); + err = 0; + } else if (cmd_match(buf, "-want_replacement")) { + /* Clearing 'want_replacement' is always allowed. + * Once replacements starts it is too late though. + */ + err = 0; + clear_bit(WantReplacement, &rdev->flags); + } else if (cmd_match(buf, "replacement")) { + /* Can only set a device as a replacement when array has not + * yet been started. Once running, replacement is automatic + * from spares, or by assigning 'slot'. + */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + set_bit(Replacement, &rdev->flags); + err = 0; + } + } else if (cmd_match(buf, "-replacement")) { + /* Similarly, can only clear Replacement before start */ + if (rdev->mddev->pers) + err = -EBUSY; + else { + clear_bit(Replacement, &rdev->flags); + err = 0; + } } if (!err) - sysfs_notify(&rdev->kobj, NULL, "state"); + sysfs_notify_dirent_safe(rdev->sysfs_state); return err ? err : len; } static struct rdev_sysfs_entry rdev_state = __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); static ssize_t -errors_show(mdk_rdev_t *rdev, char *page) +errors_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); } static ssize_t -errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) +errors_store(struct md_rdev *rdev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); @@ -1964,7 +2743,7 @@ static struct rdev_sysfs_entry rdev_errors = __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t -slot_show(mdk_rdev_t *rdev, char *page) +slot_show(struct md_rdev *rdev, char *page) { if (rdev->raid_disk < 0) return sprintf(page, "none\n"); @@ -1973,11 +2752,10 @@ slot_show(mdk_rdev_t *rdev, char *page) } static ssize_t -slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) +slot_store(struct md_rdev *rdev, const char *buf, size_t len) { char *e; int err; - char nm[20]; int slot = simple_strtoul(buf, &e, 10); if (strncmp(buf, "none", 4)==0) slot = -1; @@ -1994,62 +2772,59 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (rdev->raid_disk == -1) return -EEXIST; /* personality does all needed checks */ - if (rdev->mddev->pers->hot_add_disk == NULL) + if (rdev->mddev->pers->hot_remove_disk == NULL) return -EINVAL; - err = rdev->mddev->pers-> - hot_remove_disk(rdev->mddev, rdev->raid_disk); - if (err) - return err; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&rdev->mddev->kobj, nm); + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(rdev->mddev, rdev); + if (rdev->raid_disk >= 0) + return -EBUSY; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { - mdk_rdev_t *rdev2; - struct list_head *tmp; /* Activating a spare .. or possibly reactivating - * if we every get bitmaps working here. + * if we ever get bitmaps working here. */ if (rdev->raid_disk != -1) return -EBUSY; + if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) + return -EBUSY; + if (rdev->mddev->pers->hot_add_disk == NULL) return -EINVAL; - rdev_for_each(rdev2, tmp, rdev->mddev) - if (rdev2->raid_disk == slot) - return -EEXIST; + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) + return -ENOSPC; rdev->raid_disk = slot; if (test_bit(In_sync, &rdev->flags)) rdev->saved_raid_disk = slot; else rdev->saved_raid_disk = -1; + clear_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); err = rdev->mddev->pers-> hot_add_disk(rdev->mddev, rdev); if (err) { rdev->raid_disk = -1; return err; } else - sysfs_notify(&rdev->kobj, NULL, "state"); - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) - printk(KERN_WARNING - "md: cannot register " - "%s for %s\n", - nm, mdname(rdev->mddev)); - + sysfs_notify_dirent_safe(rdev->sysfs_state); + if (sysfs_link_rdev(rdev->mddev, rdev)) + /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { - if (slot >= rdev->mddev->raid_disks) + if (slot >= rdev->mddev->raid_disks && + slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) return -ENOSPC; rdev->raid_disk = slot; /* assume it is working */ clear_bit(Faulty, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); set_bit(In_sync, &rdev->flags); - sysfs_notify(&rdev->kobj, NULL, "state"); + sysfs_notify_dirent_safe(rdev->sysfs_state); } return len; } @@ -2059,35 +2834,92 @@ static struct rdev_sysfs_entry rdev_slot = __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); static ssize_t -offset_show(mdk_rdev_t *rdev, char *page) +offset_show(struct md_rdev *rdev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); } static ssize_t -offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) +offset_store(struct md_rdev *rdev, const char *buf, size_t len) { - char *e; - unsigned long long offset = simple_strtoull(buf, &e, 10); - if (e==buf || (*e && *e != '\n')) + unsigned long long offset; + if (kstrtoull(buf, 10, &offset) < 0) return -EINVAL; if (rdev->mddev->pers && rdev->raid_disk >= 0) return -EBUSY; - if (rdev->size && rdev->mddev->external) + if (rdev->sectors && rdev->mddev->external) /* Must set offset before size, so overlap checks * can be sane */ return -EBUSY; rdev->data_offset = offset; + rdev->new_data_offset = offset; return len; } static struct rdev_sysfs_entry rdev_offset = __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); +static ssize_t new_offset_show(struct md_rdev *rdev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)rdev->new_data_offset); +} + +static ssize_t new_offset_store(struct md_rdev *rdev, + const char *buf, size_t len) +{ + unsigned long long new_offset; + struct mddev *mddev = rdev->mddev; + + if (kstrtoull(buf, 10, &new_offset) < 0) + return -EINVAL; + + if (mddev->sync_thread) + return -EBUSY; + if (new_offset == rdev->data_offset) + /* reset is always permitted */ + ; + else if (new_offset > rdev->data_offset) { + /* must not push array size beyond rdev_sectors */ + if (new_offset - rdev->data_offset + + mddev->dev_sectors > rdev->sectors) + return -E2BIG; + } + /* Metadata worries about other space details. */ + + /* decreasing the offset is inconsistent with a backwards + * reshape. + */ + if (new_offset < rdev->data_offset && + mddev->reshape_backwards) + return -EINVAL; + /* Increasing offset is inconsistent with forwards + * reshape. reshape_direction should be set to + * 'backwards' first. + */ + if (new_offset > rdev->data_offset && + !mddev->reshape_backwards) + return -EINVAL; + + if (mddev->pers && mddev->persistent && + !super_types[mddev->major_version] + .allow_new_offset(rdev, new_offset)) + return -E2BIG; + rdev->new_data_offset = new_offset; + if (new_offset > rdev->data_offset) + mddev->reshape_backwards = 1; + else if (new_offset < rdev->data_offset) + mddev->reshape_backwards = 0; + + return len; +} +static struct rdev_sysfs_entry rdev_new_offset = +__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); + static ssize_t -rdev_size_show(mdk_rdev_t *rdev, char *page) +rdev_size_show(struct md_rdev *rdev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)rdev->size); + return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) @@ -2100,54 +2932,74 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) return 1; } -static ssize_t -rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) +static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) { - unsigned long long size; - unsigned long long oldsize = rdev->size; - mddev_t *my_mddev = rdev->mddev; + unsigned long long blocks; + sector_t new; - if (strict_strtoull(buf, 10, &size) < 0) + if (kstrtoull(buf, 10, &blocks) < 0) return -EINVAL; - if (size < my_mddev->size) + + if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) + return -EINVAL; /* sector conversion overflow */ + + new = blocks * 2; + if (new != blocks * 2) + return -EINVAL; /* unsigned long long to sector_t overflow */ + + *sectors = new; + return 0; +} + +static ssize_t +rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + struct mddev *my_mddev = rdev->mddev; + sector_t oldsectors = rdev->sectors; + sector_t sectors; + + if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; + if (rdev->data_offset != rdev->new_data_offset) + return -EINVAL; /* too confusing */ if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->persistent) { - size = super_types[my_mddev->major_version]. - rdev_size_change(rdev, size * 2); - if (!size) + sectors = super_types[my_mddev->major_version]. + rdev_size_change(rdev, sectors); + if (!sectors) return -EBUSY; - } else if (!size) { - size = (rdev->bdev->bd_inode->i_size >> 10); - size -= rdev->data_offset/2; - } - if (size < my_mddev->size) - return -EINVAL; /* component must fit device */ + } else if (!sectors) + sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - + rdev->data_offset; + if (!my_mddev->pers->resize) + /* Cannot change size for RAID0 or Linear etc */ + return -EINVAL; } + if (sectors < my_mddev->dev_sectors) + return -EINVAL; /* component must fit device */ - rdev->size = size; - if (size > oldsize && my_mddev->external) { + rdev->sectors = sectors; + if (sectors > oldsectors && my_mddev->external) { /* need to check that all other rdevs with the same ->bdev * do not overlap. We need to unlock the mddev to avoid - * a deadlock. We have already changed rdev->size, and if + * a deadlock. We have already changed rdev->sectors, and if * we have to change it back, we will have the lock again. */ - mddev_t *mddev; + struct mddev *mddev; int overlap = 0; - struct list_head *tmp, *tmp2; + struct list_head *tmp; mddev_unlock(my_mddev); for_each_mddev(mddev, tmp) { - mdk_rdev_t *rdev2; - - mddev_lock(mddev); - rdev_for_each(rdev2, tmp2, mddev) - if (test_bit(AllReserved, &rdev2->flags) || - (rdev->bdev == rdev2->bdev && - rdev != rdev2 && - overlaps(rdev->data_offset, rdev->size * 2, - rdev2->data_offset, - rdev2->size * 2))) { + struct md_rdev *rdev2; + + mddev_lock_nointr(mddev); + rdev_for_each(rdev2, mddev) + if (rdev->bdev == rdev2->bdev && + rdev != rdev2 && + overlaps(rdev->data_offset, rdev->sectors, + rdev2->data_offset, + rdev2->sectors)) { overlap = 1; break; } @@ -2157,15 +3009,15 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) break; } } - mddev_lock(my_mddev); + mddev_lock_nointr(my_mddev); if (overlap) { /* Someone else could have slipped in a size * change here, but doing so is just silly. - * We put oldsize back because we *know* it is + * We put oldsectors back because we *know* it is * safe, and trust userspace not to race with * itself */ - rdev->size = oldsize; + rdev->sectors = oldsectors; return -EBUSY; } } @@ -2175,20 +3027,93 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) static struct rdev_sysfs_entry rdev_size = __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); + +static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) +{ + unsigned long long recovery_start = rdev->recovery_offset; + + if (test_bit(In_sync, &rdev->flags) || + recovery_start == MaxSector) + return sprintf(page, "none\n"); + + return sprintf(page, "%llu\n", recovery_start); +} + +static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) +{ + unsigned long long recovery_start; + + if (cmd_match(buf, "none")) + recovery_start = MaxSector; + else if (kstrtoull(buf, 10, &recovery_start)) + return -EINVAL; + + if (rdev->mddev->pers && + rdev->raid_disk >= 0) + return -EBUSY; + + rdev->recovery_offset = recovery_start; + if (recovery_start == MaxSector) + set_bit(In_sync, &rdev->flags); + else + clear_bit(In_sync, &rdev->flags); + return len; +} + +static struct rdev_sysfs_entry rdev_recovery_start = +__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); + + +static ssize_t +badblocks_show(struct badblocks *bb, char *page, int unack); +static ssize_t +badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); + +static ssize_t bb_show(struct md_rdev *rdev, char *page) +{ + return badblocks_show(&rdev->badblocks, page, 0); +} +static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) +{ + int rv = badblocks_store(&rdev->badblocks, page, len, 0); + /* Maybe that ack was all we needed */ + if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) + wake_up(&rdev->blocked_wait); + return rv; +} +static struct rdev_sysfs_entry rdev_bad_blocks = +__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); + + +static ssize_t ubb_show(struct md_rdev *rdev, char *page) +{ + return badblocks_show(&rdev->badblocks, page, 1); +} +static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) +{ + return badblocks_store(&rdev->badblocks, page, len, 1); +} +static struct rdev_sysfs_entry rdev_unack_bad_blocks = +__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); + static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_errors.attr, &rdev_slot.attr, &rdev_offset.attr, + &rdev_new_offset.attr, &rdev_size.attr, + &rdev_recovery_start.attr, + &rdev_bad_blocks.attr, + &rdev_unack_bad_blocks.attr, NULL, }; static ssize_t rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); - mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); - mddev_t *mddev = rdev->mddev; + struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + struct mddev *mddev = rdev->mddev; ssize_t rv; if (!entry->show) @@ -2210,9 +3135,9 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); - mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); + struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); ssize_t rv; - mddev_t *mddev = rdev->mddev; + struct mddev *mddev = rdev->mddev; if (!entry->store) return -EIO; @@ -2231,10 +3156,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, static void rdev_free(struct kobject *ko) { - mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); + struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); kfree(rdev); } -static struct sysfs_ops rdev_sysfs_ops = { +static const struct sysfs_ops rdev_sysfs_ops = { .show = rdev_attr_show, .store = rdev_attr_store, }; @@ -2244,6 +3169,40 @@ static struct kobj_type rdev_ktype = { .default_attrs = rdev_default_attrs, }; +int md_rdev_init(struct md_rdev *rdev) +{ + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; + rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->new_data_offset = 0; + rdev->sb_events = 0; + rdev->last_read_error.tv_sec = 0; + rdev->last_read_error.tv_nsec = 0; + rdev->sb_loaded = 0; + rdev->bb_page = NULL; + atomic_set(&rdev->nr_pending, 0); + atomic_set(&rdev->read_errors, 0); + atomic_set(&rdev->corrected_errors, 0); + + INIT_LIST_HEAD(&rdev->same_set); + init_waitqueue_head(&rdev->blocked_wait); + + /* Add space to store bad block list. + * This reserves the space even on arrays where it cannot + * be used - I wonder if that matters + */ + rdev->badblocks.count = 0; + rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ + rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); + seqlock_init(&rdev->badblocks.lock); + if (rdev->badblocks.page == NULL) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_GPL(md_rdev_init); /* * Import a device. If 'super_format' >= 0, then sanity check the superblock * @@ -2254,11 +3213,11 @@ static struct kobj_type rdev_ktype = { * * a faulty rdev _never_ has rdev->sb set. */ -static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) +static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { char b[BDEVNAME_SIZE]; int err; - mdk_rdev_t *rdev; + struct md_rdev *rdev; sector_t size; rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); @@ -2267,7 +3226,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi return ERR_PTR(-ENOMEM); } - if ((err = alloc_disk_sb(rdev))) + err = md_rdev_init(rdev); + if (err) + goto abort_free; + err = alloc_disk_sb(rdev); + if (err) goto abort_free; err = lock_rdev(rdev, newdev, super_format == -2); @@ -2276,17 +3239,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi kobject_init(&rdev->kobj, &rdev_ktype); - rdev->desc_nr = -1; - rdev->saved_raid_disk = -1; - rdev->raid_disk = -1; - rdev->flags = 0; - rdev->data_offset = 0; - rdev->sb_events = 0; - atomic_set(&rdev->nr_pending, 0); - atomic_set(&rdev->read_errors, 0); - atomic_set(&rdev->corrected_errors, 0); - - size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; if (!size) { printk(KERN_WARNING "md: %s has zero or unknown size, marking faulty!\n", @@ -2314,17 +3267,12 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi } } - INIT_LIST_HEAD(&rdev->same_set); - init_waitqueue_head(&rdev->blocked_wait); - return rdev; abort_free: - if (rdev->sb_page) { - if (rdev->bdev) - unlock_rdev(rdev); - free_disk_sb(rdev); - } + if (rdev->bdev) + unlock_rdev(rdev); + md_rdev_clear(rdev); kfree(rdev); return ERR_PTR(err); } @@ -2334,15 +3282,14 @@ abort_free: */ -static void analyze_sbs(mddev_t * mddev) +static void analyze_sbs(struct mddev * mddev) { int i; - struct list_head *tmp; - mdk_rdev_t *rdev, *freshest; + struct md_rdev *rdev, *freshest, *tmp; char b[BDEVNAME_SIZE]; freshest = NULL; - rdev_for_each(rdev, tmp, mddev) + rdev_for_each_safe(rdev, tmp, mddev) switch (super_types[mddev->major_version]. load_super(rdev, freshest, mddev->minor_version)) { case 1: @@ -2363,7 +3310,17 @@ static void analyze_sbs(mddev_t * mddev) validate_super(mddev, freshest); i = 0; - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { + if (mddev->max_disks && + (rdev->desc_nr >= mddev->max_disks || + i > mddev->max_disks)) { + printk(KERN_WARNING + "md: %s: %s: only %d devices permitted\n", + mdname(mddev), bdevname(rdev->bdev, b), + mddev->max_disks); + kick_rdev_from_array(rdev); + continue; + } if (rdev != freshest) if (super_types[mddev->major_version]. validate_super(mddev, rdev)) { @@ -2377,60 +3334,69 @@ static void analyze_sbs(mddev_t * mddev) rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; set_bit(In_sync, &rdev->flags); - } else if (rdev->raid_disk >= mddev->raid_disks) { + } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } } +} - - - if (mddev->recovery_cp != MaxSector && - mddev->level >= 1) - printk(KERN_ERR "md: %s: raid array is not clean" - " -- starting background reconstruction\n", - mdname(mddev)); - +/* Read a fixed-point number. + * Numbers in sysfs attributes should be in "standard" units where + * possible, so time should be in seconds. + * However we internally use a a much smaller unit such as + * milliseconds or jiffies. + * This function takes a decimal number with a possible fractional + * component, and produces an integer which is the result of + * multiplying that number by 10^'scale'. + * all without any floating-point arithmetic. + */ +int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) +{ + unsigned long result = 0; + long decimals = -1; + while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { + if (*cp == '.') + decimals = 0; + else if (decimals < scale) { + unsigned int value; + value = *cp - '0'; + result = result * 10 + value; + if (decimals >= 0) + decimals++; + } + cp++; + } + if (*cp == '\n') + cp++; + if (*cp) + return -EINVAL; + if (decimals < 0) + decimals = 0; + while (decimals < scale) { + result *= 10; + decimals ++; + } + *res = result; + return 0; } + static void md_safemode_timeout(unsigned long data); static ssize_t -safe_delay_show(mddev_t *mddev, char *page) +safe_delay_show(struct mddev *mddev, char *page) { int msec = (mddev->safemode_delay*1000)/HZ; return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); } static ssize_t -safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) +safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) { - int scale=1; - int dot=0; - int i; unsigned long msec; - char buf[30]; - char *e; - /* remove a period, and count digits after it */ - if (len >= sizeof(buf)) - return -EINVAL; - strlcpy(buf, cbuf, len); - buf[len] = 0; - for (i=0; i<len; i++) { - if (dot) { - if (isdigit(buf[i])) { - buf[i-1] = buf[i]; - scale *= 10; - } - buf[i] = 0; - } else if (buf[i] == '.') { - dot=1; - buf[i] = 0; - } - } - msec = simple_strtoul(buf, &e, 10); - if (e == buf || (*e && *e != '\n')) + + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) return -EINVAL; - msec = (msec * 1000) / scale; if (msec == 0) mddev->safemode_delay = 0; else { @@ -2438,7 +3404,7 @@ safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) mddev->safemode_delay = (msec*HZ)/1000; if (mddev->safemode_delay == 0) mddev->safemode_delay = 1; - if (mddev->safemode_delay < old_delay) + if (mddev->safemode_delay < old_delay || old_delay == 0) md_safemode_timeout((unsigned long)mddev); } return len; @@ -2447,9 +3413,9 @@ static struct md_sysfs_entry md_safe_delay = __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); static ssize_t -level_show(mddev_t *mddev, char *page) +level_show(struct mddev *mddev, char *page) { - struct mdk_personality *p = mddev->pers; + struct md_personality *p = mddev->pers; if (p) return sprintf(page, "%s\n", p->name); else if (mddev->clevel[0]) @@ -2461,20 +3427,184 @@ level_show(mddev_t *mddev, char *page) } static ssize_t -level_store(mddev_t *mddev, const char *buf, size_t len) +level_store(struct mddev *mddev, const char *buf, size_t len) { + char clevel[16]; ssize_t rv = len; - if (mddev->pers) + struct md_personality *pers; + long level; + void *priv; + struct md_rdev *rdev; + + if (mddev->pers == NULL) { + if (len == 0) + return 0; + if (len >= sizeof(mddev->clevel)) + return -ENOSPC; + strncpy(mddev->clevel, buf, len); + if (mddev->clevel[len-1] == '\n') + len--; + mddev->clevel[len] = 0; + mddev->level = LEVEL_NONE; + return rv; + } + if (mddev->ro) + return -EROFS; + + /* request to change the personality. Need to ensure: + * - array is not engaged in resync/recovery/reshape + * - old personality can be suspended + * - new personality will access other array. + */ + + if (mddev->sync_thread || + mddev->reshape_position != MaxSector || + mddev->sysfs_active) return -EBUSY; - if (len == 0) - return 0; - if (len >= sizeof(mddev->clevel)) - return -ENOSPC; - strncpy(mddev->clevel, buf, len); - if (mddev->clevel[len-1] == '\n') + + if (!mddev->pers->quiesce) { + printk(KERN_WARNING "md: %s: %s does not support online personality change\n", + mdname(mddev), mddev->pers->name); + return -EINVAL; + } + + /* Now find the new personality */ + if (len == 0 || len >= sizeof(clevel)) + return -EINVAL; + strncpy(clevel, buf, len); + if (clevel[len-1] == '\n') len--; - mddev->clevel[len] = 0; - mddev->level = LEVEL_NONE; + clevel[len] = 0; + if (kstrtol(clevel, 10, &level)) + level = LEVEL_NONE; + + if (request_module("md-%s", clevel) != 0) + request_module("md-level-%s", clevel); + spin_lock(&pers_lock); + pers = find_pers(level, clevel); + if (!pers || !try_module_get(pers->owner)) { + spin_unlock(&pers_lock); + printk(KERN_WARNING "md: personality %s not loaded\n", clevel); + return -EINVAL; + } + spin_unlock(&pers_lock); + + if (pers == mddev->pers) { + /* Nothing to do! */ + module_put(pers->owner); + return rv; + } + if (!pers->takeover) { + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", + mdname(mddev), clevel); + return -EINVAL; + } + + rdev_for_each(rdev, mddev) + rdev->new_raid_disk = rdev->raid_disk; + + /* ->takeover must set new_* and/or delta_disks + * if it succeeds, and may set them when it fails. + */ + priv = pers->takeover(mddev); + if (IS_ERR(priv)) { + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks -= mddev->delta_disks; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + module_put(pers->owner); + printk(KERN_WARNING "md: %s: %s would not accept array\n", + mdname(mddev), clevel); + return PTR_ERR(priv); + } + + /* Looks like we have a winner */ + mddev_suspend(mddev); + mddev->pers->stop(mddev); + + if (mddev->pers->sync_request == NULL && + pers->sync_request != NULL) { + /* need to add the md_redundancy_group */ + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + printk(KERN_WARNING + "md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); + } + if (mddev->pers->sync_request != NULL && + pers->sync_request == NULL) { + /* need to remove the md_redundancy_group */ + if (mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + } + + if (mddev->pers->sync_request == NULL && + mddev->external) { + /* We are converting from a no-redundancy array + * to a redundancy array and metadata is managed + * externally so we need to be sure that writes + * won't block due to a need to transition + * clean->dirty + * until external management is started. + */ + mddev->in_sync = 0; + mddev->safemode_delay = 0; + mddev->safemode = 0; + } + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk >= mddev->raid_disks) + rdev->new_raid_disk = -1; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + sysfs_unlink_rdev(mddev, rdev); + } + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + if (rdev->new_raid_disk == rdev->raid_disk) + continue; + rdev->raid_disk = rdev->new_raid_disk; + if (rdev->raid_disk < 0) + clear_bit(In_sync, &rdev->flags); + else { + if (sysfs_link_rdev(mddev, rdev)) + printk(KERN_WARNING "md: cannot register rd%d" + " for %s after level change\n", + rdev->raid_disk, mdname(mddev)); + } + } + + module_put(mddev->pers->owner); + mddev->pers = pers; + mddev->private = priv; + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = mddev->new_chunk_sectors; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->degraded = 0; + if (mddev->pers->sync_request == NULL) { + /* this is now an array without redundancy, so + * it must always be in_sync + */ + mddev->in_sync = 1; + del_timer_sync(&mddev->safemode_timer); + } + blk_set_stacking_limits(&mddev->queue->limits); + pers->run(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + mddev_resume(mddev); + if (!mddev->thread) + md_update_sb(mddev, 1); + sysfs_notify(&mddev->kobj, NULL, "level"); + md_new_event(mddev); return rv; } @@ -2483,7 +3613,7 @@ __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); static ssize_t -layout_show(mddev_t *mddev, char *page) +layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ if (mddev->reshape_position != MaxSector && @@ -2494,7 +3624,7 @@ layout_show(mddev_t *mddev, char *page) } static ssize_t -layout_store(mddev_t *mddev, const char *buf, size_t len) +layout_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); @@ -2502,12 +3632,23 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) if (!*buf || (*e && *e != '\n')) return -EINVAL; - if (mddev->pers) - return -EBUSY; - if (mddev->reshape_position != MaxSector) + if (mddev->pers) { + int err; + if (mddev->pers->check_reshape == NULL) + return -EBUSY; + if (mddev->ro) + return -EROFS; mddev->new_layout = n; - else - mddev->layout = n; + err = mddev->pers->check_reshape(mddev); + if (err) { + mddev->new_layout = mddev->layout; + return err; + } + } else { + mddev->new_layout = n; + if (mddev->reshape_position == MaxSector) + mddev->layout = n; + } return len; } static struct md_sysfs_entry md_layout = @@ -2515,7 +3656,7 @@ __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); static ssize_t -raid_disks_show(mddev_t *mddev, char *page) +raid_disks_show(struct mddev *mddev, char *page) { if (mddev->raid_disks == 0) return 0; @@ -2526,10 +3667,10 @@ raid_disks_show(mddev_t *mddev, char *page) return sprintf(page, "%d\n", mddev->raid_disks); } -static int update_raid_disks(mddev_t *mddev, int raid_disks); +static int update_raid_disks(struct mddev *mddev, int raid_disks); static ssize_t -raid_disks_store(mddev_t *mddev, const char *buf, size_t len) +raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { char *e; int rv = 0; @@ -2541,9 +3682,20 @@ raid_disks_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) rv = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { + struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; + + rdev_for_each(rdev, mddev) { + if (olddisks < n && + rdev->data_offset < rdev->new_data_offset) + return -EINVAL; + if (olddisks > n && + rdev->data_offset > rdev->new_data_offset) + return -EINVAL; + } mddev->delta_disks = n - olddisks; mddev->raid_disks = n; + mddev->reshape_backwards = (mddev->delta_disks < 0); } else mddev->raid_disks = n; return rv ? rv : len; @@ -2552,54 +3704,71 @@ static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t -chunk_size_show(mddev_t *mddev, char *page) +chunk_size_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector && - mddev->chunk_size != mddev->new_chunk) - return sprintf(page, "%d (%d)\n", mddev->new_chunk, - mddev->chunk_size); - return sprintf(page, "%d\n", mddev->chunk_size); + mddev->chunk_sectors != mddev->new_chunk_sectors) + return sprintf(page, "%d (%d)\n", + mddev->new_chunk_sectors << 9, + mddev->chunk_sectors << 9); + return sprintf(page, "%d\n", mddev->chunk_sectors << 9); } static ssize_t -chunk_size_store(mddev_t *mddev, const char *buf, size_t len) +chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { - /* can only set chunk_size if array is not yet active */ char *e; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; - if (mddev->pers) - return -EBUSY; - else if (mddev->reshape_position != MaxSector) - mddev->new_chunk = n; - else - mddev->chunk_size = n; + if (mddev->pers) { + int err; + if (mddev->pers->check_reshape == NULL) + return -EBUSY; + if (mddev->ro) + return -EROFS; + mddev->new_chunk_sectors = n >> 9; + err = mddev->pers->check_reshape(mddev); + if (err) { + mddev->new_chunk_sectors = mddev->chunk_sectors; + return err; + } + } else { + mddev->new_chunk_sectors = n >> 9; + if (mddev->reshape_position == MaxSector) + mddev->chunk_sectors = n >> 9; + } return len; } static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t -resync_start_show(mddev_t *mddev, char *page) +resync_start_show(struct mddev *mddev, char *page) { + if (mddev->recovery_cp == MaxSector) + return sprintf(page, "none\n"); return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); } static ssize_t -resync_start_store(mddev_t *mddev, const char *buf, size_t len) +resync_start_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long n = simple_strtoull(buf, &e, 10); - if (mddev->pers) + if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) return -EBUSY; - if (!*buf || (*e && *e != '\n')) + if (cmd_match(buf, "none")) + n = MaxSector; + else if (!*buf || (*e && *e != '\n')) return -EINVAL; mddev->recovery_cp = n; + if (mddev->pers) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); return len; } static struct md_sysfs_entry md_resync_start = @@ -2657,7 +3826,7 @@ static int match_word(const char *word, char **list) } static ssize_t -array_state_show(mddev_t *mddev, char *page) +array_state_show(struct mddev *mddev, char *page) { enum array_state st = inactive; @@ -2672,7 +3841,7 @@ array_state_show(mddev_t *mddev, char *page) case 0: if (mddev->in_sync) st = clean; - else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) st = write_pending; else if (mddev->safemode) st = active_idle; @@ -2682,7 +3851,7 @@ array_state_show(mddev_t *mddev, char *page) else { if (list_empty(&mddev->disks) && mddev->raid_disks == 0 && - mddev->size == 0) + mddev->dev_sectors == 0) st = clear; else st = inactive; @@ -2690,12 +3859,13 @@ array_state_show(mddev_t *mddev, char *page) return sprintf(page, "%s\n", array_states[st]); } -static int do_md_stop(mddev_t * mddev, int ro, int is_open); -static int do_md_run(mddev_t * mddev); -static int restart_array(mddev_t *mddev); +static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev); +static int md_set_readonly(struct mddev * mddev, struct block_device *bdev); +static int do_md_run(struct mddev * mddev); +static int restart_array(struct mddev *mddev); static ssize_t -array_state_store(mddev_t *mddev, const char *buf, size_t len) +array_state_store(struct mddev *mddev, const char *buf, size_t len) { int err = -EINVAL; enum array_state st = match_word(buf, array_states); @@ -2704,24 +3874,20 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; case clear: /* stopping an active array */ - if (atomic_read(&mddev->openers) > 0) - return -EBUSY; - err = do_md_stop(mddev, 0, 0); + err = do_md_stop(mddev, 0, NULL); break; case inactive: /* stopping an active array */ - if (mddev->pers) { - if (atomic_read(&mddev->openers) > 0) - return -EBUSY; - err = do_md_stop(mddev, 2, 0); - } else + if (mddev->pers) + err = do_md_stop(mddev, 2, NULL); + else err = 0; /* already inactive */ break; case suspended: break; /* not supported yet */ case readonly: if (mddev->pers) - err = do_md_stop(mddev, 1, 0); + err = md_set_readonly(mddev, NULL); else { mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); @@ -2730,9 +3896,9 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) break; case read_auto: if (mddev->pers) { - if (mddev->ro != 1) - err = do_md_stop(mddev, 1, 0); - else + if (mddev->ro == 0) + err = md_set_readonly(mddev, NULL); + else if (mddev->ro == 1) err = restart_array(mddev); if (err == 0) { mddev->ro = 2; @@ -2752,25 +3918,19 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) mddev->in_sync = 1; if (mddev->safemode == 1) mddev->safemode = 0; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, - &mddev->flags); + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } err = 0; } else err = -EBUSY; spin_unlock_irq(&mddev->write_lock); - } else { - mddev->ro = 0; - mddev->recovery_cp = MaxSector; - err = do_md_run(mddev); - } + } else + err = -EINVAL; break; case active: if (mddev->pers) { restart_array(mddev); - if (mddev->external) - clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; } else { @@ -2787,7 +3947,9 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) if (err) return err; else { - sysfs_notify(&mddev->kobj, NULL, "array_state"); + if (mddev->hold_active == UNTIL_IOCTL) + mddev->hold_active = 0; + sysfs_notify_dirent_safe(mddev->sysfs_state); return len; } } @@ -2795,13 +3957,36 @@ static struct md_sysfs_entry md_array_state = __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); static ssize_t -null_show(mddev_t *mddev, char *page) +max_corrected_read_errors_show(struct mddev *mddev, char *page) { + return sprintf(page, "%d\n", + atomic_read(&mddev->max_corr_read_errors)); +} + +static ssize_t +max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) { + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + + if (*buf && (*e == 0 || *e == '\n')) { + atomic_set(&mddev->max_corr_read_errors, n); + return len; + } return -EINVAL; } +static struct md_sysfs_entry max_corr_read_errors = +__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, + max_corrected_read_errors_store); + static ssize_t -new_dev_store(mddev_t *mddev, const char *buf, size_t len) +null_show(struct mddev *mddev, char *page) +{ + return -EINVAL; +} + +static ssize_t +new_dev_store(struct mddev *mddev, const char *buf, size_t len) { /* buf must be %d:%d\n? giving major and minor numbers */ /* The new device is added to the array. @@ -2814,7 +3999,7 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len) int major = simple_strtoul(buf, &e, 10); int minor; dev_t dev; - mdk_rdev_t *rdev; + struct md_rdev *rdev; int err; if (!*buf || *e != ':' || !e[1] || e[1] == '\n') @@ -2832,8 +4017,9 @@ new_dev_store(mddev_t *mddev, const char *buf, size_t len) rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { - mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, - mdk_rdev_t, same_set); + struct md_rdev *rdev0 + = list_entry(mddev->disks.next, + struct md_rdev, same_set); err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) @@ -2857,7 +4043,7 @@ static struct md_sysfs_entry md_new_device = __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); static ssize_t -bitmap_store(mddev_t *mddev, const char *buf, size_t len) +bitmap_store(struct mddev *mddev, const char *buf, size_t len) { char *end; unsigned long chunk, end_chunk; @@ -2875,8 +4061,7 @@ bitmap_store(mddev_t *mddev, const char *buf, size_t len) } if (*end && !isspace(*end)) break; bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); - buf = end; - while (isspace(*buf)) buf++; + buf = skip_spaces(end); } bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ out: @@ -2887,34 +4072,33 @@ static struct md_sysfs_entry md_bitmap = __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); static ssize_t -size_show(mddev_t *mddev, char *page) +size_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", (unsigned long long)mddev->size); + return sprintf(page, "%llu\n", + (unsigned long long)mddev->dev_sectors / 2); } -static int update_size(mddev_t *mddev, sector_t num_sectors); +static int update_size(struct mddev *mddev, sector_t num_sectors); static ssize_t -size_store(mddev_t *mddev, const char *buf, size_t len) +size_store(struct mddev *mddev, const char *buf, size_t len) { /* If array is inactive, we can reduce the component size, but * not increase it (except from 0). * If array is active, we can try an on-line resize */ - char *e; - int err = 0; - unsigned long long size = simple_strtoull(buf, &e, 10); - if (!*buf || *buf == '\n' || - (*e && *e != '\n')) - return -EINVAL; + sector_t sectors; + int err = strict_blocks_to_sectors(buf, §ors); + if (err < 0) + return err; if (mddev->pers) { - err = update_size(mddev, size * 2); + err = update_size(mddev, sectors); md_update_sb(mddev, 1); } else { - if (mddev->size == 0 || - mddev->size > size) - mddev->size = size; + if (mddev->dev_sectors == 0 || + mddev->dev_sectors > sectors) + mddev->dev_sectors = sectors; else err = -ENOSPC; } @@ -2925,14 +4109,14 @@ static struct md_sysfs_entry md_size = __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); -/* Metdata version. +/* Metadata version. * This is one of * 'none' for arrays with no metadata (good luck...) * 'external' for arrays with externally managed metadata, * or N.M for internally known formats */ static ssize_t -metadata_show(mddev_t *mddev, char *page) +metadata_show(struct mddev *mddev, char *page) { if (mddev->persistent) return sprintf(page, "%d.%d\n", @@ -2944,11 +4128,17 @@ metadata_show(mddev_t *mddev, char *page) } static ssize_t -metadata_store(mddev_t *mddev, const char *buf, size_t len) +metadata_store(struct mddev *mddev, const char *buf, size_t len) { int major, minor; char *e; - if (!list_empty(&mddev->disks)) + /* Changing the details of 'external' metadata is + * always permitted. Otherwise there must be + * no devices attached to the array. + */ + if (mddev->external && strncmp(buf, "external:", 9) == 0) + ; + else if (!list_empty(&mddev->disks)) return -EBUSY; if (cmd_match(buf, "none")) { @@ -2992,10 +4182,12 @@ static struct md_sysfs_entry md_metadata = __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t -action_show(mddev_t *mddev, char *page) +action_show(struct mddev *mddev, char *page) { char *type = "idle"; - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + type = "frozen"; + else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) type = "reshape"; @@ -3013,17 +4205,20 @@ action_show(mddev_t *mddev, char *page) } static ssize_t -action_store(mddev_t *mddev, const char *page, size_t len) +action_store(struct mddev *mddev, const char *page, size_t len) { if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (cmd_match(page, "idle")) { + if (cmd_match(page, "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - mddev->recovery = 0; + md_reap_sync_thread(mddev); } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -3049,34 +4244,49 @@ action_store(mddev_t *mddev, const char *page, size_t len) set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } + if (mddev->ro == 2) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = 0; + md_wakeup_thread(mddev->sync_thread); + } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - sysfs_notify(&mddev->kobj, NULL, "sync_action"); + sysfs_notify_dirent_safe(mddev->sysfs_action); return len; } +static struct md_sysfs_entry md_scan_mode = +__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); + static ssize_t -mismatch_cnt_show(mddev_t *mddev, char *page) +last_sync_action_show(struct mddev *mddev, char *page) { - return sprintf(page, "%llu\n", - (unsigned long long) mddev->resync_mismatches); + return sprintf(page, "%s\n", mddev->last_sync_action); } -static struct md_sysfs_entry md_scan_mode = -__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); +static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); +static ssize_t +mismatch_cnt_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long) + atomic64_read(&mddev->resync_mismatches)); +} static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); static ssize_t -sync_min_show(mddev_t *mddev, char *page) +sync_min_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_min(mddev), mddev->sync_speed_min ? "local": "system"); } static ssize_t -sync_min_store(mddev_t *mddev, const char *buf, size_t len) +sync_min_store(struct mddev *mddev, const char *buf, size_t len) { int min; char *e; @@ -3095,14 +4305,14 @@ static struct md_sysfs_entry md_sync_min = __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); static ssize_t -sync_max_show(mddev_t *mddev, char *page) +sync_max_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_max(mddev), mddev->sync_speed_max ? "local": "system"); } static ssize_t -sync_max_store(mddev_t *mddev, const char *buf, size_t len) +sync_max_store(struct mddev *mddev, const char *buf, size_t len) { int max; char *e; @@ -3121,24 +4331,24 @@ static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); static ssize_t -degraded_show(mddev_t *mddev, char *page) +degraded_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->degraded); } static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); static ssize_t -sync_force_parallel_show(mddev_t *mddev, char *page) +sync_force_parallel_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->parallel_resync); } static ssize_t -sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) +sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) { long n; - if (strict_strtol(buf, 10, &n)) + if (kstrtol(buf, 10, &n)) return -EINVAL; if (n != 0 && n != 1) @@ -3158,9 +4368,11 @@ __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, sync_force_parallel_show, sync_force_parallel_store); static ssize_t -sync_speed_show(mddev_t *mddev, char *page) +sync_speed_show(struct mddev *mddev, char *page) { unsigned long resync, dt, db; + if (mddev->curr_resync == 0) + return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; if (!dt) dt++; @@ -3171,32 +4383,40 @@ sync_speed_show(mddev_t *mddev, char *page) static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); static ssize_t -sync_completed_show(mddev_t *mddev, char *page) +sync_completed_show(struct mddev *mddev, char *page) { - unsigned long max_blocks, resync; + unsigned long long max_sectors, resync; + + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return sprintf(page, "none\n"); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_blocks = mddev->resync_max_sectors; + if (mddev->curr_resync == 1 || + mddev->curr_resync == 2) + return sprintf(page, "delayed\n"); + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sectors = mddev->resync_max_sectors; else - max_blocks = mddev->size << 1; + max_sectors = mddev->dev_sectors; - resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); - return sprintf(page, "%lu / %lu\n", resync, max_blocks); + resync = mddev->curr_resync_completed; + return sprintf(page, "%llu / %llu\n", resync, max_sectors); } static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); static ssize_t -min_sync_show(mddev_t *mddev, char *page) +min_sync_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_min); } static ssize_t -min_sync_store(mddev_t *mddev, const char *buf, size_t len) +min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; - if (strict_strtoull(buf, 10, &min)) + if (kstrtoull(buf, 10, &min)) return -EINVAL; if (min > mddev->resync_max) return -EINVAL; @@ -3204,8 +4424,9 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len) return -EBUSY; /* Must be a multiple of chunk_size */ - if (mddev->chunk_size) { - if (min & (sector_t)((mddev->chunk_size>>9)-1)) + if (mddev->chunk_sectors) { + sector_t temp = min; + if (sector_div(temp, mddev->chunk_sectors)) return -EINVAL; } mddev->resync_min = min; @@ -3217,7 +4438,7 @@ static struct md_sysfs_entry md_min_sync = __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); static ssize_t -max_sync_show(mddev_t *mddev, char *page) +max_sync_show(struct mddev *mddev, char *page) { if (mddev->resync_max == MaxSector) return sprintf(page, "max\n"); @@ -3226,23 +4447,25 @@ max_sync_show(mddev_t *mddev, char *page) (unsigned long long)mddev->resync_max); } static ssize_t -max_sync_store(mddev_t *mddev, const char *buf, size_t len) +max_sync_store(struct mddev *mddev, const char *buf, size_t len) { if (strncmp(buf, "max", 3) == 0) mddev->resync_max = MaxSector; else { unsigned long long max; - if (strict_strtoull(buf, 10, &max)) + if (kstrtoull(buf, 10, &max)) return -EINVAL; if (max < mddev->resync_min) return -EINVAL; if (max < mddev->resync_max && + mddev->ro == 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; /* Must be a multiple of chunk_size */ - if (mddev->chunk_size) { - if (max & (sector_t)((mddev->chunk_size>>9)-1)) + if (mddev->chunk_sectors) { + sector_t temp = max; + if (sector_div(temp, mddev->chunk_sectors)) return -EINVAL; } mddev->resync_max = max; @@ -3255,63 +4478,74 @@ static struct md_sysfs_entry md_max_sync = __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); static ssize_t -suspend_lo_show(mddev_t *mddev, char *page) +suspend_lo_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); } static ssize_t -suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) +suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_lo; - if (mddev->pers->quiesce == NULL) + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if (new >= mddev->suspend_hi || - (new > mddev->suspend_lo && new < mddev->suspend_hi)) { - mddev->suspend_lo = new; + + mddev->suspend_lo = new; + if (new >= old) + /* Shrinking suspended region */ mddev->pers->quiesce(mddev, 2); - return len; - } else - return -EINVAL; + else { + /* Expanding suspended region - need to wait */ + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + return len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); static ssize_t -suspend_hi_show(mddev_t *mddev, char *page) +suspend_hi_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); } static ssize_t -suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) +suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); + unsigned long long old = mddev->suspend_hi; - if (mddev->pers->quiesce == NULL) + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; - if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || - (new > mddev->suspend_lo && new > mddev->suspend_hi)) { - mddev->suspend_hi = new; + + mddev->suspend_hi = new; + if (new <= old) + /* Shrinking suspended region */ + mddev->pers->quiesce(mddev, 2); + else { + /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); - return len; - } else - return -EINVAL; + } + return len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); static ssize_t -reshape_position_show(mddev_t *mddev, char *page) +reshape_position_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector) return sprintf(page, "%llu\n", @@ -3321,8 +4555,9 @@ reshape_position_show(mddev_t *mddev, char *page) } static ssize_t -reshape_position_store(mddev_t *mddev, const char *buf, size_t len) +reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { + struct md_rdev *rdev; char *e; unsigned long long new = simple_strtoull(buf, &e, 10); if (mddev->pers) @@ -3331,9 +4566,12 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len) return -EINVAL; mddev->reshape_position = new; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; mddev->new_level = mddev->level; mddev->new_layout = mddev->layout; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; return len; } @@ -3341,6 +4579,84 @@ static struct md_sysfs_entry md_reshape_position = __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, reshape_position_store); +static ssize_t +reshape_direction_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%s\n", + mddev->reshape_backwards ? "backwards" : "forwards"); +} + +static ssize_t +reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) +{ + int backwards = 0; + if (cmd_match(buf, "forwards")) + backwards = 0; + else if (cmd_match(buf, "backwards")) + backwards = 1; + else + return -EINVAL; + if (mddev->reshape_backwards == backwards) + return len; + + /* check if we are allowed to change */ + if (mddev->delta_disks) + return -EBUSY; + + if (mddev->persistent && + mddev->major_version == 0) + return -EINVAL; + + mddev->reshape_backwards = backwards; + return len; +} + +static struct md_sysfs_entry md_reshape_direction = +__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, + reshape_direction_store); + +static ssize_t +array_size_show(struct mddev *mddev, char *page) +{ + if (mddev->external_size) + return sprintf(page, "%llu\n", + (unsigned long long)mddev->array_sectors/2); + else + return sprintf(page, "default\n"); +} + +static ssize_t +array_size_store(struct mddev *mddev, const char *buf, size_t len) +{ + sector_t sectors; + + if (strncmp(buf, "default", 7) == 0) { + if (mddev->pers) + sectors = mddev->pers->size(mddev, 0, 0); + else + sectors = mddev->array_sectors; + + mddev->external_size = 0; + } else { + if (strict_blocks_to_sectors(buf, §ors) < 0) + return -EINVAL; + if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + return -E2BIG; + + mddev->external_size = 1; + } + + mddev->array_sectors = sectors; + if (mddev->pers) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } + return len; +} + +static struct md_sysfs_entry md_array_size = +__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, + array_size_store); static struct attribute *md_default_attrs[] = { &md_level.attr, @@ -3354,11 +4670,15 @@ static struct attribute *md_default_attrs[] = { &md_safe_delay.attr, &md_array_state.attr, &md_reshape_position.attr, + &md_reshape_direction.attr, + &md_array_size.attr, + &max_corr_read_errors.attr, NULL, }; static struct attribute *md_redundancy_attrs[] = { &md_scan_mode.attr, + &md_last_scan_mode.attr, &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, @@ -3383,16 +4703,25 @@ static ssize_t md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); - mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); + struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; if (!entry->show) return -EIO; + spin_lock(&all_mddevs_lock); + if (list_empty(&mddev->all_mddevs)) { + spin_unlock(&all_mddevs_lock); + return -EBUSY; + } + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + rv = mddev_lock(mddev); if (!rv) { rv = entry->show(mddev, page); mddev_unlock(mddev); } + mddev_put(mddev); return rv; } @@ -3401,28 +4730,49 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); - mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); + struct mddev *mddev = container_of(kobj, struct mddev, kobj); ssize_t rv; if (!entry->store) return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + spin_lock(&all_mddevs_lock); + if (list_empty(&mddev->all_mddevs)) { + spin_unlock(&all_mddevs_lock); + return -EBUSY; + } + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + if (entry->store == new_dev_store) + flush_workqueue(md_misc_wq); rv = mddev_lock(mddev); if (!rv) { rv = entry->store(mddev, page, length); mddev_unlock(mddev); } + mddev_put(mddev); return rv; } static void md_free(struct kobject *ko) { - mddev_t *mddev = container_of(ko, mddev_t, kobj); + struct mddev *mddev = container_of(ko, struct mddev, kobj); + + if (mddev->sysfs_state) + sysfs_put(mddev->sysfs_state); + + if (mddev->gendisk) { + del_gendisk(mddev->gendisk); + put_disk(mddev->gendisk); + } + if (mddev->queue) + blk_cleanup_queue(mddev->queue); + kfree(mddev); } -static struct sysfs_ops md_sysfs_ops = { +static const struct sysfs_ops md_sysfs_ops = { .show = md_attr_show, .store = md_attr_store, }; @@ -3434,76 +4784,164 @@ static struct kobj_type md_ktype = { int mdp_major = 0; -static struct kobject *md_probe(dev_t dev, int *part, void *data) +static void mddev_delayed_delete(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, del_work); + + sysfs_remove_group(&mddev->kobj, &md_bitmap_group); + kobject_del(&mddev->kobj); + kobject_put(&mddev->kobj); +} + +static int md_alloc(dev_t dev, char *name) { static DEFINE_MUTEX(disks_mutex); - mddev_t *mddev = mddev_find(dev); + struct mddev *mddev = mddev_find(dev); struct gendisk *disk; - int partitioned = (MAJOR(dev) != MD_MAJOR); - int shift = partitioned ? MdpMinorShift : 0; - int unit = MINOR(dev) >> shift; + int partitioned; + int shift; + int unit; int error; if (!mddev) - return NULL; + return -ENODEV; + + partitioned = (MAJOR(mddev->unit) != MD_MAJOR); + shift = partitioned ? MdpMinorShift : 0; + unit = MINOR(mddev->unit) >> shift; + + /* wait for any previous instance of this device to be + * completely removed (mddev_delayed_delete). + */ + flush_workqueue(md_misc_wq); mutex_lock(&disks_mutex); - if (mddev->gendisk) { - mutex_unlock(&disks_mutex); - mddev_put(mddev); - return NULL; + error = -EEXIST; + if (mddev->gendisk) + goto abort; + + if (name) { + /* Need to ensure that 'name' is not a duplicate. + */ + struct mddev *mddev2; + spin_lock(&all_mddevs_lock); + + list_for_each_entry(mddev2, &all_mddevs, all_mddevs) + if (mddev2->gendisk && + strcmp(mddev2->gendisk->disk_name, name) == 0) { + spin_unlock(&all_mddevs_lock); + goto abort; + } + spin_unlock(&all_mddevs_lock); } + + error = -ENOMEM; + mddev->queue = blk_alloc_queue(GFP_KERNEL); + if (!mddev->queue) + goto abort; + mddev->queue->queuedata = mddev; + + blk_queue_make_request(mddev->queue, md_make_request); + blk_set_stacking_limits(&mddev->queue->limits); + disk = alloc_disk(1 << shift); if (!disk) { - mutex_unlock(&disks_mutex); - mddev_put(mddev); - return NULL; + blk_cleanup_queue(mddev->queue); + mddev->queue = NULL; + goto abort; } - disk->major = MAJOR(dev); + disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; - if (partitioned) + if (name) + strcpy(disk->disk_name, name); + else if (partitioned) sprintf(disk->disk_name, "md_d%d", unit); else sprintf(disk->disk_name, "md%d", unit); disk->fops = &md_fops; disk->private_data = mddev; disk->queue = mddev->queue; - add_disk(disk); + blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); + /* Allow extended partitions. This makes the + * 'mdp' device redundant, but we can't really + * remove it now. + */ + disk->flags |= GENHD_FL_EXT_DEVT; mddev->gendisk = disk; - error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj, - "%s", "md"); - mutex_unlock(&disks_mutex); - if (error) + /* As soon as we call add_disk(), another thread could get + * through to md_open, so make sure it doesn't get too far + */ + mutex_lock(&mddev->open_mutex); + add_disk(disk); + + error = kobject_init_and_add(&mddev->kobj, &md_ktype, + &disk_to_dev(disk)->kobj, "%s", "md"); + if (error) { + /* This isn't possible, but as kobject_init_and_add is marked + * __must_check, we must do something with the result + */ printk(KERN_WARNING "md: cannot register %s/md - name in use\n", disk->disk_name); - else + error = 0; + } + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_bitmap_group)) + printk(KERN_DEBUG "pointless warning\n"); + mutex_unlock(&mddev->open_mutex); + abort: + mutex_unlock(&disks_mutex); + if (!error && mddev->kobj.sd) { kobject_uevent(&mddev->kobj, KOBJ_ADD); + mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); + } + mddev_put(mddev); + return error; +} + +static struct kobject *md_probe(dev_t dev, int *part, void *data) +{ + md_alloc(dev, NULL); return NULL; } +static int add_named_array(const char *val, struct kernel_param *kp) +{ + /* val must be "md_*" where * is not all digits. + * We allocate an array with a large free minor number, and + * set the name to val. val must not already be an active name. + */ + int len = strlen(val); + char buf[DISK_NAME_LEN]; + + while (len && val[len-1] == '\n') + len--; + if (len >= DISK_NAME_LEN) + return -E2BIG; + strlcpy(buf, val, len+1); + if (strncmp(buf, "md_", 3) != 0) + return -EINVAL; + return md_alloc(0, buf); +} + static void md_safemode_timeout(unsigned long data) { - mddev_t *mddev = (mddev_t *) data; + struct mddev *mddev = (struct mddev *) data; if (!atomic_read(&mddev->writes_pending)) { mddev->safemode = 1; if (mddev->external) - set_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags); + sysfs_notify_dirent_safe(mddev->sysfs_state); } md_wakeup_thread(mddev->thread); } static int start_dirty_degraded; -static int do_md_run(mddev_t * mddev) +int md_run(struct mddev *mddev) { int err; - int chunk_size; - struct list_head *tmp; - mdk_rdev_t *rdev; - struct gendisk *disk; - struct mdk_personality *pers; - char b[BDEVNAME_SIZE]; + struct md_rdev *rdev; + struct md_personality *pers; if (list_empty(&mddev->disks)) /* cannot run an array with no devices.. */ @@ -3511,6 +4949,9 @@ static int do_md_run(mddev_t * mddev) if (mddev->pers) return -EBUSY; + /* Cannot run until previous stop completes properly */ + if (mddev->sysfs_active) + return -EBUSY; /* * Analyze all RAID superblock(s) @@ -3521,56 +4962,17 @@ static int do_md_run(mddev_t * mddev) analyze_sbs(mddev); } - chunk_size = mddev->chunk_size; - - if (chunk_size) { - if (chunk_size > MAX_CHUNK_SIZE) { - printk(KERN_ERR "too big chunk_size: %d > %d\n", - chunk_size, MAX_CHUNK_SIZE); - return -EINVAL; - } - /* - * chunk-size has to be a power of 2 and multiples of PAGE_SIZE - */ - if ( (1 << ffz(~chunk_size)) != chunk_size) { - printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); - return -EINVAL; - } - if (chunk_size < PAGE_SIZE) { - printk(KERN_ERR "too small chunk_size: %d < %ld\n", - chunk_size, PAGE_SIZE); - return -EINVAL; - } - - /* devices must have minimum size of one chunk */ - rdev_for_each(rdev, tmp, mddev) { - if (test_bit(Faulty, &rdev->flags)) - continue; - if (rdev->size < chunk_size / 1024) { - printk(KERN_WARNING - "md: Dev %s smaller than chunk_size:" - " %lluk < %dk\n", - bdevname(rdev->bdev,b), - (unsigned long long)rdev->size, - chunk_size / 1024); - return -EINVAL; - } - } - } - -#ifdef CONFIG_KMOD if (mddev->level != LEVEL_NONE) request_module("md-level-%d", mddev->level); else if (mddev->clevel[0]) request_module("md-%s", mddev->clevel); -#endif /* * Drop all container device buffers, from now on * the only valid external interface is through the md * device. */ - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); @@ -3578,11 +4980,13 @@ static int do_md_run(mddev_t * mddev) /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, - * Internal Bitmap issues has handled elsewhere. + * Internal Bitmap issues have been handled elsewhere. */ - if (rdev->data_offset < rdev->sb_start) { - if (mddev->size && - rdev->data_offset + mddev->size*2 + if (rdev->meta_bdev) { + /* Nothing to check */; + } else if (rdev->data_offset < rdev->sb_start) { + if (mddev->dev_sectors && + rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { printk("md: %s: data overlaps metadata\n", mdname(mddev)); @@ -3596,13 +5000,11 @@ static int do_md_run(mddev_t * mddev) return -EINVAL; } } - sysfs_notify(&rdev->kobj, NULL, "state"); + sysfs_notify_dirent_safe(rdev->sysfs_state); } - md_probe(mddev->unit, NULL, NULL); - disk = mddev->gendisk; - if (!disk) - return -ENOMEM; + if (mddev->bio_set == NULL) + mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -3618,7 +5020,10 @@ static int do_md_run(mddev_t * mddev) } mddev->pers = pers; spin_unlock(&pers_lock); - mddev->level = pers->level; + if (mddev->level != pers->level) { + mddev->level = pers->level; + mddev->new_level = pers->level; + } strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && @@ -3634,11 +5039,11 @@ static int do_md_run(mddev_t * mddev) * configuration. */ char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; - mdk_rdev_t *rdev2; - struct list_head *tmp2; + struct md_rdev *rdev2; int warned = 0; - rdev_for_each(rdev, tmp, mddev) { - rdev_for_each(rdev2, tmp2, mddev) { + + rdev_for_each(rdev, mddev) + rdev_for_each(rdev2, mddev) { if (rdev < rdev2 && rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { @@ -3652,7 +5057,7 @@ static int do_md_run(mddev_t * mddev) warned = 1; } } - } + if (warned) printk(KERN_WARNING "True protection against single-disk" @@ -3660,17 +5065,29 @@ static int do_md_run(mddev_t * mddev) } mddev->recovery = 0; - mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ - mddev->barriers_work = 1; + /* may be over-ridden by personality */ + mddev->resync_max_sectors = mddev->dev_sectors; + mddev->ok_start_degraded = start_dirty_degraded; - if (start_readonly) + if (start_readonly && mddev->ro == 0) mddev->ro = 2; /* read-only, but switch on first write */ err = mddev->pers->run(mddev); if (err) printk(KERN_ERR "md: pers->run() failed ...\n"); - else if (mddev->pers->sync_request) { + else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { + WARN_ONCE(!mddev->external_size, "%s: default size too small," + " but 'external_size' not in effect?\n", __func__); + printk(KERN_ERR + "md: invalid array_size %llu > default size %llu\n", + (unsigned long long)mddev->array_sectors / 2, + (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); + err = -EINVAL; + mddev->pers->stop(mddev); + } + if (err == 0 && mddev->pers->sync_request && + (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { err = bitmap_create(mddev); if (err) { printk(KERN_ERR "%s: failed to create bitmap (%d)\n", @@ -3685,87 +5102,68 @@ static int do_md_run(mddev_t * mddev) return err; } if (mddev->pers->sync_request) { - if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_redundancy_group)) printk(KERN_WARNING "md: cannot register extra attributes for %s\n", mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); } else if (mddev->ro == 2) /* auto-readonly not meaningful */ mddev->ro = 0; atomic_set(&mddev->writes_pending,0); + atomic_set(&mddev->max_corr_read_errors, + MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; mddev->safemode_timer.function = md_safemode_timeout; mddev->safemode_timer.data = (unsigned long) mddev; mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; - - rdev_for_each(rdev, tmp, mddev) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) - printk("md: cannot register %s for %s\n", - nm, mdname(mddev)); - } + smp_wmb(); + mddev->ready = 1; + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) + if (sysfs_link_rdev(mddev, rdev)) + /* failure here is OK */; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (mddev->flags) + if (mddev->flags & MD_UPDATE_SB_FLAGS) md_update_sb(mddev, 0); - set_capacity(disk, mddev->array_sectors); + md_new_event(mddev); + sysfs_notify_dirent_safe(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_action); + sysfs_notify(&mddev->kobj, NULL, "degraded"); + return 0; +} +EXPORT_SYMBOL_GPL(md_run); - /* If we call blk_queue_make_request here, it will - * re-initialise max_sectors etc which may have been - * refined inside -> run. So just set the bits we need to set. - * Most initialisation happended when we called - * blk_queue_make_request(..., md_fail_request) - * earlier. - */ - mddev->queue->queuedata = mddev; - mddev->queue->make_request_fn = mddev->pers->make_request; +static int do_md_run(struct mddev *mddev) +{ + int err; - /* If there is a partially-recovered drive we need to - * start recovery here. If we leave it to md_check_recovery, - * it will remove the drives and not do the right thing - */ - if (mddev->degraded && !mddev->sync_thread) { - struct list_head *rtmp; - int spares = 0; - rdev_for_each(rdev, rtmp, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) - /* complete an interrupted recovery */ - spares++; - if (spares && mddev->pers->sync_request) { - mddev->recovery = 0; - set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, - mddev, - "%s_resync"); - if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); - /* leave the spares where they are, it shouldn't hurt */ - mddev->recovery = 0; - } - } + err = md_run(mddev); + if (err) + goto out; + err = bitmap_load(mddev); + if (err) { + bitmap_destroy(mddev); + goto out; } + md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); mddev->changed = 1; - md_new_event(mddev); - sysfs_notify(&mddev->kobj, NULL, "array_state"); - sysfs_notify(&mddev->kobj, NULL, "sync_action"); - sysfs_notify(&mddev->kobj, NULL, "degraded"); - kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE); - return 0; + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); +out: + return err; } -static int restart_array(mddev_t *mddev) +static int restart_array(struct mddev *mddev) { struct gendisk *disk = mddev->gendisk; @@ -3785,182 +5183,252 @@ static int restart_array(mddev_t *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); - sysfs_notify(&mddev->kobj, NULL, "array_state"); + sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; } -/* similar to deny_write_access, but accounts for our holding a reference - * to the file ourselves */ -static int deny_bitmap_write_access(struct file * file) +static void md_clean(struct mddev *mddev) { - struct inode *inode = file->f_mapping->host; + mddev->array_sectors = 0; + mddev->external_size = 0; + mddev->dev_sectors = 0; + mddev->raid_disks = 0; + mddev->recovery_cp = 0; + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + mddev->reshape_position = MaxSector; + mddev->external = 0; + mddev->persistent = 0; + mddev->level = LEVEL_NONE; + mddev->clevel[0] = 0; + mddev->flags = 0; + mddev->ro = 0; + mddev->metadata_type[0] = 0; + mddev->chunk_sectors = 0; + mddev->ctime = mddev->utime = 0; + mddev->layout = 0; + mddev->max_disks = 0; + mddev->events = 0; + mddev->can_decrease_events = 0; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->new_level = LEVEL_NONE; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 0; + mddev->curr_resync = 0; + atomic64_set(&mddev->resync_mismatches, 0); + mddev->suspend_lo = mddev->suspend_hi = 0; + mddev->sync_speed_min = mddev->sync_speed_max = 0; + mddev->recovery = 0; + mddev->in_sync = 0; + mddev->changed = 0; + mddev->degraded = 0; + mddev->safemode = 0; + mddev->merge_check_needed = 0; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = 0; + mddev->bitmap_info.default_space = 0; + mddev->bitmap_info.chunksize = 0; + mddev->bitmap_info.daemon_sleep = 0; + mddev->bitmap_info.max_write_behind = 0; +} - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_writecount) > 1) { - spin_unlock(&inode->i_lock); - return -ETXTBSY; +static void __md_stop_writes(struct mddev *mddev) +{ + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); } - atomic_set(&inode->i_writecount, -1); - spin_unlock(&inode->i_lock); - return 0; + del_timer_sync(&mddev->safemode_timer); + + bitmap_flush(mddev); + md_super_wait(mddev); + + if (mddev->ro == 0 && + (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) { + /* mark array as shutdown cleanly */ + mddev->in_sync = 1; + md_update_sb(mddev, 1); + } +} + +void md_stop_writes(struct mddev *mddev) +{ + mddev_lock_nointr(mddev); + __md_stop_writes(mddev); + mddev_unlock(mddev); } +EXPORT_SYMBOL_GPL(md_stop_writes); -static void restore_bitmap_write_access(struct file *file) +static void __md_stop(struct mddev *mddev) { - struct inode *inode = file->f_mapping->host; + mddev->ready = 0; + mddev->pers->stop(mddev); + if (mddev->pers->sync_request && mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + module_put(mddev->pers->owner); + mddev->pers = NULL; + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); +} - spin_lock(&inode->i_lock); - atomic_set(&inode->i_writecount, 1); - spin_unlock(&inode->i_lock); +void md_stop(struct mddev *mddev) +{ + /* stop the array and free an attached data structures. + * This is called from dm-raid + */ + __md_stop(mddev); + bitmap_destroy(mddev); + if (mddev->bio_set) + bioset_free(mddev->bio_set); +} + +EXPORT_SYMBOL_GPL(md_stop); + +static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) +{ + int err = 0; + int did_freeze = 0; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + } + mddev_unlock(mddev); + wait_event(resync_wait, mddev->sync_thread == NULL); + mddev_lock_nointr(mddev); + + mutex_lock(&mddev->open_mutex); + if (atomic_read(&mddev->openers) > !!bdev || + mddev->sync_thread || + (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { + printk("md: %s still in use.\n",mdname(mddev)); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + err = -EBUSY; + goto out; + } + if (mddev->pers) { + __md_stop_writes(mddev); + + err = -ENXIO; + if (mddev->ro==1) + goto out; + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_state); + err = 0; + } +out: + mutex_unlock(&mddev->open_mutex); + return err; } /* mode: * 0 - completely stop and dis-assemble array - * 1 - switch to readonly * 2 - stop but do not disassemble array */ -static int do_md_stop(mddev_t * mddev, int mode, int is_open) +static int do_md_stop(struct mddev * mddev, int mode, + struct block_device *bdev) { - int err = 0; struct gendisk *disk = mddev->gendisk; + struct md_rdev *rdev; + int did_freeze = 0; - if (atomic_read(&mddev->openers) > is_open) { + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + } + mddev_unlock(mddev); + wait_event(resync_wait, mddev->sync_thread == NULL); + mddev_lock_nointr(mddev); + + mutex_lock(&mddev->open_mutex); + if (atomic_read(&mddev->openers) > !!bdev || + mddev->sysfs_active || + mddev->sync_thread || + (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { printk("md: %s still in use.\n",mdname(mddev)); + mutex_unlock(&mddev->open_mutex); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } return -EBUSY; } - if (mddev->pers) { + if (mddev->ro) + set_disk_ro(disk, 0); - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - } - - del_timer_sync(&mddev->safemode_timer); - - switch(mode) { - case 1: /* readonly */ - err = -ENXIO; - if (mddev->ro==1) - goto out; - mddev->ro = 1; - break; - case 0: /* disassemble */ - case 2: /* stop */ - bitmap_flush(mddev); - md_super_wait(mddev); - if (mddev->ro) - set_disk_ro(disk, 0); - blk_queue_make_request(mddev->queue, md_fail_request); - mddev->pers->stop(mddev); - mddev->queue->merge_bvec_fn = NULL; - mddev->queue->unplug_fn = NULL; - mddev->queue->backing_dev_info.congested_fn = NULL; - if (mddev->pers->sync_request) - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + __md_stop_writes(mddev); + __md_stop(mddev); + mddev->queue->merge_bvec_fn = NULL; + mddev->queue->backing_dev_info.congested_fn = NULL; - module_put(mddev->pers->owner); - mddev->pers = NULL; - /* tell userspace to handle 'inactive' */ - sysfs_notify(&mddev->kobj, NULL, "array_state"); + /* tell userspace to handle 'inactive' */ + sysfs_notify_dirent_safe(mddev->sysfs_state); - set_capacity(disk, 0); - mddev->changed = 1; + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) + sysfs_unlink_rdev(mddev, rdev); - if (mddev->ro) - mddev->ro = 0; - } - if (!mddev->in_sync || mddev->flags) { - /* mark array as shutdown cleanly */ - mddev->in_sync = 1; - md_update_sb(mddev, 1); - } - if (mode == 1) - set_disk_ro(disk, 1); - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - } + set_capacity(disk, 0); + mutex_unlock(&mddev->open_mutex); + mddev->changed = 1; + revalidate_disk(disk); + if (mddev->ro) + mddev->ro = 0; + } else + mutex_unlock(&mddev->open_mutex); /* * Free resources if final stop */ if (mode == 0) { - mdk_rdev_t *rdev; - struct list_head *tmp; - printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); bitmap_destroy(mddev); - if (mddev->bitmap_file) { - restore_bitmap_write_access(mddev->bitmap_file); - fput(mddev->bitmap_file); - mddev->bitmap_file = NULL; + if (mddev->bitmap_info.file) { + fput(mddev->bitmap_info.file); + mddev->bitmap_info.file = NULL; } - mddev->bitmap_offset = 0; - - rdev_for_each(rdev, tmp, mddev) - if (rdev->raid_disk >= 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } - - /* make sure all md_delayed_delete calls have finished */ - flush_scheduled_work(); + mddev->bitmap_info.offset = 0; export_array(mddev); - mddev->array_sectors = 0; - mddev->size = 0; - mddev->raid_disks = 0; - mddev->recovery_cp = 0; - mddev->resync_min = 0; - mddev->resync_max = MaxSector; - mddev->reshape_position = MaxSector; - mddev->external = 0; - mddev->persistent = 0; - mddev->level = LEVEL_NONE; - mddev->clevel[0] = 0; - mddev->flags = 0; - mddev->ro = 0; - mddev->metadata_type[0] = 0; - mddev->chunk_size = 0; - mddev->ctime = mddev->utime = 0; - mddev->layout = 0; - mddev->max_disks = 0; - mddev->events = 0; - mddev->delta_disks = 0; - mddev->new_level = LEVEL_NONE; - mddev->new_layout = 0; - mddev->new_chunk = 0; - mddev->curr_resync = 0; - mddev->resync_mismatches = 0; - mddev->suspend_lo = mddev->suspend_hi = 0; - mddev->sync_speed_min = mddev->sync_speed_max = 0; - mddev->recovery = 0; - mddev->in_sync = 0; - mddev->changed = 0; - mddev->degraded = 0; - mddev->barriers_work = 0; - mddev->safemode = 0; - - } else if (mddev->pers) - printk(KERN_INFO "md: %s switched to read-only mode.\n", - mdname(mddev)); - err = 0; + md_clean(mddev); + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); + if (mddev->hold_active == UNTIL_STOP) + mddev->hold_active = 0; + } + blk_integrity_unregister(disk); md_new_event(mddev); - sysfs_notify(&mddev->kobj, NULL, "array_state"); -out: - return err; + sysfs_notify_dirent_safe(mddev->sysfs_state); + return 0; } #ifndef MODULE -static void autorun_array(mddev_t *mddev) +static void autorun_array(struct mddev *mddev) { - mdk_rdev_t *rdev; - struct list_head *tmp; + struct md_rdev *rdev; int err; if (list_empty(&mddev->disks)) @@ -3968,16 +5436,16 @@ static void autorun_array(mddev_t *mddev) printk(KERN_INFO "md: running: "); - rdev_for_each(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; printk("<%s>", bdevname(rdev->bdev,b)); } printk("\n"); - err = do_md_run (mddev); + err = do_md_run(mddev); if (err) { printk(KERN_WARNING "md: do_md_run() returned %d\n", err); - do_md_stop (mddev, 0, 0); + do_md_stop(mddev, 0, NULL); } } @@ -3995,9 +5463,8 @@ static void autorun_array(mddev_t *mddev) */ static void autorun_devices(int part) { - struct list_head *tmp; - mdk_rdev_t *rdev0, *rdev; - mddev_t *mddev; + struct md_rdev *rdev0, *rdev, *tmp; + struct mddev *mddev; char b[BDEVNAME_SIZE]; printk(KERN_INFO "md: autorun ...\n"); @@ -4006,12 +5473,12 @@ static void autorun_devices(int part) dev_t dev; LIST_HEAD(candidates); rdev0 = list_entry(pending_raid_disks.next, - mdk_rdev_t, same_set); + struct md_rdev, same_set); printk(KERN_INFO "md: considering %s ...\n", bdevname(rdev0->bdev,b)); INIT_LIST_HEAD(&candidates); - rdev_for_each_list(rdev, tmp, pending_raid_disks) + rdev_for_each_list(rdev, tmp, &pending_raid_disks) if (super_90_load(rdev, rdev0, 0) >= 0) { printk(KERN_INFO "md: adding %s ...\n", bdevname(rdev->bdev,b)); @@ -4057,7 +5524,7 @@ static void autorun_devices(int part) } else { printk(KERN_INFO "md: created %s\n", mdname(mddev)); mddev->persistent = 1; - rdev_for_each_list(rdev, tmp, candidates) { + rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) export_rdev(rdev); @@ -4068,7 +5535,7 @@ static void autorun_devices(int part) /* on success, candidates will be empty, on error * it won't... */ - rdev_for_each_list(rdev, tmp, candidates) { + rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); export_rdev(rdev); } @@ -4092,34 +5559,35 @@ static int get_version(void __user * arg) return 0; } -static int get_array_info(mddev_t * mddev, void __user * arg) +static int get_array_info(struct mddev * mddev, void __user * arg) { mdu_array_info_t info; - int nr,working,active,failed,spare; - mdk_rdev_t *rdev; - struct list_head *tmp; + int nr,working,insync,failed,spare; + struct md_rdev *rdev; - nr=working=active=failed=spare=0; - rdev_for_each(rdev, tmp, mddev) { + nr = working = insync = failed = spare = 0; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; else { working++; if (test_bit(In_sync, &rdev->flags)) - active++; + insync++; else spare++; } } + rcu_read_unlock(); info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; info.patch_version = MD_PATCHLEVEL_VERSION; info.ctime = mddev->ctime; info.level = mddev->level; - info.size = mddev->size; - if (info.size != mddev->size) /* overflow */ + info.size = mddev->dev_sectors / 2; + if (info.size != mddev->dev_sectors / 2) /* overflow */ info.size = -1; info.nr_disks = nr; info.raid_disks = mddev->raid_disks; @@ -4130,15 +5598,15 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.state = 0; if (mddev->in_sync) info.state = (1<<MD_SB_CLEAN); - if (mddev->bitmap && mddev->bitmap_offset) - info.state = (1<<MD_SB_BITMAP_PRESENT); - info.active_disks = active; + if (mddev->bitmap && mddev->bitmap_info.offset) + info.state |= (1<<MD_SB_BITMAP_PRESENT); + info.active_disks = insync; info.working_disks = working; info.failed_disks = failed; info.spare_disks = spare; info.layout = mddev->layout; - info.chunk_size = mddev->chunk_size; + info.chunk_size = mddev->chunk_sectors << 9; if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; @@ -4146,22 +5614,19 @@ static int get_array_info(mddev_t * mddev, void __user * arg) return 0; } -static int get_bitmap_file(mddev_t * mddev, void __user * arg) +static int get_bitmap_file(struct mddev * mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ char *ptr, *buf = NULL; int err = -ENOMEM; - if (md_allow_write(mddev)) - file = kmalloc(sizeof(*file), GFP_NOIO); - else - file = kmalloc(sizeof(*file), GFP_KERNEL); + file = kmalloc(sizeof(*file), GFP_NOIO); if (!file) goto out; /* bitmap disabled, zero the first byte and copy out */ - if (!mddev->bitmap || !mddev->bitmap->file) { + if (!mddev->bitmap || !mddev->bitmap->storage.file) { file->pathname[0] = '\0'; goto copy_out; } @@ -4170,7 +5635,8 @@ static int get_bitmap_file(mddev_t * mddev, void __user * arg) if (!buf) goto out; - ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); + ptr = d_path(&mddev->bitmap->storage.file->f_path, + buf, sizeof(file->pathname)); if (IS_ERR(ptr)) goto out; @@ -4186,15 +5652,16 @@ out: return err; } -static int get_disk_info(mddev_t * mddev, void __user * arg) +static int get_disk_info(struct mddev * mddev, void __user * arg) { mdu_disk_info_t info; - mdk_rdev_t *rdev; + struct md_rdev *rdev; if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; - rdev = find_rdev_nr(mddev, info.number); + rcu_read_lock(); + rdev = find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); @@ -4213,6 +5680,7 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) info.raid_disk = -1; info.state = (1<<MD_DISK_REMOVED); } + rcu_read_unlock(); if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; @@ -4220,10 +5688,10 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) return 0; } -static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) +static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) { char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; - mdk_rdev_t *rdev; + struct md_rdev *rdev; dev_t dev = MKDEV(info->major,info->minor); if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) @@ -4240,9 +5708,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) return PTR_ERR(rdev); } if (!list_empty(&mddev->disks)) { - mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, - mdk_rdev_t, same_set); - int err = super_types[mddev->major_version] + struct md_rdev *rdev0 + = list_entry(mddev->disks.next, + struct md_rdev, same_set); + err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) { printk(KERN_WARNING @@ -4283,21 +5752,33 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) PTR_ERR(rdev)); return PTR_ERR(rdev); } - /* set save_raid_disk if appropriate */ + /* set saved_raid_disk if appropriate */ if (!mddev->persistent) { if (info->state & (1<<MD_DISK_SYNC) && - info->raid_disk < mddev->raid_disks) + info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; - else + set_bit(In_sync, &rdev->flags); + clear_bit(Bitmap_sync, &rdev->flags); + } else rdev->raid_disk = -1; + rdev->saved_raid_disk = rdev->raid_disk; } else super_types[mddev->major_version]. validate_super(mddev, rdev); - rdev->saved_raid_disk = rdev->raid_disk; + if ((info->state & (1<<MD_DISK_SYNC)) && + rdev->raid_disk != info->raid_disk) { + /* This was a hot-add request, but events doesn't + * match, so reject it. + */ + export_rdev(rdev); + return -EINVAL; + } clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); + else + clear_bit(WriteMostly, &rdev->flags); rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); @@ -4315,12 +5796,14 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (err) export_rdev(rdev); else - sysfs_notify(&rdev->kobj, NULL, "state"); + sysfs_notify_dirent_safe(rdev->sysfs_state); - md_update_sb(mddev, 1); + set_bit(MD_CHANGE_DEVS, &mddev->flags); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (!err) + md_new_event(mddev); md_wakeup_thread(mddev->thread); return err; } @@ -4336,7 +5819,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (!(info->state & (1<<MD_DISK_FAULTY))) { int err; - rdev = md_import_device (dev, -1, 0); + rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", @@ -4358,10 +5841,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (!mddev->persistent) { printk(KERN_INFO "md: nonpersistent superblock ...\n"); - rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - } else - rdev->sb_start = calc_dev_sboffset(rdev->bdev); - rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; + rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; + } else + rdev->sb_start = calc_dev_sboffset(rdev); + rdev->sectors = rdev->sb_start; err = bind_rdev_to_array(rdev, mddev); if (err) { @@ -4373,15 +5856,18 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) return 0; } -static int hot_remove_disk(mddev_t * mddev, dev_t dev) +static int hot_remove_disk(struct mddev * mddev, dev_t dev) { char b[BDEVNAME_SIZE]; - mdk_rdev_t *rdev; + struct md_rdev *rdev; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(mddev, rdev); + if (rdev->raid_disk >= 0) goto busy; @@ -4396,11 +5882,11 @@ busy: return -EBUSY; } -static int hot_add_disk(mddev_t * mddev, dev_t dev) +static int hot_add_disk(struct mddev * mddev, dev_t dev) { char b[BDEVNAME_SIZE]; int err; - mdk_rdev_t *rdev; + struct md_rdev *rdev; if (!mddev->pers) return -ENODEV; @@ -4418,7 +5904,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) return -EINVAL; } - rdev = md_import_device (dev, -1, 0); + rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", @@ -4427,11 +5913,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) } if (mddev->persistent) - rdev->sb_start = calc_dev_sboffset(rdev->bdev); + rdev->sb_start = calc_dev_sboffset(rdev); else - rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; + rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; - rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; + rdev->sectors = rdev->sb_start; if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING @@ -4452,13 +5938,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) * noticed in interrupt contexts ... */ - if (rdev->desc_nr == mddev->max_disks) { - printk(KERN_WARNING "%s: can not hot-add to full array!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unbind_export; - } - rdev->raid_disk = -1; md_update_sb(mddev, 1); @@ -4472,17 +5951,14 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) md_new_event(mddev); return 0; -abort_unbind_export: - unbind_rdev_from_array(rdev); - abort_export: export_rdev(rdev); return err; } -static int set_bitmap_file(mddev_t *mddev, int fd) +static int set_bitmap_file(struct mddev *mddev, int fd) { - int err; + int err = 0; if (mddev->pers) { if (!mddev->pers->quiesce) @@ -4494,32 +5970,47 @@ static int set_bitmap_file(mddev_t *mddev, int fd) if (fd >= 0) { + struct inode *inode; if (mddev->bitmap) return -EEXIST; /* cannot add when bitmap is present */ - mddev->bitmap_file = fget(fd); + mddev->bitmap_info.file = fget(fd); - if (mddev->bitmap_file == NULL) { + if (mddev->bitmap_info.file == NULL) { printk(KERN_ERR "%s: error: failed to get bitmap file\n", mdname(mddev)); return -EBADF; } - err = deny_bitmap_write_access(mddev->bitmap_file); - if (err) { + inode = mddev->bitmap_info.file->f_mapping->host; + if (!S_ISREG(inode->i_mode)) { + printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", + mdname(mddev)); + err = -EBADF; + } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) { + printk(KERN_ERR "%s: error: bitmap file must open for write\n", + mdname(mddev)); + err = -EBADF; + } else if (atomic_read(&inode->i_writecount) != 1) { printk(KERN_ERR "%s: error: bitmap file is already in use\n", mdname(mddev)); - fput(mddev->bitmap_file); - mddev->bitmap_file = NULL; + err = -EBUSY; + } + if (err) { + fput(mddev->bitmap_info.file); + mddev->bitmap_info.file = NULL; return err; } - mddev->bitmap_offset = 0; /* file overrides offset */ + mddev->bitmap_info.offset = 0; /* file overrides offset */ } else if (mddev->bitmap == NULL) return -ENOENT; /* cannot remove what isn't there */ err = 0; if (mddev->pers) { mddev->pers->quiesce(mddev, 1); - if (fd >= 0) + if (fd >= 0) { err = bitmap_create(mddev); + if (!err) + err = bitmap_load(mddev); + } if (fd < 0 || err) { bitmap_destroy(mddev); fd = -1; /* make sure to put the file */ @@ -4527,11 +6018,9 @@ static int set_bitmap_file(mddev_t *mddev, int fd) mddev->pers->quiesce(mddev, 0); } if (fd < 0) { - if (mddev->bitmap_file) { - restore_bitmap_write_access(mddev->bitmap_file); - fput(mddev->bitmap_file); - } - mddev->bitmap_file = NULL; + if (mddev->bitmap_info.file) + fput(mddev->bitmap_info.file); + mddev->bitmap_info.file = NULL; } return err; @@ -4550,7 +6039,7 @@ static int set_bitmap_file(mddev_t *mddev, int fd) * The minor and patch _version numbers are also kept incase the * super_block handler wishes to interpret them. */ -static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) +static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) { if (info->raid_disks == 0) { @@ -4568,6 +6057,10 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->minor_version = info->minor_version; mddev->patch_version = info->patch_version; mddev->persistent = !info->not_persistent; + /* ensure mddev_put doesn't delete this now that there + * is some minimal configuration. + */ + mddev->ctime = get_seconds(); return 0; } mddev->major_version = MD_MAJOR_VERSION; @@ -4577,7 +6070,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->level = info->level; mddev->clevel[0] = 0; - mddev->size = info->size; + mddev->dev_sectors = 2 * (sector_t)info->size; mddev->raid_disks = info->raid_disks; /* don't set md_minor, it is determined by which /dev/md* was * openned @@ -4590,7 +6083,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->external = 0; mddev->layout = info->layout; - mddev->chunk_size = info->chunk_size; + mddev->chunk_sectors = info->chunk_size >> 9; mddev->max_disks = MD_SB_DISKS; @@ -4598,8 +6091,9 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->flags = 0; set_bit(MD_CHANGE_DEVS, &mddev->flags); - mddev->default_bitmap_offset = MD_SB_BYTES >> 9; - mddev->bitmap_offset = 0; + mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); + mddev->bitmap_info.offset = 0; mddev->reshape_position = MaxSector; @@ -4609,18 +6103,29 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) get_random_bytes(mddev->uuid, 16); mddev->new_level = mddev->level; - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->delta_disks = 0; + mddev->reshape_backwards = 0; return 0; } -static int update_size(mddev_t *mddev, sector_t num_sectors) +void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) { - mdk_rdev_t * rdev; + WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + + if (mddev->external_size) + return; + + mddev->array_sectors = array_sectors; +} +EXPORT_SYMBOL(md_set_array_sectors); + +static int update_size(struct mddev *mddev, sector_t num_sectors) +{ + struct md_rdev *rdev; int rv; - struct list_head *tmp; int fit = (num_sectors == 0); if (mddev->pers->resize == NULL) @@ -4633,18 +6138,14 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) * sb_start or, if that is <data_offset, it must fit before the size * of each device. If num_sectors is zero, we find the largest size * that fits. - */ if (mddev->sync_thread) return -EBUSY; - if (mddev->bitmap) - /* Sorry, cannot grow a bitmap yet, just remove it, - * grow, and re-add. - */ - return -EBUSY; - rdev_for_each(rdev, tmp, mddev) { - sector_t avail; - avail = rdev->size * 2; + if (mddev->ro) + return -EROFS; + + rdev_for_each(rdev, mddev) { + sector_t avail = rdev->sectors; if (fit && (num_sectors == 0 || num_sectors > avail)) num_sectors = avail; @@ -4652,35 +6153,46 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) { - struct block_device *bdev; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } - } + if (!rv) + revalidate_disk(mddev->gendisk); return rv; } -static int update_raid_disks(mddev_t *mddev, int raid_disks) +static int update_raid_disks(struct mddev *mddev, int raid_disks) { int rv; + struct md_rdev *rdev; /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; + if (mddev->ro) + return -EROFS; if (raid_disks <= 0 || - raid_disks >= mddev->max_disks) + (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; + + rdev_for_each(rdev, mddev) { + if (mddev->raid_disks < raid_disks && + rdev->data_offset < rdev->new_data_offset) + return -EINVAL; + if (mddev->raid_disks > raid_disks && + rdev->data_offset > rdev->new_data_offset) + return -EINVAL; + } + mddev->delta_disks = raid_disks - mddev->raid_disks; + if (mddev->delta_disks < 0) + mddev->reshape_backwards = 1; + else if (mddev->delta_disks > 0) + mddev->reshape_backwards = 0; rv = mddev->pers->check_reshape(mddev); + if (rv < 0) { + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + } return rv; } @@ -4693,14 +6205,14 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks) * Any differences that cannot be handled will cause an error. * Normally, only one change can be managed at a time. */ -static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) +static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) { int rv = 0; int cnt = 0; int state = 0; /* calculate expected state,ignoring low bits */ - if (mddev->bitmap && mddev->bitmap_offset) + if (mddev->bitmap && mddev->bitmap_info.offset) state |= (1 << MD_SB_BITMAP_PRESENT); if (mddev->major_version != info->major_version || @@ -4710,30 +6222,41 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->level != info->level || /* mddev->layout != info->layout || */ !mddev->persistent != info->not_persistent|| - mddev->chunk_size != info->chunk_size || + mddev->chunk_sectors != info->chunk_size >> 9 || /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ ((state^info->state) & 0xfffffe00) ) return -EINVAL; /* Check there is only one change */ - if (info->size >= 0 && mddev->size != info->size) cnt++; - if (mddev->raid_disks != info->raid_disks) cnt++; - if (mddev->layout != info->layout) cnt++; - if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; - if (cnt == 0) return 0; - if (cnt > 1) return -EINVAL; + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) + cnt++; + if (mddev->raid_disks != info->raid_disks) + cnt++; + if (mddev->layout != info->layout) + cnt++; + if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) + cnt++; + if (cnt == 0) + return 0; + if (cnt > 1) + return -EINVAL; if (mddev->layout != info->layout) { /* Change layout * we don't need to do anything at the md level, the * personality will take care of it all. */ - if (mddev->pers->reconfig == NULL) + if (mddev->pers->check_reshape == NULL) return -EINVAL; - else - return mddev->pers->reconfig(mddev, info->layout, -1); + else { + mddev->new_layout = info->layout; + rv = mddev->pers->check_reshape(mddev); + if (rv) + mddev->new_layout = mddev->layout; + return rv; + } } - if (info->size >= 0 && mddev->size != info->size) + if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); if (mddev->raid_disks != info->raid_disks) @@ -4748,11 +6271,16 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) /* add the bitmap */ if (mddev->bitmap) return -EEXIST; - if (mddev->default_bitmap_offset == 0) + if (mddev->bitmap_info.default_offset == 0) return -EINVAL; - mddev->bitmap_offset = mddev->default_bitmap_offset; + mddev->bitmap_info.offset = + mddev->bitmap_info.default_offset; + mddev->bitmap_info.space = + mddev->bitmap_info.default_space; mddev->pers->quiesce(mddev, 1); rv = bitmap_create(mddev); + if (!rv) + rv = bitmap_load(mddev); if (rv) bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); @@ -4760,31 +6288,37 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) /* remove the bitmap */ if (!mddev->bitmap) return -ENOENT; - if (mddev->bitmap->file) + if (mddev->bitmap->storage.file) return -EINVAL; mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); - mddev->bitmap_offset = 0; + mddev->bitmap_info.offset = 0; } } md_update_sb(mddev, 1); return rv; } -static int set_disk_faulty(mddev_t *mddev, dev_t dev) +static int set_disk_faulty(struct mddev *mddev, dev_t dev) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; + int err = 0; if (mddev->pers == NULL) return -ENODEV; - rdev = find_rdev(mddev, dev); + rcu_read_lock(); + rdev = find_rdev_rcu(mddev, dev); if (!rdev) - return -ENODEV; - - md_error(mddev, rdev); - return 0; + err = -ENODEV; + else { + md_error(mddev, rdev); + if (!test_bit(Faulty, &rdev->flags)) + err = -EBUSY; + } + rcu_read_unlock(); + return err; } /* @@ -4795,59 +6329,140 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev) */ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) { - mddev_t *mddev = bdev->bd_disk->private_data; + struct mddev *mddev = bdev->bd_disk->private_data; geo->heads = 2; geo->sectors = 4; - geo->cylinders = get_capacity(mddev->gendisk) / 8; + geo->cylinders = mddev->array_sectors / 8; return 0; } -static int md_ioctl(struct inode *inode, struct file *file, +static inline bool md_ioctl_valid(unsigned int cmd) +{ + switch (cmd) { + case ADD_NEW_DISK: + case BLKROSET: + case GET_ARRAY_INFO: + case GET_BITMAP_FILE: + case GET_DISK_INFO: + case HOT_ADD_DISK: + case HOT_REMOVE_DISK: + case PRINT_RAID_DEBUG: + case RAID_AUTORUN: + case RAID_VERSION: + case RESTART_ARRAY_RW: + case RUN_ARRAY: + case SET_ARRAY_INFO: + case SET_BITMAP_FILE: + case SET_DISK_FAULTY: + case STOP_ARRAY: + case STOP_ARRAY_RO: + return true; + default: + return false; + } +} + +static int md_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { int err = 0; void __user *argp = (void __user *)arg; - mddev_t *mddev = NULL; + struct mddev *mddev = NULL; + int ro; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + if (!md_ioctl_valid(cmd)) + return -ENOTTY; + + switch (cmd) { + case RAID_VERSION: + case GET_ARRAY_INFO: + case GET_DISK_INFO: + break; + default: + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + } /* * Commands dealing with the RAID driver but not any * particular array: */ - switch (cmd) - { - case RAID_VERSION: - err = get_version(argp); - goto done; + switch (cmd) { + case RAID_VERSION: + err = get_version(argp); + goto done; - case PRINT_RAID_DEBUG: - err = 0; - md_print_devices(); - goto done; + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done; #ifndef MODULE - case RAID_AUTORUN: - err = 0; - autostart_arrays(arg); - goto done; + case RAID_AUTORUN: + err = 0; + autostart_arrays(arg); + goto done; #endif - default:; + default:; } /* * Commands creating/starting a new array: */ - mddev = inode->i_bdev->bd_disk->private_data; + mddev = bdev->bd_disk->private_data; if (!mddev) { BUG(); goto abort; } + /* Some actions do not requires the mutex */ + switch (cmd) { + case GET_ARRAY_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_array_info(mddev, argp); + goto abort; + + case GET_DISK_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_disk_info(mddev, argp); + goto abort; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, new_decode_dev(arg)); + goto abort; + } + + if (cmd == ADD_NEW_DISK) + /* need to ensure md_delayed_delete() has completed */ + flush_workqueue(md_misc_wq); + + if (cmd == HOT_REMOVE_DISK) + /* need to ensure recovery thread has run */ + wait_event_interruptible_timeout(mddev->sb_wait, + !test_bit(MD_RECOVERY_NEEDED, + &mddev->flags), + msecs_to_jiffies(5000)); + if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { + /* Need to flush page cache, and ensure no-one else opens + * and writes + */ + mutex_lock(&mddev->open_mutex); + if (atomic_read(&mddev->openers) > 1) { + mutex_unlock(&mddev->open_mutex); + err = -EBUSY; + goto abort; + } + set_bit(MD_STILL_CLOSED, &mddev->flags); + mutex_unlock(&mddev->open_mutex); + sync_blockdev(bdev); + } err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -4856,50 +6471,44 @@ static int md_ioctl(struct inode *inode, struct file *file, goto abort; } - switch (cmd) - { - case SET_ARRAY_INFO: - { - mdu_array_info_t info; - if (!arg) - memset(&info, 0, sizeof(info)); - else if (copy_from_user(&info, argp, sizeof(info))) { - err = -EFAULT; - goto abort_unlock; - } - if (mddev->pers) { - err = update_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't update" - " array info. %d\n", err); - goto abort_unlock; - } - goto done_unlock; - } - if (!list_empty(&mddev->disks)) { - printk(KERN_WARNING - "md: array %s already has disks!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - if (mddev->raid_disks) { - printk(KERN_WARNING - "md: array %s already initialised!\n", - mdname(mddev)); - err = -EBUSY; - goto abort_unlock; - } - err = set_array_info(mddev, &info); - if (err) { - printk(KERN_WARNING "md: couldn't set" - " array info. %d\n", err); - goto abort_unlock; - } + if (cmd == SET_ARRAY_INFO) { + mdu_array_info_t info; + if (!arg) + memset(&info, 0, sizeof(info)); + else if (copy_from_user(&info, argp, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + if (mddev->pers) { + err = update_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't update" + " array info. %d\n", err); + goto abort_unlock; } goto done_unlock; - - default:; + } + if (!list_empty(&mddev->disks)) { + printk(KERN_WARNING + "md: array %s already has disks!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + if (mddev->raid_disks) { + printk(KERN_WARNING + "md: array %s already initialised!\n", + mdname(mddev)); + err = -EBUSY; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk(KERN_WARNING "md: couldn't set" + " array info. %d\n", err); + goto abort_unlock; + } + goto done_unlock; } /* @@ -4918,32 +6527,73 @@ static int md_ioctl(struct inode *inode, struct file *file, /* * Commands even a read-only array can execute: */ - switch (cmd) - { - case GET_ARRAY_INFO: - err = get_array_info(mddev, argp); - goto done_unlock; - - case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, argp); - goto done_unlock; - - case GET_DISK_INFO: - err = get_disk_info(mddev, argp); + switch (cmd) { + case GET_BITMAP_FILE: + err = get_bitmap_file(mddev, argp); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + err = do_md_stop(mddev, 0, bdev); + goto done_unlock; + + case STOP_ARRAY_RO: + err = md_set_readonly(mddev, bdev); + goto done_unlock; + + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, new_decode_dev(arg)); + goto done_unlock; + + case ADD_NEW_DISK: + /* We can support ADD_NEW_DISK on read-only arrays + * on if we are re-adding a preexisting device. + * So require mddev->pers and MD_DISK_SYNC. + */ + if (mddev->pers) { + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else if (!(info.state & (1<<MD_DISK_SYNC))) + /* Need to clear read-only for this */ + break; + else + err = add_new_disk(mddev, &info); goto done_unlock; + } + break; - case RESTART_ARRAY_RW: - err = restart_array(mddev); + case BLKROSET: + if (get_user(ro, (int __user *)(arg))) { + err = -EFAULT; goto done_unlock; + } + err = -EINVAL; - case STOP_ARRAY: - err = do_md_stop (mddev, 0, 1); + /* if the bdev is going readonly the value of mddev->ro + * does not matter, no writes are coming + */ + if (ro) goto done_unlock; - case STOP_ARRAY_RO: - err = do_md_stop (mddev, 1, 1); + /* are we are already prepared for writes? */ + if (mddev->ro != 1) goto done_unlock; + /* transitioning to readauto need only happen for + * arrays that call md_write_start + */ + if (mddev->pers) { + err = restart_array(mddev); + if (err == 0) { + mddev->ro = 2; + set_disk_ro(mddev->gendisk, 0); + } + } + goto done_unlock; } /* @@ -4956,54 +6606,58 @@ static int md_ioctl(struct inode *inode, struct file *file, if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { if (mddev->ro == 2) { mddev->ro = 0; - sysfs_notify(&mddev->kobj, NULL, "array_state"); + sysfs_notify_dirent_safe(mddev->sysfs_state); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + /* mddev_unlock will wake thread */ + /* If a device failed while we were read-only, we + * need to make sure the metadata is updated now. + */ + if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { + mddev_unlock(mddev); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_DEVS, &mddev->flags) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); + mddev_lock_nointr(mddev); + } } else { err = -EROFS; goto abort_unlock; } } - switch (cmd) + switch (cmd) { + case ADD_NEW_DISK: { - case ADD_NEW_DISK: - { - mdu_disk_info_t info; - if (copy_from_user(&info, argp, sizeof(info))) - err = -EFAULT; - else - err = add_new_disk(mddev, &info); - goto done_unlock; - } - - case HOT_REMOVE_DISK: - err = hot_remove_disk(mddev, new_decode_dev(arg)); - goto done_unlock; - - case HOT_ADD_DISK: - err = hot_add_disk(mddev, new_decode_dev(arg)); - goto done_unlock; + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } - case SET_DISK_FAULTY: - err = set_disk_faulty(mddev, new_decode_dev(arg)); - goto done_unlock; + case HOT_ADD_DISK: + err = hot_add_disk(mddev, new_decode_dev(arg)); + goto done_unlock; - case RUN_ARRAY: - err = do_md_run (mddev); - goto done_unlock; + case RUN_ARRAY: + err = do_md_run(mddev); + goto done_unlock; - case SET_BITMAP_FILE: - err = set_bitmap_file(mddev, (int)arg); - goto done_unlock; + case SET_BITMAP_FILE: + err = set_bitmap_file(mddev, (int)arg); + goto done_unlock; - default: - err = -EINVAL; - goto abort_unlock; + default: + err = -EINVAL; + goto abort_unlock; } done_unlock: abort_unlock: + if (mddev->hold_active == UNTIL_IOCTL && + err != -EINVAL) + mddev->hold_active = 0; mddev_unlock(mddev); return err; @@ -5013,68 +6667,103 @@ done: abort: return err; } +#ifdef CONFIG_COMPAT +static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case HOT_REMOVE_DISK: + case HOT_ADD_DISK: + case SET_DISK_FAULTY: + case SET_BITMAP_FILE: + /* These take in integer arg, do not convert */ + break; + default: + arg = (unsigned long)compat_ptr(arg); + break; + } + + return md_ioctl(bdev, mode, cmd, arg); +} +#endif /* CONFIG_COMPAT */ -static int md_open(struct inode *inode, struct file *file) +static int md_open(struct block_device *bdev, fmode_t mode) { /* * Succeed if we can lock the mddev, which confirms that * it isn't being stopped right now. */ - mddev_t *mddev = inode->i_bdev->bd_disk->private_data; + struct mddev *mddev = mddev_find(bdev->bd_dev); int err; - if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) + if (!mddev) + return -ENODEV; + + if (mddev->gendisk != bdev->bd_disk) { + /* we are racing with mddev_put which is discarding this + * bd_disk. + */ + mddev_put(mddev); + /* Wait until bdev->bd_disk is definitely gone */ + flush_workqueue(md_misc_wq); + /* Then retry the open from the top */ + return -ERESTARTSYS; + } + BUG_ON(mddev != bdev->bd_disk->private_data); + + if ((err = mutex_lock_interruptible(&mddev->open_mutex))) goto out; err = 0; - mddev_get(mddev); atomic_inc(&mddev->openers); - mddev_unlock(mddev); + clear_bit(MD_STILL_CLOSED, &mddev->flags); + mutex_unlock(&mddev->open_mutex); - check_disk_change(inode->i_bdev); + check_disk_change(bdev); out: return err; } -static int md_release(struct inode *inode, struct file * file) +static void md_release(struct gendisk *disk, fmode_t mode) { - mddev_t *mddev = inode->i_bdev->bd_disk->private_data; + struct mddev *mddev = disk->private_data; BUG_ON(!mddev); atomic_dec(&mddev->openers); mddev_put(mddev); - - return 0; } static int md_media_changed(struct gendisk *disk) { - mddev_t *mddev = disk->private_data; + struct mddev *mddev = disk->private_data; return mddev->changed; } static int md_revalidate(struct gendisk *disk) { - mddev_t *mddev = disk->private_data; + struct mddev *mddev = disk->private_data; mddev->changed = 0; return 0; } -static struct block_device_operations md_fops = +static const struct block_device_operations md_fops = { .owner = THIS_MODULE, .open = md_open, .release = md_release, .ioctl = md_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = md_compat_ioctl, +#endif .getgeo = md_getgeo, - .media_changed = md_media_changed, + .media_changed = md_media_changed, .revalidate_disk= md_revalidate, }; static int md_thread(void * arg) { - mdk_thread_t *thread = arg; + struct md_thread *thread = arg; /* * md_thread is a 'system-thread', it's priority should be very @@ -5106,28 +6795,28 @@ static int md_thread(void * arg) thread->timeout); clear_bit(THREAD_WAKEUP, &thread->flags); - - thread->run(thread->mddev); + if (!kthread_should_stop()) + thread->run(thread); } return 0; } -void md_wakeup_thread(mdk_thread_t *thread) +void md_wakeup_thread(struct md_thread *thread) { if (thread) { - dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); + pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); set_bit(THREAD_WAKEUP, &thread->flags); wake_up(&thread->wqueue); } } -mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, - const char *name) +struct md_thread *md_register_thread(void (*run) (struct md_thread *), + struct mddev *mddev, const char *name) { - mdk_thread_t *thread; + struct md_thread *thread; - thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); + thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); if (!thread) return NULL; @@ -5136,7 +6825,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, thread->run = run; thread->mddev = mddev; thread->timeout = MAX_SCHEDULE_TIMEOUT; - thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); + thread->tsk = kthread_run(md_thread, thread, + "%s_%s", + mdname(thread->mddev), + name); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; @@ -5144,15 +6836,24 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, return thread; } -void md_unregister_thread(mdk_thread_t *thread) +void md_unregister_thread(struct md_thread **threadp) { - dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); + struct md_thread *thread = *threadp; + if (!thread) + return; + pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); + /* Locking ensures that mddev_unlock does not wake_up a + * non-existent thread + */ + spin_lock(&pers_lock); + *threadp = NULL; + spin_unlock(&pers_lock); kthread_stop(thread->tsk); kfree(thread); } -void md_error(mddev_t *mddev, mdk_rdev_t *rdev) +void md_error(struct mddev *mddev, struct md_rdev *rdev) { if (!mddev) { MD_BUG(); @@ -5162,26 +6863,17 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) if (!rdev || test_bit(Faulty, &rdev->flags)) return; - if (mddev->external) - set_bit(Blocked, &rdev->flags); -/* - dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", - mdname(mddev), - MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), - __builtin_return_address(0),__builtin_return_address(1), - __builtin_return_address(2),__builtin_return_address(3)); -*/ - if (!mddev->pers) - return; - if (!mddev->pers->error_handler) + if (!mddev->pers || !mddev->pers->error_handler) return; mddev->pers->error_handler(mddev,rdev); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - set_bit(StateChanged, &rdev->flags); + sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); md_new_event_inintr(mddev); } @@ -5190,12 +6882,11 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) static void status_unused(struct seq_file *seq) { int i = 0; - mdk_rdev_t *rdev; - struct list_head *tmp; + struct md_rdev *rdev; seq_printf(seq, "unused devices: "); - rdev_for_each_list(rdev, tmp, pending_raid_disks) { + list_for_each_entry(rdev, &pending_raid_disks, same_set) { char b[BDEVNAME_SIZE]; i++; seq_printf(seq, "%s ", @@ -5208,39 +6899,45 @@ static void status_unused(struct seq_file *seq) } -static void status_resync(struct seq_file *seq, mddev_t * mddev) +static void status_resync(struct seq_file *seq, struct mddev * mddev) { - sector_t max_blocks, resync, res; - unsigned long dt, db, rt; + sector_t max_sectors, resync, res; + unsigned long dt, db; + sector_t rt; int scale; unsigned int per_milli; - resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + if (mddev->curr_resync <= 3) + resync = 0; + else + resync = mddev->curr_resync + - atomic_read(&mddev->recovery_active); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - max_blocks = mddev->resync_max_sectors >> 1; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sectors = mddev->resync_max_sectors; else - max_blocks = mddev->size; + max_sectors = mddev->dev_sectors; /* * Should not happen. */ - if (!max_blocks) { + if (!max_sectors) { MD_BUG(); return; } /* Pick 'scale' such that (resync>>scale)*1000 will fit - * in a sector_t, and (max_blocks>>scale) will fit in a + * in a sector_t, and (max_sectors>>scale) will fit in a * u32, as those are the requirements for sector_div. * Thus 'scale' must be at least 10 */ scale = 10; if (sizeof(sector_t) > sizeof(unsigned long)) { - while ( max_blocks/2 > (1ULL<<(scale+32))) + while ( max_sectors/2 > (1ULL<<(scale+32))) scale++; } res = (resync>>scale)*1000; - sector_div(res, (u32)((max_blocks>>scale)+1)); + sector_div(res, (u32)((max_sectors>>scale)+1)); per_milli = res; { @@ -5261,25 +6958,35 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"))), per_milli/10, per_milli % 10, - (unsigned long long) resync, - (unsigned long long) max_blocks); + (unsigned long long) resync/2, + (unsigned long long) max_sectors/2); /* - * We do not want to overflow, so the order of operands and - * the * 100 / 100 trick are important. We do a +1 to be - * safe against division by zero. We only estimate anyway. - * * dt: time from mark until now * db: blocks written from mark until now * rt: remaining time + * + * rt is a sector_t, so could be 32bit or 64bit. + * So we divide before multiply in case it is 32bit and close + * to the limit. + * We scale the divisor (db) by 32 to avoid losing precision + * near the end of resync when the number of remaining sectors + * is close to 'db'. + * We then divide rt by 32 after multiplying by db to compensate. + * The '+1' avoids division by zero if db is very small. */ dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) - mddev->resync_mark_cnt; - rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; - seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + rt = max_sectors - resync; /* number of remaining sectors */ + sector_div(rt, db/32+1); + rt *= dt; + rt >>= 5; + + seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, + ((unsigned long)rt % 60)/6); seq_printf(seq, " speed=%ldK/sec", db/2/dt); } @@ -5288,7 +6995,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) { struct list_head *tmp; loff_t l = *pos; - mddev_t *mddev; + struct mddev *mddev; if (l >= 0x10000) return NULL; @@ -5299,7 +7006,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) spin_lock(&all_mddevs_lock); list_for_each(tmp,&all_mddevs) if (!l--) { - mddev = list_entry(tmp, mddev_t, all_mddevs); + mddev = list_entry(tmp, struct mddev, all_mddevs); mddev_get(mddev); spin_unlock(&all_mddevs_lock); return mddev; @@ -5313,7 +7020,7 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct list_head *tmp; - mddev_t *next_mddev, *mddev = v; + struct mddev *next_mddev, *mddev = v; ++*pos; if (v == (void*)2) @@ -5325,7 +7032,7 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) else tmp = mddev->all_mddevs.next; if (tmp != &all_mddevs) - next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); + next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); else { next_mddev = (void*)2; *pos = 0x10000; @@ -5340,27 +7047,20 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void md_seq_stop(struct seq_file *seq, void *v) { - mddev_t *mddev = v; + struct mddev *mddev = v; if (mddev && v != (void*)1 && v != (void*)2) mddev_put(mddev); } -struct mdstat_info { - int event; -}; - static int md_seq_show(struct seq_file *seq, void *v) { - mddev_t *mddev = v; - sector_t size; - struct list_head *tmp2; - mdk_rdev_t *rdev; - struct mdstat_info *mi = seq->private; - struct bitmap *bitmap; + struct mddev *mddev = v; + sector_t sectors; + struct md_rdev *rdev; if (v == (void*)1) { - struct mdk_personality *pers; + struct md_personality *pers; seq_printf(seq, "Personalities : "); spin_lock(&pers_lock); list_for_each_entry(pers, &pers_list, list) @@ -5368,7 +7068,7 @@ static int md_seq_show(struct seq_file *seq, void *v) spin_unlock(&pers_lock); seq_printf(seq, "\n"); - mi->event = atomic_read(&md_event_count); + seq->poll_event = atomic_read(&md_event_count); return 0; } if (v == (void*)2) { @@ -5390,8 +7090,8 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " %s", mddev->pers->name); } - size = 0; - rdev_for_each(rdev, tmp2, mddev) { + sectors = 0; + rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -5400,9 +7100,12 @@ static int md_seq_show(struct seq_file *seq, void *v) if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; - } else if (rdev->raid_disk < 0) + } + if (rdev->raid_disk < 0) seq_printf(seq, "(S)"); /* spare */ - size += rdev->size; + if (test_bit(Replacement, &rdev->flags)) + seq_printf(seq, "(R)"); + sectors += rdev->sectors; } if (!list_empty(&mddev->disks)) { @@ -5412,7 +7115,7 @@ static int md_seq_show(struct seq_file *seq, void *v) mddev->array_sectors / 2); else seq_printf(seq, "\n %llu blocks", - (unsigned long long)size); + (unsigned long long)sectors / 2); } if (mddev->persistent) { if (mddev->major_version != 0 || @@ -5428,13 +7131,13 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " super non-persistent"); if (mddev->pers) { - mddev->pers->status (seq, mddev); + mddev->pers->status(seq, mddev); seq_printf(seq, "\n "); if (mddev->pers->sync_request) { if (mddev->curr_resync > 2) { - status_resync (seq, mddev); + status_resync(seq, mddev); seq_printf(seq, "\n "); - } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + } else if (mddev->curr_resync >= 1) seq_printf(seq, "\tresync=DELAYED\n "); else if (mddev->recovery_cp < MaxSector) seq_printf(seq, "\tresync=PENDING\n "); @@ -5442,27 +7145,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } else seq_printf(seq, "\n "); - if ((bitmap = mddev->bitmap)) { - unsigned long chunk_kb; - unsigned long flags; - spin_lock_irqsave(&bitmap->lock, flags); - chunk_kb = bitmap->chunksize >> 10; - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " - "%lu%s chunk", - bitmap->pages - bitmap->missing_pages, - bitmap->pages, - (bitmap->pages - bitmap->missing_pages) - << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : bitmap->chunksize, - chunk_kb ? "KB" : "B"); - if (bitmap->file) { - seq_printf(seq, ", file: "); - seq_path(seq, &bitmap->file->f_path, " \t\n"); - } - - seq_printf(seq, "\n"); - spin_unlock_irqrestore(&bitmap->lock, flags); - } + bitmap_status(seq, mddev->bitmap); seq_printf(seq, "\n"); } @@ -5471,7 +7154,7 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; } -static struct seq_operations md_seq_ops = { +static const struct seq_operations md_seq_ops = { .start = md_seq_start, .next = md_seq_next, .stop = md_seq_stop, @@ -5480,34 +7163,32 @@ static struct seq_operations md_seq_ops = { static int md_seq_open(struct inode *inode, struct file *file) { + struct seq_file *seq; int error; - struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); - if (mi == NULL) - return -ENOMEM; error = seq_open(file, &md_seq_ops); if (error) - kfree(mi); - else { - struct seq_file *p = file->private_data; - p->private = mi; - mi->event = atomic_read(&md_event_count); - } + return error; + + seq = file->private_data; + seq->poll_event = atomic_read(&md_event_count); return error; } +static int md_unloading; static unsigned int mdstat_poll(struct file *filp, poll_table *wait) { - struct seq_file *m = filp->private_data; - struct mdstat_info *mi = m->private; + struct seq_file *seq = filp->private_data; int mask; + if (md_unloading) + return POLLIN|POLLRDNORM|POLLERR|POLLPRI;; poll_wait(filp, &md_event_waiters, wait); /* always allow read */ mask = POLLIN | POLLRDNORM; - if (mi->event != atomic_read(&md_event_count)) + if (seq->poll_event != atomic_read(&md_event_count)) mask |= POLLERR | POLLPRI; return mask; } @@ -5521,7 +7202,7 @@ static const struct file_operations md_seq_fops = { .poll = mdstat_poll, }; -int register_md_personality(struct mdk_personality *p) +int register_md_personality(struct md_personality *p) { spin_lock(&pers_lock); list_add_tail(&p->list, &pers_list); @@ -5530,7 +7211,7 @@ int register_md_personality(struct mdk_personality *p) return 0; } -int unregister_md_personality(struct mdk_personality *p) +int unregister_md_personality(struct md_personality *p) { printk(KERN_INFO "md: %s personality unregistered\n", p->name); spin_lock(&pers_lock); @@ -5539,19 +7220,19 @@ int unregister_md_personality(struct mdk_personality *p) return 0; } -static int is_mddev_idle(mddev_t *mddev) +static int is_mddev_idle(struct mddev *mddev, int init) { - mdk_rdev_t * rdev; + struct md_rdev * rdev; int idle; - long curr_events; + int curr_events; idle = 1; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = disk_stat_read(disk, sectors[0]) + - disk_stat_read(disk, sectors[1]) - - atomic_read(&disk->sync_io); + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - + atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and * disk_stats is counted when it completes. @@ -5574,7 +7255,7 @@ static int is_mddev_idle(mddev_t *mddev) * always make curr_events less than last_events. * */ - if (curr_events - rdev->last_events > 4096) { + if (init || curr_events - rdev->last_events > 64) { rdev->last_events = curr_events; idle = 0; } @@ -5583,13 +7264,14 @@ static int is_mddev_idle(mddev_t *mddev) return idle; } -void md_done_sync(mddev_t *mddev, int blocks, int ok) +void md_done_sync(struct mddev *mddev, int blocks, int ok) { /* another "blocks" (512byte) blocks have been synced */ atomic_sub(blocks, &mddev->recovery_active); wake_up(&mddev->recovery_wait); if (!ok) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_bit(MD_RECOVERY_ERROR, &mddev->recovery); md_wakeup_thread(mddev->thread); // stop recovery, signal do_sync .... } @@ -5601,7 +7283,7 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) * in superblock) before writing, schedule a superblock update * and wait for it to complete. */ -void md_write_start(mddev_t *mddev, struct bio *bi) +void md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) @@ -5624,19 +7306,19 @@ void md_write_start(mddev_t *mddev, struct bio *bi) if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); md_wakeup_thread(mddev->thread); did_change = 1; } spin_unlock_irq(&mddev->write_lock); } if (did_change) - sysfs_notify(&mddev->kobj, NULL, "array_state"); + sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } -void md_write_end(mddev_t *mddev) +void md_write_end(struct mddev *mddev) { if (atomic_dec_and_test(&mddev->writes_pending)) { if (mddev->safemode == 2) @@ -5655,7 +7337,7 @@ void md_write_end(mddev_t *mddev) * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock * is dropped, so return -EAGAIN after notifying userspace. */ -int md_allow_write(mddev_t *mddev) +int md_allow_write(struct mddev *mddev) { if (!mddev->pers) return 0; @@ -5668,16 +7350,17 @@ int md_allow_write(mddev_t *mddev) if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; spin_unlock_irq(&mddev->write_lock); md_update_sb(mddev, 0); - sysfs_notify(&mddev->kobj, NULL, "array_state"); + sysfs_notify_dirent_safe(mddev->sysfs_state); } else spin_unlock_irq(&mddev->write_lock); - if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) return -EAGAIN; else return 0; @@ -5686,40 +7369,49 @@ EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -void md_do_sync(mddev_t *mddev) +#define UPDATE_FREQUENCY (5*60*HZ) +void md_do_sync(struct md_thread *thread) { - mddev_t *mddev2; + struct mddev *mddev = thread->mddev; + struct mddev *mddev2; unsigned int currspeed = 0, window; sector_t max_sectors,j, io_sectors; unsigned long mark[SYNC_MARKS]; + unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; struct list_head *tmp; sector_t last_check; int skipped = 0; - struct list_head *rtmp; - mdk_rdev_t *rdev; - char *desc; + struct md_rdev *rdev; + char *desc, *action = NULL; + struct blk_plug plug; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) return; - if (mddev->ro) /* never try to sync a read-only array */ + if (mddev->ro) {/* never try to sync a read-only array */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); return; + } if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { desc = "data-check"; - else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + action = "check"; + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { desc = "requested-resync"; - else + action = "repair"; + } else desc = "resync"; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) desc = "reshape"; else desc = "recovery"; + mddev->last_sync_action = action ?: desc; + /* we overload curr_resync somewhat here. * 0 == not engaged in resync at all * 2 == checking that there is no conflict with another sync @@ -5740,10 +7432,8 @@ void md_do_sync(mddev_t *mddev) mddev->curr_resync = 2; try_again: - if (kthread_should_stop()) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; - } for_each_mddev(mddev2, tmp) { if (mddev2 == mddev) continue; @@ -5761,14 +7451,20 @@ void md_do_sync(mddev_t *mddev) * time 'round when curr_resync == 2 */ continue; - prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); - if (!kthread_should_stop() && + /* We need to wait 'interruptible' so as not to + * contribute to the load average, and not to + * be caught by 'softlockup' + */ + prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev2->curr_resync >= mddev->curr_resync) { printk(KERN_INFO "md: delaying %s of %s" " until %s has finished (they" " share one or more physical units)\n", desc, mdname(mddev), mdname(mddev2)); mddev_put(mddev2); + if (signal_pending(current)) + flush_signals(current); schedule(); finish_wait(&resync_wait, &wq); goto try_again; @@ -5784,7 +7480,7 @@ void md_do_sync(mddev_t *mddev) * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); /* we don't use the checkpoint if there's a bitmap */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->resync_min; @@ -5792,17 +7488,32 @@ void md_do_sync(mddev_t *mddev) j = mddev->recovery_cp; } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - max_sectors = mddev->size << 1; + max_sectors = mddev->resync_max_sectors; else { /* recovery follows the physical size of devices */ - max_sectors = mddev->size << 1; + max_sectors = mddev->dev_sectors; j = MaxSector; - rdev_for_each(rdev, rtmp, mddev) + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < j) j = rdev->recovery_offset; + rcu_read_unlock(); + + /* If there is a bitmap, we need to make sure all + * writes that started before we added a spare + * complete before we start doing a recovery. + * Otherwise the write might complete and (via + * bitmap_endwrite) set a bit in the bitmap after the + * recovery has checked that bit and skipped that + * region. + */ + if (mddev->bitmap) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } } printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); @@ -5812,7 +7523,7 @@ void md_do_sync(mddev_t *mddev) "(but not more than %d KB/sec) for %s.\n", speed_max(mddev), desc); - is_mddev_idle(mddev); /* this also initializes IO event counters */ + is_mddev_idle(mddev, 1); /* this initializes IO event counters */ io_sectors = 0; for (m = 0; m < SYNC_MARKS; m++) { @@ -5827,36 +7538,71 @@ void md_do_sync(mddev_t *mddev) * Tune reconstruction: */ window = 32*(PAGE_SIZE/512); - printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", - window/2,(unsigned long long) max_sectors/2); + printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", + window/2, (unsigned long long)max_sectors/2); atomic_set(&mddev->recovery_active, 0); last_check = 0; if (j>2) { - printk(KERN_INFO + printk(KERN_INFO "md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; - } + } else + mddev->curr_resync = 3; /* no longer delayed */ + mddev->curr_resync_completed = j; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + md_new_event(mddev); + update_time = jiffies; + blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; skipped = 0; - if (j >= mddev->resync_max) { - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + ((mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) || + time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || + (j - mddev->curr_resync_completed)*2 + >= mddev->resync_max - mddev->curr_resync_completed + )) { + /* time to update curr_resync_completed */ wait_event(mddev->recovery_wait, - mddev->resync_max > j - || kthread_should_stop()); + atomic_read(&mddev->recovery_active) == 0); + mddev->curr_resync_completed = j; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + j > mddev->recovery_cp) + mddev->recovery_cp = j; + update_time = jiffies; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + } + + while (j >= mddev->resync_max && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + /* As this condition is controlled by user-space, + * we can block indefinitely, so use '_interruptible' + * to avoid triggering warnings. + */ + flush_signals(current); /* just in case */ + wait_event_interruptible(mddev->recovery_wait, + mddev->resync_max > j + || test_bit(MD_RECOVERY_INTR, + &mddev->recovery)); } - if (kthread_should_stop()) - goto interrupted; + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; + sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; + break; } if (!skipped) { /* actual IO requested */ @@ -5864,11 +7610,15 @@ void md_do_sync(mddev_t *mddev) atomic_add(sectors, &mddev->recovery_active); } + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; + j += sectors; - if (j>1) mddev->curr_resync = j; + if (j > 2) + mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) - /* this is the earliers that rebuilt will be + /* this is the earliest that rebuild will be * visible in /proc/mdstat */ md_new_event(mddev); @@ -5877,10 +7627,6 @@ void md_do_sync(mddev_t *mddev) continue; last_check = io_sectors; - - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - break; - repeat: if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { /* step marks */ @@ -5893,10 +7639,8 @@ void md_do_sync(mddev_t *mddev) last_mark = next; } - - if (kthread_should_stop()) - goto interrupted; - + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; /* * this loop exits only if either when we are slower than @@ -5906,7 +7650,6 @@ void md_do_sync(mddev_t *mddev) * about not overloading the IO subsystem. (things like an * e2fsck being done on the RAID array should execute fast) */ - blk_unplug(mddev->queue); cond_resched(); currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 @@ -5914,19 +7657,19 @@ void md_do_sync(mddev_t *mddev) if (currspeed > speed_min(mddev)) { if ((currspeed > speed_max(mddev)) || - !is_mddev_idle(mddev)) { + !is_mddev_idle(mddev, 0)) { msleep(500); goto repeat; } } } - printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); + printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, + test_bit(MD_RECOVERY_INTR, &mddev->recovery) + ? "interrupted" : "done"); /* * this also signals 'finished resyncing' to md_stop */ - out: - blk_unplug(mddev->queue); - + blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); /* tell personality that we are finished */ @@ -5940,95 +7683,106 @@ void md_do_sync(mddev_t *mddev) printk(KERN_INFO "md: checkpointing %s of %s.\n", desc, mdname(mddev)); - mddev->recovery_cp = mddev->curr_resync; + if (test_bit(MD_RECOVERY_ERROR, + &mddev->recovery)) + mddev->recovery_cp = + mddev->curr_resync_completed; + else + mddev->recovery_cp = + mddev->curr_resync; } } else mddev->recovery_cp = MaxSector; } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - rdev_for_each(rdev, rtmp, mddev) + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); } } + skip: set_bit(MD_CHANGE_DEVS, &mddev->flags); - skip: + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + /* We completed so min/max setting can be forgotten if used. */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + mddev->resync_min = mddev->curr_resync_completed; mddev->curr_resync = 0; - mddev->resync_min = 0; - mddev->resync_max = MaxSector; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); return; - - interrupted: - /* - * got a signal, exit. - */ - printk(KERN_INFO - "md: md_do_sync() got signal ... exiting\n"); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; - } EXPORT_SYMBOL_GPL(md_do_sync); - -static int remove_and_add_spares(mddev_t *mddev) +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this) { - mdk_rdev_t *rdev; - struct list_head *rtmp; + struct md_rdev *rdev; int spares = 0; + int removed = 0; - rdev_for_each(rdev, rtmp, mddev) - if (rdev->raid_disk >= 0 && + rdev_for_each(rdev, mddev) + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( - mddev, rdev->raid_disk)==0) { - char nm[20]; - sprintf(nm,"rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); + mddev, rdev) == 0) { + sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = -1; + removed++; } } + if (removed && mddev->kobj.sd) + sysfs_notify(&mddev->kobj, NULL, "degraded"); - if (mddev->degraded && ! mddev->ro) { - rdev_for_each(rdev, rtmp, mddev) { - if (rdev->raid_disk >= 0 && - !test_bit(In_sync, &rdev->flags) && - !test_bit(Blocked, &rdev->flags)) - spares++; - if (rdev->raid_disk < 0 - && !test_bit(Faulty, &rdev->flags)) { - rdev->recovery_offset = 0; - if (mddev->pers-> - hot_add_disk(mddev, rdev) == 0) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&mddev->kobj, - &rdev->kobj, nm)) - printk(KERN_WARNING - "md: cannot register " - "%s for %s\n", - nm, mdname(mddev)); - spares++; - md_new_event(mddev); - } else - break; - } + if (this) + goto no_add; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk >= 0 && + !test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + spares++; + if (rdev->raid_disk >= 0) + continue; + if (test_bit(Faulty, &rdev->flags)) + continue; + if (mddev->ro && + ! (rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) + continue; + + if (rdev->saved_raid_disk < 0) + rdev->recovery_offset = 0; + if (mddev->pers-> + hot_add_disk(mddev, rdev) == 0) { + if (sysfs_link_rdev(mddev, rdev)) + /* failure here is OK */; + spares++; + md_new_event(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); } } +no_add: + if (removed) + set_bit(MD_CHANGE_DEVS, &mddev->flags); return spares; } + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -6051,20 +7805,13 @@ static int remove_and_add_spares(mddev_t *mddev) * 5/ If array is degraded, try to add spares devices * 6/ If array has spares or is not in-sync, start a resync thread. */ -void md_check_recovery(mddev_t *mddev) +void md_check_recovery(struct mddev *mddev) { - mdk_rdev_t *rdev; - struct list_head *rtmp; - + if (mddev->suspended) + return; if (mddev->bitmap) - bitmap_daemon_work(mddev->bitmap); - - if (test_and_clear_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags)) - sysfs_notify(&mddev->kobj, NULL, "array_state"); - - if (mddev->ro) - return; + bitmap_daemon_work(mddev); if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { @@ -6078,7 +7825,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( - (mddev->flags && !mddev->external) || + (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || @@ -6091,10 +7838,19 @@ void md_check_recovery(mddev_t *mddev) int spares = 0; if (mddev->ro) { - /* Only thing we do on a ro array is remove - * failed devices. + /* On a read-only array we can: + * - remove failed devices + * - add already-in_sync devices if the array itself + * is in-sync. + * As we only add devices that are already in-sync, + * we can activate the spares immediately. + */ + remove_and_add_spares(mddev, NULL); + /* There is no thread, but we need to call + * ->spare_active and clear saved_raid_disk */ - remove_and_add_spares(mddev); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } @@ -6108,24 +7864,18 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; did_change = 1; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } if (mddev->safemode == 1) mddev->safemode = 0; spin_unlock_irq(&mddev->write_lock); if (did_change) - sysfs_notify(&mddev->kobj, NULL, "array_state"); + sysfs_notify_dirent_safe(mddev->sysfs_state); } - if (mddev->flags) + if (mddev->flags & MD_UPDATE_SB_FLAGS) md_update_sb(mddev, 0); - rdev_for_each(rdev, rtmp, mddev) - if (test_and_clear_bit(StateChanged, &rdev->flags)) - sysfs_notify(&rdev->kobj, NULL, "state"); - - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { /* resync/recovery still happening */ @@ -6133,60 +7883,38 @@ void md_check_recovery(mddev_t *mddev) goto unlock; } if (mddev->sync_thread) { - /* resync has finished, collect result */ - md_unregister_thread(mddev->sync_thread); - mddev->sync_thread = NULL; - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* success...*/ - /* activate any spares */ - if (mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - } - md_update_sb(mddev, 1); - - /* if array is no-longer degraded, then any saved_raid_disk - * information must be scrapped - */ - if (!mddev->degraded) - rdev_for_each(rdev, rtmp, mddev) - rdev->saved_raid_disk = -1; - - mddev->recovery = 0; - /* flag recovery needed just to double check */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify(&mddev->kobj, NULL, "sync_action"); - md_new_event(mddev); + md_reap_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ + mddev->curr_resync_completed = 0; set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); /* Clear some bits that don't mean anything, but * might be left set */ clear_bit(MD_RECOVERY_INTR, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); - if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) goto unlock; /* no recovery is running. * remove any failed drives, then * add spares if possible. - * Spare are also removed and re-added, to allow + * Spares are also removed and re-added, to allow * the personality to fail the re-add. */ if (mddev->reshape_position != MaxSector) { - if (mddev->pers->check_reshape(mddev) != 0) + if (mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev) != 0) /* Cannot proceed */ goto unlock; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if ((spares = remove_and_add_spares(mddev))) { + } else if ((spares = remove_and_add_spares(mddev, NULL))) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); @@ -6199,7 +7927,7 @@ void md_check_recovery(mddev_t *mddev) goto unlock; if (mddev->pers->sync_request) { - if (spares && mddev->bitmap && ! mddev->bitmap->file) { + if (spares) { /* We are adding a device or devices to an array * which has the bitmap stored on all devices. * So make sure all bitmap pages get written @@ -6208,66 +7936,627 @@ void md_check_recovery(mddev_t *mddev) } mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "%s_resync"); + "resync"); if (!mddev->sync_thread) { printk(KERN_ERR "%s: could not start resync" " thread...\n", mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ - mddev->recovery = 0; + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); } else md_wakeup_thread(mddev->sync_thread); - sysfs_notify(&mddev->kobj, NULL, "sync_action"); + sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(mddev); } unlock: + wake_up(&mddev->sb_wait); + if (!mddev->sync_thread) { clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) - sysfs_notify(&mddev->kobj, NULL, "sync_action"); + if (mddev->sysfs_action) + sysfs_notify_dirent_safe(mddev->sysfs_action); } mddev_unlock(mddev); } } -void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) +void md_reap_sync_thread(struct mddev *mddev) { - sysfs_notify(&rdev->kobj, NULL, "state"); + struct md_rdev *rdev; + + /* resync has finished, collect result */ + md_unregister_thread(&mddev->sync_thread); + wake_up(&resync_wait); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) { + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + } + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + + /* If array is no-longer degraded, then any saved_raid_disk + * information must be scrapped. + */ + if (!mddev->degraded) + rdev_for_each(rdev, mddev) + rdev->saved_raid_disk = -1; + + md_update_sb(mddev, 1); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); +} + +void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) +{ + sysfs_notify_dirent_safe(rdev->sysfs_state); wait_event_timeout(rdev->blocked_wait, - !test_bit(Blocked, &rdev->flags), + !test_bit(Blocked, &rdev->flags) && + !test_bit(BlockedBadBlocks, &rdev->flags), msecs_to_jiffies(5000)); rdev_dec_pending(rdev, mddev); } EXPORT_SYMBOL(md_wait_for_blocked_rdev); -static int md_notify_reboot(struct notifier_block *this, - unsigned long code, void *x) +void md_finish_reshape(struct mddev *mddev) { - struct list_head *tmp; - mddev_t *mddev; + /* called be personality module when reshape completes. */ + struct md_rdev *rdev; - if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { + rdev_for_each(rdev, mddev) { + if (rdev->data_offset > rdev->new_data_offset) + rdev->sectors += rdev->data_offset - rdev->new_data_offset; + else + rdev->sectors -= rdev->new_data_offset - rdev->data_offset; + rdev->data_offset = rdev->new_data_offset; + } +} +EXPORT_SYMBOL(md_finish_reshape); - printk(KERN_INFO "md: stopping all md devices.\n"); +/* Bad block management. + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so md_is_badblock + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * We return + * 0 if there are no known bad blocks in the range + * 1 if there are known bad block which are all acknowledged + * -1 if there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int hi; + int lo; + u64 *p = bb->page; + int rv; + sector_t target = s + sectors; + unsigned seq; + + if (bb->shift > 0) { + /* round the start down, and the end up */ + s >>= bb->shift; + target += (1<<bb->shift) - 1; + target >>= bb->shift; + sectors = target - s; + } + /* 'target' is now the first block after the bad range */ + +retry: + seq = read_seqbegin(&bb->lock); + lo = 0; + rv = 0; + hi = bb->count; + + /* Binary search between lo and hi for 'target' + * i.e. for the last range that starts before 'target' + */ + /* INVARIANT: ranges before 'lo' and at-or-after 'hi' + * are known not to be the last range before target. + * VARIANT: hi-lo is the number of possible + * ranges, and decreases until it reaches 1 + */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a < target) + /* This could still be the one, earlier ranges + * could not. */ + lo = mid; + else + /* This and later ranges are definitely out. */ + hi = mid; + } + /* 'lo' might be the last that started before target, but 'hi' isn't */ + if (hi > lo) { + /* need to check all range that end after 's' to see if + * any are unacknowledged. + */ + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + if (BB_OFFSET(p[lo]) < target) { + /* starts before the end, and finishes after + * the start, so they must overlap + */ + if (rv != -1 && BB_ACK(p[lo])) + rv = 1; + else + rv = -1; + *first_bad = BB_OFFSET(p[lo]); + *bad_sectors = BB_LEN(p[lo]); + } + lo--; + } + } - for_each_mddev(mddev, tmp) - if (mddev_trylock(mddev)) { - /* Force a switch to readonly even array - * appears to still be in use. Hence - * the '100'. + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} +EXPORT_SYMBOL_GPL(md_is_badblock); + +/* + * Add a range of bad blocks to the table. + * This might extend the table, or might contract it + * if two adjacent ranges can be merged. + * We binary-search to find the 'insertion' point, then + * decide how best to handle it. + */ +static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + u64 *p; + int lo, hi; + int rv = 1; + unsigned long flags; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 0; + + if (bb->shift) { + /* round the start down, and the end up */ + sector_t next = s + sectors; + s >>= bb->shift; + next += (1<<bb->shift) - 1; + next >>= bb->shift; + sectors = next - s; + } + + write_seqlock_irqsave(&bb->lock, flags); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts at-or-before 's' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a <= s) + lo = mid; + else + hi = mid; + } + if (hi > lo && BB_OFFSET(p[lo]) > s) + hi = lo; + + if (hi > lo) { + /* we found a range that might merge with the start + * of our new range + */ + sector_t a = BB_OFFSET(p[lo]); + sector_t e = a + BB_LEN(p[lo]); + int ack = BB_ACK(p[lo]); + if (e >= s) { + /* Yes, we can merge with a previous range */ + if (s == a && s + sectors >= e) + /* new range covers old */ + ack = acknowledged; + else + ack = ack && acknowledged; + + if (e < s + sectors) + e = s + sectors; + if (e - a <= BB_MAX_LEN) { + p[lo] = BB_MAKE(a, e-a, ack); + s = e; + } else { + /* does not all fit in one range, + * make p[lo] maximal */ - do_md_stop (mddev, 1, 100); - mddev_unlock(mddev); + if (BB_LEN(p[lo]) != BB_MAX_LEN) + p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; } - /* - * certain more exotic SCSI devices are known to be - * volatile wrt too early system reboots. While the - * right place to handle this issue is the given - * driver, we do want to have a safe RAID driver ... + sectors = e - s; + } + } + if (sectors && hi < bb->count) { + /* 'hi' points to the first range that starts after 's'. + * Maybe we can merge with the start of that range */ + sector_t a = BB_OFFSET(p[hi]); + sector_t e = a + BB_LEN(p[hi]); + int ack = BB_ACK(p[hi]); + if (a <= s + sectors) { + /* merging is possible */ + if (e <= s + sectors) { + /* full overlap */ + e = s + sectors; + ack = acknowledged; + } else + ack = ack && acknowledged; + + a = s; + if (e - a <= BB_MAX_LEN) { + p[hi] = BB_MAKE(a, e-a, ack); + s = e; + } else { + p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + lo = hi; + hi++; + } + } + if (sectors == 0 && hi < bb->count) { + /* we might be able to combine lo and hi */ + /* Note: 's' is at the end of 'lo' */ + sector_t a = BB_OFFSET(p[hi]); + int lolen = BB_LEN(p[lo]); + int hilen = BB_LEN(p[hi]); + int newlen = lolen + hilen - (s - a); + if (s >= a && newlen < BB_MAX_LEN) { + /* yes, we can combine them */ + int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); + p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + memmove(p + hi, p + hi + 1, + (bb->count - hi - 1) * 8); + bb->count--; + } + } + while (sectors) { + /* didn't merge (it all). + * Need to add a range just before 'hi' */ + if (bb->count >= MD_MAX_BADBLOCKS) { + /* No room for more */ + rv = 0; + break; + } else { + int this_sectors = sectors; + memmove(p + hi + 1, p + hi, + (bb->count - hi) * 8); + bb->count++; + + if (this_sectors > BB_MAX_LEN) + this_sectors = BB_MAX_LEN; + p[hi] = BB_MAKE(s, this_sectors, acknowledged); + sectors -= this_sectors; + s += this_sectors; + } + } + + bb->changed = 1; + if (!acknowledged) + bb->unacked_exist = 1; + write_sequnlock_irqrestore(&bb->lock, flags); + + return rv; +} + +int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) +{ + int rv; + if (is_new) + s += rdev->new_data_offset; + else + s += rdev->data_offset; + rv = md_set_badblocks(&rdev->badblocks, + s, sectors, 0); + if (rv) { + /* Make sure they get written out promptly */ + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); + md_wakeup_thread(rdev->mddev->thread); + } + return rv; +} +EXPORT_SYMBOL_GPL(rdev_set_badblocks); + +/* + * Remove a range of bad blocks from the table. + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + */ +static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) +{ + u64 *p; + int lo, hi; + sector_t target = s + sectors; + int rv = 0; + + if (bb->shift > 0) { + /* When clearing we round the start up and the end down. + * This should not matter as the shift should align with + * the block size and no rounding should ever be needed. + * However it is better the think a block is bad when it + * isn't than to think a block is not bad when it is. */ - mdelay(1000*1); + s += (1<<bb->shift) - 1; + s >>= bb->shift; + target >>= bb->shift; + sectors = target - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts before 'target' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a < target) + lo = mid; + else + hi = mid; } + if (hi > lo) { + /* p[lo] is the last range that could overlap the + * current range. Earlier ranges could also overlap, + * but only this one can overlap the end of the range. + */ + if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { + /* Partial overlap, leave the tail of this range */ + int ack = BB_ACK(p[lo]); + sector_t a = BB_OFFSET(p[lo]); + sector_t end = a + BB_LEN(p[lo]); + + if (a < s) { + /* we need to split this range */ + if (bb->count >= MD_MAX_BADBLOCKS) { + rv = -ENOSPC; + goto out; + } + memmove(p+lo+1, p+lo, (bb->count - lo) * 8); + bb->count++; + p[lo] = BB_MAKE(a, s-a, ack); + lo++; + } + p[lo] = BB_MAKE(target, end - target, ack); + /* there is no longer an overlap */ + hi = lo; + lo--; + } + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + /* This range does overlap */ + if (BB_OFFSET(p[lo]) < s) { + /* Keep the early parts of this range. */ + int ack = BB_ACK(p[lo]); + sector_t start = BB_OFFSET(p[lo]); + p[lo] = BB_MAKE(start, s - start, ack); + /* now low doesn't overlap, so.. */ + break; + } + lo--; + } + /* 'lo' is strictly before, 'hi' is strictly after, + * anything between needs to be discarded + */ + if (hi - lo > 1) { + memmove(p+lo+1, p+hi, (bb->count - hi) * 8); + bb->count -= (hi - lo - 1); + } + } + + bb->changed = 1; +out: + write_sequnlock_irq(&bb->lock); + return rv; +} + +int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) +{ + if (is_new) + s += rdev->new_data_offset; + else + s += rdev->data_offset; + return md_clear_badblocks(&rdev->badblocks, + s, sectors); +} +EXPORT_SYMBOL_GPL(rdev_clear_badblocks); + +/* + * Acknowledge all bad blocks in a list. + * This only succeeds if ->changed is clear. It is used by + * in-kernel metadata updates + */ +void md_ack_all_badblocks(struct badblocks *bb) +{ + if (bb->page == NULL || bb->changed) + /* no point even trying */ + return; + write_seqlock_irq(&bb->lock); + + if (bb->changed == 0 && bb->unacked_exist) { + u64 *p = bb->page; + int i; + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + sector_t start = BB_OFFSET(p[i]); + int len = BB_LEN(p[i]); + p[i] = BB_MAKE(start, len, 1); + } + } + bb->unacked_exist = 0; + } + write_sequnlock_irq(&bb->lock); +} +EXPORT_SYMBOL_GPL(md_ack_all_badblocks); + +/* sysfs access to bad-blocks list. + * We present two files. + * 'bad-blocks' lists sector numbers and lengths of ranges that + * are recorded as bad. The list is truncated to fit within + * the one-page limit of sysfs. + * Writing "sector length" to this file adds an acknowledged + * bad block list. + * 'unacknowledged-bad-blocks' lists bad blocks that have not yet + * been acknowledged. Writing to this file adds bad blocks + * without acknowledging them. This is largely for testing. + */ + +static ssize_t +badblocks_show(struct badblocks *bb, char *page, int unack) +{ + size_t len; + int i; + u64 *p = bb->page; + unsigned seq; + + if (bb->shift < 0) + return 0; + +retry: + seq = read_seqbegin(&bb->lock); + + len = 0; + i = 0; + + while (len < PAGE_SIZE && i < bb->count) { + sector_t s = BB_OFFSET(p[i]); + unsigned int length = BB_LEN(p[i]); + int ack = BB_ACK(p[i]); + i++; + + if (unack && ack) + continue; + + len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", + (unsigned long long)s << bb->shift, + length << bb->shift); + } + if (unack && len == 0) + bb->unacked_exist = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return len; +} + +#define DO_DEBUG 1 + +static ssize_t +badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) +{ + unsigned long long sector; + int length; + char newline; +#ifdef DO_DEBUG + /* Allow clearing via sysfs *only* for testing/debugging. + * Normally only a successful write may clear a badblock + */ + int clear = 0; + if (page[0] == '-') { + clear = 1; + page++; + } +#endif /* DO_DEBUG */ + + switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { + case 3: + if (newline != '\n') + return -EINVAL; + case 2: + if (length <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + +#ifdef DO_DEBUG + if (clear) { + md_clear_badblocks(bb, sector, length); + return len; + } +#endif /* DO_DEBUG */ + if (md_set_badblocks(bb, sector, length, !unack)) + return len; + else + return -ENOSPC; +} + +static int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct list_head *tmp; + struct mddev *mddev; + int need_delay = 0; + + for_each_mddev(mddev, tmp) { + if (mddev_trylock(mddev)) { + if (mddev->pers) + __md_stop_writes(mddev); + if (mddev->persistent) + mddev->safemode = 2; + mddev_unlock(mddev); + } + need_delay = 1; + } + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + if (need_delay) + mdelay(1000*1); + return NOTIFY_DONE; } @@ -6279,20 +8568,31 @@ static struct notifier_block md_notifier = { static void md_geninit(void) { - dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); } static int __init md_init(void) { - if (register_blkdev(MAJOR_NR, "md")) - return -1; - if ((mdp_major=register_blkdev(0, "mdp"))<=0) { - unregister_blkdev(MAJOR_NR, "md"); - return -1; - } - blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, + int ret = -ENOMEM; + + md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); + if (!md_wq) + goto err_wq; + + md_misc_wq = alloc_workqueue("md_misc", 0, 0); + if (!md_misc_wq) + goto err_misc_wq; + + if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) + goto err_md; + + if ((ret = register_blkdev(0, "mdp")) < 0) + goto err_mdp; + mdp_major = ret; + + blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, md_probe, NULL, NULL); blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, md_probe, NULL, NULL); @@ -6301,9 +8601,17 @@ static int __init md_init(void) raid_table_header = register_sysctl_table(raid_root_table); md_geninit(); - return (0); -} + return 0; +err_mdp: + unregister_blkdev(MD_MAJOR, "md"); +err_md: + destroy_workqueue(md_misc_wq); +err_misc_wq: + destroy_workqueue(md_wq); +err_wq: + return ret; +} #ifndef MODULE @@ -6335,7 +8643,7 @@ void md_autodetect_dev(dev_t dev) static void autostart_arrays(int part) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; struct detected_devices_node *node_detected_dev; dev_t dev; int i_scanned, i_passed; @@ -6375,27 +8683,36 @@ static void autostart_arrays(int part) static __exit void md_exit(void) { - mddev_t *mddev; + struct mddev *mddev; struct list_head *tmp; + int delay = 1; - blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); + blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); - unregister_blkdev(MAJOR_NR,"md"); + unregister_blkdev(MD_MAJOR,"md"); unregister_blkdev(mdp_major, "mdp"); unregister_reboot_notifier(&md_notifier); unregister_sysctl_table(raid_table_header); + + /* We cannot unload the modules while some process is + * waiting for us in select() or poll() - wake them up + */ + md_unloading = 1; + while (waitqueue_active(&md_event_waiters)) { + /* not safe to leave yet */ + wake_up(&md_event_waiters); + msleep(delay); + delay += delay; + } remove_proc_entry("mdstat", NULL); + for_each_mddev(mddev, tmp) { - struct gendisk *disk = mddev->gendisk; - if (!disk) - continue; export_array(mddev); - del_gendisk(disk); - put_disk(disk); - mddev->gendisk = NULL; - mddev_put(mddev); + mddev->hold_active = 0; } + destroy_workqueue(md_misc_wq); + destroy_workqueue(md_wq); } subsys_initcall(md_init); @@ -6419,6 +8736,7 @@ static int set_ro(const char *val, struct kernel_param *kp) module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); +module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); EXPORT_SYMBOL(register_md_personality); EXPORT_SYMBOL(unregister_md_personality); @@ -6430,6 +8748,8 @@ EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_check_recovery); +EXPORT_SYMBOL(md_reap_sync_thread); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); diff --git a/drivers/md/md.h b/drivers/md/md.h new file mode 100644 index 00000000000..a49d991f3fe --- /dev/null +++ b/drivers/md/md.h @@ -0,0 +1,629 @@ +/* + md.h : kernel internal structure of the Linux MD driver + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_MD_H +#define _MD_MD_H + +#include <linux/blkdev.h> +#include <linux/kobject.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/timer.h> +#include <linux/wait.h> +#include <linux/workqueue.h> + +#define MaxSector (~(sector_t)0) + +/* Bad block numbers are stored sorted in a single page. + * 64bits is used for each block or extent. + * 54 bits are sector number, 9 bits are extent size, + * 1 bit is an 'acknowledged' flag. + */ +#define MD_MAX_BADBLOCKS (PAGE_SIZE/8) + +/* + * MD's 'extended' device + */ +struct md_rdev { + struct list_head same_set; /* RAID devices within the same set */ + + sector_t sectors; /* Device size (in 512bytes sectors) */ + struct mddev *mddev; /* RAID array if running */ + int last_events; /* IO event timestamp */ + + /* + * If meta_bdev is non-NULL, it means that a separate device is + * being used to store the metadata (superblock/bitmap) which + * would otherwise be contained on the same device as the data (bdev). + */ + struct block_device *meta_bdev; + struct block_device *bdev; /* block device handle */ + + struct page *sb_page, *bb_page; + int sb_loaded; + __u64 sb_events; + sector_t data_offset; /* start of data in array */ + sector_t new_data_offset;/* only relevant while reshaping */ + sector_t sb_start; /* offset of the super block (in 512byte sectors) */ + int sb_size; /* bytes in the superblock */ + int preferred_minor; /* autorun support */ + + struct kobject kobj; + + /* A device can be in one of three states based on two flags: + * Not working: faulty==1 in_sync==0 + * Fully working: faulty==0 in_sync==1 + * Working, but not + * in sync with array + * faulty==0 in_sync==0 + * + * It can never have faulty==1, in_sync==1 + * This reduces the burden of testing multiple flags in many cases + */ + + unsigned long flags; /* bit set of 'enum flag_bits' bits. */ + wait_queue_head_t blocked_wait; + + int desc_nr; /* descriptor index in the superblock */ + int raid_disk; /* role of device in array */ + int new_raid_disk; /* role that the device will have in + * the array after a level-change completes. + */ + int saved_raid_disk; /* role that device used to have in the + * array and could again if we did a partial + * resync from the bitmap + */ + sector_t recovery_offset;/* If this device has been partially + * recovered, this is where we were + * up to. + */ + + atomic_t nr_pending; /* number of pending requests. + * only maintained for arrays that + * support hot removal + */ + atomic_t read_errors; /* number of consecutive read errors that + * we have tried to ignore. + */ + struct timespec last_read_error; /* monotonic time since our + * last read error + */ + atomic_t corrected_errors; /* number of corrected read errors, + * for reporting to userspace and storing + * in superblock. + */ + struct work_struct del_work; /* used for delayed sysfs removal */ + + struct kernfs_node *sysfs_state; /* handle for 'state' + * sysfs entry */ + + struct badblocks { + int count; /* count of bad blocks */ + int unacked_exist; /* there probably are unacknowledged + * bad blocks. This is only cleared + * when a read discovers none + */ + int shift; /* shift from sectors to block size + * a -ve shift means badblocks are + * disabled.*/ + u64 *page; /* badblock list */ + int changed; + seqlock_t lock; + + sector_t sector; + sector_t size; /* in sectors */ + } badblocks; +}; +enum flag_bits { + Faulty, /* device is known to have a fault */ + In_sync, /* device is in_sync with rest of array */ + Bitmap_sync, /* ..actually, not quite In_sync. Need a + * bitmap-based recovery to get fully in sync + */ + Unmerged, /* device is being added to array and should + * be considerred for bvec_merge_fn but not + * yet for actual IO + */ + WriteMostly, /* Avoid reading if at all possible */ + AutoDetected, /* added by auto-detect */ + Blocked, /* An error occurred but has not yet + * been acknowledged by the metadata + * handler, so don't allow writes + * until it is cleared */ + WriteErrorSeen, /* A write error has been seen on this + * device + */ + FaultRecorded, /* Intermediate state for clearing + * Blocked. The Fault is/will-be + * recorded in the metadata, but that + * metadata hasn't been stored safely + * on disk yet. + */ + BlockedBadBlocks, /* A writer is blocked because they + * found an unacknowledged bad-block. + * This can safely be cleared at any + * time, and the writer will re-check. + * It may be set at any time, and at + * worst the writer will timeout and + * re-check. So setting it as + * accurately as possible is good, but + * not absolutely critical. + */ + WantReplacement, /* This device is a candidate to be + * hot-replaced, either because it has + * reported some faults, or because + * of explicit request. + */ + Replacement, /* This device is a replacement for + * a want_replacement device with same + * raid_disk number. + */ +}; + +#define BB_LEN_MASK (0x00000000000001FFULL) +#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) +#define BB_ACK_MASK (0x8000000000000000ULL) +#define BB_MAX_LEN 512 +#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) +#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) +#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) +#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) + +extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors); +static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + if (unlikely(rdev->badblocks.count)) { + int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, + sectors, + first_bad, bad_sectors); + if (rv) + *first_bad -= rdev->data_offset; + return rv; + } + return 0; +} +extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new); +extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new); +extern void md_ack_all_badblocks(struct badblocks *bb); + +struct mddev { + void *private; + struct md_personality *pers; + dev_t unit; + int md_minor; + struct list_head disks; + unsigned long flags; +#define MD_CHANGE_DEVS 0 /* Some device status has changed */ +#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ +#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ +#define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */ +#define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ +#define MD_STILL_CLOSED 4 /* If set, then array has not been opened since + * md_ioctl checked on it. + */ + + int suspended; + atomic_t active_io; + int ro; + int sysfs_active; /* set when sysfs deletes + * are happening, so run/ + * takeover/stop are not safe + */ + int ready; /* See when safe to pass + * IO requests down */ + struct gendisk *gendisk; + + struct kobject kobj; + int hold_active; +#define UNTIL_IOCTL 1 +#define UNTIL_STOP 2 + + /* Superblock information */ + int major_version, + minor_version, + patch_version; + int persistent; + int external; /* metadata is + * managed externally */ + char metadata_type[17]; /* externally set*/ + int chunk_sectors; + time_t ctime, utime; + int level, layout; + char clevel[16]; + int raid_disks; + int max_disks; + sector_t dev_sectors; /* used size of + * component devices */ + sector_t array_sectors; /* exported array size */ + int external_size; /* size managed + * externally */ + __u64 events; + /* If the last 'event' was simply a clean->dirty transition, and + * we didn't write it to the spares, then it is safe and simple + * to just decrement the event count on a dirty->clean transition. + * So we record that possibility here. + */ + int can_decrease_events; + + char uuid[16]; + + /* If the array is being reshaped, we need to record the + * new shape and an indication of where we are up to. + * This is written to the superblock. + * If reshape_position is MaxSector, then no reshape is happening (yet). + */ + sector_t reshape_position; + int delta_disks, new_level, new_layout; + int new_chunk_sectors; + int reshape_backwards; + + struct md_thread *thread; /* management thread */ + struct md_thread *sync_thread; /* doing resync or reconstruct */ + + /* 'last_sync_action' is initialized to "none". It is set when a + * sync operation (i.e "data-check", "requested-resync", "resync", + * "recovery", or "reshape") is started. It holds this value even + * when the sync thread is "frozen" (interrupted) or "idle" (stopped + * or finished). It is overwritten when a new sync operation is begun. + */ + char *last_sync_action; + sector_t curr_resync; /* last block scheduled */ + /* As resync requests can complete out of order, we cannot easily track + * how much resync has been completed. So we occasionally pause until + * everything completes, then set curr_resync_completed to curr_resync. + * As such it may be well behind the real resync mark, but it is a value + * we are certain of. + */ + sector_t curr_resync_completed; + unsigned long resync_mark; /* a recent timestamp */ + sector_t resync_mark_cnt;/* blocks written at resync_mark */ + sector_t curr_mark_cnt; /* blocks scheduled now */ + + sector_t resync_max_sectors; /* may be set by personality */ + + atomic64_t resync_mismatches; /* count of sectors where + * parity/replica mismatch found + */ + + /* allow user-space to request suspension of IO to regions of the array */ + sector_t suspend_lo; + sector_t suspend_hi; + /* if zero, use the system-wide default */ + int sync_speed_min; + int sync_speed_max; + + /* resync even though the same disks are shared among md-devices */ + int parallel_resync; + + int ok_start_degraded; + /* recovery/resync flags + * NEEDED: we might need to start a resync/recover + * RUNNING: a thread is running, or about to be started + * SYNC: actually doing a resync, not a recovery + * RECOVER: doing recovery, or need to try it. + * INTR: resync needs to be aborted for some reason + * DONE: thread is done and is waiting to be reaped + * REQUEST: user-space has requested a sync (used with SYNC) + * CHECK: user-space request for check-only, no repair + * RESHAPE: A reshape is happening + * ERROR: sync-action interrupted because io-error + * + * If neither SYNC or RESHAPE are set, then it is a recovery. + */ +#define MD_RECOVERY_RUNNING 0 +#define MD_RECOVERY_SYNC 1 +#define MD_RECOVERY_RECOVER 2 +#define MD_RECOVERY_INTR 3 +#define MD_RECOVERY_DONE 4 +#define MD_RECOVERY_NEEDED 5 +#define MD_RECOVERY_REQUESTED 6 +#define MD_RECOVERY_CHECK 7 +#define MD_RECOVERY_RESHAPE 8 +#define MD_RECOVERY_FROZEN 9 +#define MD_RECOVERY_ERROR 10 + + unsigned long recovery; + /* If a RAID personality determines that recovery (of a particular + * device) will fail due to a read error on the source device, it + * takes a copy of this number and does not attempt recovery again + * until this number changes. + */ + int recovery_disabled; + + int in_sync; /* know to not need resync */ + /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so + * that we are never stopping an array while it is open. + * 'reconfig_mutex' protects all other reconfiguration. + * These locks are separate due to conflicting interactions + * with bdev->bd_mutex. + * Lock ordering is: + * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk + * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open + */ + struct mutex open_mutex; + struct mutex reconfig_mutex; + atomic_t active; /* general refcount */ + atomic_t openers; /* number of active opens */ + + int changed; /* True if we might need to + * reread partition info */ + int degraded; /* whether md should consider + * adding a spare + */ + int merge_check_needed; /* at least one + * member device + * has a + * merge_bvec_fn */ + + atomic_t recovery_active; /* blocks scheduled, but not written */ + wait_queue_head_t recovery_wait; + sector_t recovery_cp; + sector_t resync_min; /* user requested sync + * starts here */ + sector_t resync_max; /* resync should pause + * when it gets here */ + + struct kernfs_node *sysfs_state; /* handle for 'array_state' + * file in sysfs. + */ + struct kernfs_node *sysfs_action; /* handle for 'sync_action' */ + + struct work_struct del_work; /* used for delayed sysfs removal */ + + spinlock_t write_lock; + wait_queue_head_t sb_wait; /* for waiting on superblock updates */ + atomic_t pending_writes; /* number of active superblock writes */ + + unsigned int safemode; /* if set, update "clean" superblock + * when no writes pending. + */ + unsigned int safemode_delay; + struct timer_list safemode_timer; + atomic_t writes_pending; + struct request_queue *queue; /* for plugging ... */ + + struct bitmap *bitmap; /* the bitmap for the device */ + struct { + struct file *file; /* the bitmap file */ + loff_t offset; /* offset from superblock of + * start of bitmap. May be + * negative, but not '0' + * For external metadata, offset + * from start of device. + */ + unsigned long space; /* space available at this offset */ + loff_t default_offset; /* this is the offset to use when + * hot-adding a bitmap. It should + * eventually be settable by sysfs. + */ + unsigned long default_space; /* space available at + * default offset */ + struct mutex mutex; + unsigned long chunksize; + unsigned long daemon_sleep; /* how many jiffies between updates? */ + unsigned long max_write_behind; /* write-behind mode */ + int external; + } bitmap_info; + + atomic_t max_corr_read_errors; /* max read retries */ + struct list_head all_mddevs; + + struct attribute_group *to_remove; + + struct bio_set *bio_set; + + /* Generic flush handling. + * The last to finish preflush schedules a worker to submit + * the rest of the request (without the REQ_FLUSH flag). + */ + struct bio *flush_bio; + atomic_t flush_pending; + struct work_struct flush_work; + struct work_struct event_work; /* used by dm to report failure event */ + void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); +}; + + +static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) +{ + int faulty = test_bit(Faulty, &rdev->flags); + if (atomic_dec_and_test(&rdev->nr_pending) && faulty) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); +} + +static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) +{ + atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); +} + +struct md_personality +{ + char *name; + int level; + struct list_head list; + struct module *owner; + void (*make_request)(struct mddev *mddev, struct bio *bio); + int (*run)(struct mddev *mddev); + int (*stop)(struct mddev *mddev); + void (*status)(struct seq_file *seq, struct mddev *mddev); + /* error_handler must set ->faulty and clear ->in_sync + * if appropriate, and should abort recovery if needed + */ + void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); + int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); + int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); + int (*spare_active) (struct mddev *mddev); + sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); + int (*resize) (struct mddev *mddev, sector_t sectors); + sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); + int (*check_reshape) (struct mddev *mddev); + int (*start_reshape) (struct mddev *mddev); + void (*finish_reshape) (struct mddev *mddev); + /* quiesce moves between quiescence states + * 0 - fully active + * 1 - no new requests allowed + * others - reserved + */ + void (*quiesce) (struct mddev *mddev, int state); + /* takeover is used to transition an array from one + * personality to another. The new personality must be able + * to handle the data in the current layout. + * e.g. 2drive raid1 -> 2drive raid5 + * ndrive raid5 -> degraded n+1drive raid6 with special layout + * If the takeover succeeds, a new 'private' structure is returned. + * This needs to be installed and then ->run used to activate the + * array. + */ + void *(*takeover) (struct mddev *mddev); +}; + + +struct md_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct mddev *, char *); + ssize_t (*store)(struct mddev *, const char *, size_t); +}; +extern struct attribute_group md_bitmap_group; + +static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) +{ + if (sd) + return sysfs_get_dirent(sd, name); + return sd; +} +static inline void sysfs_notify_dirent_safe(struct kernfs_node *sd) +{ + if (sd) + sysfs_notify_dirent(sd); +} + +static inline char * mdname (struct mddev * mddev) +{ + return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; +} + +static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + char nm[20]; + if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) { + sprintf(nm, "rd%d", rdev->raid_disk); + return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); + } else + return 0; +} + +static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + char nm[20]; + if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) { + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } +} + +/* + * iterates through some rdev ringlist. It's safe to remove the + * current 'rdev'. Dont touch 'tmp' though. + */ +#define rdev_for_each_list(rdev, tmp, head) \ + list_for_each_entry_safe(rdev, tmp, head, same_set) + +/* + * iterates through the 'same array disks' ringlist + */ +#define rdev_for_each(rdev, mddev) \ + list_for_each_entry(rdev, &((mddev)->disks), same_set) + +#define rdev_for_each_safe(rdev, tmp, mddev) \ + list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) + +#define rdev_for_each_rcu(rdev, mddev) \ + list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) + +struct md_thread { + void (*run) (struct md_thread *thread); + struct mddev *mddev; + wait_queue_head_t wqueue; + unsigned long flags; + struct task_struct *tsk; + unsigned long timeout; + void *private; +}; + +#define THREAD_WAKEUP 0 + +static inline void safe_put_page(struct page *p) +{ + if (p) put_page(p); +} + +extern int register_md_personality(struct md_personality *p); +extern int unregister_md_personality(struct md_personality *p); +extern struct md_thread *md_register_thread( + void (*run)(struct md_thread *thread), + struct mddev *mddev, + const char *name); +extern void md_unregister_thread(struct md_thread **threadp); +extern void md_wakeup_thread(struct md_thread *thread); +extern void md_check_recovery(struct mddev *mddev); +extern void md_reap_sync_thread(struct mddev *mddev); +extern void md_write_start(struct mddev *mddev, struct bio *bi); +extern void md_write_end(struct mddev *mddev); +extern void md_done_sync(struct mddev *mddev, int blocks, int ok); +extern void md_error(struct mddev *mddev, struct md_rdev *rdev); +extern void md_finish_reshape(struct mddev *mddev); + +extern int mddev_congested(struct mddev *mddev, int bits); +extern void md_flush_request(struct mddev *mddev, struct bio *bio); +extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page); +extern void md_super_wait(struct mddev *mddev); +extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, + struct page *page, int rw, bool metadata_op); +extern void md_do_sync(struct md_thread *thread); +extern void md_new_event(struct mddev *mddev); +extern int md_allow_write(struct mddev *mddev); +extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); +extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); +extern int md_check_no_bitmap(struct mddev *mddev); +extern int md_integrity_register(struct mddev *mddev); +extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); +extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); + +extern void mddev_init(struct mddev *mddev); +extern int md_run(struct mddev *mddev); +extern void md_stop(struct mddev *mddev); +extern void md_stop_writes(struct mddev *mddev); +extern int md_rdev_init(struct md_rdev *rdev); +extern void md_rdev_clear(struct md_rdev *rdev); + +extern void mddev_suspend(struct mddev *mddev); +extern void mddev_resume(struct mddev *mddev); +extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, + struct mddev *mddev); +extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, + struct mddev *mddev); + +extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); +static inline int mddev_check_plugged(struct mddev *mddev) +{ + return !!blk_check_plugged(md_unplug, mddev, + sizeof(struct blk_plug_cb)); +} +#endif /* _MD_MD_H */ diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c deleted file mode 100644 index b61d5767aae..00000000000 --- a/drivers/md/mktables.c +++ /dev/null @@ -1,120 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved - * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2 or (at your - * option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * mktables.c - * - * Make RAID-6 tables. This is a host user space program to be run at - * compile time. - */ - -#include <stdio.h> -#include <string.h> -#include <inttypes.h> -#include <stdlib.h> -#include <time.h> - -static uint8_t gfmul(uint8_t a, uint8_t b) -{ - uint8_t v = 0; - - while (b) { - if (b & 1) - v ^= a; - a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); - b >>= 1; - } - - return v; -} - -static uint8_t gfpow(uint8_t a, int b) -{ - uint8_t v = 1; - - b %= 255; - if (b < 0) - b += 255; - - while (b) { - if (b & 1) - v = gfmul(v, a); - a = gfmul(a, a); - b >>= 1; - } - - return v; -} - -int main(int argc, char *argv[]) -{ - int i, j, k; - uint8_t v; - uint8_t exptbl[256], invtbl[256]; - - printf("#include \"raid6.h\"\n"); - - /* Compute multiplication table */ - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfmul[256][256] =\n" - "{\n"); - for (i = 0; i < 256; i++) { - printf("\t{\n"); - for (j = 0; j < 256; j += 8) { - printf("\t\t"); - for (k = 0; k < 8; k++) - printf("0x%02x,%c", gfmul(i, j + k), - (k == 7) ? '\n' : ' '); - } - printf("\t},\n"); - } - printf("};\n"); - - /* Compute power-of-2 table (exponent) */ - v = 1; - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfexp[256] =\n" "{\n"); - for (i = 0; i < 256; i += 8) { - printf("\t"); - for (j = 0; j < 8; j++) { - exptbl[i + j] = v; - printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); - v = gfmul(v, 2); - if (v == 1) - v = 0; /* For entry 255, not a real entry */ - } - } - printf("};\n"); - - /* Compute inverse table x^-1 == x^254 */ - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfinv[256] =\n" "{\n"); - for (i = 0; i < 256; i += 8) { - printf("\t"); - for (j = 0; j < 8; j++) { - invtbl[i + j] = v = gfpow(i + j, 254); - printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); - } - } - printf("};\n"); - - /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ - printf("\nconst u8 __attribute__((aligned(256)))\n" - "raid6_gfexi[256] =\n" "{\n"); - for (i = 0; i < 256; i += 8) { - printf("\t"); - for (j = 0; j < 8; j++) - printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1], - (j == 7) ? '\n' : ' '); - } - printf("};\n"); - - return 0; -} diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index c4779ccba1c..849ad39f547 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -19,23 +19,20 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include <linux/blkdev.h> #include <linux/module.h> +#include <linux/raid/md_u.h> +#include <linux/seq_file.h> #include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/raid/multipath.h> -#include <linux/buffer_head.h> -#include <asm/atomic.h> - -#define MAJOR_NR MD_MAJOR -#define MD_DRIVER -#define MD_PERSONALITY +#include "md.h" +#include "multipath.h" #define MAX_WORK_PER_DISK 128 #define NR_RESERVED_BUFS 32 -static int multipath_map (multipath_conf_t *conf) +static int multipath_map (struct mpconf *conf) { int i, disks = conf->raid_disks; @@ -46,7 +43,7 @@ static int multipath_map (multipath_conf_t *conf) rcu_read_lock(); for (i = 0; i < disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); + struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) { atomic_inc(&rdev->nr_pending); rcu_read_unlock(); @@ -62,8 +59,8 @@ static int multipath_map (multipath_conf_t *conf) static void multipath_reschedule_retry (struct multipath_bh *mp_bh) { unsigned long flags; - mddev_t *mddev = mp_bh->mddev; - multipath_conf_t *conf = mddev_to_conf(mddev); + struct mddev *mddev = mp_bh->mddev; + struct mpconf *conf = mddev->private; spin_lock_irqsave(&conf->device_lock, flags); list_add(&mp_bh->retry_list, &conf->retry_list); @@ -80,7 +77,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) { struct bio *bio = mp_bh->master_bio; - multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); + struct mpconf *conf = mp_bh->mddev->private; bio_endio(bio, err); mempool_free(mp_bh, conf->pool); @@ -89,13 +86,13 @@ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) static void multipath_end_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); - multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); - mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; + struct multipath_bh *mp_bh = bio->bi_private; + struct mpconf *conf = mp_bh->mddev->private; + struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; if (uptodate) multipath_end_bh_io(mp_bh, 0); - else if (!bio_rw_ahead(bio)) { + else if (!(bio->bi_rw & REQ_RAHEAD)) { /* * oops, IO error: */ @@ -103,54 +100,22 @@ static void multipath_end_request(struct bio *bio, int error) md_error (mp_bh->mddev, rdev); printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", bdevname(rdev->bdev,b), - (unsigned long long)bio->bi_sector); + (unsigned long long)bio->bi_iter.bi_sector); multipath_reschedule_retry(mp_bh); } else multipath_end_bh_io(mp_bh, error); rdev_dec_pending(rdev, conf->mddev); } -static void unplug_slaves(mddev_t *mddev) +static void multipath_make_request(struct mddev *mddev, struct bio * bio) { - multipath_conf_t *conf = mddev_to_conf(mddev); - int i; - - rcu_read_lock(); - for (i=0; i<mddev->raid_disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags) - && atomic_read(&rdev->nr_pending)) { - struct request_queue *r_queue = bdev_get_queue(rdev->bdev); - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - blk_unplug(r_queue); - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - rcu_read_unlock(); -} - -static void multipath_unplug(struct request_queue *q) -{ - unplug_slaves(q->queuedata); -} - - -static int multipath_make_request (struct request_queue *q, struct bio * bio) -{ - mddev_t *mddev = q->queuedata; - multipath_conf_t *conf = mddev_to_conf(mddev); + struct mpconf *conf = mddev->private; struct multipath_bh * mp_bh; struct multipath_info *multipath; - const int rw = bio_data_dir(bio); - if (unlikely(bio_barrier(bio))) { - bio_endio(bio, -EOPNOTSUPP); - return 0; + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); + return; } mp_bh = mempool_alloc(conf->pool, GFP_NOIO); @@ -158,34 +123,31 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) mp_bh->master_bio = bio; mp_bh->mddev = mddev; - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); - mp_bh->path = multipath_map(conf); if (mp_bh->path < 0) { bio_endio(bio, -EIO); mempool_free(mp_bh, conf->pool); - return 0; + return; } multipath = conf->multipaths + mp_bh->path; mp_bh->bio = *bio; - mp_bh->bio.bi_sector += multipath->rdev->data_offset; + mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; mp_bh->bio.bi_bdev = multipath->rdev->bdev; - mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST); + mp_bh->bio.bi_rw |= REQ_FAILFAST_TRANSPORT; mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; generic_make_request(&mp_bh->bio); - return 0; + return; } -static void multipath_status (struct seq_file *seq, mddev_t *mddev) +static void multipath_status (struct seq_file *seq, struct mddev *mddev) { - multipath_conf_t *conf = mddev_to_conf(mddev); + struct mpconf *conf = mddev->private; int i; seq_printf (seq, " [%d/%d] [", conf->raid_disks, - conf->working_disks); + conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", conf->multipaths[i].rdev && @@ -195,13 +157,16 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) static int multipath_congested(void *data, int bits) { - mddev_t *mddev = data; - multipath_conf_t *conf = mddev_to_conf(mddev); + struct mddev *mddev = data; + struct mpconf *conf = mddev->private; int i, ret = 0; + if (mddev_congested(mddev, bits)) + return 1; + rcu_read_lock(); for (i = 0; i < mddev->raid_disks ; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->multipaths[i].rdev); + struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -219,41 +184,42 @@ static int multipath_congested(void *data, int bits) /* * Careful, this can execute in IRQ contexts as well! */ -static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) +static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) { - multipath_conf_t *conf = mddev_to_conf(mddev); + struct mpconf *conf = mddev->private; + char b[BDEVNAME_SIZE]; - if (conf->working_disks <= 1) { + if (conf->raid_disks - mddev->degraded <= 1) { /* * Uh oh, we can do nothing if this is our last path, but * first check if this is a queued request for a device * which has just failed. */ printk(KERN_ALERT - "multipath: only one IO path left and IO error.\n"); + "multipath: only one IO path left and IO error.\n"); /* leave it active... it's all we have */ - } else { - /* - * Mark disk as unusable - */ - if (!test_bit(Faulty, &rdev->flags)) { - char b[BDEVNAME_SIZE]; - clear_bit(In_sync, &rdev->flags); - set_bit(Faulty, &rdev->flags); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - conf->working_disks--; - mddev->degraded++; - printk(KERN_ALERT "multipath: IO failure on %s," - " disabling IO path.\n" - "multipath: Operation continuing" - " on %d IO paths.\n", - bdevname (rdev->bdev,b), - conf->working_disks); - } + return; } + /* + * Mark disk as unusable + */ + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded++; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT "multipath: IO failure on %s," + " disabling IO path.\n" + "multipath: Operation continuing" + " on %d IO paths.\n", + bdevname(rdev->bdev, b), + conf->raid_disks - mddev->degraded); } -static void print_multipath_conf (multipath_conf_t *conf) +static void print_multipath_conf (struct mpconf *conf) { int i; struct multipath_info *tmp; @@ -263,7 +229,7 @@ static void print_multipath_conf (multipath_conf_t *conf) printk("(conf==NULL)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->working_disks, + printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); for (i = 0; i < conf->raid_disks; i++) { @@ -277,9 +243,9 @@ static void print_multipath_conf (multipath_conf_t *conf) } -static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) +static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) { - multipath_conf_t *conf = mddev->private; + struct mpconf *conf = mddev->private; struct request_queue *q; int err = -EEXIST; int path; @@ -295,24 +261,29 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) for (path = first; path <= last; path++) if ((p=conf->multipaths+path)->rdev == NULL) { q = rdev->bdev->bd_disk->queue; - blk_queue_stack_limits(mddev->queue, q); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. + * violating it, so limit ->max_segments to one, lying + * within a single page. * (Note: it is very unlikely that a device with * merge_bvec_fn will be involved in multipath.) */ - if (q->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + if (q->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } - conf->working_disks++; + spin_lock_irq(&conf->device_lock); mddev->degraded--; rdev->raid_disk = path; set_bit(In_sync, &rdev->flags); + spin_unlock_irq(&conf->device_lock); rcu_assign_pointer(p->rdev, rdev); err = 0; + md_integrity_add_rdev(rdev, mddev); break; } @@ -321,17 +292,16 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) return err; } -static int multipath_remove_disk(mddev_t *mddev, int number) +static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) { - multipath_conf_t *conf = mddev->private; + struct mpconf *conf = mddev->private; int err = 0; - mdk_rdev_t *rdev; + int number = rdev->raid_disk; struct multipath_info *p = conf->multipaths + number; print_multipath_conf(conf); - rdev = p->rdev; - if (rdev) { + if (rdev == p->rdev) { if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { printk(KERN_ERR "hot-remove-disk, slot %d is identified" @@ -345,7 +315,9 @@ static int multipath_remove_disk(mddev_t *mddev, int number) /* lost the race, try later */ err = -EBUSY; p->rdev = rdev; + goto abort; } + err = md_integrity_register(mddev); } abort: @@ -363,12 +335,13 @@ abort: * 3. Performs writes following reads for array syncronising. */ -static void multipathd (mddev_t *mddev) +static void multipathd(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct multipath_bh *mp_bh; struct bio *bio; unsigned long flags; - multipath_conf_t *conf = mddev_to_conf(mddev); + struct mpconf *conf = mddev->private; struct list_head *head = &conf->retry_list; md_check_recovery(mddev); @@ -382,23 +355,24 @@ static void multipathd (mddev_t *mddev) spin_unlock_irqrestore(&conf->device_lock, flags); bio = &mp_bh->bio; - bio->bi_sector = mp_bh->master_bio->bi_sector; + bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; if ((mp_bh->path = multipath_map (conf))<0) { printk(KERN_ALERT "multipath: %s: unrecoverable IO read" " error for block %llu\n", bdevname(bio->bi_bdev,b), - (unsigned long long)bio->bi_sector); + (unsigned long long)bio->bi_iter.bi_sector); multipath_end_bh_io(mp_bh, -EIO); } else { printk(KERN_ERR "multipath: %s: redirecting sector %llu" " to another IO path\n", bdevname(bio->bi_bdev,b), - (unsigned long long)bio->bi_sector); + (unsigned long long)bio->bi_iter.bi_sector); *bio = *(mp_bh->master_bio); - bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; + bio->bi_iter.bi_sector += + conf->multipaths[mp_bh->path].rdev->data_offset; bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; - bio->bi_rw |= (1 << BIO_RW_FAILFAST); + bio->bi_rw |= REQ_FAILFAST_TRANSPORT; bio->bi_end_io = multipath_end_request; bio->bi_private = mp_bh; generic_make_request(bio); @@ -407,13 +381,24 @@ static void multipathd (mddev_t *mddev) spin_unlock_irqrestore(&conf->device_lock, flags); } -static int multipath_run (mddev_t *mddev) +static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + return mddev->dev_sectors; +} + +static int multipath_run (struct mddev *mddev) { - multipath_conf_t *conf; + struct mpconf *conf; int disk_idx; struct multipath_info *disk; - mdk_rdev_t *rdev; - struct list_head *tmp; + struct md_rdev *rdev; + int working_disks; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; if (mddev->level != LEVEL_MULTIPATH) { printk("multipath: %s: raid level not set to multipath IO (%d)\n", @@ -425,9 +410,8 @@ static int multipath_run (mddev_t *mddev) * bookkeeping area. [whatever we allocate in multipath_run(), * should be freed in multipath_stop()] */ - mddev->queue->queue_lock = &mddev->queue->__queue_lock; - conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL); + conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); mddev->private = conf; if (!conf) { printk(KERN_ERR @@ -445,8 +429,8 @@ static int multipath_run (mddev_t *mddev) goto out_free_conf; } - conf->working_disks = 0; - rdev_for_each(rdev, tmp, mddev) { + working_disks = 0; + rdev_for_each(rdev, mddev) { disk_idx = rdev->raid_disk; if (disk_idx < 0 || disk_idx >= mddev->raid_disks) @@ -454,18 +438,20 @@ static int multipath_run (mddev_t *mddev) disk = conf->multipaths + disk_idx; disk->rdev = rdev; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); /* as we don't honour merge_bvec_fn, we must never risk * violating it, not that we ever expect a device with * a merge_bvec_fn to be involved in multipath */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { + blk_queue_max_segments(mddev->queue, 1); + blk_queue_segment_boundary(mddev->queue, + PAGE_CACHE_SIZE - 1); + } if (!test_bit(Faulty, &rdev->flags)) - conf->working_disks++; + working_disks++; } conf->raid_disks = mddev->raid_disks; @@ -473,14 +459,14 @@ static int multipath_run (mddev_t *mddev) spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); - if (!conf->working_disks) { + if (!working_disks) { printk(KERN_ERR "multipath: no operational IO paths for %s\n", mdname(mddev)); goto out_free_conf; } - mddev->degraded = conf->raid_disks - conf->working_disks; + mddev->degraded = conf->raid_disks - working_disks; - conf->pool = mempool_create_kzalloc_pool(NR_RESERVED_BUFS, + conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, sizeof(struct multipath_bh)); if (conf->pool == NULL) { printk(KERN_ERR @@ -490,7 +476,8 @@ static int multipath_run (mddev_t *mddev) } { - mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath"); + mddev->thread = md_register_thread(multipathd, mddev, + "multipath"); if (!mddev->thread) { printk(KERN_ERR "multipath: couldn't allocate thread" " for %s\n", mdname(mddev)); @@ -500,16 +487,19 @@ static int multipath_run (mddev_t *mddev) printk(KERN_INFO "multipath: array %s active with %d out of %d IO paths\n", - mdname(mddev), conf->working_disks, mddev->raid_disks); + mdname(mddev), conf->raid_disks - mddev->degraded, + mddev->raid_disks); /* * Ok, everything is just fine now */ - mddev->array_sectors = mddev->size * 2; + md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); - mddev->queue->unplug_fn = multipath_unplug; mddev->queue->backing_dev_info.congested_fn = multipath_congested; mddev->queue->backing_dev_info.congested_data = mddev; + if (md_integrity_register(mddev)) + goto out_free_conf; + return 0; out_free_conf: @@ -523,12 +513,11 @@ out: } -static int multipath_stop (mddev_t *mddev) +static int multipath_stop (struct mddev *mddev) { - multipath_conf_t *conf = mddev_to_conf(mddev); + struct mpconf *conf = mddev->private; - md_unregister_thread(mddev->thread); - mddev->thread = NULL; + md_unregister_thread(&mddev->thread); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ mempool_destroy(conf->pool); kfree(conf->multipaths); @@ -537,7 +526,7 @@ static int multipath_stop (mddev_t *mddev) return 0; } -static struct mdk_personality multipath_personality = +static struct md_personality multipath_personality = { .name = "multipath", .level = LEVEL_MULTIPATH, @@ -549,6 +538,7 @@ static struct mdk_personality multipath_personality = .error_handler = multipath_error, .hot_add_disk = multipath_add_disk, .hot_remove_disk= multipath_remove_disk, + .size = multipath_size, }; static int __init multipath_init (void) @@ -564,6 +554,7 @@ static void __exit multipath_exit (void) module_init(multipath_init); module_exit(multipath_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("simple multi-path personality for MD"); MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ MODULE_ALIAS("md-multipath"); MODULE_ALIAS("md-level--4"); diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h new file mode 100644 index 00000000000..717c60f6289 --- /dev/null +++ b/drivers/md/multipath.h @@ -0,0 +1,31 @@ +#ifndef _MULTIPATH_H +#define _MULTIPATH_H + +struct multipath_info { + struct md_rdev *rdev; +}; + +struct mpconf { + struct mddev *mddev; + struct multipath_info *multipaths; + int raid_disks; + spinlock_t device_lock; + struct list_head retry_list; + + mempool_t *pool; +}; + +/* + * this is our 'private' 'collective' MULTIPATH buffer head. + * it contains information about what kind of IO operations were started + * for this MULTIPATH operation, and about their status: + */ + +struct multipath_bh { + struct mddev *mddev; + struct bio *master_bio; + struct bio bio; + int path; + struct list_head retry_list; +}; +#endif diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig new file mode 100644 index 00000000000..0c2dec7aec2 --- /dev/null +++ b/drivers/md/persistent-data/Kconfig @@ -0,0 +1,18 @@ +config DM_PERSISTENT_DATA + tristate + depends on BLK_DEV_DM + select LIBCRC32C + select DM_BUFIO + ---help--- + Library providing immutable on-disk data structure support for + device-mapper targets such as the thin provisioning target. + +config DM_DEBUG_BLOCK_STACK_TRACING + boolean "Keep stack trace of persistent data block lock holders" + depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA + select STACKTRACE + ---help--- + Enable this for messages that may help debug problems with the + block manager locking used by thin provisioning and caching. + + If unsure, say N. diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile new file mode 100644 index 00000000000..ff528792c35 --- /dev/null +++ b/drivers/md/persistent-data/Makefile @@ -0,0 +1,12 @@ +obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o +dm-persistent-data-objs := \ + dm-array.o \ + dm-bitset.o \ + dm-block-manager.o \ + dm-space-map-common.o \ + dm-space-map-disk.o \ + dm-space-map-metadata.o \ + dm-transaction-manager.o \ + dm-btree.o \ + dm-btree-remove.o \ + dm-btree-spine.o diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c new file mode 100644 index 00000000000..1d75b1dc1e2 --- /dev/null +++ b/drivers/md/persistent-data/dm-array.c @@ -0,0 +1,819 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-array.h" +#include "dm-space-map.h" +#include "dm-transaction-manager.h" + +#include <linux/export.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "array" + +/*----------------------------------------------------------------*/ + +/* + * The array is implemented as a fully populated btree, which points to + * blocks that contain the packed values. This is more space efficient + * than just using a btree since we don't store 1 key per value. + */ +struct array_block { + __le32 csum; + __le32 max_entries; + __le32 nr_entries; + __le32 value_size; + __le64 blocknr; /* Block this node is supposed to live in. */ +} __packed; + +/*----------------------------------------------------------------*/ + +/* + * Validator methods. As usual we calculate a checksum, and also write the + * block location into the header (paranoia about ssds remapping areas by + * mistake). + */ +#define CSUM_XOR 595846735 + +static void array_block_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t size_of_block) +{ + struct array_block *bh_le = dm_block_data(b); + + bh_le->blocknr = cpu_to_le64(dm_block_location(b)); + bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, + size_of_block - sizeof(__le32), + CSUM_XOR)); +} + +static int array_block_check(struct dm_block_validator *v, + struct dm_block *b, + size_t size_of_block) +{ + struct array_block *bh_le = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) { + DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu", + (unsigned long long) le64_to_cpu(bh_le->blocknr), + (unsigned long long) dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, + size_of_block - sizeof(__le32), + CSUM_XOR)); + if (csum_disk != bh_le->csum) { + DMERR_LIMIT("array_block_check failed: csum %u != wanted %u", + (unsigned) le32_to_cpu(csum_disk), + (unsigned) le32_to_cpu(bh_le->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator array_validator = { + .name = "array", + .prepare_for_write = array_block_prepare_for_write, + .check = array_block_check +}; + +/*----------------------------------------------------------------*/ + +/* + * Functions for manipulating the array blocks. + */ + +/* + * Returns a pointer to a value within an array block. + * + * index - The index into _this_ specific block. + */ +static void *element_at(struct dm_array_info *info, struct array_block *ab, + unsigned index) +{ + unsigned char *entry = (unsigned char *) (ab + 1); + + entry += index * info->value_type.size; + + return entry; +} + +/* + * Utility function that calls one of the value_type methods on every value + * in an array block. + */ +static void on_entries(struct dm_array_info *info, struct array_block *ab, + void (*fn)(void *, const void *)) +{ + unsigned i, nr_entries = le32_to_cpu(ab->nr_entries); + + for (i = 0; i < nr_entries; i++) + fn(info->value_type.context, element_at(info, ab, i)); +} + +/* + * Increment every value in an array block. + */ +static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab) +{ + struct dm_btree_value_type *vt = &info->value_type; + + if (vt->inc) + on_entries(info, ab, vt->inc); +} + +/* + * Decrement every value in an array block. + */ +static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab) +{ + struct dm_btree_value_type *vt = &info->value_type; + + if (vt->dec) + on_entries(info, ab, vt->dec); +} + +/* + * Each array block can hold this many values. + */ +static uint32_t calc_max_entries(size_t value_size, size_t size_of_block) +{ + return (size_of_block - sizeof(struct array_block)) / value_size; +} + +/* + * Allocate a new array block. The caller will need to unlock block. + */ +static int alloc_ablock(struct dm_array_info *info, size_t size_of_block, + uint32_t max_entries, + struct dm_block **block, struct array_block **ab) +{ + int r; + + r = dm_tm_new_block(info->btree_info.tm, &array_validator, block); + if (r) + return r; + + (*ab) = dm_block_data(*block); + (*ab)->max_entries = cpu_to_le32(max_entries); + (*ab)->nr_entries = cpu_to_le32(0); + (*ab)->value_size = cpu_to_le32(info->value_type.size); + + return 0; +} + +/* + * Pad an array block out with a particular value. Every instance will + * cause an increment of the value_type. new_nr must always be more than + * the current number of entries. + */ +static void fill_ablock(struct dm_array_info *info, struct array_block *ab, + const void *value, unsigned new_nr) +{ + unsigned i; + uint32_t nr_entries; + struct dm_btree_value_type *vt = &info->value_type; + + BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); + BUG_ON(new_nr < le32_to_cpu(ab->nr_entries)); + + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = nr_entries; i < new_nr; i++) { + if (vt->inc) + vt->inc(vt->context, value); + memcpy(element_at(info, ab, i), value, vt->size); + } + ab->nr_entries = cpu_to_le32(new_nr); +} + +/* + * Remove some entries from the back of an array block. Every value + * removed will be decremented. new_nr must be <= the current number of + * entries. + */ +static void trim_ablock(struct dm_array_info *info, struct array_block *ab, + unsigned new_nr) +{ + unsigned i; + uint32_t nr_entries; + struct dm_btree_value_type *vt = &info->value_type; + + BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); + BUG_ON(new_nr > le32_to_cpu(ab->nr_entries)); + + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = nr_entries; i > new_nr; i--) + if (vt->dec) + vt->dec(vt->context, element_at(info, ab, i - 1)); + ab->nr_entries = cpu_to_le32(new_nr); +} + +/* + * Read locks a block, and coerces it to an array block. The caller must + * unlock 'block' when finished. + */ +static int get_ablock(struct dm_array_info *info, dm_block_t b, + struct dm_block **block, struct array_block **ab) +{ + int r; + + r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block); + if (r) + return r; + + *ab = dm_block_data(*block); + return 0; +} + +/* + * Unlocks an array block. + */ +static int unlock_ablock(struct dm_array_info *info, struct dm_block *block) +{ + return dm_tm_unlock(info->btree_info.tm, block); +} + +/*----------------------------------------------------------------*/ + +/* + * Btree manipulation. + */ + +/* + * Looks up an array block in the btree, and then read locks it. + * + * index is the index of the index of the array_block, (ie. the array index + * / max_entries). + */ +static int lookup_ablock(struct dm_array_info *info, dm_block_t root, + unsigned index, struct dm_block **block, + struct array_block **ab) +{ + int r; + uint64_t key = index; + __le64 block_le; + + r = dm_btree_lookup(&info->btree_info, root, &key, &block_le); + if (r) + return r; + + return get_ablock(info, le64_to_cpu(block_le), block, ab); +} + +/* + * Insert an array block into the btree. The block is _not_ unlocked. + */ +static int insert_ablock(struct dm_array_info *info, uint64_t index, + struct dm_block *block, dm_block_t *root) +{ + __le64 block_le = cpu_to_le64(dm_block_location(block)); + + __dm_bless_for_disk(block_le); + return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root); +} + +/* + * Looks up an array block in the btree. Then shadows it, and updates the + * btree to point to this new shadow. 'root' is an input/output parameter + * for both the current root block, and the new one. + */ +static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, + unsigned index, struct dm_block **block, + struct array_block **ab) +{ + int r, inc; + uint64_t key = index; + dm_block_t b; + __le64 block_le; + + /* + * lookup + */ + r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le); + if (r) + return r; + b = le64_to_cpu(block_le); + + /* + * shadow + */ + r = dm_tm_shadow_block(info->btree_info.tm, b, + &array_validator, block, &inc); + if (r) + return r; + + *ab = dm_block_data(*block); + if (inc) + inc_ablock_entries(info, *ab); + + /* + * Reinsert. + * + * The shadow op will often be a noop. Only insert if it really + * copied data. + */ + if (dm_block_location(*block) != b) { + /* + * dm_tm_shadow_block will have already decremented the old + * block, but it is still referenced by the btree. We + * increment to stop the insert decrementing it below zero + * when overwriting the old value. + */ + dm_tm_inc(info->btree_info.tm, b); + r = insert_ablock(info, index, *block, root); + } + + return r; +} + +/* + * Allocate an new array block, and fill it with some values. + */ +static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block, + uint32_t max_entries, + unsigned block_index, uint32_t nr, + const void *value, dm_block_t *root) +{ + int r; + struct dm_block *block; + struct array_block *ab; + + r = alloc_ablock(info, size_of_block, max_entries, &block, &ab); + if (r) + return r; + + fill_ablock(info, ab, value, nr); + r = insert_ablock(info, block_index, block, root); + unlock_ablock(info, block); + + return r; +} + +static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block, + unsigned begin_block, unsigned end_block, + unsigned max_entries, const void *value, + dm_block_t *root) +{ + int r = 0; + + for (; !r && begin_block != end_block; begin_block++) + r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root); + + return r; +} + +/* + * There are a bunch of functions involved with resizing an array. This + * structure holds information that commonly needed by them. Purely here + * to reduce parameter count. + */ +struct resize { + /* + * Describes the array. + */ + struct dm_array_info *info; + + /* + * The current root of the array. This gets updated. + */ + dm_block_t root; + + /* + * Metadata block size. Used to calculate the nr entries in an + * array block. + */ + size_t size_of_block; + + /* + * Maximum nr entries in an array block. + */ + unsigned max_entries; + + /* + * nr of completely full blocks in the array. + * + * 'old' refers to before the resize, 'new' after. + */ + unsigned old_nr_full_blocks, new_nr_full_blocks; + + /* + * Number of entries in the final block. 0 iff only full blocks in + * the array. + */ + unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block; + + /* + * The default value used when growing the array. + */ + const void *value; +}; + +/* + * Removes a consecutive set of array blocks from the btree. The values + * in block are decremented as a side effect of the btree remove. + * + * begin_index - the index of the first array block to remove. + * end_index - the one-past-the-end value. ie. this block is not removed. + */ +static int drop_blocks(struct resize *resize, unsigned begin_index, + unsigned end_index) +{ + int r; + + while (begin_index != end_index) { + uint64_t key = begin_index++; + r = dm_btree_remove(&resize->info->btree_info, resize->root, + &key, &resize->root); + if (r) + return r; + } + + return 0; +} + +/* + * Calculates how many blocks are needed for the array. + */ +static unsigned total_nr_blocks_needed(unsigned nr_full_blocks, + unsigned nr_entries_in_last_block) +{ + return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0); +} + +/* + * Shrink an array. + */ +static int shrink(struct resize *resize) +{ + int r; + unsigned begin, end; + struct dm_block *block; + struct array_block *ab; + + /* + * Lose some blocks from the back? + */ + if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) { + begin = total_nr_blocks_needed(resize->new_nr_full_blocks, + resize->new_nr_entries_in_last_block); + end = total_nr_blocks_needed(resize->old_nr_full_blocks, + resize->old_nr_entries_in_last_block); + + r = drop_blocks(resize, begin, end); + if (r) + return r; + } + + /* + * Trim the new tail block + */ + if (resize->new_nr_entries_in_last_block) { + r = shadow_ablock(resize->info, &resize->root, + resize->new_nr_full_blocks, &block, &ab); + if (r) + return r; + + trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block); + unlock_ablock(resize->info, block); + } + + return 0; +} + +/* + * Grow an array. + */ +static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries) +{ + int r; + struct dm_block *block; + struct array_block *ab; + + r = shadow_ablock(resize->info, &resize->root, + resize->old_nr_full_blocks, &block, &ab); + if (r) + return r; + + fill_ablock(resize->info, ab, resize->value, new_nr_entries); + unlock_ablock(resize->info, block); + + return r; +} + +static int grow_add_tail_block(struct resize *resize) +{ + return insert_new_ablock(resize->info, resize->size_of_block, + resize->max_entries, + resize->new_nr_full_blocks, + resize->new_nr_entries_in_last_block, + resize->value, &resize->root); +} + +static int grow_needs_more_blocks(struct resize *resize) +{ + int r; + unsigned old_nr_blocks = resize->old_nr_full_blocks; + + if (resize->old_nr_entries_in_last_block > 0) { + old_nr_blocks++; + + r = grow_extend_tail_block(resize, resize->max_entries); + if (r) + return r; + } + + r = insert_full_ablocks(resize->info, resize->size_of_block, + old_nr_blocks, + resize->new_nr_full_blocks, + resize->max_entries, resize->value, + &resize->root); + if (r) + return r; + + if (resize->new_nr_entries_in_last_block) + r = grow_add_tail_block(resize); + + return r; +} + +static int grow(struct resize *resize) +{ + if (resize->new_nr_full_blocks > resize->old_nr_full_blocks) + return grow_needs_more_blocks(resize); + + else if (resize->old_nr_entries_in_last_block) + return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block); + + else + return grow_add_tail_block(resize); +} + +/*----------------------------------------------------------------*/ + +/* + * These are the value_type functions for the btree elements, which point + * to array blocks. + */ +static void block_inc(void *context, const void *value) +{ + __le64 block_le; + struct dm_array_info *info = context; + + memcpy(&block_le, value, sizeof(block_le)); + dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le)); +} + +static void block_dec(void *context, const void *value) +{ + int r; + uint64_t b; + __le64 block_le; + uint32_t ref_count; + struct dm_block *block; + struct array_block *ab; + struct dm_array_info *info = context; + + memcpy(&block_le, value, sizeof(block_le)); + b = le64_to_cpu(block_le); + + r = dm_tm_ref(info->btree_info.tm, b, &ref_count); + if (r) { + DMERR_LIMIT("couldn't get reference count for block %llu", + (unsigned long long) b); + return; + } + + if (ref_count == 1) { + /* + * We're about to drop the last reference to this ablock. + * So we need to decrement the ref count of the contents. + */ + r = get_ablock(info, b, &block, &ab); + if (r) { + DMERR_LIMIT("couldn't get array block %llu", + (unsigned long long) b); + return; + } + + dec_ablock_entries(info, ab); + unlock_ablock(info, block); + } + + dm_tm_dec(info->btree_info.tm, b); +} + +static int block_equal(void *context, const void *value1, const void *value2) +{ + return !memcmp(value1, value2, sizeof(__le64)); +} + +/*----------------------------------------------------------------*/ + +void dm_array_info_init(struct dm_array_info *info, + struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt) +{ + struct dm_btree_value_type *bvt = &info->btree_info.value_type; + + memcpy(&info->value_type, vt, sizeof(info->value_type)); + info->btree_info.tm = tm; + info->btree_info.levels = 1; + + bvt->context = info; + bvt->size = sizeof(__le64); + bvt->inc = block_inc; + bvt->dec = block_dec; + bvt->equal = block_equal; +} +EXPORT_SYMBOL_GPL(dm_array_info_init); + +int dm_array_empty(struct dm_array_info *info, dm_block_t *root) +{ + return dm_btree_empty(&info->btree_info, root); +} +EXPORT_SYMBOL_GPL(dm_array_empty); + +static int array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) +{ + int r; + struct resize resize; + + if (old_size == new_size) + return 0; + + resize.info = info; + resize.root = root; + resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + resize.max_entries = calc_max_entries(info->value_type.size, + resize.size_of_block); + + resize.old_nr_full_blocks = old_size / resize.max_entries; + resize.old_nr_entries_in_last_block = old_size % resize.max_entries; + resize.new_nr_full_blocks = new_size / resize.max_entries; + resize.new_nr_entries_in_last_block = new_size % resize.max_entries; + resize.value = value; + + r = ((new_size > old_size) ? grow : shrink)(&resize); + if (r) + return r; + + *new_root = resize.root; + return 0; +} + +int dm_array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + int r = array_resize(info, root, old_size, new_size, value, new_root); + __dm_unbless_for_disk(value); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_resize); + +int dm_array_del(struct dm_array_info *info, dm_block_t root) +{ + return dm_btree_del(&info->btree_info, root); +} +EXPORT_SYMBOL_GPL(dm_array_del); + +int dm_array_get_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, void *value_le) +{ + int r; + struct dm_block *block; + struct array_block *ab; + size_t size_of_block; + unsigned entry, max_entries; + + size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + max_entries = calc_max_entries(info->value_type.size, size_of_block); + + r = lookup_ablock(info, root, index / max_entries, &block, &ab); + if (r) + return r; + + entry = index % max_entries; + if (entry >= le32_to_cpu(ab->nr_entries)) + r = -ENODATA; + else + memcpy(value_le, element_at(info, ab, entry), + info->value_type.size); + + unlock_ablock(info, block); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_get_value); + +static int array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) +{ + int r; + struct dm_block *block; + struct array_block *ab; + size_t size_of_block; + unsigned max_entries; + unsigned entry; + void *old_value; + struct dm_btree_value_type *vt = &info->value_type; + + size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + max_entries = calc_max_entries(info->value_type.size, size_of_block); + + r = shadow_ablock(info, &root, index / max_entries, &block, &ab); + if (r) + return r; + *new_root = root; + + entry = index % max_entries; + if (entry >= le32_to_cpu(ab->nr_entries)) { + r = -ENODATA; + goto out; + } + + old_value = element_at(info, ab, entry); + if (vt->dec && + (!vt->equal || !vt->equal(vt->context, old_value, value))) { + vt->dec(vt->context, old_value); + if (vt->inc) + vt->inc(vt->context, value); + } + + memcpy(old_value, value, info->value_type.size); + +out: + unlock_ablock(info, block); + return r; +} + +int dm_array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + int r; + + r = array_set_value(info, root, index, value, new_root); + __dm_unbless_for_disk(value); + return r; +} +EXPORT_SYMBOL_GPL(dm_array_set_value); + +struct walk_info { + struct dm_array_info *info; + int (*fn)(void *context, uint64_t key, void *leaf); + void *context; +}; + +static int walk_ablock(void *context, uint64_t *keys, void *leaf) +{ + struct walk_info *wi = context; + + int r; + unsigned i; + __le64 block_le; + unsigned nr_entries, max_entries; + struct dm_block *block; + struct array_block *ab; + + memcpy(&block_le, leaf, sizeof(block_le)); + r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab); + if (r) + return r; + + max_entries = le32_to_cpu(ab->max_entries); + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = 0; i < nr_entries; i++) { + r = wi->fn(wi->context, keys[0] * max_entries + i, + element_at(wi->info, ab, i)); + + if (r) + break; + } + + unlock_ablock(wi->info, block); + return r; +} + +int dm_array_walk(struct dm_array_info *info, dm_block_t root, + int (*fn)(void *, uint64_t key, void *leaf), + void *context) +{ + struct walk_info wi; + + wi.info = info; + wi.fn = fn; + wi.context = context; + + return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi); +} +EXPORT_SYMBOL_GPL(dm_array_walk); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h new file mode 100644 index 00000000000..ea177d6fa58 --- /dev/null +++ b/drivers/md/persistent-data/dm-array.h @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_ARRAY_H +#define _LINUX_DM_ARRAY_H + +#include "dm-btree.h" + +/*----------------------------------------------------------------*/ + +/* + * The dm-array is a persistent version of an array. It packs the data + * more efficiently than a btree which will result in less disk space use, + * and a performance boost. The element get and set operations are still + * O(ln(n)), but with a much smaller constant. + * + * The value type structure is reused from the btree type to support proper + * reference counting of values. + * + * The arrays implicitly know their length, and bounds are checked for + * lookups and updated. It doesn't store this in an accessible place + * because it would waste a whole metadata block. Make sure you store the + * size along with the array root in your encompassing data. + * + * Array entries are indexed via an unsigned integer starting from zero. + * Arrays are not sparse; if you resize an array to have 'n' entries then + * 'n - 1' will be the last valid index. + * + * Typical use: + * + * a) initialise a dm_array_info structure. This describes the array + * values and ties it into a specific transaction manager. It holds no + * instance data; the same info can be used for many similar arrays if + * you wish. + * + * b) Get yourself a root. The root is the index of a block of data on the + * disk that holds a particular instance of an array. You may have a + * pre existing root in your metadata that you wish to use, or you may + * want to create a brand new, empty array with dm_array_empty(). + * + * Like the other data structures in this library, dm_array objects are + * immutable between transactions. Update functions will return you the + * root for a _new_ array. If you've incremented the old root, via + * dm_tm_inc(), before calling the update function you may continue to use + * it in parallel with the new root. + * + * c) resize an array with dm_array_resize(). + * + * d) Get a value from the array with dm_array_get_value(). + * + * e) Set a value in the array with dm_array_set_value(). + * + * f) Walk an array of values in index order with dm_array_walk(). More + * efficient than making many calls to dm_array_get_value(). + * + * g) Destroy the array with dm_array_del(). This tells the transaction + * manager that you're no longer using this data structure so it can + * recycle it's blocks. (dm_array_dec() would be a better name for it, + * but del is in keeping with dm_btree_del()). + */ + +/* + * Describes an array. Don't initialise this structure yourself, use the + * init function below. + */ +struct dm_array_info { + struct dm_transaction_manager *tm; + struct dm_btree_value_type value_type; + struct dm_btree_info btree_info; +}; + +/* + * Sets up a dm_array_info structure. You don't need to do anything with + * this structure when you finish using it. + * + * info - the structure being filled in. + * tm - the transaction manager that should supervise this structure. + * vt - describes the leaf values. + */ +void dm_array_info_init(struct dm_array_info *info, + struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt); + +/* + * Create an empty, zero length array. + * + * info - describes the array + * root - on success this will be filled out with the root block + */ +int dm_array_empty(struct dm_array_info *info, dm_block_t *root); + +/* + * Resizes the array. + * + * info - describes the array + * root - the root block of the array on disk + * old_size - the caller is responsible for remembering the size of + * the array + * new_size - can be bigger or smaller than old_size + * value - if we're growing the array the new entries will have this value + * new_root - on success, points to the new root block + * + * If growing the inc function for 'value' will be called the appropriate + * number of times. So if the caller is holding a reference they may want + * to drop it. + */ +int dm_array_resize(struct dm_array_info *info, dm_block_t root, + uint32_t old_size, uint32_t new_size, + const void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * Frees a whole array. The value_type's decrement operation will be called + * for all values in the array + */ +int dm_array_del(struct dm_array_info *info, dm_block_t root); + +/* + * Lookup a value in the array + * + * info - describes the array + * root - root block of the array + * index - array index + * value - the value to be read. Will be in on-disk format of course. + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_array_get_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, void *value); + +/* + * Set an entry in the array. + * + * info - describes the array + * root - root block of the array + * index - array index + * value - value to be written to disk. Make sure you confirm the value is + * in on-disk format with__dm_bless_for_disk() before calling. + * new_root - the new root block + * + * The old value being overwritten will be decremented, the new value + * incremented. + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_array_set_value(struct dm_array_info *info, dm_block_t root, + uint32_t index, const void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * Walk through all the entries in an array. + * + * info - describes the array + * root - root block of the array + * fn - called back for every element + * context - passed to the callback + */ +int dm_array_walk(struct dm_array_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t key, void *leaf), + void *context); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_ARRAY_H */ diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c new file mode 100644 index 00000000000..36f7cc2c710 --- /dev/null +++ b/drivers/md/persistent-data/dm-bitset.c @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-bitset.h" +#include "dm-transaction-manager.h" + +#include <linux/export.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "bitset" +#define BITS_PER_ARRAY_ENTRY 64 + +/*----------------------------------------------------------------*/ + +static struct dm_btree_value_type bitset_bvt = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL, +}; + +/*----------------------------------------------------------------*/ + +void dm_disk_bitset_init(struct dm_transaction_manager *tm, + struct dm_disk_bitset *info) +{ + dm_array_info_init(&info->array_info, tm, &bitset_bvt); + info->current_index_set = false; +} +EXPORT_SYMBOL_GPL(dm_disk_bitset_init); + +int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root) +{ + return dm_array_empty(&info->array_info, root); +} +EXPORT_SYMBOL_GPL(dm_bitset_empty); + +int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root, + uint32_t old_nr_entries, uint32_t new_nr_entries, + bool default_value, dm_block_t *new_root) +{ + uint32_t old_blocks = dm_div_up(old_nr_entries, BITS_PER_ARRAY_ENTRY); + uint32_t new_blocks = dm_div_up(new_nr_entries, BITS_PER_ARRAY_ENTRY); + __le64 value = default_value ? cpu_to_le64(~0) : cpu_to_le64(0); + + __dm_bless_for_disk(&value); + return dm_array_resize(&info->array_info, root, old_blocks, new_blocks, + &value, new_root); +} +EXPORT_SYMBOL_GPL(dm_bitset_resize); + +int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root) +{ + return dm_array_del(&info->array_info, root); +} +EXPORT_SYMBOL_GPL(dm_bitset_del); + +int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, + dm_block_t *new_root) +{ + int r; + __le64 value; + + if (!info->current_index_set || !info->dirty) + return 0; + + value = cpu_to_le64(info->current_bits); + + __dm_bless_for_disk(&value); + r = dm_array_set_value(&info->array_info, root, info->current_index, + &value, new_root); + if (r) + return r; + + info->current_index_set = false; + info->dirty = false; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_flush); + +static int read_bits(struct dm_disk_bitset *info, dm_block_t root, + uint32_t array_index) +{ + int r; + __le64 value; + + r = dm_array_get_value(&info->array_info, root, array_index, &value); + if (r) + return r; + + info->current_bits = le64_to_cpu(value); + info->current_index_set = true; + info->current_index = array_index; + info->dirty = false; + + return 0; +} + +static int get_array_entry(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned array_index = index / BITS_PER_ARRAY_ENTRY; + + if (info->current_index_set) { + if (info->current_index == array_index) + return 0; + + r = dm_bitset_flush(info, root, new_root); + if (r) + return r; + } + + return read_bits(info, root, array_index); +} + +int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + set_bit(b, (unsigned long *) &info->current_bits); + info->dirty = true; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_set_bit); + +int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root) +{ + int r; + unsigned b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + clear_bit(b, (unsigned long *) &info->current_bits); + info->dirty = true; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_clear_bit); + +int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root, bool *result) +{ + int r; + unsigned b = index % BITS_PER_ARRAY_ENTRY; + + r = get_array_entry(info, root, index, new_root); + if (r) + return r; + + *result = test_bit(b, (unsigned long *) &info->current_bits); + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_test_bit); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h new file mode 100644 index 00000000000..c2287d672ef --- /dev/null +++ b/drivers/md/persistent-data/dm-bitset.h @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_BITSET_H +#define _LINUX_DM_BITSET_H + +#include "dm-array.h" + +/*----------------------------------------------------------------*/ + +/* + * This bitset type is a thin wrapper round a dm_array of 64bit words. It + * uses a tiny, one word cache to reduce the number of array lookups and so + * increase performance. + * + * Like the dm-array that it's based on, the caller needs to keep track of + * the size of the bitset separately. The underlying dm-array implicitly + * knows how many words it's storing and will return -ENODATA if you try + * and access an out of bounds word. However, an out of bounds bit in the + * final word will _not_ be detected, you have been warned. + * + * Bits are indexed from zero. + + * Typical use: + * + * a) Initialise a dm_disk_bitset structure with dm_disk_bitset_init(). + * This describes the bitset and includes the cache. It's not called it + * dm_bitset_info in line with other data structures because it does + * include instance data. + * + * b) Get yourself a root. The root is the index of a block of data on the + * disk that holds a particular instance of an bitset. You may have a + * pre existing root in your metadata that you wish to use, or you may + * want to create a brand new, empty bitset with dm_bitset_empty(). + * + * Like the other data structures in this library, dm_bitset objects are + * immutable between transactions. Update functions will return you the + * root for a _new_ array. If you've incremented the old root, via + * dm_tm_inc(), before calling the update function you may continue to use + * it in parallel with the new root. + * + * Even read operations may trigger the cache to be flushed and as such + * return a root for a new, updated bitset. + * + * c) resize a bitset with dm_bitset_resize(). + * + * d) Set a bit with dm_bitset_set_bit(). + * + * e) Clear a bit with dm_bitset_clear_bit(). + * + * f) Test a bit with dm_bitset_test_bit(). + * + * g) Flush all updates from the cache with dm_bitset_flush(). + * + * h) Destroy the bitset with dm_bitset_del(). This tells the transaction + * manager that you're no longer using this data structure so it can + * recycle it's blocks. (dm_bitset_dec() would be a better name for it, + * but del is in keeping with dm_btree_del()). + */ + +/* + * Opaque object. Unlike dm_array_info, you should have one of these per + * bitset. Initialise with dm_disk_bitset_init(). + */ +struct dm_disk_bitset { + struct dm_array_info array_info; + + uint32_t current_index; + uint64_t current_bits; + + bool current_index_set:1; + bool dirty:1; +}; + +/* + * Sets up a dm_disk_bitset structure. You don't need to do anything with + * this structure when you finish using it. + * + * tm - the transaction manager that should supervise this structure + * info - the structure being initialised + */ +void dm_disk_bitset_init(struct dm_transaction_manager *tm, + struct dm_disk_bitset *info); + +/* + * Create an empty, zero length bitset. + * + * info - describes the bitset + * new_root - on success, points to the new root block + */ +int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root); + +/* + * Resize the bitset. + * + * info - describes the bitset + * old_root - the root block of the array on disk + * old_nr_entries - the number of bits in the old bitset + * new_nr_entries - the number of bits you want in the new bitset + * default_value - the value for any new bits + * new_root - on success, points to the new root block + */ +int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t old_root, + uint32_t old_nr_entries, uint32_t new_nr_entries, + bool default_value, dm_block_t *new_root); + +/* + * Frees the bitset. + */ +int dm_bitset_del(struct dm_disk_bitset *info, dm_block_t root); + +/* + * Set a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_set_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root); + +/* + * Clears a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_clear_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root); + +/* + * Tests a bit. + * + * info - describes the bitset + * root - the root block of the bitset + * index - the bit index + * new_root - on success, points to the new root block (cached values may have been written) + * result - the bit value you're after + * + * -ENODATA will be returned if the index is out of bounds. + */ +int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, + uint32_t index, dm_block_t *new_root, bool *result); + +/* + * Flush any cached changes to disk. + * + * info - describes the bitset + * root - the root block of the bitset + * new_root - on success, points to the new root block + */ +int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, + dm_block_t *new_root); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_BITSET_H */ diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c new file mode 100644 index 00000000000..087411c95ff --- /dev/null +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -0,0 +1,636 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#include "dm-block-manager.h" +#include "dm-persistent-data-internal.h" +#include "../dm-bufio.h" + +#include <linux/crc32c.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/rwsem.h> +#include <linux/device-mapper.h> +#include <linux/stacktrace.h> + +#define DM_MSG_PREFIX "block manager" + +/*----------------------------------------------------------------*/ + +/* + * This is a read/write semaphore with a couple of differences. + * + * i) There is a restriction on the number of concurrent read locks that + * may be held at once. This is just an implementation detail. + * + * ii) Recursive locking attempts are detected and return EINVAL. A stack + * trace is also emitted for the previous lock acquisition. + * + * iii) Priority is given to write locks. + */ +#define MAX_HOLDERS 4 +#define MAX_STACK 10 + +typedef unsigned long stack_entries[MAX_STACK]; + +struct block_lock { + spinlock_t lock; + __s32 count; + struct list_head waiters; + struct task_struct *holders[MAX_HOLDERS]; + +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + struct stack_trace traces[MAX_HOLDERS]; + stack_entries entries[MAX_HOLDERS]; +#endif +}; + +struct waiter { + struct list_head list; + struct task_struct *task; + int wants_write; +}; + +static unsigned __find_holder(struct block_lock *lock, + struct task_struct *task) +{ + unsigned i; + + for (i = 0; i < MAX_HOLDERS; i++) + if (lock->holders[i] == task) + break; + + BUG_ON(i == MAX_HOLDERS); + return i; +} + +/* call this *after* you increment lock->count */ +static void __add_holder(struct block_lock *lock, struct task_struct *task) +{ + unsigned h = __find_holder(lock, NULL); +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + struct stack_trace *t; +#endif + + get_task_struct(task); + lock->holders[h] = task; + +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + t = lock->traces + h; + t->nr_entries = 0; + t->max_entries = MAX_STACK; + t->entries = lock->entries[h]; + t->skip = 2; + save_stack_trace(t); +#endif +} + +/* call this *before* you decrement lock->count */ +static void __del_holder(struct block_lock *lock, struct task_struct *task) +{ + unsigned h = __find_holder(lock, task); + lock->holders[h] = NULL; + put_task_struct(task); +} + +static int __check_holder(struct block_lock *lock) +{ + unsigned i; +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + static struct stack_trace t; + static stack_entries entries; +#endif + + for (i = 0; i < MAX_HOLDERS; i++) { + if (lock->holders[i] == current) { + DMERR("recursive lock detected in metadata"); +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + DMERR("previously held here:"); + print_stack_trace(lock->traces + i, 4); + + DMERR("subsequent acquisition attempted here:"); + t.nr_entries = 0; + t.max_entries = MAX_STACK; + t.entries = entries; + t.skip = 3; + save_stack_trace(&t); + print_stack_trace(&t, 4); +#endif + return -EINVAL; + } + } + + return 0; +} + +static void __wait(struct waiter *w) +{ + for (;;) { + set_task_state(current, TASK_UNINTERRUPTIBLE); + + if (!w->task) + break; + + schedule(); + } + + set_task_state(current, TASK_RUNNING); +} + +static void __wake_waiter(struct waiter *w) +{ + struct task_struct *task; + + list_del(&w->list); + task = w->task; + smp_mb(); + w->task = NULL; + wake_up_process(task); +} + +/* + * We either wake a few readers or a single writer. + */ +static void __wake_many(struct block_lock *lock) +{ + struct waiter *w, *tmp; + + BUG_ON(lock->count < 0); + list_for_each_entry_safe(w, tmp, &lock->waiters, list) { + if (lock->count >= MAX_HOLDERS) + return; + + if (w->wants_write) { + if (lock->count > 0) + return; /* still read locked */ + + lock->count = -1; + __add_holder(lock, w->task); + __wake_waiter(w); + return; + } + + lock->count++; + __add_holder(lock, w->task); + __wake_waiter(w); + } +} + +static void bl_init(struct block_lock *lock) +{ + int i; + + spin_lock_init(&lock->lock); + lock->count = 0; + INIT_LIST_HEAD(&lock->waiters); + for (i = 0; i < MAX_HOLDERS; i++) + lock->holders[i] = NULL; +} + +static int __available_for_read(struct block_lock *lock) +{ + return lock->count >= 0 && + lock->count < MAX_HOLDERS && + list_empty(&lock->waiters); +} + +static int bl_down_read(struct block_lock *lock) +{ + int r; + struct waiter w; + + spin_lock(&lock->lock); + r = __check_holder(lock); + if (r) { + spin_unlock(&lock->lock); + return r; + } + + if (__available_for_read(lock)) { + lock->count++; + __add_holder(lock, current); + spin_unlock(&lock->lock); + return 0; + } + + get_task_struct(current); + + w.task = current; + w.wants_write = 0; + list_add_tail(&w.list, &lock->waiters); + spin_unlock(&lock->lock); + + __wait(&w); + put_task_struct(current); + return 0; +} + +static int bl_down_read_nonblock(struct block_lock *lock) +{ + int r; + + spin_lock(&lock->lock); + r = __check_holder(lock); + if (r) + goto out; + + if (__available_for_read(lock)) { + lock->count++; + __add_holder(lock, current); + r = 0; + } else + r = -EWOULDBLOCK; + +out: + spin_unlock(&lock->lock); + return r; +} + +static void bl_up_read(struct block_lock *lock) +{ + spin_lock(&lock->lock); + BUG_ON(lock->count <= 0); + __del_holder(lock, current); + --lock->count; + if (!list_empty(&lock->waiters)) + __wake_many(lock); + spin_unlock(&lock->lock); +} + +static int bl_down_write(struct block_lock *lock) +{ + int r; + struct waiter w; + + spin_lock(&lock->lock); + r = __check_holder(lock); + if (r) { + spin_unlock(&lock->lock); + return r; + } + + if (lock->count == 0 && list_empty(&lock->waiters)) { + lock->count = -1; + __add_holder(lock, current); + spin_unlock(&lock->lock); + return 0; + } + + get_task_struct(current); + w.task = current; + w.wants_write = 1; + + /* + * Writers given priority. We know there's only one mutator in the + * system, so ignoring the ordering reversal. + */ + list_add(&w.list, &lock->waiters); + spin_unlock(&lock->lock); + + __wait(&w); + put_task_struct(current); + + return 0; +} + +static void bl_up_write(struct block_lock *lock) +{ + spin_lock(&lock->lock); + __del_holder(lock, current); + lock->count = 0; + if (!list_empty(&lock->waiters)) + __wake_many(lock); + spin_unlock(&lock->lock); +} + +static void report_recursive_bug(dm_block_t b, int r) +{ + if (r == -EINVAL) + DMERR("recursive acquisition of block %llu requested.", + (unsigned long long) b); +} + +/*----------------------------------------------------------------*/ + +/* + * Block manager is currently implemented using dm-bufio. struct + * dm_block_manager and struct dm_block map directly onto a couple of + * structs in the bufio interface. I want to retain the freedom to move + * away from bufio in the future. So these structs are just cast within + * this .c file, rather than making it through to the public interface. + */ +static struct dm_buffer *to_buffer(struct dm_block *b) +{ + return (struct dm_buffer *) b; +} + +dm_block_t dm_block_location(struct dm_block *b) +{ + return dm_bufio_get_block_number(to_buffer(b)); +} +EXPORT_SYMBOL_GPL(dm_block_location); + +void *dm_block_data(struct dm_block *b) +{ + return dm_bufio_get_block_data(to_buffer(b)); +} +EXPORT_SYMBOL_GPL(dm_block_data); + +struct buffer_aux { + struct dm_block_validator *validator; + struct block_lock lock; + int write_locked; +}; + +static void dm_block_manager_alloc_callback(struct dm_buffer *buf) +{ + struct buffer_aux *aux = dm_bufio_get_aux_data(buf); + aux->validator = NULL; + bl_init(&aux->lock); +} + +static void dm_block_manager_write_callback(struct dm_buffer *buf) +{ + struct buffer_aux *aux = dm_bufio_get_aux_data(buf); + if (aux->validator) { + aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf, + dm_bufio_get_block_size(dm_bufio_get_client(buf))); + } +} + +/*---------------------------------------------------------------- + * Public interface + *--------------------------------------------------------------*/ +struct dm_block_manager { + struct dm_bufio_client *bufio; + bool read_only:1; +}; + +struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, + unsigned block_size, + unsigned cache_size, + unsigned max_held_per_thread) +{ + int r; + struct dm_block_manager *bm; + + bm = kmalloc(sizeof(*bm), GFP_KERNEL); + if (!bm) { + r = -ENOMEM; + goto bad; + } + + bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread, + sizeof(struct buffer_aux), + dm_block_manager_alloc_callback, + dm_block_manager_write_callback); + if (IS_ERR(bm->bufio)) { + r = PTR_ERR(bm->bufio); + kfree(bm); + goto bad; + } + + bm->read_only = false; + + return bm; + +bad: + return ERR_PTR(r); +} +EXPORT_SYMBOL_GPL(dm_block_manager_create); + +void dm_block_manager_destroy(struct dm_block_manager *bm) +{ + dm_bufio_client_destroy(bm->bufio); + kfree(bm); +} +EXPORT_SYMBOL_GPL(dm_block_manager_destroy); + +unsigned dm_bm_block_size(struct dm_block_manager *bm) +{ + return dm_bufio_get_block_size(bm->bufio); +} +EXPORT_SYMBOL_GPL(dm_bm_block_size); + +dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) +{ + return dm_bufio_get_device_size(bm->bufio); +} + +static int dm_bm_validate_buffer(struct dm_block_manager *bm, + struct dm_buffer *buf, + struct buffer_aux *aux, + struct dm_block_validator *v) +{ + if (unlikely(!aux->validator)) { + int r; + if (!v) + return 0; + r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(bm->bufio)); + if (unlikely(r)) { + DMERR_LIMIT("%s validator check failed for block %llu", v->name, + (unsigned long long) dm_bufio_get_block_number(buf)); + return r; + } + aux->validator = v; + } else { + if (unlikely(aux->validator != v)) { + DMERR_LIMIT("validator mismatch (old=%s vs new=%s) for block %llu", + aux->validator->name, v ? v->name : "NULL", + (unsigned long long) dm_bufio_get_block_number(buf)); + return -EINVAL; + } + } + + return 0; +} +int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result) +{ + struct buffer_aux *aux; + void *p; + int r; + + p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_read(&aux->lock); + if (unlikely(r)) { + dm_bufio_release(to_buffer(*result)); + report_recursive_bug(b, r); + return r; + } + + aux->write_locked = 0; + + r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_read(&aux->lock); + dm_bufio_release(to_buffer(*result)); + return r; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bm_read_lock); + +int dm_bm_write_lock(struct dm_block_manager *bm, + dm_block_t b, struct dm_block_validator *v, + struct dm_block **result) +{ + struct buffer_aux *aux; + void *p; + int r; + + if (bm->read_only) + return -EPERM; + + p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_write(&aux->lock); + if (r) { + dm_bufio_release(to_buffer(*result)); + report_recursive_bug(b, r); + return r; + } + + aux->write_locked = 1; + + r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_write(&aux->lock); + dm_bufio_release(to_buffer(*result)); + return r; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bm_write_lock); + +int dm_bm_read_try_lock(struct dm_block_manager *bm, + dm_block_t b, struct dm_block_validator *v, + struct dm_block **result) +{ + struct buffer_aux *aux; + void *p; + int r; + + p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result); + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + if (unlikely(!p)) + return -EWOULDBLOCK; + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_read_nonblock(&aux->lock); + if (r < 0) { + dm_bufio_release(to_buffer(*result)); + report_recursive_bug(b, r); + return r; + } + aux->write_locked = 0; + + r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_read(&aux->lock); + dm_bufio_release(to_buffer(*result)); + return r; + } + + return 0; +} + +int dm_bm_write_lock_zero(struct dm_block_manager *bm, + dm_block_t b, struct dm_block_validator *v, + struct dm_block **result) +{ + int r; + struct buffer_aux *aux; + void *p; + + if (bm->read_only) + return -EPERM; + + p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result); + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + + memset(p, 0, dm_bm_block_size(bm)); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_write(&aux->lock); + if (r) { + dm_bufio_release(to_buffer(*result)); + return r; + } + + aux->write_locked = 1; + aux->validator = v; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero); + +int dm_bm_unlock(struct dm_block *b) +{ + struct buffer_aux *aux; + aux = dm_bufio_get_aux_data(to_buffer(b)); + + if (aux->write_locked) { + dm_bufio_mark_buffer_dirty(to_buffer(b)); + bl_up_write(&aux->lock); + } else + bl_up_read(&aux->lock); + + dm_bufio_release(to_buffer(b)); + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bm_unlock); + +int dm_bm_flush(struct dm_block_manager *bm) +{ + if (bm->read_only) + return -EPERM; + + return dm_bufio_write_dirty_buffers(bm->bufio); +} +EXPORT_SYMBOL_GPL(dm_bm_flush); + +void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) +{ + dm_bufio_prefetch(bm->bufio, b, 1); +} + +void dm_bm_set_read_only(struct dm_block_manager *bm) +{ + bm->read_only = true; +} +EXPORT_SYMBOL_GPL(dm_bm_set_read_only); + +void dm_bm_set_read_write(struct dm_block_manager *bm) +{ + bm->read_only = false; +} +EXPORT_SYMBOL_GPL(dm_bm_set_read_write); + +u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) +{ + return crc32c(~(u32) 0, data, len) ^ init_xor; +} +EXPORT_SYMBOL_GPL(dm_bm_checksum); + +/*----------------------------------------------------------------*/ + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); +MODULE_DESCRIPTION("Immutable metadata library for dm"); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h new file mode 100644 index 00000000000..1b95dfc1778 --- /dev/null +++ b/drivers/md/persistent-data/dm-block-manager.h @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_BLOCK_MANAGER_H +#define _LINUX_DM_BLOCK_MANAGER_H + +#include <linux/types.h> +#include <linux/blkdev.h> + +/*----------------------------------------------------------------*/ + +/* + * Block number. + */ +typedef uint64_t dm_block_t; +struct dm_block; + +dm_block_t dm_block_location(struct dm_block *b); +void *dm_block_data(struct dm_block *b); + +/*----------------------------------------------------------------*/ + +/* + * @name should be a unique identifier for the block manager, no longer + * than 32 chars. + * + * @max_held_per_thread should be the maximum number of locks, read or + * write, that an individual thread holds at any one time. + */ +struct dm_block_manager; +struct dm_block_manager *dm_block_manager_create( + struct block_device *bdev, unsigned block_size, + unsigned cache_size, unsigned max_held_per_thread); +void dm_block_manager_destroy(struct dm_block_manager *bm); + +unsigned dm_bm_block_size(struct dm_block_manager *bm); +dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm); + +/*----------------------------------------------------------------*/ + +/* + * The validator allows the caller to verify newly-read data and modify + * the data just before writing, e.g. to calculate checksums. It's + * important to be consistent with your use of validators. The only time + * you can change validators is if you call dm_bm_write_lock_zero. + */ +struct dm_block_validator { + const char *name; + void (*prepare_for_write)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); + + /* + * Return 0 if the checksum is valid or < 0 on error. + */ + int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); +}; + +/*----------------------------------------------------------------*/ + +/* + * You can have multiple concurrent readers or a single writer holding a + * block lock. + */ + +/* + * dm_bm_lock() locks a block and returns through @result a pointer to + * memory that holds a copy of that block. If you have write-locked the + * block then any changes you make to memory pointed to by @result will be + * written back to the disk sometime after dm_bm_unlock is called. + */ +int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +/* + * The *_try_lock variants return -EWOULDBLOCK if the block isn't + * available immediately. + */ +int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +/* + * Use dm_bm_write_lock_zero() when you know you're going to + * overwrite the block completely. It saves a disk read. + */ +int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +int dm_bm_unlock(struct dm_block *b); + +/* + * It's a common idiom to have a superblock that should be committed last. + * + * @superblock should be write-locked on entry. It will be unlocked during + * this function. All dirty blocks are guaranteed to be written and flushed + * before the superblock. + * + * This method always blocks. + */ +int dm_bm_flush(struct dm_block_manager *bm); + +/* + * Request data is prefetched into the cache. + */ +void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); + +/* + * Switches the bm to a read only mode. Once read-only mode + * has been entered the following functions will return -EPERM. + * + * dm_bm_write_lock + * dm_bm_write_lock_zero + * dm_bm_flush_and_unlock + * + * Additionally you should not use dm_bm_unlock_move, however no error will + * be returned if you do. + */ +void dm_bm_set_read_only(struct dm_block_manager *bm); +void dm_bm_set_read_write(struct dm_block_manager *bm); + +u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); + +/*----------------------------------------------------------------*/ + +#endif /* _LINUX_DM_BLOCK_MANAGER_H */ diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h new file mode 100644 index 00000000000..37d367bb9aa --- /dev/null +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_BTREE_INTERNAL_H +#define DM_BTREE_INTERNAL_H + +#include "dm-btree.h" + +/*----------------------------------------------------------------*/ + +/* + * We'll need 2 accessor functions for n->csum and n->blocknr + * to support dm-btree-spine.c in that case. + */ + +enum node_flags { + INTERNAL_NODE = 1, + LEAF_NODE = 1 << 1 +}; + +/* + * Every btree node begins with this structure. Make sure it's a multiple + * of 8-bytes in size, otherwise the 64bit keys will be mis-aligned. + */ +struct node_header { + __le32 csum; + __le32 flags; + __le64 blocknr; /* Block this node is supposed to live in. */ + + __le32 nr_entries; + __le32 max_entries; + __le32 value_size; + __le32 padding; +} __packed; + +struct btree_node { + struct node_header header; + __le64 keys[0]; +} __packed; + + +void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, + struct dm_btree_value_type *vt); + +int new_block(struct dm_btree_info *info, struct dm_block **result); +int unlock_block(struct dm_btree_info *info, struct dm_block *b); + +/* + * Spines keep track of the rolling locks. There are 2 variants, read-only + * and one that uses shadowing. These are separate structs to allow the + * type checker to spot misuse, for example accidentally calling read_lock + * on a shadow spine. + */ +struct ro_spine { + struct dm_btree_info *info; + + int count; + struct dm_block *nodes[2]; +}; + +void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info); +int exit_ro_spine(struct ro_spine *s); +int ro_step(struct ro_spine *s, dm_block_t new_child); +void ro_pop(struct ro_spine *s); +struct btree_node *ro_node(struct ro_spine *s); + +struct shadow_spine { + struct dm_btree_info *info; + + int count; + struct dm_block *nodes[2]; + + dm_block_t root; +}; + +void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info); +int exit_shadow_spine(struct shadow_spine *s); + +int shadow_step(struct shadow_spine *s, dm_block_t b, + struct dm_btree_value_type *vt); + +/* + * The spine must have at least one entry before calling this. + */ +struct dm_block *shadow_current(struct shadow_spine *s); + +/* + * The spine must have at least two entries before calling this. + */ +struct dm_block *shadow_parent(struct shadow_spine *s); + +int shadow_has_parent(struct shadow_spine *s); + +int shadow_root(struct shadow_spine *s); + +/* + * Some inlines. + */ +static inline __le64 *key_ptr(struct btree_node *n, uint32_t index) +{ + return n->keys + index; +} + +static inline void *value_base(struct btree_node *n) +{ + return &n->keys[le32_to_cpu(n->header.max_entries)]; +} + +static inline void *value_ptr(struct btree_node *n, uint32_t index) +{ + uint32_t value_size = le32_to_cpu(n->header.value_size); + return value_base(n) + (value_size * index); +} + +/* + * Assumes the values are suitably-aligned and converts to core format. + */ +static inline uint64_t value64(struct btree_node *n, uint32_t index) +{ + __le64 *values_le = value_base(n); + + return le64_to_cpu(values_le[index]); +} + +/* + * Searching for a key within a single node. + */ +int lower_bound(struct btree_node *n, uint64_t key); + +extern struct dm_block_validator btree_node_validator; + +#endif /* DM_BTREE_INTERNAL_H */ diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c new file mode 100644 index 00000000000..b88757cd0d1 --- /dev/null +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -0,0 +1,592 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-btree.h" +#include "dm-btree-internal.h" +#include "dm-transaction-manager.h" + +#include <linux/export.h> + +/* + * Removing an entry from a btree + * ============================== + * + * A very important constraint for our btree is that no node, except the + * root, may have fewer than a certain number of entries. + * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). + * + * Ensuring this is complicated by the way we want to only ever hold the + * locks on 2 nodes concurrently, and only change nodes in a top to bottom + * fashion. + * + * Each node may have a left or right sibling. When decending the spine, + * if a node contains only MIN_ENTRIES then we try and increase this to at + * least MIN_ENTRIES + 1. We do this in the following ways: + * + * [A] No siblings => this can only happen if the node is the root, in which + * case we copy the childs contents over the root. + * + * [B] No left sibling + * ==> rebalance(node, right sibling) + * + * [C] No right sibling + * ==> rebalance(left sibling, node) + * + * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD + * ==> delete node adding it's contents to left and right + * + * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD + * ==> rebalance(left, node, right) + * + * After these operations it's possible that the our original node no + * longer contains the desired sub tree. For this reason this rebalancing + * is performed on the children of the current node. This also avoids + * having a special case for the root. + * + * Once this rebalancing has occurred we can then step into the child node + * for internal nodes. Or delete the entry for leaf nodes. + */ + +/* + * Some little utilities for moving node data around. + */ +static void node_shift(struct btree_node *n, int shift) +{ + uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); + uint32_t value_size = le32_to_cpu(n->header.value_size); + + if (shift < 0) { + shift = -shift; + BUG_ON(shift > nr_entries); + BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); + memmove(key_ptr(n, 0), + key_ptr(n, shift), + (nr_entries - shift) * sizeof(__le64)); + memmove(value_ptr(n, 0), + value_ptr(n, shift), + (nr_entries - shift) * value_size); + } else { + BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); + memmove(key_ptr(n, shift), + key_ptr(n, 0), + nr_entries * sizeof(__le64)); + memmove(value_ptr(n, shift), + value_ptr(n, 0), + nr_entries * value_size); + } +} + +static void node_copy(struct btree_node *left, struct btree_node *right, int shift) +{ + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t value_size = le32_to_cpu(left->header.value_size); + BUG_ON(value_size != le32_to_cpu(right->header.value_size)); + + if (shift < 0) { + shift = -shift; + BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries)); + memcpy(key_ptr(left, nr_left), + key_ptr(right, 0), + shift * sizeof(__le64)); + memcpy(value_ptr(left, nr_left), + value_ptr(right, 0), + shift * value_size); + } else { + BUG_ON(shift > le32_to_cpu(right->header.max_entries)); + memcpy(key_ptr(right, 0), + key_ptr(left, nr_left - shift), + shift * sizeof(__le64)); + memcpy(value_ptr(right, 0), + value_ptr(left, nr_left - shift), + shift * value_size); + } +} + +/* + * Delete a specific entry from a leaf node. + */ +static void delete_at(struct btree_node *n, unsigned index) +{ + unsigned nr_entries = le32_to_cpu(n->header.nr_entries); + unsigned nr_to_copy = nr_entries - (index + 1); + uint32_t value_size = le32_to_cpu(n->header.value_size); + BUG_ON(index >= nr_entries); + + if (nr_to_copy) { + memmove(key_ptr(n, index), + key_ptr(n, index + 1), + nr_to_copy * sizeof(__le64)); + + memmove(value_ptr(n, index), + value_ptr(n, index + 1), + nr_to_copy * value_size); + } + + n->header.nr_entries = cpu_to_le32(nr_entries - 1); +} + +static unsigned merge_threshold(struct btree_node *n) +{ + return le32_to_cpu(n->header.max_entries) / 3; +} + +struct child { + unsigned index; + struct dm_block *block; + struct btree_node *n; +}; + +static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, + struct btree_node *parent, + unsigned index, struct child *result) +{ + int r, inc; + dm_block_t root; + + result->index = index; + root = value64(parent, index); + + r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, + &result->block, &inc); + if (r) + return r; + + result->n = dm_block_data(result->block); + + if (inc) + inc_children(info->tm, result->n, vt); + + *((__le64 *) value_ptr(parent, index)) = + cpu_to_le64(dm_block_location(result->block)); + + return 0; +} + +static int exit_child(struct dm_btree_info *info, struct child *c) +{ + return dm_tm_unlock(info->tm, c->block); +} + +static void shift(struct btree_node *left, struct btree_node *right, int count) +{ + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); + + BUG_ON(max_entries != r_max_entries); + BUG_ON(nr_left - count > max_entries); + BUG_ON(nr_right + count > max_entries); + + if (!count) + return; + + if (count > 0) { + node_shift(right, count); + node_copy(left, right, count); + } else { + node_copy(left, right, count); + node_shift(right, count); + } + + left->header.nr_entries = cpu_to_le32(nr_left - count); + right->header.nr_entries = cpu_to_le32(nr_right + count); +} + +static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent, + struct child *l, struct child *r) +{ + struct btree_node *left = l->n; + struct btree_node *right = r->n; + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + unsigned threshold = 2 * merge_threshold(left) + 1; + + if (nr_left + nr_right < threshold) { + /* + * Merge + */ + node_copy(left, right, -nr_right); + left->header.nr_entries = cpu_to_le32(nr_left + nr_right); + delete_at(parent, r->index); + + /* + * We need to decrement the right block, but not it's + * children, since they're still referenced by left. + */ + dm_tm_dec(info->tm, dm_block_location(r->block)); + } else { + /* + * Rebalance. + */ + unsigned target_left = (nr_left + nr_right) / 2; + shift(left, right, nr_left - target_left); + *key_ptr(parent, r->index) = right->keys[0]; + } +} + +static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, + struct dm_btree_value_type *vt, unsigned left_index) +{ + int r; + struct btree_node *parent; + struct child left, right; + + parent = dm_block_data(shadow_current(s)); + + r = init_child(info, vt, parent, left_index, &left); + if (r) + return r; + + r = init_child(info, vt, parent, left_index + 1, &right); + if (r) { + exit_child(info, &left); + return r; + } + + __rebalance2(info, parent, &left, &right); + + r = exit_child(info, &left); + if (r) { + exit_child(info, &right); + return r; + } + + return exit_child(info, &right); +} + +/* + * We dump as many entries from center as possible into left, then the rest + * in right, then rebalance2. This wastes some cpu, but I want something + * simple atm. + */ +static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent, + struct child *l, struct child *c, struct child *r, + struct btree_node *left, struct btree_node *center, struct btree_node *right, + uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) +{ + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + unsigned shift = min(max_entries - nr_left, nr_center); + + BUG_ON(nr_left + shift > max_entries); + node_copy(left, center, -shift); + left->header.nr_entries = cpu_to_le32(nr_left + shift); + + if (shift != nr_center) { + shift = nr_center - shift; + BUG_ON((nr_right + shift) > max_entries); + node_shift(right, shift); + node_copy(center, right, shift); + right->header.nr_entries = cpu_to_le32(nr_right + shift); + } + *key_ptr(parent, r->index) = right->keys[0]; + + delete_at(parent, c->index); + r->index--; + + dm_tm_dec(info->tm, dm_block_location(c->block)); + __rebalance2(info, parent, l, r); +} + +/* + * Redistributes entries among 3 sibling nodes. + */ +static void redistribute3(struct dm_btree_info *info, struct btree_node *parent, + struct child *l, struct child *c, struct child *r, + struct btree_node *left, struct btree_node *center, struct btree_node *right, + uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) +{ + int s; + uint32_t max_entries = le32_to_cpu(left->header.max_entries); + unsigned target = (nr_left + nr_center + nr_right) / 3; + BUG_ON(target > max_entries); + + if (nr_left < nr_right) { + s = nr_left - target; + + if (s < 0 && nr_center < -s) { + /* not enough in central node */ + shift(left, center, nr_center); + s = nr_center - target; + shift(left, right, s); + nr_right += s; + } else + shift(left, center, s); + + shift(center, right, target - nr_right); + + } else { + s = target - nr_right; + if (s > 0 && nr_center < s) { + /* not enough in central node */ + shift(center, right, nr_center); + s = target - nr_center; + shift(left, right, s); + nr_left -= s; + } else + shift(center, right, s); + + shift(left, center, nr_left - target); + } + + *key_ptr(parent, c->index) = center->keys[0]; + *key_ptr(parent, r->index) = right->keys[0]; +} + +static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent, + struct child *l, struct child *c, struct child *r) +{ + struct btree_node *left = l->n; + struct btree_node *center = c->n; + struct btree_node *right = r->n; + + uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t nr_center = le32_to_cpu(center->header.nr_entries); + uint32_t nr_right = le32_to_cpu(right->header.nr_entries); + + unsigned threshold = merge_threshold(left) * 4 + 1; + + BUG_ON(left->header.max_entries != center->header.max_entries); + BUG_ON(center->header.max_entries != right->header.max_entries); + + if ((nr_left + nr_center + nr_right) < threshold) + delete_center_node(info, parent, l, c, r, left, center, right, + nr_left, nr_center, nr_right); + else + redistribute3(info, parent, l, c, r, left, center, right, + nr_left, nr_center, nr_right); +} + +static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, + struct dm_btree_value_type *vt, unsigned left_index) +{ + int r; + struct btree_node *parent = dm_block_data(shadow_current(s)); + struct child left, center, right; + + /* + * FIXME: fill out an array? + */ + r = init_child(info, vt, parent, left_index, &left); + if (r) + return r; + + r = init_child(info, vt, parent, left_index + 1, ¢er); + if (r) { + exit_child(info, &left); + return r; + } + + r = init_child(info, vt, parent, left_index + 2, &right); + if (r) { + exit_child(info, &left); + exit_child(info, ¢er); + return r; + } + + __rebalance3(info, parent, &left, ¢er, &right); + + r = exit_child(info, &left); + if (r) { + exit_child(info, ¢er); + exit_child(info, &right); + return r; + } + + r = exit_child(info, ¢er); + if (r) { + exit_child(info, &right); + return r; + } + + r = exit_child(info, &right); + if (r) + return r; + + return 0; +} + +static int get_nr_entries(struct dm_transaction_manager *tm, + dm_block_t b, uint32_t *result) +{ + int r; + struct dm_block *block; + struct btree_node *n; + + r = dm_tm_read_lock(tm, b, &btree_node_validator, &block); + if (r) + return r; + + n = dm_block_data(block); + *result = le32_to_cpu(n->header.nr_entries); + + return dm_tm_unlock(tm, block); +} + +static int rebalance_children(struct shadow_spine *s, + struct dm_btree_info *info, + struct dm_btree_value_type *vt, uint64_t key) +{ + int i, r, has_left_sibling, has_right_sibling; + uint32_t child_entries; + struct btree_node *n; + + n = dm_block_data(shadow_current(s)); + + if (le32_to_cpu(n->header.nr_entries) == 1) { + struct dm_block *child; + dm_block_t b = value64(n, 0); + + r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); + if (r) + return r; + + memcpy(n, dm_block_data(child), + dm_bm_block_size(dm_tm_get_bm(info->tm))); + r = dm_tm_unlock(info->tm, child); + if (r) + return r; + + dm_tm_dec(info->tm, dm_block_location(child)); + return 0; + } + + i = lower_bound(n, key); + if (i < 0) + return -ENODATA; + + r = get_nr_entries(info->tm, value64(n, i), &child_entries); + if (r) + return r; + + has_left_sibling = i > 0; + has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); + + if (!has_left_sibling) + r = rebalance2(s, info, vt, i); + + else if (!has_right_sibling) + r = rebalance2(s, info, vt, i - 1); + + else + r = rebalance3(s, info, vt, i - 1); + + return r; +} + +static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index) +{ + int i = lower_bound(n, key); + + if ((i < 0) || + (i >= le32_to_cpu(n->header.nr_entries)) || + (le64_to_cpu(n->keys[i]) != key)) + return -ENODATA; + + *index = i; + + return 0; +} + +/* + * Prepares for removal from one level of the hierarchy. The caller must + * call delete_at() to remove the entry at index. + */ +static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, + struct dm_btree_value_type *vt, dm_block_t root, + uint64_t key, unsigned *index) +{ + int i = *index, r; + struct btree_node *n; + + for (;;) { + r = shadow_step(s, root, vt); + if (r < 0) + break; + + /* + * We have to patch up the parent node, ugly, but I don't + * see a way to do this automatically as part of the spine + * op. + */ + if (shadow_has_parent(s)) { + __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); + memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), + &location, sizeof(__le64)); + } + + n = dm_block_data(shadow_current(s)); + + if (le32_to_cpu(n->header.flags) & LEAF_NODE) + return do_leaf(n, key, index); + + r = rebalance_children(s, info, vt, key); + if (r) + break; + + n = dm_block_data(shadow_current(s)); + if (le32_to_cpu(n->header.flags) & LEAF_NODE) + return do_leaf(n, key, index); + + i = lower_bound(n, key); + + /* + * We know the key is present, or else + * rebalance_children would have returned + * -ENODATA + */ + root = value64(n, i); + } + + return r; +} + +static struct dm_btree_value_type le64_type = { + .context = NULL, + .size = sizeof(__le64), + .inc = NULL, + .dec = NULL, + .equal = NULL +}; + +int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, dm_block_t *new_root) +{ + unsigned level, last_level = info->levels - 1; + int index = 0, r = 0; + struct shadow_spine spine; + struct btree_node *n; + + init_shadow_spine(&spine, info); + for (level = 0; level < info->levels; level++) { + r = remove_raw(&spine, info, + (level == last_level ? + &info->value_type : &le64_type), + root, keys[level], (unsigned *)&index); + if (r < 0) + break; + + n = dm_block_data(shadow_current(&spine)); + if (level != last_level) { + root = value64(n, index); + continue; + } + + BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); + + if (info->value_type.dec) + info->value_type.dec(info->value_type.context, + value_ptr(n, index)); + + delete_at(n, index); + } + + *new_root = shadow_root(&spine); + exit_shadow_spine(&spine); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_remove); diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c new file mode 100644 index 00000000000..cf9fd676ae4 --- /dev/null +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -0,0 +1,251 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-btree-internal.h" +#include "dm-transaction-manager.h" + +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "btree spine" + +/*----------------------------------------------------------------*/ + +#define BTREE_CSUM_XOR 121107 + +static int node_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size); + +static void node_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct btree_node *n = dm_block_data(b); + struct node_header *h = &n->header; + + h->blocknr = cpu_to_le64(dm_block_location(b)); + h->csum = cpu_to_le32(dm_bm_checksum(&h->flags, + block_size - sizeof(__le32), + BTREE_CSUM_XOR)); + + BUG_ON(node_check(v, b, 4096)); +} + +static int node_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct btree_node *n = dm_block_data(b); + struct node_header *h = &n->header; + size_t value_size; + __le32 csum_disk; + uint32_t flags; + + if (dm_block_location(b) != le64_to_cpu(h->blocknr)) { + DMERR_LIMIT("node_check failed: blocknr %llu != wanted %llu", + le64_to_cpu(h->blocknr), dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&h->flags, + block_size - sizeof(__le32), + BTREE_CSUM_XOR)); + if (csum_disk != h->csum) { + DMERR_LIMIT("node_check failed: csum %u != wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(h->csum)); + return -EILSEQ; + } + + value_size = le32_to_cpu(h->value_size); + + if (sizeof(struct node_header) + + (sizeof(__le64) + value_size) * le32_to_cpu(h->max_entries) > block_size) { + DMERR_LIMIT("node_check failed: max_entries too large"); + return -EILSEQ; + } + + if (le32_to_cpu(h->nr_entries) > le32_to_cpu(h->max_entries)) { + DMERR_LIMIT("node_check failed: too many entries"); + return -EILSEQ; + } + + /* + * The node must be either INTERNAL or LEAF. + */ + flags = le32_to_cpu(h->flags); + if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) { + DMERR_LIMIT("node_check failed: node is neither INTERNAL or LEAF"); + return -EILSEQ; + } + + return 0; +} + +struct dm_block_validator btree_node_validator = { + .name = "btree_node", + .prepare_for_write = node_prepare_for_write, + .check = node_check +}; + +/*----------------------------------------------------------------*/ + +static int bn_read_lock(struct dm_btree_info *info, dm_block_t b, + struct dm_block **result) +{ + return dm_tm_read_lock(info->tm, b, &btree_node_validator, result); +} + +static int bn_shadow(struct dm_btree_info *info, dm_block_t orig, + struct dm_btree_value_type *vt, + struct dm_block **result) +{ + int r, inc; + + r = dm_tm_shadow_block(info->tm, orig, &btree_node_validator, + result, &inc); + if (!r && inc) + inc_children(info->tm, dm_block_data(*result), vt); + + return r; +} + +int new_block(struct dm_btree_info *info, struct dm_block **result) +{ + return dm_tm_new_block(info->tm, &btree_node_validator, result); +} + +int unlock_block(struct dm_btree_info *info, struct dm_block *b) +{ + return dm_tm_unlock(info->tm, b); +} + +/*----------------------------------------------------------------*/ + +void init_ro_spine(struct ro_spine *s, struct dm_btree_info *info) +{ + s->info = info; + s->count = 0; + s->nodes[0] = NULL; + s->nodes[1] = NULL; +} + +int exit_ro_spine(struct ro_spine *s) +{ + int r = 0, i; + + for (i = 0; i < s->count; i++) { + int r2 = unlock_block(s->info, s->nodes[i]); + if (r2 < 0) + r = r2; + } + + return r; +} + +int ro_step(struct ro_spine *s, dm_block_t new_child) +{ + int r; + + if (s->count == 2) { + r = unlock_block(s->info, s->nodes[0]); + if (r < 0) + return r; + s->nodes[0] = s->nodes[1]; + s->count--; + } + + r = bn_read_lock(s->info, new_child, s->nodes + s->count); + if (!r) + s->count++; + + return r; +} + +void ro_pop(struct ro_spine *s) +{ + BUG_ON(!s->count); + --s->count; + unlock_block(s->info, s->nodes[s->count]); +} + +struct btree_node *ro_node(struct ro_spine *s) +{ + struct dm_block *block; + + BUG_ON(!s->count); + block = s->nodes[s->count - 1]; + + return dm_block_data(block); +} + +/*----------------------------------------------------------------*/ + +void init_shadow_spine(struct shadow_spine *s, struct dm_btree_info *info) +{ + s->info = info; + s->count = 0; +} + +int exit_shadow_spine(struct shadow_spine *s) +{ + int r = 0, i; + + for (i = 0; i < s->count; i++) { + int r2 = unlock_block(s->info, s->nodes[i]); + if (r2 < 0) + r = r2; + } + + return r; +} + +int shadow_step(struct shadow_spine *s, dm_block_t b, + struct dm_btree_value_type *vt) +{ + int r; + + if (s->count == 2) { + r = unlock_block(s->info, s->nodes[0]); + if (r < 0) + return r; + s->nodes[0] = s->nodes[1]; + s->count--; + } + + r = bn_shadow(s->info, b, vt, s->nodes + s->count); + if (!r) { + if (!s->count) + s->root = dm_block_location(s->nodes[0]); + + s->count++; + } + + return r; +} + +struct dm_block *shadow_current(struct shadow_spine *s) +{ + BUG_ON(!s->count); + + return s->nodes[s->count - 1]; +} + +struct dm_block *shadow_parent(struct shadow_spine *s) +{ + BUG_ON(s->count != 2); + + return s->count == 2 ? s->nodes[0] : NULL; +} + +int shadow_has_parent(struct shadow_spine *s) +{ + return s->count >= 2; +} + +int shadow_root(struct shadow_spine *s) +{ + return s->root; +} diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c new file mode 100644 index 00000000000..416060c2570 --- /dev/null +++ b/drivers/md/persistent-data/dm-btree.c @@ -0,0 +1,896 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-btree-internal.h" +#include "dm-space-map.h" +#include "dm-transaction-manager.h" + +#include <linux/export.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "btree" + +/*---------------------------------------------------------------- + * Array manipulation + *--------------------------------------------------------------*/ +static void memcpy_disk(void *dest, const void *src, size_t len) + __dm_written_to_disk(src) +{ + memcpy(dest, src, len); + __dm_unbless_for_disk(src); +} + +static void array_insert(void *base, size_t elt_size, unsigned nr_elts, + unsigned index, void *elt) + __dm_written_to_disk(elt) +{ + if (index < nr_elts) + memmove(base + (elt_size * (index + 1)), + base + (elt_size * index), + (nr_elts - index) * elt_size); + + memcpy_disk(base + (elt_size * index), elt, elt_size); +} + +/*----------------------------------------------------------------*/ + +/* makes the assumption that no two keys are the same. */ +static int bsearch(struct btree_node *n, uint64_t key, int want_hi) +{ + int lo = -1, hi = le32_to_cpu(n->header.nr_entries); + + while (hi - lo > 1) { + int mid = lo + ((hi - lo) / 2); + uint64_t mid_key = le64_to_cpu(n->keys[mid]); + + if (mid_key == key) + return mid; + + if (mid_key < key) + lo = mid; + else + hi = mid; + } + + return want_hi ? hi : lo; +} + +int lower_bound(struct btree_node *n, uint64_t key) +{ + return bsearch(n, key, 0); +} + +void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, + struct dm_btree_value_type *vt) +{ + unsigned i; + uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); + + if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) + for (i = 0; i < nr_entries; i++) + dm_tm_inc(tm, value64(n, i)); + else if (vt->inc) + for (i = 0; i < nr_entries; i++) + vt->inc(vt->context, value_ptr(n, i)); +} + +static int insert_at(size_t value_size, struct btree_node *node, unsigned index, + uint64_t key, void *value) + __dm_written_to_disk(value) +{ + uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); + __le64 key_le = cpu_to_le64(key); + + if (index > nr_entries || + index >= le32_to_cpu(node->header.max_entries)) { + DMERR("too many entries in btree node for insert"); + __dm_unbless_for_disk(value); + return -ENOMEM; + } + + __dm_bless_for_disk(&key_le); + + array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); + array_insert(value_base(node), value_size, nr_entries, index, value); + node->header.nr_entries = cpu_to_le32(nr_entries + 1); + + return 0; +} + +/*----------------------------------------------------------------*/ + +/* + * We want 3n entries (for some n). This works more nicely for repeated + * insert remove loops than (2n + 1). + */ +static uint32_t calc_max_entries(size_t value_size, size_t block_size) +{ + uint32_t total, n; + size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ + + block_size -= sizeof(struct node_header); + total = block_size / elt_size; + n = total / 3; /* rounds down */ + + return 3 * n; +} + +int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) +{ + int r; + struct dm_block *b; + struct btree_node *n; + size_t block_size; + uint32_t max_entries; + + r = new_block(info, &b); + if (r < 0) + return r; + + block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); + max_entries = calc_max_entries(info->value_type.size, block_size); + + n = dm_block_data(b); + memset(n, 0, block_size); + n->header.flags = cpu_to_le32(LEAF_NODE); + n->header.nr_entries = cpu_to_le32(0); + n->header.max_entries = cpu_to_le32(max_entries); + n->header.value_size = cpu_to_le32(info->value_type.size); + + *root = dm_block_location(b); + return unlock_block(info, b); +} +EXPORT_SYMBOL_GPL(dm_btree_empty); + +/*----------------------------------------------------------------*/ + +/* + * Deletion uses a recursive algorithm, since we have limited stack space + * we explicitly manage our own stack on the heap. + */ +#define MAX_SPINE_DEPTH 64 +struct frame { + struct dm_block *b; + struct btree_node *n; + unsigned level; + unsigned nr_children; + unsigned current_child; +}; + +struct del_stack { + struct dm_btree_info *info; + struct dm_transaction_manager *tm; + int top; + struct frame spine[MAX_SPINE_DEPTH]; +}; + +static int top_frame(struct del_stack *s, struct frame **f) +{ + if (s->top < 0) { + DMERR("btree deletion stack empty"); + return -EINVAL; + } + + *f = s->spine + s->top; + + return 0; +} + +static int unprocessed_frames(struct del_stack *s) +{ + return s->top >= 0; +} + +static void prefetch_children(struct del_stack *s, struct frame *f) +{ + unsigned i; + struct dm_block_manager *bm = dm_tm_get_bm(s->tm); + + for (i = 0; i < f->nr_children; i++) + dm_bm_prefetch(bm, value64(f->n, i)); +} + +static bool is_internal_level(struct dm_btree_info *info, struct frame *f) +{ + return f->level < (info->levels - 1); +} + +static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) +{ + int r; + uint32_t ref_count; + + if (s->top >= MAX_SPINE_DEPTH - 1) { + DMERR("btree deletion stack out of memory"); + return -ENOMEM; + } + + r = dm_tm_ref(s->tm, b, &ref_count); + if (r) + return r; + + if (ref_count > 1) + /* + * This is a shared node, so we can just decrement it's + * reference counter and leave the children. + */ + dm_tm_dec(s->tm, b); + + else { + uint32_t flags; + struct frame *f = s->spine + ++s->top; + + r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); + if (r) { + s->top--; + return r; + } + + f->n = dm_block_data(f->b); + f->level = level; + f->nr_children = le32_to_cpu(f->n->header.nr_entries); + f->current_child = 0; + + flags = le32_to_cpu(f->n->header.flags); + if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) + prefetch_children(s, f); + } + + return 0; +} + +static void pop_frame(struct del_stack *s) +{ + struct frame *f = s->spine + s->top--; + + dm_tm_dec(s->tm, dm_block_location(f->b)); + dm_tm_unlock(s->tm, f->b); +} + +int dm_btree_del(struct dm_btree_info *info, dm_block_t root) +{ + int r; + struct del_stack *s; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + s->info = info; + s->tm = info->tm; + s->top = -1; + + r = push_frame(s, root, 0); + if (r) + goto out; + + while (unprocessed_frames(s)) { + uint32_t flags; + struct frame *f; + dm_block_t b; + + r = top_frame(s, &f); + if (r) + goto out; + + if (f->current_child >= f->nr_children) { + pop_frame(s); + continue; + } + + flags = le32_to_cpu(f->n->header.flags); + if (flags & INTERNAL_NODE) { + b = value64(f->n, f->current_child); + f->current_child++; + r = push_frame(s, b, f->level); + if (r) + goto out; + + } else if (is_internal_level(info, f)) { + b = value64(f->n, f->current_child); + f->current_child++; + r = push_frame(s, b, f->level + 1); + if (r) + goto out; + + } else { + if (info->value_type.dec) { + unsigned i; + + for (i = 0; i < f->nr_children; i++) + info->value_type.dec(info->value_type.context, + value_ptr(f->n, i)); + } + pop_frame(s); + } + } + +out: + kfree(s); + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_del); + +/*----------------------------------------------------------------*/ + +static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, + int (*search_fn)(struct btree_node *, uint64_t), + uint64_t *result_key, void *v, size_t value_size) +{ + int i, r; + uint32_t flags, nr_entries; + + do { + r = ro_step(s, block); + if (r < 0) + return r; + + i = search_fn(ro_node(s), key); + + flags = le32_to_cpu(ro_node(s)->header.flags); + nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); + if (i < 0 || i >= nr_entries) + return -ENODATA; + + if (flags & INTERNAL_NODE) + block = value64(ro_node(s), i); + + } while (!(flags & LEAF_NODE)); + + *result_key = le64_to_cpu(ro_node(s)->keys[i]); + memcpy(v, value_ptr(ro_node(s), i), value_size); + + return 0; +} + +int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value_le) +{ + unsigned level, last_level = info->levels - 1; + int r = -ENODATA; + uint64_t rkey; + __le64 internal_value_le; + struct ro_spine spine; + + init_ro_spine(&spine, info); + for (level = 0; level < info->levels; level++) { + size_t size; + void *value_p; + + if (level == last_level) { + value_p = value_le; + size = info->value_type.size; + + } else { + value_p = &internal_value_le; + size = sizeof(uint64_t); + } + + r = btree_lookup_raw(&spine, root, keys[level], + lower_bound, &rkey, + value_p, size); + + if (!r) { + if (rkey != keys[level]) { + exit_ro_spine(&spine); + return -ENODATA; + } + } else { + exit_ro_spine(&spine); + return r; + } + + root = le64_to_cpu(internal_value_le); + } + exit_ro_spine(&spine); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_lookup); + +/* + * Splits a node by creating a sibling node and shifting half the nodes + * contents across. Assumes there is a parent node, and it has room for + * another child. + * + * Before: + * +--------+ + * | Parent | + * +--------+ + * | + * v + * +----------+ + * | A ++++++ | + * +----------+ + * + * + * After: + * +--------+ + * | Parent | + * +--------+ + * | | + * v +------+ + * +---------+ | + * | A* +++ | v + * +---------+ +-------+ + * | B +++ | + * +-------+ + * + * Where A* is a shadow of A. + */ +static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, + unsigned parent_index, uint64_t key) +{ + int r; + size_t size; + unsigned nr_left, nr_right; + struct dm_block *left, *right, *parent; + struct btree_node *ln, *rn, *pn; + __le64 location; + + left = shadow_current(s); + + r = new_block(s->info, &right); + if (r < 0) + return r; + + ln = dm_block_data(left); + rn = dm_block_data(right); + + nr_left = le32_to_cpu(ln->header.nr_entries) / 2; + nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; + + ln->header.nr_entries = cpu_to_le32(nr_left); + + rn->header.flags = ln->header.flags; + rn->header.nr_entries = cpu_to_le32(nr_right); + rn->header.max_entries = ln->header.max_entries; + rn->header.value_size = ln->header.value_size; + memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); + + size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? + sizeof(uint64_t) : s->info->value_type.size; + memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), + size * nr_right); + + /* + * Patch up the parent + */ + parent = shadow_parent(s); + + pn = dm_block_data(parent); + location = cpu_to_le64(dm_block_location(left)); + __dm_bless_for_disk(&location); + memcpy_disk(value_ptr(pn, parent_index), + &location, sizeof(__le64)); + + location = cpu_to_le64(dm_block_location(right)); + __dm_bless_for_disk(&location); + + r = insert_at(sizeof(__le64), pn, parent_index + 1, + le64_to_cpu(rn->keys[0]), &location); + if (r) + return r; + + if (key < le64_to_cpu(rn->keys[0])) { + unlock_block(s->info, right); + s->nodes[1] = left; + } else { + unlock_block(s->info, left); + s->nodes[1] = right; + } + + return 0; +} + +/* + * Splits a node by creating two new children beneath the given node. + * + * Before: + * +----------+ + * | A ++++++ | + * +----------+ + * + * + * After: + * +------------+ + * | A (shadow) | + * +------------+ + * | | + * +------+ +----+ + * | | + * v v + * +-------+ +-------+ + * | B +++ | | C +++ | + * +-------+ +-------+ + */ +static int btree_split_beneath(struct shadow_spine *s, uint64_t key) +{ + int r; + size_t size; + unsigned nr_left, nr_right; + struct dm_block *left, *right, *new_parent; + struct btree_node *pn, *ln, *rn; + __le64 val; + + new_parent = shadow_current(s); + + r = new_block(s->info, &left); + if (r < 0) + return r; + + r = new_block(s->info, &right); + if (r < 0) { + /* FIXME: put left */ + return r; + } + + pn = dm_block_data(new_parent); + ln = dm_block_data(left); + rn = dm_block_data(right); + + nr_left = le32_to_cpu(pn->header.nr_entries) / 2; + nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; + + ln->header.flags = pn->header.flags; + ln->header.nr_entries = cpu_to_le32(nr_left); + ln->header.max_entries = pn->header.max_entries; + ln->header.value_size = pn->header.value_size; + + rn->header.flags = pn->header.flags; + rn->header.nr_entries = cpu_to_le32(nr_right); + rn->header.max_entries = pn->header.max_entries; + rn->header.value_size = pn->header.value_size; + + memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); + memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); + + size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? + sizeof(__le64) : s->info->value_type.size; + memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); + memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), + nr_right * size); + + /* new_parent should just point to l and r now */ + pn->header.flags = cpu_to_le32(INTERNAL_NODE); + pn->header.nr_entries = cpu_to_le32(2); + pn->header.max_entries = cpu_to_le32( + calc_max_entries(sizeof(__le64), + dm_bm_block_size( + dm_tm_get_bm(s->info->tm)))); + pn->header.value_size = cpu_to_le32(sizeof(__le64)); + + val = cpu_to_le64(dm_block_location(left)); + __dm_bless_for_disk(&val); + pn->keys[0] = ln->keys[0]; + memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); + + val = cpu_to_le64(dm_block_location(right)); + __dm_bless_for_disk(&val); + pn->keys[1] = rn->keys[0]; + memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); + + /* + * rejig the spine. This is ugly, since it knows too + * much about the spine + */ + if (s->nodes[0] != new_parent) { + unlock_block(s->info, s->nodes[0]); + s->nodes[0] = new_parent; + } + if (key < le64_to_cpu(rn->keys[0])) { + unlock_block(s->info, right); + s->nodes[1] = left; + } else { + unlock_block(s->info, left); + s->nodes[1] = right; + } + s->count = 2; + + return 0; +} + +static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, + struct dm_btree_value_type *vt, + uint64_t key, unsigned *index) +{ + int r, i = *index, top = 1; + struct btree_node *node; + + for (;;) { + r = shadow_step(s, root, vt); + if (r < 0) + return r; + + node = dm_block_data(shadow_current(s)); + + /* + * We have to patch up the parent node, ugly, but I don't + * see a way to do this automatically as part of the spine + * op. + */ + if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ + __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); + + __dm_bless_for_disk(&location); + memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), + &location, sizeof(__le64)); + } + + node = dm_block_data(shadow_current(s)); + + if (node->header.nr_entries == node->header.max_entries) { + if (top) + r = btree_split_beneath(s, key); + else + r = btree_split_sibling(s, root, i, key); + + if (r < 0) + return r; + } + + node = dm_block_data(shadow_current(s)); + + i = lower_bound(node, key); + + if (le32_to_cpu(node->header.flags) & LEAF_NODE) + break; + + if (i < 0) { + /* change the bounds on the lowest key */ + node->keys[0] = cpu_to_le64(key); + i = 0; + } + + root = value64(node, i); + top = 0; + } + + if (i < 0 || le64_to_cpu(node->keys[i]) != key) + i++; + + *index = i; + return 0; +} + +static int insert(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root, + int *inserted) + __dm_written_to_disk(value) +{ + int r, need_insert; + unsigned level, index = -1, last_level = info->levels - 1; + dm_block_t block = root; + struct shadow_spine spine; + struct btree_node *n; + struct dm_btree_value_type le64_type; + + le64_type.context = NULL; + le64_type.size = sizeof(__le64); + le64_type.inc = NULL; + le64_type.dec = NULL; + le64_type.equal = NULL; + + init_shadow_spine(&spine, info); + + for (level = 0; level < (info->levels - 1); level++) { + r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); + if (r < 0) + goto bad; + + n = dm_block_data(shadow_current(&spine)); + need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || + (le64_to_cpu(n->keys[index]) != keys[level])); + + if (need_insert) { + dm_block_t new_tree; + __le64 new_le; + + r = dm_btree_empty(info, &new_tree); + if (r < 0) + goto bad; + + new_le = cpu_to_le64(new_tree); + __dm_bless_for_disk(&new_le); + + r = insert_at(sizeof(uint64_t), n, index, + keys[level], &new_le); + if (r) + goto bad; + } + + if (level < last_level) + block = value64(n, index); + } + + r = btree_insert_raw(&spine, block, &info->value_type, + keys[level], &index); + if (r < 0) + goto bad; + + n = dm_block_data(shadow_current(&spine)); + need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) || + (le64_to_cpu(n->keys[index]) != keys[level])); + + if (need_insert) { + if (inserted) + *inserted = 1; + + r = insert_at(info->value_type.size, n, index, + keys[level], value); + if (r) + goto bad_unblessed; + } else { + if (inserted) + *inserted = 0; + + if (info->value_type.dec && + (!info->value_type.equal || + !info->value_type.equal( + info->value_type.context, + value_ptr(n, index), + value))) { + info->value_type.dec(info->value_type.context, + value_ptr(n, index)); + } + memcpy_disk(value_ptr(n, index), + value, info->value_type.size); + } + + *new_root = shadow_root(&spine); + exit_shadow_spine(&spine); + + return 0; + +bad: + __dm_unbless_for_disk(value); +bad_unblessed: + exit_shadow_spine(&spine); + return r; +} + +int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root) + __dm_written_to_disk(value) +{ + return insert(info, root, keys, value, new_root, NULL); +} +EXPORT_SYMBOL_GPL(dm_btree_insert); + +int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root, + int *inserted) + __dm_written_to_disk(value) +{ + return insert(info, root, keys, value, new_root, inserted); +} +EXPORT_SYMBOL_GPL(dm_btree_insert_notify); + +/*----------------------------------------------------------------*/ + +static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, + uint64_t *result_key, dm_block_t *next_block) +{ + int i, r; + uint32_t flags; + + do { + r = ro_step(s, block); + if (r < 0) + return r; + + flags = le32_to_cpu(ro_node(s)->header.flags); + i = le32_to_cpu(ro_node(s)->header.nr_entries); + if (!i) + return -ENODATA; + else + i--; + + if (find_highest) + *result_key = le64_to_cpu(ro_node(s)->keys[i]); + else + *result_key = le64_to_cpu(ro_node(s)->keys[0]); + + if (next_block || flags & INTERNAL_NODE) + block = value64(ro_node(s), i); + + } while (flags & INTERNAL_NODE); + + if (next_block) + *next_block = block; + return 0; +} + +static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, + bool find_highest, uint64_t *result_keys) +{ + int r = 0, count = 0, level; + struct ro_spine spine; + + init_ro_spine(&spine, info); + for (level = 0; level < info->levels; level++) { + r = find_key(&spine, root, find_highest, result_keys + level, + level == info->levels - 1 ? NULL : &root); + if (r == -ENODATA) { + r = 0; + break; + + } else if (r) + break; + + count++; + } + exit_ro_spine(&spine); + + return r ? r : count; +} + +int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys) +{ + return dm_btree_find_key(info, root, true, result_keys); +} +EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); + +int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys) +{ + return dm_btree_find_key(info, root, false, result_keys); +} +EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); + +/*----------------------------------------------------------------*/ + +/* + * FIXME: We shouldn't use a recursive algorithm when we have limited stack + * space. Also this only works for single level trees. + */ +static int walk_node(struct ro_spine *s, dm_block_t block, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context) +{ + int r; + unsigned i, nr; + struct btree_node *n; + uint64_t keys; + + r = ro_step(s, block); + n = ro_node(s); + + nr = le32_to_cpu(n->header.nr_entries); + for (i = 0; i < nr; i++) { + if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { + r = walk_node(s, value64(n, i), fn, context); + if (r) + goto out; + } else { + keys = le64_to_cpu(*key_ptr(n, i)); + r = fn(context, &keys, value_ptr(n, i)); + if (r) + goto out; + } + } + +out: + ro_pop(s); + return r; +} + +int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context) +{ + int r; + struct ro_spine spine; + + BUG_ON(info->levels > 1); + + init_ro_spine(&spine, info); + r = walk_node(&spine, root, fn, context); + exit_ro_spine(&spine); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_walk); diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h new file mode 100644 index 00000000000..dacfc34180b --- /dev/null +++ b/drivers/md/persistent-data/dm-btree.h @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#ifndef _LINUX_DM_BTREE_H +#define _LINUX_DM_BTREE_H + +#include "dm-block-manager.h" + +struct dm_transaction_manager; + +/*----------------------------------------------------------------*/ + +/* + * Annotations used to check on-disk metadata is handled as little-endian. + */ +#ifdef __CHECKER__ +# define __dm_written_to_disk(x) __releases(x) +# define __dm_reads_from_disk(x) __acquires(x) +# define __dm_bless_for_disk(x) __acquire(x) +# define __dm_unbless_for_disk(x) __release(x) +#else +# define __dm_written_to_disk(x) +# define __dm_reads_from_disk(x) +# define __dm_bless_for_disk(x) +# define __dm_unbless_for_disk(x) +#endif + +/*----------------------------------------------------------------*/ + +/* + * Manipulates hierarchical B+ trees with 64-bit keys and arbitrary-sized + * values. + */ + +/* + * Information about the values stored within the btree. + */ +struct dm_btree_value_type { + void *context; + + /* + * The size in bytes of each value. + */ + uint32_t size; + + /* + * Any of these methods can be safely set to NULL if you do not + * need the corresponding feature. + */ + + /* + * The btree is making a duplicate of the value, for instance + * because previously-shared btree nodes have now diverged. + * @value argument is the new copy that the copy function may modify. + * (Probably it just wants to increment a reference count + * somewhere.) This method is _not_ called for insertion of a new + * value: It is assumed the ref count is already 1. + */ + void (*inc)(void *context, const void *value); + + /* + * This value is being deleted. The btree takes care of freeing + * the memory pointed to by @value. Often the del function just + * needs to decrement a reference count somewhere. + */ + void (*dec)(void *context, const void *value); + + /* + * A test for equality between two values. When a value is + * overwritten with a new one, the old one has the dec method + * called _unless_ the new and old value are deemed equal. + */ + int (*equal)(void *context, const void *value1, const void *value2); +}; + +/* + * The shape and contents of a btree. + */ +struct dm_btree_info { + struct dm_transaction_manager *tm; + + /* + * Number of nested btrees. (Not the depth of a single tree.) + */ + unsigned levels; + struct dm_btree_value_type value_type; +}; + +/* + * Set up an empty tree. O(1). + */ +int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root); + +/* + * Delete a tree. O(n) - this is the slow one! It can also block, so + * please don't call it on an IO path. + */ +int dm_btree_del(struct dm_btree_info *info, dm_block_t root); + +/* + * All the lookup functions return -ENODATA if the key cannot be found. + */ + +/* + * Tries to find a key that matches exactly. O(ln(n)) + */ +int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value_le); + +/* + * Insertion (or overwrite an existing value). O(ln(n)) + */ +int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root) + __dm_written_to_disk(value); + +/* + * A variant of insert that indicates whether it actually inserted or just + * overwrote. Useful if you're keeping track of the number of entries in a + * tree. + */ +int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, void *value, dm_block_t *new_root, + int *inserted) + __dm_written_to_disk(value); + +/* + * Remove a key if present. This doesn't remove empty sub trees. Normally + * subtrees represent a separate entity, like a snapshot map, so this is + * correct behaviour. O(ln(n)). + */ +int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, + uint64_t *keys, dm_block_t *new_root); + +/* + * Returns < 0 on failure. Otherwise the number of key entries that have + * been filled out. Remember trees can have zero entries, and as such have + * no lowest key. + */ +int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys); + +/* + * Returns < 0 on failure. Otherwise the number of key entries that have + * been filled out. Remember trees can have zero entries, and as such have + * no highest key. + */ +int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, + uint64_t *result_keys); + +/* + * Iterate through the a btree, calling fn() on each entry. + * It only works for single level trees and is internally recursive, so + * monitor stack usage carefully. + */ +int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, + int (*fn)(void *context, uint64_t *keys, void *leaf), + void *context); + +#endif /* _LINUX_DM_BTREE_H */ diff --git a/drivers/md/persistent-data/dm-persistent-data-internal.h b/drivers/md/persistent-data/dm-persistent-data-internal.h new file mode 100644 index 00000000000..c49e26fff36 --- /dev/null +++ b/drivers/md/persistent-data/dm-persistent-data-internal.h @@ -0,0 +1,19 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _DM_PERSISTENT_DATA_INTERNAL_H +#define _DM_PERSISTENT_DATA_INTERNAL_H + +#include "dm-block-manager.h" + +static inline unsigned dm_hash_block(dm_block_t b, unsigned hash_mask) +{ + const unsigned BIG_PRIME = 4294967291UL; + + return (((unsigned) b) * BIG_PRIME) & hash_mask; +} + +#endif /* _PERSISTENT_DATA_INTERNAL_H */ diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c new file mode 100644 index 00000000000..aacbe70c2c2 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -0,0 +1,751 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-space-map-common.h" +#include "dm-transaction-manager.h" + +#include <linux/bitops.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "space map common" + +/*----------------------------------------------------------------*/ + +/* + * Index validator. + */ +#define INDEX_CSUM_XOR 160478 + +static void index_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_metadata_index *mi_le = dm_block_data(b); + + mi_le->blocknr = cpu_to_le64(dm_block_location(b)); + mi_le->csum = cpu_to_le32(dm_bm_checksum(&mi_le->padding, + block_size - sizeof(__le32), + INDEX_CSUM_XOR)); +} + +static int index_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_metadata_index *mi_le = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) { + DMERR_LIMIT("index_check failed: blocknr %llu != wanted %llu", + le64_to_cpu(mi_le->blocknr), dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&mi_le->padding, + block_size - sizeof(__le32), + INDEX_CSUM_XOR)); + if (csum_disk != mi_le->csum) { + DMERR_LIMIT("index_check failed: csum %u != wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator index_validator = { + .name = "index", + .prepare_for_write = index_prepare_for_write, + .check = index_check +}; + +/*----------------------------------------------------------------*/ + +/* + * Bitmap validator + */ +#define BITMAP_CSUM_XOR 240779 + +static void bitmap_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_bitmap_header *disk_header = dm_block_data(b); + + disk_header->blocknr = cpu_to_le64(dm_block_location(b)); + disk_header->csum = cpu_to_le32(dm_bm_checksum(&disk_header->not_used, + block_size - sizeof(__le32), + BITMAP_CSUM_XOR)); +} + +static int bitmap_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_bitmap_header *disk_header = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) { + DMERR_LIMIT("bitmap check failed: blocknr %llu != wanted %llu", + le64_to_cpu(disk_header->blocknr), dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&disk_header->not_used, + block_size - sizeof(__le32), + BITMAP_CSUM_XOR)); + if (csum_disk != disk_header->csum) { + DMERR_LIMIT("bitmap check failed: csum %u != wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator dm_sm_bitmap_validator = { + .name = "sm_bitmap", + .prepare_for_write = bitmap_prepare_for_write, + .check = bitmap_check +}; + +/*----------------------------------------------------------------*/ + +#define ENTRIES_PER_WORD 32 +#define ENTRIES_SHIFT 5 + +static void *dm_bitmap_data(struct dm_block *b) +{ + return dm_block_data(b) + sizeof(struct disk_bitmap_header); +} + +#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL + +static unsigned bitmap_word_used(void *addr, unsigned b) +{ + __le64 *words_le = addr; + __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); + + uint64_t bits = le64_to_cpu(*w_le); + uint64_t mask = (bits + WORD_MASK_HIGH + 1) & WORD_MASK_HIGH; + + return !(~bits & mask); +} + +static unsigned sm_lookup_bitmap(void *addr, unsigned b) +{ + __le64 *words_le = addr; + __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); + unsigned hi, lo; + + b = (b & (ENTRIES_PER_WORD - 1)) << 1; + hi = !!test_bit_le(b, (void *) w_le); + lo = !!test_bit_le(b + 1, (void *) w_le); + return (hi << 1) | lo; +} + +static void sm_set_bitmap(void *addr, unsigned b, unsigned val) +{ + __le64 *words_le = addr; + __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); + + b = (b & (ENTRIES_PER_WORD - 1)) << 1; + + if (val & 2) + __set_bit_le(b, (void *) w_le); + else + __clear_bit_le(b, (void *) w_le); + + if (val & 1) + __set_bit_le(b + 1, (void *) w_le); + else + __clear_bit_le(b + 1, (void *) w_le); +} + +static int sm_find_free(void *addr, unsigned begin, unsigned end, + unsigned *result) +{ + while (begin < end) { + if (!(begin & (ENTRIES_PER_WORD - 1)) && + bitmap_word_used(addr, begin)) { + begin += ENTRIES_PER_WORD; + continue; + } + + if (!sm_lookup_bitmap(addr, begin)) { + *result = begin; + return 0; + } + + begin++; + } + + return -ENOSPC; +} + +/*----------------------------------------------------------------*/ + +static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm) +{ + ll->tm = tm; + + ll->bitmap_info.tm = tm; + ll->bitmap_info.levels = 1; + + /* + * Because the new bitmap blocks are created via a shadow + * operation, the old entry has already had its reference count + * decremented and we don't need the btree to do any bookkeeping. + */ + ll->bitmap_info.value_type.size = sizeof(struct disk_index_entry); + ll->bitmap_info.value_type.inc = NULL; + ll->bitmap_info.value_type.dec = NULL; + ll->bitmap_info.value_type.equal = NULL; + + ll->ref_count_info.tm = tm; + ll->ref_count_info.levels = 1; + ll->ref_count_info.value_type.size = sizeof(uint32_t); + ll->ref_count_info.value_type.inc = NULL; + ll->ref_count_info.value_type.dec = NULL; + ll->ref_count_info.value_type.equal = NULL; + + ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm)); + + if (ll->block_size > (1 << 30)) { + DMERR("block size too big to hold bitmaps"); + return -EINVAL; + } + + ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) * + ENTRIES_PER_BYTE; + ll->nr_blocks = 0; + ll->bitmap_root = 0; + ll->ref_count_root = 0; + ll->bitmap_index_changed = false; + + return 0; +} + +int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) +{ + int r; + dm_block_t i, nr_blocks, nr_indexes; + unsigned old_blocks, blocks; + + nr_blocks = ll->nr_blocks + extra_blocks; + old_blocks = dm_sector_div_up(ll->nr_blocks, ll->entries_per_block); + blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block); + + nr_indexes = dm_sector_div_up(nr_blocks, ll->entries_per_block); + if (nr_indexes > ll->max_entries(ll)) { + DMERR("space map too large"); + return -EINVAL; + } + + /* + * We need to set this before the dm_tm_new_block() call below. + */ + ll->nr_blocks = nr_blocks; + for (i = old_blocks; i < blocks; i++) { + struct dm_block *b; + struct disk_index_entry idx; + + r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b); + if (r < 0) + return r; + + idx.blocknr = cpu_to_le64(dm_block_location(b)); + + r = dm_tm_unlock(ll->tm, b); + if (r < 0) + return r; + + idx.nr_free = cpu_to_le32(ll->entries_per_block); + idx.none_free_before = 0; + + r = ll->save_ie(ll, i, &idx); + if (r < 0) + return r; + } + + return 0; +} + +int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result) +{ + int r; + dm_block_t index = b; + struct disk_index_entry ie_disk; + struct dm_block *blk; + + b = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ie_disk); + if (r < 0) + return r; + + r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr), + &dm_sm_bitmap_validator, &blk); + if (r < 0) + return r; + + *result = sm_lookup_bitmap(dm_bitmap_data(blk), b); + + return dm_tm_unlock(ll->tm, blk); +} + +static int sm_ll_lookup_big_ref_count(struct ll_disk *ll, dm_block_t b, + uint32_t *result) +{ + __le32 le_rc; + int r; + + r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc); + if (r < 0) + return r; + + *result = le32_to_cpu(le_rc); + + return r; +} + +int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) +{ + int r = sm_ll_lookup_bitmap(ll, b, result); + + if (r) + return r; + + if (*result != 3) + return r; + + return sm_ll_lookup_big_ref_count(ll, b, result); +} + +int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, + dm_block_t end, dm_block_t *result) +{ + int r; + struct disk_index_entry ie_disk; + dm_block_t i, index_begin = begin; + dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block); + + /* + * FIXME: Use shifts + */ + begin = do_div(index_begin, ll->entries_per_block); + end = do_div(end, ll->entries_per_block); + + for (i = index_begin; i < index_end; i++, begin = 0) { + struct dm_block *blk; + unsigned position; + uint32_t bit_end; + + r = ll->load_ie(ll, i, &ie_disk); + if (r < 0) + return r; + + if (le32_to_cpu(ie_disk.nr_free) == 0) + continue; + + r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr), + &dm_sm_bitmap_validator, &blk); + if (r < 0) + return r; + + bit_end = (i == index_end - 1) ? end : ll->entries_per_block; + + r = sm_find_free(dm_bitmap_data(blk), + max_t(unsigned, begin, le32_to_cpu(ie_disk.none_free_before)), + bit_end, &position); + if (r == -ENOSPC) { + /* + * This might happen because we started searching + * part way through the bitmap. + */ + dm_tm_unlock(ll->tm, blk); + continue; + + } else if (r < 0) { + dm_tm_unlock(ll->tm, blk); + return r; + } + + r = dm_tm_unlock(ll->tm, blk); + if (r < 0) + return r; + + *result = i * ll->entries_per_block + (dm_block_t) position; + return 0; + } + + return -ENOSPC; +} + +static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, + int (*mutator)(void *context, uint32_t old, uint32_t *new), + void *context, enum allocation_event *ev) +{ + int r; + uint32_t bit, old, ref_count; + struct dm_block *nb; + dm_block_t index = b; + struct disk_index_entry ie_disk; + void *bm_le; + int inc; + + bit = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ie_disk); + if (r < 0) + return r; + + r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk.blocknr), + &dm_sm_bitmap_validator, &nb, &inc); + if (r < 0) { + DMERR("dm_tm_shadow_block() failed"); + return r; + } + ie_disk.blocknr = cpu_to_le64(dm_block_location(nb)); + + bm_le = dm_bitmap_data(nb); + old = sm_lookup_bitmap(bm_le, bit); + + if (old > 2) { + r = sm_ll_lookup_big_ref_count(ll, b, &old); + if (r < 0) { + dm_tm_unlock(ll->tm, nb); + return r; + } + } + + r = mutator(context, old, &ref_count); + if (r) { + dm_tm_unlock(ll->tm, nb); + return r; + } + + if (ref_count <= 2) { + sm_set_bitmap(bm_le, bit, ref_count); + + r = dm_tm_unlock(ll->tm, nb); + if (r < 0) + return r; + + if (old > 2) { + r = dm_btree_remove(&ll->ref_count_info, + ll->ref_count_root, + &b, &ll->ref_count_root); + if (r) + return r; + } + + } else { + __le32 le_rc = cpu_to_le32(ref_count); + + sm_set_bitmap(bm_le, bit, 3); + r = dm_tm_unlock(ll->tm, nb); + if (r < 0) + return r; + + __dm_bless_for_disk(&le_rc); + r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root, + &b, &le_rc, &ll->ref_count_root); + if (r < 0) { + DMERR("ref count insert failed"); + return r; + } + } + + if (ref_count && !old) { + *ev = SM_ALLOC; + ll->nr_allocated++; + le32_add_cpu(&ie_disk.nr_free, -1); + if (le32_to_cpu(ie_disk.none_free_before) == bit) + ie_disk.none_free_before = cpu_to_le32(bit + 1); + + } else if (old && !ref_count) { + *ev = SM_FREE; + ll->nr_allocated--; + le32_add_cpu(&ie_disk.nr_free, 1); + ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit)); + } + + return ll->save_ie(ll, index, &ie_disk); +} + +static int set_ref_count(void *context, uint32_t old, uint32_t *new) +{ + *new = *((uint32_t *) context); + return 0; +} + +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, + uint32_t ref_count, enum allocation_event *ev) +{ + return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); +} + +static int inc_ref_count(void *context, uint32_t old, uint32_t *new) +{ + *new = old + 1; + return 0; +} + +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +{ + return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); +} + +static int dec_ref_count(void *context, uint32_t old, uint32_t *new) +{ + if (!old) { + DMERR_LIMIT("unable to decrement a reference count below 0"); + return -EINVAL; + } + + *new = old - 1; + return 0; +} + +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +{ + return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev); +} + +int sm_ll_commit(struct ll_disk *ll) +{ + int r = 0; + + if (ll->bitmap_index_changed) { + r = ll->commit(ll); + if (!r) + ll->bitmap_index_changed = false; + } + + return r; +} + +/*----------------------------------------------------------------*/ + +static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + memcpy(ie, ll->mi_le.index + index, sizeof(*ie)); + return 0; +} + +static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + ll->bitmap_index_changed = true; + memcpy(ll->mi_le.index + index, ie, sizeof(*ie)); + return 0; +} + +static int metadata_ll_init_index(struct ll_disk *ll) +{ + int r; + struct dm_block *b; + + r = dm_tm_new_block(ll->tm, &index_validator, &b); + if (r < 0) + return r; + + memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); + ll->bitmap_root = dm_block_location(b); + + return dm_tm_unlock(ll->tm, b); +} + +static int metadata_ll_open(struct ll_disk *ll) +{ + int r; + struct dm_block *block; + + r = dm_tm_read_lock(ll->tm, ll->bitmap_root, + &index_validator, &block); + if (r) + return r; + + memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le)); + return dm_tm_unlock(ll->tm, block); +} + +static dm_block_t metadata_ll_max_entries(struct ll_disk *ll) +{ + return MAX_METADATA_BITMAPS; +} + +static int metadata_ll_commit(struct ll_disk *ll) +{ + int r, inc; + struct dm_block *b; + + r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc); + if (r) + return r; + + memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); + ll->bitmap_root = dm_block_location(b); + + return dm_tm_unlock(ll->tm, b); +} + +int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm) +{ + int r; + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = metadata_ll_load_ie; + ll->save_ie = metadata_ll_save_ie; + ll->init_index = metadata_ll_init_index; + ll->open_index = metadata_ll_open; + ll->max_entries = metadata_ll_max_entries; + ll->commit = metadata_ll_commit; + + ll->nr_blocks = 0; + ll->nr_allocated = 0; + + r = ll->init_index(ll); + if (r < 0) + return r; + + r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root); + if (r < 0) + return r; + + return 0; +} + +int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct disk_sm_root *smr = root_le; + + if (len < sizeof(struct disk_sm_root)) { + DMERR("sm_metadata root too small"); + return -ENOMEM; + } + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = metadata_ll_load_ie; + ll->save_ie = metadata_ll_save_ie; + ll->init_index = metadata_ll_init_index; + ll->open_index = metadata_ll_open; + ll->max_entries = metadata_ll_max_entries; + ll->commit = metadata_ll_commit; + + ll->nr_blocks = le64_to_cpu(smr->nr_blocks); + ll->nr_allocated = le64_to_cpu(smr->nr_allocated); + ll->bitmap_root = le64_to_cpu(smr->bitmap_root); + ll->ref_count_root = le64_to_cpu(smr->ref_count_root); + + return ll->open_index(ll); +} + +/*----------------------------------------------------------------*/ + +static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie); +} + +static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + __dm_bless_for_disk(ie); + return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root, + &index, ie, &ll->bitmap_root); +} + +static int disk_ll_init_index(struct ll_disk *ll) +{ + return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root); +} + +static int disk_ll_open(struct ll_disk *ll) +{ + /* nothing to do */ + return 0; +} + +static dm_block_t disk_ll_max_entries(struct ll_disk *ll) +{ + return -1ULL; +} + +static int disk_ll_commit(struct ll_disk *ll) +{ + return 0; +} + +int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm) +{ + int r; + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = disk_ll_load_ie; + ll->save_ie = disk_ll_save_ie; + ll->init_index = disk_ll_init_index; + ll->open_index = disk_ll_open; + ll->max_entries = disk_ll_max_entries; + ll->commit = disk_ll_commit; + + ll->nr_blocks = 0; + ll->nr_allocated = 0; + + r = ll->init_index(ll); + if (r < 0) + return r; + + r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root); + if (r < 0) + return r; + + return 0; +} + +int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct disk_sm_root *smr = root_le; + + if (len < sizeof(struct disk_sm_root)) { + DMERR("sm_metadata root too small"); + return -ENOMEM; + } + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = disk_ll_load_ie; + ll->save_ie = disk_ll_save_ie; + ll->init_index = disk_ll_init_index; + ll->open_index = disk_ll_open; + ll->max_entries = disk_ll_max_entries; + ll->commit = disk_ll_commit; + + ll->nr_blocks = le64_to_cpu(smr->nr_blocks); + ll->nr_allocated = le64_to_cpu(smr->nr_allocated); + ll->bitmap_root = le64_to_cpu(smr->bitmap_root); + ll->ref_count_root = le64_to_cpu(smr->ref_count_root); + + return ll->open_index(ll); +} + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h new file mode 100644 index 00000000000..b3078d5eda0 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -0,0 +1,127 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_SPACE_MAP_COMMON_H +#define DM_SPACE_MAP_COMMON_H + +#include "dm-btree.h" + +/*----------------------------------------------------------------*/ + +/* + * Low level disk format + * + * Bitmap btree + * ------------ + * + * Each value stored in the btree is an index_entry. This points to a + * block that is used as a bitmap. Within the bitmap hold 2 bits per + * entry, which represent UNUSED = 0, REF_COUNT = 1, REF_COUNT = 2 and + * REF_COUNT = many. + * + * Refcount btree + * -------------- + * + * Any entry that has a ref count higher than 2 gets entered in the ref + * count tree. The leaf values for this tree is the 32-bit ref count. + */ + +struct disk_index_entry { + __le64 blocknr; + __le32 nr_free; + __le32 none_free_before; +} __packed; + + +#define MAX_METADATA_BITMAPS 255 +struct disk_metadata_index { + __le32 csum; + __le32 padding; + __le64 blocknr; + + struct disk_index_entry index[MAX_METADATA_BITMAPS]; +} __packed; + +struct ll_disk; + +typedef int (*load_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *result); +typedef int (*save_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie); +typedef int (*init_index_fn)(struct ll_disk *ll); +typedef int (*open_index_fn)(struct ll_disk *ll); +typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll); +typedef int (*commit_fn)(struct ll_disk *ll); + +struct ll_disk { + struct dm_transaction_manager *tm; + struct dm_btree_info bitmap_info; + struct dm_btree_info ref_count_info; + + uint32_t block_size; + uint32_t entries_per_block; + dm_block_t nr_blocks; + dm_block_t nr_allocated; + + /* + * bitmap_root may be a btree root or a simple index. + */ + dm_block_t bitmap_root; + + dm_block_t ref_count_root; + + struct disk_metadata_index mi_le; + load_ie_fn load_ie; + save_ie_fn save_ie; + init_index_fn init_index; + open_index_fn open_index; + max_index_entries_fn max_entries; + commit_fn commit; + bool bitmap_index_changed:1; +}; + +struct disk_sm_root { + __le64 nr_blocks; + __le64 nr_allocated; + __le64 bitmap_root; + __le64 ref_count_root; +} __packed; + +#define ENTRIES_PER_BYTE 4 + +struct disk_bitmap_header { + __le32 csum; + __le32 not_used; + __le64 blocknr; +} __packed; + +enum allocation_event { + SM_NONE, + SM_ALLOC, + SM_FREE, +}; + +/*----------------------------------------------------------------*/ + +int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks); +int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result); +int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result); +int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, + dm_block_t end, dm_block_t *result); +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev); +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); +int sm_ll_commit(struct ll_disk *ll); + +int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm); +int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len); + +int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm); +int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len); + +/*----------------------------------------------------------------*/ + +#endif /* DM_SPACE_MAP_COMMON_H */ diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c new file mode 100644 index 00000000000..cfbf9617e46 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -0,0 +1,303 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-space-map-common.h" +#include "dm-space-map-disk.h" +#include "dm-space-map.h" +#include "dm-transaction-manager.h" + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "space map disk" + +/*----------------------------------------------------------------*/ + +/* + * Space map interface. + */ +struct sm_disk { + struct dm_space_map sm; + + struct ll_disk ll; + struct ll_disk old_ll; + + dm_block_t begin; + dm_block_t nr_allocated_this_transaction; +}; + +static void sm_disk_destroy(struct dm_space_map *sm) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + kfree(smd); +} + +static int sm_disk_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + return sm_ll_extend(&smd->ll, extra_blocks); +} + +static int sm_disk_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + *count = smd->old_ll.nr_blocks; + + return 0; +} + +static int sm_disk_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + *count = (smd->old_ll.nr_blocks - smd->old_ll.nr_allocated) - smd->nr_allocated_this_transaction; + + return 0; +} + +static int sm_disk_get_count(struct dm_space_map *sm, dm_block_t b, + uint32_t *result) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + return sm_ll_lookup(&smd->ll, b, result); +} + +static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b, + int *result) +{ + int r; + uint32_t count; + + r = sm_disk_get_count(sm, b, &count); + if (r) + return r; + + return count > 1; +} + +static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b, + uint32_t count) +{ + int r; + uint32_t old_count; + enum allocation_event ev; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + r = sm_ll_insert(&smd->ll, b, count, &ev); + if (!r) { + switch (ev) { + case SM_NONE: + break; + + case SM_ALLOC: + /* + * This _must_ be free in the prior transaction + * otherwise we've lost atomicity. + */ + smd->nr_allocated_this_transaction++; + break; + + case SM_FREE: + /* + * It's only free if it's also free in the last + * transaction. + */ + r = sm_ll_lookup(&smd->old_ll, b, &old_count); + if (r) + return r; + + if (!old_count) + smd->nr_allocated_this_transaction--; + break; + } + } + + return r; +} + +static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) +{ + int r; + enum allocation_event ev; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + r = sm_ll_inc(&smd->ll, b, &ev); + if (!r && (ev == SM_ALLOC)) + /* + * This _must_ be free in the prior transaction + * otherwise we've lost atomicity. + */ + smd->nr_allocated_this_transaction++; + + return r; +} + +static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) +{ + enum allocation_event ev; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + return sm_ll_dec(&smd->ll, b, &ev); +} + +static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + int r; + enum allocation_event ev; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + /* FIXME: we should loop round a couple of times */ + r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b); + if (r) + return r; + + smd->begin = *b + 1; + r = sm_ll_inc(&smd->ll, *b, &ev); + if (!r) { + BUG_ON(ev != SM_ALLOC); + smd->nr_allocated_this_transaction++; + } + + return r; +} + +static int sm_disk_commit(struct dm_space_map *sm) +{ + int r; + dm_block_t nr_free; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + r = sm_disk_get_nr_free(sm, &nr_free); + if (r) + return r; + + r = sm_ll_commit(&smd->ll); + if (r) + return r; + + memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll)); + smd->begin = 0; + smd->nr_allocated_this_transaction = 0; + + r = sm_disk_get_nr_free(sm, &nr_free); + if (r) + return r; + + return 0; +} + +static int sm_disk_root_size(struct dm_space_map *sm, size_t *result) +{ + *result = sizeof(struct disk_sm_root); + + return 0; +} + +static int sm_disk_copy_root(struct dm_space_map *sm, void *where_le, size_t max) +{ + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + struct disk_sm_root root_le; + + root_le.nr_blocks = cpu_to_le64(smd->ll.nr_blocks); + root_le.nr_allocated = cpu_to_le64(smd->ll.nr_allocated); + root_le.bitmap_root = cpu_to_le64(smd->ll.bitmap_root); + root_le.ref_count_root = cpu_to_le64(smd->ll.ref_count_root); + + if (max < sizeof(root_le)) + return -ENOSPC; + + memcpy(where_le, &root_le, sizeof(root_le)); + + return 0; +} + +/*----------------------------------------------------------------*/ + +static struct dm_space_map ops = { + .destroy = sm_disk_destroy, + .extend = sm_disk_extend, + .get_nr_blocks = sm_disk_get_nr_blocks, + .get_nr_free = sm_disk_get_nr_free, + .get_count = sm_disk_get_count, + .count_is_more_than_one = sm_disk_count_is_more_than_one, + .set_count = sm_disk_set_count, + .inc_block = sm_disk_inc_block, + .dec_block = sm_disk_dec_block, + .new_block = sm_disk_new_block, + .commit = sm_disk_commit, + .root_size = sm_disk_root_size, + .copy_root = sm_disk_copy_root, + .register_threshold_callback = NULL +}; + +struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, + dm_block_t nr_blocks) +{ + int r; + struct sm_disk *smd; + + smd = kmalloc(sizeof(*smd), GFP_KERNEL); + if (!smd) + return ERR_PTR(-ENOMEM); + + smd->begin = 0; + smd->nr_allocated_this_transaction = 0; + memcpy(&smd->sm, &ops, sizeof(smd->sm)); + + r = sm_ll_new_disk(&smd->ll, tm); + if (r) + goto bad; + + r = sm_ll_extend(&smd->ll, nr_blocks); + if (r) + goto bad; + + r = sm_disk_commit(&smd->sm); + if (r) + goto bad; + + return &smd->sm; + +bad: + kfree(smd); + return ERR_PTR(r); +} +EXPORT_SYMBOL_GPL(dm_sm_disk_create); + +struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct sm_disk *smd; + + smd = kmalloc(sizeof(*smd), GFP_KERNEL); + if (!smd) + return ERR_PTR(-ENOMEM); + + smd->begin = 0; + smd->nr_allocated_this_transaction = 0; + memcpy(&smd->sm, &ops, sizeof(smd->sm)); + + r = sm_ll_open_disk(&smd->ll, tm, root_le, len); + if (r) + goto bad; + + r = sm_disk_commit(&smd->sm); + if (r) + goto bad; + + return &smd->sm; + +bad: + kfree(smd); + return ERR_PTR(r); +} +EXPORT_SYMBOL_GPL(dm_sm_disk_open); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-space-map-disk.h b/drivers/md/persistent-data/dm-space-map-disk.h new file mode 100644 index 00000000000..447a0a9a2d9 --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-disk.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_SPACE_MAP_DISK_H +#define _LINUX_DM_SPACE_MAP_DISK_H + +#include "dm-block-manager.h" + +struct dm_space_map; +struct dm_transaction_manager; + +/* + * Unfortunately we have to use two-phase construction due to the cycle + * between the tm and sm. + */ +struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, + dm_block_t nr_blocks); + +struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, + void *root, size_t len); + +#endif /* _LINUX_DM_SPACE_MAP_DISK_H */ diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c new file mode 100644 index 00000000000..786b689bdfc --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -0,0 +1,794 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-space-map.h" +#include "dm-space-map-common.h" +#include "dm-space-map-metadata.h" + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "space map metadata" + +/*----------------------------------------------------------------*/ + +/* + * An edge triggered threshold. + */ +struct threshold { + bool threshold_set; + bool value_set; + dm_block_t threshold; + dm_block_t current_value; + dm_sm_threshold_fn fn; + void *context; +}; + +static void threshold_init(struct threshold *t) +{ + t->threshold_set = false; + t->value_set = false; +} + +static void set_threshold(struct threshold *t, dm_block_t value, + dm_sm_threshold_fn fn, void *context) +{ + t->threshold_set = true; + t->threshold = value; + t->fn = fn; + t->context = context; +} + +static bool below_threshold(struct threshold *t, dm_block_t value) +{ + return t->threshold_set && value <= t->threshold; +} + +static bool threshold_already_triggered(struct threshold *t) +{ + return t->value_set && below_threshold(t, t->current_value); +} + +static void check_threshold(struct threshold *t, dm_block_t value) +{ + if (below_threshold(t, value) && + !threshold_already_triggered(t)) + t->fn(t->context); + + t->value_set = true; + t->current_value = value; +} + +/*----------------------------------------------------------------*/ + +/* + * Space map interface. + * + * The low level disk format is written using the standard btree and + * transaction manager. This means that performing disk operations may + * cause us to recurse into the space map in order to allocate new blocks. + * For this reason we have a pool of pre-allocated blocks large enough to + * service any metadata_ll_disk operation. + */ + +/* + * FIXME: we should calculate this based on the size of the device. + * Only the metadata space map needs this functionality. + */ +#define MAX_RECURSIVE_ALLOCATIONS 1024 + +enum block_op_type { + BOP_INC, + BOP_DEC +}; + +struct block_op { + enum block_op_type type; + dm_block_t block; +}; + +struct bop_ring_buffer { + unsigned begin; + unsigned end; + struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1]; +}; + +static void brb_init(struct bop_ring_buffer *brb) +{ + brb->begin = 0; + brb->end = 0; +} + +static bool brb_empty(struct bop_ring_buffer *brb) +{ + return brb->begin == brb->end; +} + +static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) +{ + unsigned r = old + 1; + return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r; +} + +static int brb_push(struct bop_ring_buffer *brb, + enum block_op_type type, dm_block_t b) +{ + struct block_op *bop; + unsigned next = brb_next(brb, brb->end); + + /* + * We don't allow the last bop to be filled, this way we can + * differentiate between full and empty. + */ + if (next == brb->begin) + return -ENOMEM; + + bop = brb->bops + brb->end; + bop->type = type; + bop->block = b; + + brb->end = next; + + return 0; +} + +static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result) +{ + struct block_op *bop; + + if (brb_empty(brb)) + return -ENODATA; + + bop = brb->bops + brb->begin; + result->type = bop->type; + result->block = bop->block; + + brb->begin = brb_next(brb, brb->begin); + + return 0; +} + +/*----------------------------------------------------------------*/ + +struct sm_metadata { + struct dm_space_map sm; + + struct ll_disk ll; + struct ll_disk old_ll; + + dm_block_t begin; + + unsigned recursion_count; + unsigned allocated_this_transaction; + struct bop_ring_buffer uncommitted; + + struct threshold threshold; +}; + +static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) +{ + int r = brb_push(&smm->uncommitted, type, b); + + if (r) { + DMERR("too many recursive allocations"); + return -ENOMEM; + } + + return 0; +} + +static int commit_bop(struct sm_metadata *smm, struct block_op *op) +{ + int r = 0; + enum allocation_event ev; + + switch (op->type) { + case BOP_INC: + r = sm_ll_inc(&smm->ll, op->block, &ev); + break; + + case BOP_DEC: + r = sm_ll_dec(&smm->ll, op->block, &ev); + break; + } + + return r; +} + +static void in(struct sm_metadata *smm) +{ + smm->recursion_count++; +} + +static int out(struct sm_metadata *smm) +{ + int r = 0; + + /* + * If we're not recursing then very bad things are happening. + */ + if (!smm->recursion_count) { + DMERR("lost track of recursion depth"); + return -ENOMEM; + } + + if (smm->recursion_count == 1) { + while (!brb_empty(&smm->uncommitted)) { + struct block_op bop; + + r = brb_pop(&smm->uncommitted, &bop); + if (r) { + DMERR("bug in bop ring buffer"); + break; + } + + r = commit_bop(smm, &bop); + if (r) + break; + } + } + + smm->recursion_count--; + + return r; +} + +/* + * When using the out() function above, we often want to combine an error + * code for the operation run in the recursive context with that from + * out(). + */ +static int combine_errors(int r1, int r2) +{ + return r1 ? r1 : r2; +} + +static int recursing(struct sm_metadata *smm) +{ + return smm->recursion_count; +} + +static void sm_metadata_destroy(struct dm_space_map *sm) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + kfree(smm); +} + +static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + *count = smm->ll.nr_blocks; + + return 0; +} + +static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + *count = smm->old_ll.nr_blocks - smm->old_ll.nr_allocated - + smm->allocated_this_transaction; + + return 0; +} + +static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, + uint32_t *result) +{ + int r; + unsigned i; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + unsigned adjustment = 0; + + /* + * We may have some uncommitted adjustments to add. This list + * should always be really short. + */ + for (i = smm->uncommitted.begin; + i != smm->uncommitted.end; + i = brb_next(&smm->uncommitted, i)) { + struct block_op *op = smm->uncommitted.bops + i; + + if (op->block != b) + continue; + + switch (op->type) { + case BOP_INC: + adjustment++; + break; + + case BOP_DEC: + adjustment--; + break; + } + } + + r = sm_ll_lookup(&smm->ll, b, result); + if (r) + return r; + + *result += adjustment; + + return 0; +} + +static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, + dm_block_t b, int *result) +{ + int r, adjustment = 0; + unsigned i; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + uint32_t rc; + + /* + * We may have some uncommitted adjustments to add. This list + * should always be really short. + */ + for (i = smm->uncommitted.begin; + i != smm->uncommitted.end; + i = brb_next(&smm->uncommitted, i)) { + + struct block_op *op = smm->uncommitted.bops + i; + + if (op->block != b) + continue; + + switch (op->type) { + case BOP_INC: + adjustment++; + break; + + case BOP_DEC: + adjustment--; + break; + } + } + + if (adjustment > 1) { + *result = 1; + return 0; + } + + r = sm_ll_lookup_bitmap(&smm->ll, b, &rc); + if (r) + return r; + + if (rc == 3) + /* + * We err on the side of caution, and always return true. + */ + *result = 1; + else + *result = rc + adjustment > 1; + + return 0; +} + +static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b, + uint32_t count) +{ + int r, r2; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + if (smm->recursion_count) { + DMERR("cannot recurse set_count()"); + return -EINVAL; + } + + in(smm); + r = sm_ll_insert(&smm->ll, b, count, &ev); + r2 = out(smm); + + return combine_errors(r, r2); +} + +static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b) +{ + int r, r2 = 0; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + if (recursing(smm)) + r = add_bop(smm, BOP_INC, b); + else { + in(smm); + r = sm_ll_inc(&smm->ll, b, &ev); + r2 = out(smm); + } + + return combine_errors(r, r2); +} + +static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b) +{ + int r, r2 = 0; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + if (recursing(smm)) + r = add_bop(smm, BOP_DEC, b); + else { + in(smm); + r = sm_ll_dec(&smm->ll, b, &ev); + r2 = out(smm); + } + + return combine_errors(r, r2); +} + +static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) +{ + int r, r2 = 0; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b); + if (r) + return r; + + smm->begin = *b + 1; + + if (recursing(smm)) + r = add_bop(smm, BOP_INC, *b); + else { + in(smm); + r = sm_ll_inc(&smm->ll, *b, &ev); + r2 = out(smm); + } + + if (!r) + smm->allocated_this_transaction++; + + return combine_errors(r, r2); +} + +static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + dm_block_t count; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + int r = sm_metadata_new_block_(sm, b); + if (r) { + DMERR_LIMIT("unable to allocate new metadata block"); + return r; + } + + r = sm_metadata_get_nr_free(sm, &count); + if (r) { + DMERR_LIMIT("couldn't get free block count"); + return r; + } + + check_threshold(&smm->threshold, count); + + return r; +} + +static int sm_metadata_commit(struct dm_space_map *sm) +{ + int r; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + r = sm_ll_commit(&smm->ll); + if (r) + return r; + + memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); + smm->begin = 0; + smm->allocated_this_transaction = 0; + + return 0; +} + +static int sm_metadata_register_threshold_callback(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + set_threshold(&smm->threshold, threshold, fn, context); + + return 0; +} + +static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result) +{ + *result = sizeof(struct disk_sm_root); + + return 0; +} + +static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t max) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + struct disk_sm_root root_le; + + root_le.nr_blocks = cpu_to_le64(smm->ll.nr_blocks); + root_le.nr_allocated = cpu_to_le64(smm->ll.nr_allocated); + root_le.bitmap_root = cpu_to_le64(smm->ll.bitmap_root); + root_le.ref_count_root = cpu_to_le64(smm->ll.ref_count_root); + + if (max < sizeof(root_le)) + return -ENOSPC; + + memcpy(where_le, &root_le, sizeof(root_le)); + + return 0; +} + +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks); + +static struct dm_space_map ops = { + .destroy = sm_metadata_destroy, + .extend = sm_metadata_extend, + .get_nr_blocks = sm_metadata_get_nr_blocks, + .get_nr_free = sm_metadata_get_nr_free, + .get_count = sm_metadata_get_count, + .count_is_more_than_one = sm_metadata_count_is_more_than_one, + .set_count = sm_metadata_set_count, + .inc_block = sm_metadata_inc_block, + .dec_block = sm_metadata_dec_block, + .new_block = sm_metadata_new_block, + .commit = sm_metadata_commit, + .root_size = sm_metadata_root_size, + .copy_root = sm_metadata_copy_root, + .register_threshold_callback = sm_metadata_register_threshold_callback +}; + +/*----------------------------------------------------------------*/ + +/* + * When a new space map is created that manages its own space. We use + * this tiny bootstrap allocator. + */ +static void sm_bootstrap_destroy(struct dm_space_map *sm) +{ +} + +static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + DMERR("bootstrap doesn't support extend"); + + return -EINVAL; +} + +static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + return smm->ll.nr_blocks; +} + +static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + *count = smm->ll.nr_blocks - smm->begin; + + return 0; +} + +static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b, + uint32_t *result) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + return b < smm->begin ? 1 : 0; +} + +static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm, + dm_block_t b, int *result) +{ + *result = 0; + + return 0; +} + +static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b, + uint32_t count) +{ + DMERR("bootstrap doesn't support set_count"); + + return -EINVAL; +} + +static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + /* + * We know the entire device is unused. + */ + if (smm->begin == smm->ll.nr_blocks) + return -ENOSPC; + + *b = smm->begin++; + + return 0; +} + +static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + return add_bop(smm, BOP_INC, b); +} + +static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b) +{ + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + return add_bop(smm, BOP_DEC, b); +} + +static int sm_bootstrap_commit(struct dm_space_map *sm) +{ + return 0; +} + +static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result) +{ + DMERR("bootstrap doesn't support root_size"); + + return -EINVAL; +} + +static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where, + size_t max) +{ + DMERR("bootstrap doesn't support copy_root"); + + return -EINVAL; +} + +static struct dm_space_map bootstrap_ops = { + .destroy = sm_bootstrap_destroy, + .extend = sm_bootstrap_extend, + .get_nr_blocks = sm_bootstrap_get_nr_blocks, + .get_nr_free = sm_bootstrap_get_nr_free, + .get_count = sm_bootstrap_get_count, + .count_is_more_than_one = sm_bootstrap_count_is_more_than_one, + .set_count = sm_bootstrap_set_count, + .inc_block = sm_bootstrap_inc_block, + .dec_block = sm_bootstrap_dec_block, + .new_block = sm_bootstrap_new_block, + .commit = sm_bootstrap_commit, + .root_size = sm_bootstrap_root_size, + .copy_root = sm_bootstrap_copy_root, + .register_threshold_callback = NULL +}; + +/*----------------------------------------------------------------*/ + +static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + int r, i; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + dm_block_t old_len = smm->ll.nr_blocks; + + /* + * Flick into a mode where all blocks get allocated in the new area. + */ + smm->begin = old_len; + memcpy(sm, &bootstrap_ops, sizeof(*sm)); + + /* + * Extend. + */ + r = sm_ll_extend(&smm->ll, extra_blocks); + if (r) + goto out; + + /* + * We repeatedly increment then commit until the commit doesn't + * allocate any new blocks. + */ + do { + for (i = old_len; !r && i < smm->begin; i++) { + r = sm_ll_inc(&smm->ll, i, &ev); + if (r) + goto out; + } + old_len = smm->begin; + + r = sm_ll_commit(&smm->ll); + if (r) + goto out; + + } while (old_len != smm->begin); + +out: + /* + * Switch back to normal behaviour. + */ + memcpy(sm, &ops, sizeof(*sm)); + return r; +} + +/*----------------------------------------------------------------*/ + +struct dm_space_map *dm_sm_metadata_init(void) +{ + struct sm_metadata *smm; + + smm = kmalloc(sizeof(*smm), GFP_KERNEL); + if (!smm) + return ERR_PTR(-ENOMEM); + + memcpy(&smm->sm, &ops, sizeof(smm->sm)); + + return &smm->sm; +} + +int dm_sm_metadata_create(struct dm_space_map *sm, + struct dm_transaction_manager *tm, + dm_block_t nr_blocks, + dm_block_t superblock) +{ + int r; + dm_block_t i; + enum allocation_event ev; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + smm->begin = superblock + 1; + smm->recursion_count = 0; + smm->allocated_this_transaction = 0; + brb_init(&smm->uncommitted); + threshold_init(&smm->threshold); + + memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); + + r = sm_ll_new_metadata(&smm->ll, tm); + if (r) + return r; + + if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS) + nr_blocks = DM_SM_METADATA_MAX_BLOCKS; + r = sm_ll_extend(&smm->ll, nr_blocks); + if (r) + return r; + + memcpy(&smm->sm, &ops, sizeof(smm->sm)); + + /* + * Now we need to update the newly created data structures with the + * allocated blocks that they were built from. + */ + for (i = superblock; !r && i < smm->begin; i++) + r = sm_ll_inc(&smm->ll, i, &ev); + + if (r) + return r; + + return sm_metadata_commit(sm); +} + +int dm_sm_metadata_open(struct dm_space_map *sm, + struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + + r = sm_ll_open_metadata(&smm->ll, tm, root_le, len); + if (r) + return r; + + smm->begin = 0; + smm->recursion_count = 0; + smm->allocated_this_transaction = 0; + brb_init(&smm->uncommitted); + threshold_init(&smm->threshold); + + memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); + return 0; +} diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h new file mode 100644 index 00000000000..64df923974d --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map-metadata.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef DM_SPACE_MAP_METADATA_H +#define DM_SPACE_MAP_METADATA_H + +#include "dm-transaction-manager.h" + +#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT) + +/* + * The metadata device is currently limited in size. + * + * We have one block of index, which can hold 255 index entries. Each + * index entry contains allocation info about ~16k metadata blocks. + */ +#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64)) +#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE) + +/* + * Unfortunately we have to use two-phase construction due to the cycle + * between the tm and sm. + */ +struct dm_space_map *dm_sm_metadata_init(void); + +/* + * Create a fresh space map. + */ +int dm_sm_metadata_create(struct dm_space_map *sm, + struct dm_transaction_manager *tm, + dm_block_t nr_blocks, + dm_block_t superblock); + +/* + * Open from a previously-recorded root. + */ +int dm_sm_metadata_open(struct dm_space_map *sm, + struct dm_transaction_manager *tm, + void *root_le, size_t len); + +#endif /* DM_SPACE_MAP_METADATA_H */ diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h new file mode 100644 index 00000000000..3e6d1153b7c --- /dev/null +++ b/drivers/md/persistent-data/dm-space-map.h @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_SPACE_MAP_H +#define _LINUX_DM_SPACE_MAP_H + +#include "dm-block-manager.h" + +typedef void (*dm_sm_threshold_fn)(void *context); + +/* + * struct dm_space_map keeps a record of how many times each block in a device + * is referenced. It needs to be fixed on disk as part of the transaction. + */ +struct dm_space_map { + void (*destroy)(struct dm_space_map *sm); + + /* + * You must commit before allocating the newly added space. + */ + int (*extend)(struct dm_space_map *sm, dm_block_t extra_blocks); + + /* + * Extensions do not appear in this count until after commit has + * been called. + */ + int (*get_nr_blocks)(struct dm_space_map *sm, dm_block_t *count); + + /* + * Space maps must never allocate a block from the previous + * transaction, in case we need to rollback. This complicates the + * semantics of get_nr_free(), it should return the number of blocks + * that are available for allocation _now_. For instance you may + * have blocks with a zero reference count that will not be + * available for allocation until after the next commit. + */ + int (*get_nr_free)(struct dm_space_map *sm, dm_block_t *count); + + int (*get_count)(struct dm_space_map *sm, dm_block_t b, uint32_t *result); + int (*count_is_more_than_one)(struct dm_space_map *sm, dm_block_t b, + int *result); + int (*set_count)(struct dm_space_map *sm, dm_block_t b, uint32_t count); + + int (*commit)(struct dm_space_map *sm); + + int (*inc_block)(struct dm_space_map *sm, dm_block_t b); + int (*dec_block)(struct dm_space_map *sm, dm_block_t b); + + /* + * new_block will increment the returned block. + */ + int (*new_block)(struct dm_space_map *sm, dm_block_t *b); + + /* + * The root contains all the information needed to fix the space map. + * Generally this info is small, so squirrel it away in a disk block + * along with other info. + */ + int (*root_size)(struct dm_space_map *sm, size_t *result); + int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len); + + /* + * You can register one threshold callback which is edge-triggered + * when the free space in the space map drops below the threshold. + */ + int (*register_threshold_callback)(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context); +}; + +/*----------------------------------------------------------------*/ + +static inline void dm_sm_destroy(struct dm_space_map *sm) +{ + sm->destroy(sm); +} + +static inline int dm_sm_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + return sm->extend(sm, extra_blocks); +} + +static inline int dm_sm_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + return sm->get_nr_blocks(sm, count); +} + +static inline int dm_sm_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + return sm->get_nr_free(sm, count); +} + +static inline int dm_sm_get_count(struct dm_space_map *sm, dm_block_t b, + uint32_t *result) +{ + return sm->get_count(sm, b, result); +} + +static inline int dm_sm_count_is_more_than_one(struct dm_space_map *sm, + dm_block_t b, int *result) +{ + return sm->count_is_more_than_one(sm, b, result); +} + +static inline int dm_sm_set_count(struct dm_space_map *sm, dm_block_t b, + uint32_t count) +{ + return sm->set_count(sm, b, count); +} + +static inline int dm_sm_commit(struct dm_space_map *sm) +{ + return sm->commit(sm); +} + +static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b) +{ + return sm->inc_block(sm, b); +} + +static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b) +{ + return sm->dec_block(sm, b); +} + +static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + return sm->new_block(sm, b); +} + +static inline int dm_sm_root_size(struct dm_space_map *sm, size_t *result) +{ + return sm->root_size(sm, result); +} + +static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len) +{ + return sm->copy_root(sm, copy_to_here_le, len); +} + +static inline int dm_sm_register_threshold_callback(struct dm_space_map *sm, + dm_block_t threshold, + dm_sm_threshold_fn fn, + void *context) +{ + if (sm->register_threshold_callback) + return sm->register_threshold_callback(sm, threshold, fn, context); + + return -EINVAL; +} + + +#endif /* _LINUX_DM_SPACE_MAP_H */ diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c new file mode 100644 index 00000000000..3bc30a0ae3d --- /dev/null +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -0,0 +1,382 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ +#include "dm-transaction-manager.h" +#include "dm-space-map.h" +#include "dm-space-map-disk.h" +#include "dm-space-map-metadata.h" +#include "dm-persistent-data-internal.h" + +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/device-mapper.h> + +#define DM_MSG_PREFIX "transaction manager" + +/*----------------------------------------------------------------*/ + +struct shadow_info { + struct hlist_node hlist; + dm_block_t where; +}; + +/* + * It would be nice if we scaled with the size of transaction. + */ +#define DM_HASH_SIZE 256 +#define DM_HASH_MASK (DM_HASH_SIZE - 1) + +struct dm_transaction_manager { + int is_clone; + struct dm_transaction_manager *real; + + struct dm_block_manager *bm; + struct dm_space_map *sm; + + spinlock_t lock; + struct hlist_head buckets[DM_HASH_SIZE]; +}; + +/*----------------------------------------------------------------*/ + +static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) +{ + int r = 0; + unsigned bucket = dm_hash_block(b, DM_HASH_MASK); + struct shadow_info *si; + + spin_lock(&tm->lock); + hlist_for_each_entry(si, tm->buckets + bucket, hlist) + if (si->where == b) { + r = 1; + break; + } + spin_unlock(&tm->lock); + + return r; +} + +/* + * This can silently fail if there's no memory. We're ok with this since + * creating redundant shadows causes no harm. + */ +static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) +{ + unsigned bucket; + struct shadow_info *si; + + si = kmalloc(sizeof(*si), GFP_NOIO); + if (si) { + si->where = b; + bucket = dm_hash_block(b, DM_HASH_MASK); + spin_lock(&tm->lock); + hlist_add_head(&si->hlist, tm->buckets + bucket); + spin_unlock(&tm->lock); + } +} + +static void wipe_shadow_table(struct dm_transaction_manager *tm) +{ + struct shadow_info *si; + struct hlist_node *tmp; + struct hlist_head *bucket; + int i; + + spin_lock(&tm->lock); + for (i = 0; i < DM_HASH_SIZE; i++) { + bucket = tm->buckets + i; + hlist_for_each_entry_safe(si, tmp, bucket, hlist) + kfree(si); + + INIT_HLIST_HEAD(bucket); + } + + spin_unlock(&tm->lock); +} + +/*----------------------------------------------------------------*/ + +static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, + struct dm_space_map *sm) +{ + int i; + struct dm_transaction_manager *tm; + + tm = kmalloc(sizeof(*tm), GFP_KERNEL); + if (!tm) + return ERR_PTR(-ENOMEM); + + tm->is_clone = 0; + tm->real = NULL; + tm->bm = bm; + tm->sm = sm; + + spin_lock_init(&tm->lock); + for (i = 0; i < DM_HASH_SIZE; i++) + INIT_HLIST_HEAD(tm->buckets + i); + + return tm; +} + +struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real) +{ + struct dm_transaction_manager *tm; + + tm = kmalloc(sizeof(*tm), GFP_KERNEL); + if (tm) { + tm->is_clone = 1; + tm->real = real; + } + + return tm; +} +EXPORT_SYMBOL_GPL(dm_tm_create_non_blocking_clone); + +void dm_tm_destroy(struct dm_transaction_manager *tm) +{ + if (!tm->is_clone) + wipe_shadow_table(tm); + + kfree(tm); +} +EXPORT_SYMBOL_GPL(dm_tm_destroy); + +int dm_tm_pre_commit(struct dm_transaction_manager *tm) +{ + int r; + + if (tm->is_clone) + return -EWOULDBLOCK; + + r = dm_sm_commit(tm->sm); + if (r < 0) + return r; + + return dm_bm_flush(tm->bm); +} +EXPORT_SYMBOL_GPL(dm_tm_pre_commit); + +int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root) +{ + if (tm->is_clone) + return -EWOULDBLOCK; + + wipe_shadow_table(tm); + dm_bm_unlock(root); + + return dm_bm_flush(tm->bm); +} +EXPORT_SYMBOL_GPL(dm_tm_commit); + +int dm_tm_new_block(struct dm_transaction_manager *tm, + struct dm_block_validator *v, + struct dm_block **result) +{ + int r; + dm_block_t new_block; + + if (tm->is_clone) + return -EWOULDBLOCK; + + r = dm_sm_new_block(tm->sm, &new_block); + if (r < 0) + return r; + + r = dm_bm_write_lock_zero(tm->bm, new_block, v, result); + if (r < 0) { + dm_sm_dec_block(tm->sm, new_block); + return r; + } + + /* + * New blocks count as shadows in that they don't need to be + * shadowed again. + */ + insert_shadow(tm, new_block); + + return 0; +} + +static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, + struct dm_block_validator *v, + struct dm_block **result) +{ + int r; + dm_block_t new; + struct dm_block *orig_block; + + r = dm_sm_new_block(tm->sm, &new); + if (r < 0) + return r; + + r = dm_sm_dec_block(tm->sm, orig); + if (r < 0) + return r; + + r = dm_bm_read_lock(tm->bm, orig, v, &orig_block); + if (r < 0) + return r; + + /* + * It would be tempting to use dm_bm_unlock_move here, but some + * code, such as the space maps, keeps using the old data structures + * secure in the knowledge they won't be changed until the next + * transaction. Using unlock_move would force a synchronous read + * since the old block would no longer be in the cache. + */ + r = dm_bm_write_lock_zero(tm->bm, new, v, result); + if (r) { + dm_bm_unlock(orig_block); + return r; + } + + memcpy(dm_block_data(*result), dm_block_data(orig_block), + dm_bm_block_size(tm->bm)); + + dm_bm_unlock(orig_block); + return r; +} + +int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, + struct dm_block_validator *v, struct dm_block **result, + int *inc_children) +{ + int r; + + if (tm->is_clone) + return -EWOULDBLOCK; + + r = dm_sm_count_is_more_than_one(tm->sm, orig, inc_children); + if (r < 0) + return r; + + if (is_shadow(tm, orig) && !*inc_children) + return dm_bm_write_lock(tm->bm, orig, v, result); + + r = __shadow_block(tm, orig, v, result); + if (r < 0) + return r; + insert_shadow(tm, dm_block_location(*result)); + + return r; +} +EXPORT_SYMBOL_GPL(dm_tm_shadow_block); + +int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **blk) +{ + if (tm->is_clone) + return dm_bm_read_try_lock(tm->real->bm, b, v, blk); + + return dm_bm_read_lock(tm->bm, b, v, blk); +} +EXPORT_SYMBOL_GPL(dm_tm_read_lock); + +int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b) +{ + return dm_bm_unlock(b); +} +EXPORT_SYMBOL_GPL(dm_tm_unlock); + +void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_inc_block(tm->sm, b); +} +EXPORT_SYMBOL_GPL(dm_tm_inc); + +void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_dec_block(tm->sm, b); +} +EXPORT_SYMBOL_GPL(dm_tm_dec); + +int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, + uint32_t *result) +{ + if (tm->is_clone) + return -EWOULDBLOCK; + + return dm_sm_get_count(tm->sm, b, result); +} + +struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm) +{ + return tm->bm; +} + +/*----------------------------------------------------------------*/ + +static int dm_tm_create_internal(struct dm_block_manager *bm, + dm_block_t sb_location, + struct dm_transaction_manager **tm, + struct dm_space_map **sm, + int create, + void *sm_root, size_t sm_len) +{ + int r; + + *sm = dm_sm_metadata_init(); + if (IS_ERR(*sm)) + return PTR_ERR(*sm); + + *tm = dm_tm_create(bm, *sm); + if (IS_ERR(*tm)) { + dm_sm_destroy(*sm); + return PTR_ERR(*tm); + } + + if (create) { + r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm), + sb_location); + if (r) { + DMERR("couldn't create metadata space map"); + goto bad; + } + + } else { + r = dm_sm_metadata_open(*sm, *tm, sm_root, sm_len); + if (r) { + DMERR("couldn't open metadata space map"); + goto bad; + } + } + + return 0; + +bad: + dm_tm_destroy(*tm); + dm_sm_destroy(*sm); + return r; +} + +int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, + struct dm_transaction_manager **tm, + struct dm_space_map **sm) +{ + return dm_tm_create_internal(bm, sb_location, tm, sm, 1, NULL, 0); +} +EXPORT_SYMBOL_GPL(dm_tm_create_with_sm); + +int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, + void *sm_root, size_t root_len, + struct dm_transaction_manager **tm, + struct dm_space_map **sm) +{ + return dm_tm_create_internal(bm, sb_location, tm, sm, 0, sm_root, root_len); +} +EXPORT_SYMBOL_GPL(dm_tm_open_with_sm); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h new file mode 100644 index 00000000000..2772ed2a781 --- /dev/null +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef _LINUX_DM_TRANSACTION_MANAGER_H +#define _LINUX_DM_TRANSACTION_MANAGER_H + +#include "dm-block-manager.h" + +struct dm_transaction_manager; +struct dm_space_map; + +/*----------------------------------------------------------------*/ + +/* + * This manages the scope of a transaction. It also enforces immutability + * of the on-disk data structures by limiting access to writeable blocks. + * + * Clients should not fiddle with the block manager directly. + */ + +void dm_tm_destroy(struct dm_transaction_manager *tm); + +/* + * The non-blocking version of a transaction manager is intended for use in + * fast path code that needs to do lookups e.g. a dm mapping function. + * You create the non-blocking variant from a normal tm. The interface is + * the same, except that most functions will just return -EWOULDBLOCK. + * Methods that return void yet may block should not be called on a clone + * viz. dm_tm_inc, dm_tm_dec. Call dm_tm_destroy() as you would with a normal + * tm when you've finished with it. You may not destroy the original prior + * to clones. + */ +struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transaction_manager *real); + +/* + * We use a 2-phase commit here. + * + * i) Make all changes for the transaction *except* for the superblock. + * Then call dm_tm_pre_commit() to flush them to disk. + * + * ii) Lock your superblock. Update. Then call dm_tm_commit() which will + * unlock the superblock and flush it. No other blocks should be updated + * during this period. Care should be taken to never unlock a partially + * updated superblock; perform any operations that could fail *before* you + * take the superblock lock. + */ +int dm_tm_pre_commit(struct dm_transaction_manager *tm); +int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock); + +/* + * These methods are the only way to get hold of a writeable block. + */ + +/* + * dm_tm_new_block() is pretty self-explanatory. Make sure you do actually + * write to the whole of @data before you unlock, otherwise you could get + * a data leak. (The other option is for tm_new_block() to zero new blocks + * before handing them out, which will be redundant in most, if not all, + * cases). + * Zeroes the new block and returns with write lock held. + */ +int dm_tm_new_block(struct dm_transaction_manager *tm, + struct dm_block_validator *v, + struct dm_block **result); + +/* + * dm_tm_shadow_block() allocates a new block and copies the data from @orig + * to it. It then decrements the reference count on original block. Use + * this to update the contents of a block in a data structure, don't + * confuse this with a clone - you shouldn't access the orig block after + * this operation. Because the tm knows the scope of the transaction it + * can optimise requests for a shadow of a shadow to a no-op. Don't forget + * to unlock when you've finished with the shadow. + * + * The @inc_children flag is used to tell the caller whether it needs to + * adjust reference counts for children. (Data in the block may refer to + * other blocks.) + * + * Shadowing implicitly drops a reference on @orig so you must not have + * it locked when you call this. + */ +int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, + struct dm_block_validator *v, + struct dm_block **result, int *inc_children); + +/* + * Read access. You can lock any block you want. If there's a write lock + * on it outstanding then it'll block. + */ +int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, + struct dm_block_validator *v, + struct dm_block **result); + +int dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b); + +/* + * Functions for altering the reference count of a block directly. + */ +void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b); + +void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b); + +int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, + uint32_t *result); + +struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); + +/* + * A little utility that ties the knot by producing a transaction manager + * that has a space map managed by the transaction manager... + * + * Returns a tm that has an open transaction to write the new disk sm. + * Caller should store the new sm root and commit. + * + * The superblock location is passed so the metadata space map knows it + * shouldn't be used. + */ +int dm_tm_create_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, + struct dm_transaction_manager **tm, + struct dm_space_map **sm); + +int dm_tm_open_with_sm(struct dm_block_manager *bm, dm_block_t sb_location, + void *sm_root, size_t root_len, + struct dm_transaction_manager **tm, + struct dm_space_map **sm); + +#endif /* _LINUX_DM_TRANSACTION_MANAGER_H */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 18361063566..407a99e46f6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -18,35 +18,26 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include <linux/blkdev.h> +#include <linux/seq_file.h> #include <linux/module.h> -#include <linux/raid/raid0.h> - -#define MAJOR_NR MD_MAJOR -#define MD_DRIVER -#define MD_PERSONALITY - -static void raid0_unplug(struct request_queue *q) -{ - mddev_t *mddev = q->queuedata; - raid0_conf_t *conf = mddev_to_conf(mddev); - mdk_rdev_t **devlist = conf->strip_zone[0].dev; - int i; - - for (i=0; i<mddev->raid_disks; i++) { - struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev); - - blk_unplug(r_queue); - } -} +#include <linux/slab.h> +#include "md.h" +#include "raid0.h" +#include "raid5.h" static int raid0_congested(void *data, int bits) { - mddev_t *mddev = data; - raid0_conf_t *conf = mddev_to_conf(mddev); - mdk_rdev_t **devlist = conf->strip_zone[0].dev; + struct mddev *mddev = data; + struct r0conf *conf = mddev->private; + struct md_rdev **devlist = conf->devlist; + int raid_disks = conf->strip_zone[0].nb_dev; int i, ret = 0; - for (i = 0; i < mddev->raid_disks && !ret ; i++) { + if (mddev_congested(mddev, bits)) + return 1; + + for (i = 0; i < raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); @@ -54,68 +45,110 @@ static int raid0_congested(void *data, int bits) return ret; } +/* + * inform the user of the raid configuration +*/ +static void dump_zones(struct mddev *mddev) +{ + int j, k; + sector_t zone_size = 0; + sector_t zone_start = 0; + char b[BDEVNAME_SIZE]; + struct r0conf *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; + printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n", + mdname(mddev), + conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); + for (j = 0; j < conf->nr_strip_zones; j++) { + printk(KERN_INFO "md: zone%d=[", j); + for (k = 0; k < conf->strip_zone[j].nb_dev; k++) + printk(KERN_CONT "%s%s", k?"/":"", + bdevname(conf->devlist[j*raid_disks + + k]->bdev, b)); + printk(KERN_CONT "]\n"); + + zone_size = conf->strip_zone[j].zone_end - zone_start; + printk(KERN_INFO " zone-offset=%10lluKB, " + "device-offset=%10lluKB, size=%10lluKB\n", + (unsigned long long)zone_start>>1, + (unsigned long long)conf->strip_zone[j].dev_start>>1, + (unsigned long long)zone_size>>1); + zone_start = conf->strip_zone[j].zone_end; + } + printk(KERN_INFO "\n"); +} -static int create_strip_zones (mddev_t *mddev) +static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) { - int i, c, j; - sector_t current_offset, curr_zone_offset; - sector_t min_spacing; - raid0_conf_t *conf = mddev_to_conf(mddev); - mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; - struct list_head *tmp1, *tmp2; + int i, c, err; + sector_t curr_zone_end, sectors; + struct md_rdev *smallest, *rdev1, *rdev2, *rdev, **dev; struct strip_zone *zone; int cnt; char b[BDEVNAME_SIZE]; - - /* - * The number of 'same size groups' - */ - conf->nr_strip_zones = 0; - - rdev_for_each(rdev1, tmp1, mddev) { - printk("raid0: looking at %s\n", - bdevname(rdev1->bdev,b)); + char b2[BDEVNAME_SIZE]; + struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + bool discard_supported = false; + + if (!conf) + return -ENOMEM; + rdev_for_each(rdev1, mddev) { + pr_debug("md/raid0:%s: looking at %s\n", + mdname(mddev), + bdevname(rdev1->bdev, b)); c = 0; - rdev_for_each(rdev2, tmp2, mddev) { - printk("raid0: comparing %s(%llu)", - bdevname(rdev1->bdev,b), - (unsigned long long)rdev1->size); - printk(" with %s(%llu)\n", - bdevname(rdev2->bdev,b), - (unsigned long long)rdev2->size); + + /* round size to chunk_size */ + sectors = rdev1->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev1->sectors = sectors * mddev->chunk_sectors; + + rdev_for_each(rdev2, mddev) { + pr_debug("md/raid0:%s: comparing %s(%llu)" + " with %s(%llu)\n", + mdname(mddev), + bdevname(rdev1->bdev,b), + (unsigned long long)rdev1->sectors, + bdevname(rdev2->bdev,b2), + (unsigned long long)rdev2->sectors); if (rdev2 == rdev1) { - printk("raid0: END\n"); + pr_debug("md/raid0:%s: END\n", + mdname(mddev)); break; } - if (rdev2->size == rdev1->size) - { + if (rdev2->sectors == rdev1->sectors) { /* * Not unique, don't count it as a new * group */ - printk("raid0: EQUAL\n"); + pr_debug("md/raid0:%s: EQUAL\n", + mdname(mddev)); c = 1; break; } - printk("raid0: NOT EQUAL\n"); + pr_debug("md/raid0:%s: NOT EQUAL\n", + mdname(mddev)); } if (!c) { - printk("raid0: ==> UNIQUE\n"); + pr_debug("md/raid0:%s: ==> UNIQUE\n", + mdname(mddev)); conf->nr_strip_zones++; - printk("raid0: %d zones\n", conf->nr_strip_zones); + pr_debug("md/raid0:%s: %d zones\n", + mdname(mddev), conf->nr_strip_zones); } } - printk("raid0: FINAL %d zones\n", conf->nr_strip_zones); - + pr_debug("md/raid0:%s: FINAL %d zones\n", + mdname(mddev), conf->nr_strip_zones); + err = -ENOMEM; conf->strip_zone = kzalloc(sizeof(struct strip_zone)* conf->nr_strip_zones, GFP_KERNEL); if (!conf->strip_zone) - return 1; - conf->devlist = kzalloc(sizeof(mdk_rdev_t*)* + goto abort; + conf->devlist = kzalloc(sizeof(struct md_rdev*)* conf->nr_strip_zones*mddev->raid_disks, GFP_KERNEL); if (!conf->devlist) - return 1; + goto abort; /* The first zone must contain all devices, so here we check that * there is a proper alignment of slots to devices and find them all @@ -123,123 +156,207 @@ static int create_strip_zones (mddev_t *mddev) zone = &conf->strip_zone[0]; cnt = 0; smallest = NULL; - zone->dev = conf->devlist; - rdev_for_each(rdev1, tmp1, mddev) { + dev = conf->devlist; + err = -EINVAL; + rdev_for_each(rdev1, mddev) { int j = rdev1->raid_disk; - if (j < 0 || j >= mddev->raid_disks) { - printk("raid0: bad disk number %d - aborting!\n", j); + if (mddev->level == 10) { + /* taking over a raid10-n2 array */ + j /= 2; + rdev1->new_raid_disk = j; + } + + if (mddev->level == 1) { + /* taiking over a raid1 array- + * we have only one active disk + */ + j = 0; + rdev1->new_raid_disk = j; + } + + if (j < 0) { + printk(KERN_ERR + "md/raid0:%s: remove inactive devices before converting to RAID0\n", + mdname(mddev)); goto abort; } - if (zone->dev[j]) { - printk("raid0: multiple devices for %d - aborting!\n", - j); + if (j >= mddev->raid_disks) { + printk(KERN_ERR "md/raid0:%s: bad disk number %d - " + "aborting!\n", mdname(mddev), j); goto abort; } - zone->dev[j] = rdev1; + if (dev[j]) { + printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " + "aborting!\n", mdname(mddev), j); + goto abort; + } + dev[j] = rdev1; - blk_queue_stack_limits(mddev->queue, - rdev1->bdev->bd_disk->queue); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. - */ + disk_stack_limits(mddev->gendisk, rdev1->bdev, + rdev1->data_offset << 9); - if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) + conf->has_merge_bvec = 1; - if (!smallest || (rdev1->size <smallest->size)) + if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; + + if (blk_queue_discard(bdev_get_queue(rdev1->bdev))) + discard_supported = true; } if (cnt != mddev->raid_disks) { - printk("raid0: too few disks (%d of %d) - aborting!\n", - cnt, mddev->raid_disks); + printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " + "aborting!\n", mdname(mddev), cnt, mddev->raid_disks); goto abort; } zone->nb_dev = cnt; - zone->size = smallest->size * cnt; - zone->zone_offset = 0; + zone->zone_end = smallest->sectors * cnt; - current_offset = smallest->size; - curr_zone_offset = zone->size; + curr_zone_end = zone->zone_end; /* now do the other zones */ for (i = 1; i < conf->nr_strip_zones; i++) { + int j; + zone = conf->strip_zone + i; - zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks; + dev = conf->devlist + i * mddev->raid_disks; - printk("raid0: zone %d\n", i); - zone->dev_offset = current_offset; + pr_debug("md/raid0:%s: zone %d\n", mdname(mddev), i); + zone->dev_start = smallest->sectors; smallest = NULL; c = 0; for (j=0; j<cnt; j++) { - char b[BDEVNAME_SIZE]; - rdev = conf->strip_zone[0].dev[j]; - printk("raid0: checking %s ...", bdevname(rdev->bdev,b)); - if (rdev->size > current_offset) - { - printk(" contained as device %d\n", c); - zone->dev[c] = rdev; - c++; - if (!smallest || (rdev->size <smallest->size)) { - smallest = rdev; - printk(" (%llu) is smallest!.\n", - (unsigned long long)rdev->size); - } - } else - printk(" nope.\n"); + rdev = conf->devlist[j]; + if (rdev->sectors <= zone->dev_start) { + pr_debug("md/raid0:%s: checking %s ... nope\n", + mdname(mddev), + bdevname(rdev->bdev, b)); + continue; + } + pr_debug("md/raid0:%s: checking %s ..." + " contained as device %d\n", + mdname(mddev), + bdevname(rdev->bdev, b), c); + dev[c] = rdev; + c++; + if (!smallest || rdev->sectors < smallest->sectors) { + smallest = rdev; + pr_debug("md/raid0:%s: (%llu) is smallest!.\n", + mdname(mddev), + (unsigned long long)rdev->sectors); + } } zone->nb_dev = c; - zone->size = (smallest->size - current_offset) * c; - printk("raid0: zone->nb_dev: %d, size: %llu\n", - zone->nb_dev, (unsigned long long)zone->size); + sectors = (smallest->sectors - zone->dev_start) * c; + pr_debug("md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n", + mdname(mddev), + zone->nb_dev, (unsigned long long)sectors); - zone->zone_offset = curr_zone_offset; - curr_zone_offset += zone->size; + curr_zone_end += sectors; + zone->zone_end = curr_zone_end; - current_offset = smallest->size; - printk("raid0: current zone offset: %llu\n", - (unsigned long long)current_offset); + pr_debug("md/raid0:%s: current zone start: %llu\n", + mdname(mddev), + (unsigned long long)smallest->sectors); } + mddev->queue->backing_dev_info.congested_fn = raid0_congested; + mddev->queue->backing_dev_info.congested_data = mddev; - /* Now find appropriate hash spacing. - * We want a number which causes most hash entries to cover - * at most two strips, but the hash table must be at most - * 1 PAGE. We choose the smallest strip, or contiguous collection - * of strips, that has big enough size. We never consider the last - * strip though as it's size has no bearing on the efficacy of the hash - * table. + /* + * now since we have the hard sector sizes, we can make sure + * chunk size is a multiple of that sector size */ - conf->hash_spacing = curr_zone_offset; - min_spacing = curr_zone_offset; - sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*)); - for (i=0; i < conf->nr_strip_zones-1; i++) { - sector_t sz = 0; - for (j=i; j<conf->nr_strip_zones-1 && - sz < min_spacing ; j++) - sz += conf->strip_zone[j].size; - if (sz >= min_spacing && sz < conf->hash_spacing) - conf->hash_spacing = sz; + if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { + printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n", + mdname(mddev), + mddev->chunk_sectors << 9); + goto abort; } - mddev->queue->unplug_fn = raid0_unplug; + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + blk_queue_io_opt(mddev->queue, + (mddev->chunk_sectors << 9) * mddev->raid_disks); - mddev->queue->backing_dev_info.congested_fn = raid0_congested; - mddev->queue->backing_dev_info.congested_data = mddev; + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); + *private_conf = conf; - printk("raid0: done.\n"); return 0; - abort: - return 1; +abort: + kfree(conf->strip_zone); + kfree(conf->devlist); + kfree(conf); + *private_conf = ERR_PTR(err); + return err; +} + +/* Find the zone which holds a particular offset + * Update *sectorp to be an offset in that zone + */ +static struct strip_zone *find_zone(struct r0conf *conf, + sector_t *sectorp) +{ + int i; + struct strip_zone *z = conf->strip_zone; + sector_t sector = *sectorp; + + for (i = 0; i < conf->nr_strip_zones; i++) + if (sector < z[i].zone_end) { + if (i) + *sectorp = sector - z[i-1].zone_end; + return z + i; + } + BUG(); +} + +/* + * remaps the bio to the target device. we separate two flows. + * power 2 flow and a general flow for the sake of perfromance +*/ +static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, + sector_t sector, sector_t *sector_offset) +{ + unsigned int sect_in_chunk; + sector_t chunk; + struct r0conf *conf = mddev->private; + int raid_disks = conf->strip_zone[0].nb_dev; + unsigned int chunk_sects = mddev->chunk_sectors; + + if (is_power_of_2(chunk_sects)) { + int chunksect_bits = ffz(~chunk_sects); + /* find the sector offset inside the chunk */ + sect_in_chunk = sector & (chunk_sects - 1); + sector >>= chunksect_bits; + /* chunk in zone */ + chunk = *sector_offset; + /* quotient is the chunk in real device*/ + sector_div(chunk, zone->nb_dev << chunksect_bits); + } else{ + sect_in_chunk = sector_div(sector, chunk_sects); + chunk = *sector_offset; + sector_div(chunk, chunk_sects * zone->nb_dev); + } + /* + * position the bio over the real device + * real sector = chunk in device + starting of zone + * + the position in the chunk + */ + *sector_offset = (chunk * chunk_sects) + sect_in_chunk; + return conf->devlist[(zone - conf->strip_zone)*raid_disks + + sector_div(sector, zone->nb_dev)]; } /** - * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged + * raid0_mergeable_bvec -- tell bio layer if two requests can be merged * @q: request queue * @bvm: properties of new bio * @biovec: the request that could be merged to it. @@ -250,102 +367,95 @@ static int raid0_mergeable_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - mddev_t *mddev = q->queuedata; + struct mddev *mddev = q->queuedata; + struct r0conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + sector_t sector_offset = sector; int max; - unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; - - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; - if (max < 0) max = 0; /* bio_add cannot handle a negative return */ + struct strip_zone *zone; + struct md_rdev *rdev; + struct request_queue *subq; + + if (is_power_of_2(chunk_sectors)) + max = (chunk_sectors - ((sector & (chunk_sectors-1)) + + bio_sectors)) << 9; + else + max = (chunk_sectors - (sector_div(sector, chunk_sectors) + + bio_sectors)) << 9; + if (max < 0) + max = 0; /* bio_add cannot handle a negative return */ if (max <= biovec->bv_len && bio_sectors == 0) return biovec->bv_len; - else + if (max < biovec->bv_len) + /* too small already, no need to check further */ + return max; + if (!conf->has_merge_bvec) + return max; + + /* May need to check subordinate device */ + sector = sector_offset; + zone = find_zone(mddev->private, §or_offset); + rdev = map_sector(mddev, zone, sector, §or_offset); + subq = bdev_get_queue(rdev->bdev); + if (subq->merge_bvec_fn) { + bvm->bi_bdev = rdev->bdev; + bvm->bi_sector = sector_offset + zone->dev_start + + rdev->data_offset; + return min(max, subq->merge_bvec_fn(subq, bvm, biovec)); + } else return max; } -static int raid0_run (mddev_t *mddev) +static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks) { - unsigned cur=0, i=0, nb_zone; - s64 size; - raid0_conf_t *conf; - mdk_rdev_t *rdev; - struct list_head *tmp; - - if (mddev->chunk_size == 0) { - printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); - return -EINVAL; - } - printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n", - mdname(mddev), - mddev->chunk_size >> 9, - (mddev->chunk_size>>1)-1); - blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9); - blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1); - mddev->queue->queue_lock = &mddev->queue->__queue_lock; + sector_t array_sectors = 0; + struct md_rdev *rdev; - conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL); - if (!conf) - goto out; - mddev->private = (void *)conf; - - conf->strip_zone = NULL; - conf->devlist = NULL; - if (create_strip_zones (mddev)) - goto out_free_conf; + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); - /* calculate array device size */ - mddev->array_sectors = 0; - rdev_for_each(rdev, tmp, mddev) - mddev->array_sectors += rdev->size * 2; - - printk("raid0 : md_size is %llu blocks.\n", - (unsigned long long)mddev->array_sectors / 2); - printk("raid0 : conf->hash_spacing is %llu blocks.\n", - (unsigned long long)conf->hash_spacing); - { - sector_t s = mddev->array_sectors / 2; - sector_t space = conf->hash_spacing; - int round; - conf->preshift = 0; - if (sizeof(sector_t) > sizeof(u32)) { - /*shift down space and s so that sector_div will work */ - while (space > (sector_t) (~(u32)0)) { - s >>= 1; - space >>= 1; - s += 1; /* force round-up */ - conf->preshift++; - } - } - round = sector_div(s, (u32)space) ? 1 : 0; - nb_zone = s + round; - } - printk("raid0 : nb_zone is %d.\n", nb_zone); - - printk("raid0 : Allocating %Zd bytes for hash.\n", - nb_zone*sizeof(struct strip_zone*)); - conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL); - if (!conf->hash_table) - goto out_free_conf; - size = conf->strip_zone[cur].size; - - conf->hash_table[0] = conf->strip_zone + cur; - for (i=1; i< nb_zone; i++) { - while (size <= conf->hash_spacing) { - cur++; - size += conf->strip_zone[cur].size; - } - size -= conf->hash_spacing; - conf->hash_table[i] = conf->strip_zone + cur; + rdev_for_each(rdev, mddev) + array_sectors += (rdev->sectors & + ~(sector_t)(mddev->chunk_sectors-1)); + + return array_sectors; +} + +static int raid0_stop(struct mddev *mddev); + +static int raid0_run(struct mddev *mddev) +{ + struct r0conf *conf; + int ret; + + if (mddev->chunk_sectors == 0) { + printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", + mdname(mddev)); + return -EINVAL; } - if (conf->preshift) { - conf->hash_spacing >>= conf->preshift; - /* round hash_spacing up so when we divide by it, we - * err on the side of too-low, which is safest - */ - conf->hash_spacing++; + if (md_check_no_bitmap(mddev)) + return -EINVAL; + blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); + + /* if private is not null, we are here after takeover */ + if (mddev->private == NULL) { + ret = create_strip_zones(mddev, &conf); + if (ret < 0) + return ret; + mddev->private = conf; } + conf = mddev->private; + + /* calculate array device size */ + md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); + printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", + mdname(mddev), + (unsigned long long)mddev->array_sectors); /* calculate the max read-ahead size. * For read-ahead of large files to be effective, we need to * readahead at least twice a whole stripe. i.e. number of devices @@ -356,154 +466,253 @@ static int raid0_run (mddev_t *mddev) * chunksize should be used in that case. */ { - int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE; + int stripe = mddev->raid_disks * + (mddev->chunk_sectors << 9) / PAGE_SIZE; if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } - blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); - return 0; + dump_zones(mddev); -out_free_conf: - kfree(conf->strip_zone); - kfree(conf->devlist); - kfree(conf); - mddev->private = NULL; -out: - return -ENOMEM; + ret = md_integrity_register(mddev); + if (ret) + raid0_stop(mddev); + + return ret; } -static int raid0_stop (mddev_t *mddev) +static int raid0_stop(struct mddev *mddev) { - raid0_conf_t *conf = mddev_to_conf(mddev); + struct r0conf *conf = mddev->private; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - kfree(conf->hash_table); - conf->hash_table = NULL; kfree(conf->strip_zone); - conf->strip_zone = NULL; + kfree(conf->devlist); kfree(conf); mddev->private = NULL; - return 0; } -static int raid0_make_request (struct request_queue *q, struct bio *bio) +/* + * Is io distribute over 1 or more chunks ? +*/ +static inline int is_io_in_chunk_boundary(struct mddev *mddev, + unsigned int chunk_sects, struct bio *bio) +{ + if (likely(is_power_of_2(chunk_sects))) { + return chunk_sects >= + ((bio->bi_iter.bi_sector & (chunk_sects-1)) + + bio_sectors(bio)); + } else{ + sector_t sector = bio->bi_iter.bi_sector; + return chunk_sects >= (sector_div(sector, chunk_sects) + + bio_sectors(bio)); + } +} + +static void raid0_make_request(struct mddev *mddev, struct bio *bio) { - mddev_t *mddev = q->queuedata; - unsigned int sect_in_chunk, chunksize_bits, chunk_size, chunk_sects; - raid0_conf_t *conf = mddev_to_conf(mddev); struct strip_zone *zone; - mdk_rdev_t *tmp_dev; - sector_t chunk; - sector_t block, rsect; - const int rw = bio_data_dir(bio); + struct md_rdev *tmp_dev; + struct bio *split; - if (unlikely(bio_barrier(bio))) { - bio_endio(bio, -EOPNOTSUPP); - return 0; + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); + return; } - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); - - chunk_size = mddev->chunk_size >> 10; - chunk_sects = mddev->chunk_size >> 9; - chunksize_bits = ffz(~chunk_size); - block = bio->bi_sector >> 1; - - - if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) { - struct bio_pair *bp; - /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || - bio->bi_idx != 0) - goto bad_map; - /* This is a one page bio that upper layers - * refuse to split for us, so we need to split it. - */ - bp = bio_split(bio, bio_split_pool, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); - if (raid0_make_request(q, &bp->bio1)) - generic_make_request(&bp->bio1); - if (raid0_make_request(q, &bp->bio2)) - generic_make_request(&bp->bio2); - - bio_pair_release(bp); - return 0; + do { + sector_t sector = bio->bi_iter.bi_sector; + unsigned chunk_sects = mddev->chunk_sectors; + + unsigned sectors = chunk_sects - + (likely(is_power_of_2(chunk_sects)) + ? (sector & (chunk_sects-1)) + : sector_div(sector, chunk_sects)); + + if (sectors < bio_sectors(bio)) { + split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); + bio_chain(split, bio); + } else { + split = bio; + } + + zone = find_zone(mddev->private, §or); + tmp_dev = map_sector(mddev, zone, sector, §or); + split->bi_bdev = tmp_dev->bdev; + split->bi_iter.bi_sector = sector + zone->dev_start + + tmp_dev->data_offset; + + if (unlikely((split->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { + /* Just ignore it */ + bio_endio(split, 0); + } else + generic_make_request(split); + } while (split != bio); +} + +static void raid0_status(struct seq_file *seq, struct mddev *mddev) +{ + seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2); + return; +} + +static void *raid0_takeover_raid45(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct r0conf *priv_conf; + + if (mddev->degraded != 1) { + printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", + mdname(mddev), + mddev->degraded); + return ERR_PTR(-EINVAL); } - - { - sector_t x = block >> conf->preshift; - sector_div(x, (u32)conf->hash_spacing); - zone = conf->hash_table[x]; + rdev_for_each(rdev, mddev) { + /* check slot number for a disk */ + if (rdev->raid_disk == mddev->raid_disks-1) { + printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + rdev->sectors = mddev->dev_sectors; } - - while (block >= (zone->zone_offset + zone->size)) - zone++; - - sect_in_chunk = bio->bi_sector & ((chunk_size<<1) -1); + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks--; + mddev->delta_disks = -1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + create_strip_zones(mddev, &priv_conf); + return priv_conf; +} - { - sector_t x = (block - zone->zone_offset) >> chunksize_bits; +static void *raid0_takeover_raid10(struct mddev *mddev) +{ + struct r0conf *priv_conf; + + /* Check layout: + * - far_copies must be 1 + * - near_copies must be 2 + * - disks number must be even + * - all mirrors must be already degraded + */ + if (mddev->layout != ((1 << 8) + 2)) { + printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n", + mdname(mddev), + mddev->layout); + return ERR_PTR(-EINVAL); + } + if (mddev->raid_disks & 1) { + printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + if (mddev->degraded != (mddev->raid_disks>>1)) { + printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->delta_disks = - mddev->raid_disks / 2; + mddev->raid_disks += mddev->delta_disks; + mddev->degraded = 0; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + create_strip_zones(mddev, &priv_conf); + return priv_conf; +} - sector_div(x, zone->nb_dev); - chunk = x; +static void *raid0_takeover_raid1(struct mddev *mddev) +{ + struct r0conf *priv_conf; + int chunksect; - x = block >> chunksize_bits; - tmp_dev = zone->dev[sector_div(x, zone->nb_dev)]; + /* Check layout: + * - (N - 1) mirror drives must be already faulty + */ + if ((mddev->raid_disks - 1) != mddev->degraded) { + printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); } - rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1) - + sect_in_chunk; - - bio->bi_bdev = tmp_dev->bdev; - bio->bi_sector = rsect + tmp_dev->data_offset; /* - * Let the main block layer submit the IO and resolve recursion: + * a raid1 doesn't have the notion of chunk size, so + * figure out the largest suitable size we can use. */ - return 1; - -bad_map: - printk("raid0_make_request bug: can't convert block across chunks" - " or bigger than %dk %llu %d\n", chunk_size, - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); - - bio_io_error(bio); - return 0; + chunksect = 64 * 2; /* 64K by default */ + + /* The array must be an exact multiple of chunksize */ + while (chunksect && (mddev->array_sectors & (chunksect - 1))) + chunksect >>= 1; + + if ((chunksect << 9) < PAGE_SIZE) + /* array size does not allow a suitable chunk size */ + return ERR_PTR(-EINVAL); + + /* Set new parameters */ + mddev->new_level = 0; + mddev->new_layout = 0; + mddev->new_chunk_sectors = chunksect; + mddev->chunk_sectors = chunksect; + mddev->delta_disks = 1 - mddev->raid_disks; + mddev->raid_disks = 1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + create_strip_zones(mddev, &priv_conf); + return priv_conf; } -static void raid0_status (struct seq_file *seq, mddev_t *mddev) +static void *raid0_takeover(struct mddev *mddev) { -#undef MD_DEBUG -#ifdef MD_DEBUG - int j, k, h; - char b[BDEVNAME_SIZE]; - raid0_conf_t *conf = mddev_to_conf(mddev); + /* raid0 can take over: + * raid4 - if all data disks are active. + * raid5 - providing it is Raid4 layout and one disk is faulty + * raid10 - assuming we have all necessary active disks + * raid1 - with (N -1) mirror drives faulty + */ + if (mddev->level == 4) + return raid0_takeover_raid45(mddev); - h = 0; - for (j = 0; j < conf->nr_strip_zones; j++) { - seq_printf(seq, " z%d", j); - if (conf->hash_table[h] == conf->strip_zone+j) - seq_printf(seq, "(h%d)", h++); - seq_printf(seq, "=["); - for (k = 0; k < conf->strip_zone[j].nb_dev; k++) - seq_printf(seq, "%s/", bdevname( - conf->strip_zone[j].dev[k]->bdev,b)); + if (mddev->level == 5) { + if (mddev->layout == ALGORITHM_PARITY_N) + return raid0_takeover_raid45(mddev); - seq_printf(seq, "] zo=%d do=%d s=%d\n", - conf->strip_zone[j].zone_offset, - conf->strip_zone[j].dev_offset, - conf->strip_zone[j].size); + printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", + mdname(mddev), ALGORITHM_PARITY_N); } -#endif - seq_printf(seq, " %dk chunks", mddev->chunk_size/1024); - return; + + if (mddev->level == 10) + return raid0_takeover_raid10(mddev); + + if (mddev->level == 1) + return raid0_takeover_raid1(mddev); + + printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n", + mddev->level); + + return ERR_PTR(-EINVAL); +} + +static void raid0_quiesce(struct mddev *mddev, int state) +{ } -static struct mdk_personality raid0_personality= +static struct md_personality raid0_personality= { .name = "raid0", .level = 0, @@ -512,6 +721,9 @@ static struct mdk_personality raid0_personality= .run = raid0_run, .stop = raid0_stop, .status = raid0_status, + .size = raid0_size, + .takeover = raid0_takeover, + .quiesce = raid0_quiesce, }; static int __init raid0_init (void) @@ -527,6 +739,7 @@ static void raid0_exit (void) module_init(raid0_init); module_exit(raid0_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID0 (striping) personality for MD"); MODULE_ALIAS("md-personality-2"); /* RAID0 */ MODULE_ALIAS("md-raid0"); MODULE_ALIAS("md-level-0"); diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h new file mode 100644 index 00000000000..05539d9c97f --- /dev/null +++ b/drivers/md/raid0.h @@ -0,0 +1,19 @@ +#ifndef _RAID0_H +#define _RAID0_H + +struct strip_zone { + sector_t zone_end; /* Start of the next zone (in sectors) */ + sector_t dev_start; /* Zone offset in real dev (in sectors) */ + int nb_dev; /* # of devices attached to the zone */ +}; + +struct r0conf { + struct strip_zone *strip_zone; + struct md_rdev **devlist; /* lists of rdevs, pointed to + * by strip_zone->dev */ + int nr_strip_zones; + int has_merge_bvec; /* at least one member has + * a merge_bvec_fn */ +}; + +#endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 03a5ab705c2..56e24c072b6 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -31,40 +31,52 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" -#include <linux/raid/raid1.h> -#include <linux/raid/bitmap.h> - -#define DEBUG 0 -#if DEBUG -#define PRINTK(x...) printk(x) -#else -#define PRINTK(x...) -#endif +#include <linux/slab.h> +#include <linux/delay.h> +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/ratelimit.h> +#include "md.h" +#include "raid1.h" +#include "bitmap.h" /* * Number of guaranteed r1bios in case of extreme VM load: */ #define NR_RAID1_BIOS 256 +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) -static void unplug_slaves(mddev_t *mddev); +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) + +/* When there are this many requests queue to be written by + * the raid1 thread, we become 'congested' to provide back-pressure + * for writeback. + */ +static int max_queued_requests = 1024; -static void allow_barrier(conf_t *conf); -static void lower_barrier(conf_t *conf); +static void allow_barrier(struct r1conf *conf, sector_t start_next_window, + sector_t bi_sector); +static void lower_barrier(struct r1conf *conf); static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) { struct pool_info *pi = data; - r1bio_t *r1_bio; - int size = offsetof(r1bio_t, bios[pi->raid_disks]); + int size = offsetof(struct r1bio, bios[pi->raid_disks]); /* allocate a r1bio with room for raid_disks entries in the bios array */ - r1_bio = kzalloc(size, gfp_flags); - if (!r1_bio) - unplug_slaves(pi->mddev); - - return r1_bio; + return kzalloc(size, gfp_flags); } static void r1bio_pool_free(void *r1_bio, void *data) @@ -73,30 +85,30 @@ static void r1bio_pool_free(void *r1_bio, void *data) } #define RESYNC_BLOCK_SIZE (64*1024) -//#define RESYNC_BLOCK_SIZE PAGE_SIZE +#define RESYNC_DEPTH 32 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -#define RESYNC_WINDOW (2048*1024) +#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) +#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) +#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { struct pool_info *pi = data; - struct page *page; - r1bio_t *r1_bio; + struct r1bio *r1_bio; struct bio *bio; + int need_pages; int i, j; r1_bio = r1bio_pool_alloc(gfp_flags, pi); - if (!r1_bio) { - unplug_slaves(pi->mddev); + if (!r1_bio) return NULL; - } /* * Allocate bios : 1 for reading, n-1 for writing */ for (j = pi->raid_disks ; j-- ; ) { - bio = bio_alloc(gfp_flags, RESYNC_PAGES); + bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); if (!bio) goto out_free_bio; r1_bio->bios[j] = bio; @@ -108,18 +120,15 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) * RESYNC_PAGES for each bio. */ if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) - j = pi->raid_disks; + need_pages = pi->raid_disks; else - j = 1; - while(j--) { + need_pages = 1; + for (j = 0; j < need_pages; j++) { bio = r1_bio->bios[j]; - for (i = 0; i < RESYNC_PAGES; i++) { - page = alloc_page(gfp_flags); - if (unlikely(!page)) - goto out_free_pages; + bio->bi_vcnt = RESYNC_PAGES; - bio->bi_io_vec[i].bv_page = page; - } + if (bio_alloc_pages(bio, gfp_flags)) + goto out_free_pages; } /* If not user-requests, copy the page pointers to all bios */ if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { @@ -134,12 +143,15 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) return r1_bio; out_free_pages: - for (i=0; i < RESYNC_PAGES ; i++) - for (j=0 ; j < pi->raid_disks; j++) - safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); - j = -1; + while (--j >= 0) { + struct bio_vec *bv; + + bio_for_each_segment_all(bv, r1_bio->bios[j], i) + __free_page(bv->bv_page); + } + out_free_bio: - while ( ++j < pi->raid_disks ) + while (++j < pi->raid_disks) bio_put(r1_bio->bios[j]); r1bio_pool_free(r1_bio, data); return NULL; @@ -149,7 +161,7 @@ static void r1buf_pool_free(void *__r1_bio, void *data) { struct pool_info *pi = data; int i,j; - r1bio_t *r1bio = __r1_bio; + struct r1bio *r1bio = __r1_bio; for (i = 0; i < RESYNC_PAGES; i++) for (j = pi->raid_disks; j-- ;) { @@ -164,38 +176,32 @@ static void r1buf_pool_free(void *__r1_bio, void *data) r1bio_pool_free(r1bio, data); } -static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) +static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) { int i; - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { struct bio **bio = r1_bio->bios + i; - if (*bio && *bio != IO_BLOCKED) + if (!BIO_SPECIAL(*bio)) bio_put(*bio); *bio = NULL; } } -static void free_r1bio(r1bio_t *r1_bio) +static void free_r1bio(struct r1bio *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); - - /* - * Wake up any possible resync thread that waits for the device - * to go idle. - */ - allow_barrier(conf); + struct r1conf *conf = r1_bio->mddev->private; put_all_bios(conf, r1_bio); mempool_free(r1_bio, conf->r1bio_pool); } -static void put_buf(r1bio_t *r1_bio) +static void put_buf(struct r1bio *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + struct r1conf *conf = r1_bio->mddev->private; int i; - for (i=0; i<conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2; i++) { struct bio *bio = r1_bio->bios[i]; if (bio->bi_end_io) rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); @@ -206,11 +212,11 @@ static void put_buf(r1bio_t *r1_bio) lower_barrier(conf); } -static void reschedule_retry(r1bio_t *r1_bio) +static void reschedule_retry(struct r1bio *r1_bio) { unsigned long flags; - mddev_t *mddev = r1_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); @@ -226,20 +232,52 @@ static void reschedule_retry(r1bio_t *r1_bio) * operation and are ready to return a success/failure code to the buffer * cache layer. */ -static void raid_end_bio_io(r1bio_t *r1_bio) +static void call_bio_endio(struct r1bio *r1_bio) +{ + struct bio *bio = r1_bio->master_bio; + int done; + struct r1conf *conf = r1_bio->mddev->private; + sector_t start_next_window = r1_bio->start_next_window; + sector_t bi_sector = bio->bi_iter.bi_sector; + + if (bio->bi_phys_segments) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + bio->bi_phys_segments--; + done = (bio->bi_phys_segments == 0); + spin_unlock_irqrestore(&conf->device_lock, flags); + /* + * make_request() might be waiting for + * bi_phys_segments to decrease + */ + wake_up(&conf->wait_barrier); + } else + done = 1; + + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + if (done) { + bio_endio(bio, 0); + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + allow_barrier(conf, start_next_window, bi_sector); + } +} + +static void raid_end_bio_io(struct r1bio *r1_bio) { struct bio *bio = r1_bio->master_bio; /* if nobody has done the final endio yet, do it now */ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { - PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", - (bio_data_dir(bio) == WRITE) ? "write" : "read", - (unsigned long long) bio->bi_sector, - (unsigned long long) bio->bi_sector + - (bio->bi_size >> 9) - 1); + pr_debug("raid1: sync end %s on sectors %llu-%llu\n", + (bio_data_dir(bio) == WRITE) ? "write" : "read", + (unsigned long long) bio->bi_iter.bi_sector, + (unsigned long long) bio_end_sector(bio) - 1); - bio_endio(bio, - test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); + call_bio_endio(r1_bio); } free_r1bio(r1_bio); } @@ -247,20 +285,39 @@ static void raid_end_bio_io(r1bio_t *r1_bio) /* * Update disk head position estimator based on IRQ completion info. */ -static inline void update_head_pos(int disk, r1bio_t *r1_bio) +static inline void update_head_pos(int disk, struct r1bio *r1_bio) { - conf_t *conf = mddev_to_conf(r1_bio->mddev); + struct r1conf *conf = r1_bio->mddev->private; conf->mirrors[disk].head_position = r1_bio->sector + (r1_bio->sectors); } +/* + * Find the disk number which triggered given bio + */ +static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) +{ + int mirror; + struct r1conf *conf = r1_bio->mddev->private; + int raid_disks = conf->raid_disks; + + for (mirror = 0; mirror < raid_disks * 2; mirror++) + if (r1_bio->bios[mirror] == bio) + break; + + BUG_ON(mirror == raid_disks * 2); + update_head_pos(mirror, r1_bio); + + return mirror; +} + static void raid1_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + struct r1bio *r1_bio = bio->bi_private; int mirror; - conf_t *conf = mddev_to_conf(r1_bio->mddev); + struct r1conf *conf = r1_bio->mddev->private; mirror = r1_bio->read_disk; /* @@ -284,113 +341,154 @@ static void raid1_end_read_request(struct bio *bio, int error) spin_unlock_irqrestore(&conf->device_lock, flags); } - if (uptodate) + if (uptodate) { raid_end_bio_io(r1_bio); - else { + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + } else { /* * oops, read error: */ char b[BDEVNAME_SIZE]; - if (printk_ratelimit()) - printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", - bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); + printk_ratelimited( + KERN_ERR "md/raid1:%s: %s: " + "rescheduling sector %llu\n", + mdname(conf->mddev), + bdevname(conf->mirrors[mirror].rdev->bdev, + b), + (unsigned long long)r1_bio->sector); + set_bit(R1BIO_ReadError, &r1_bio->state); reschedule_retry(r1_bio); + /* don't drop the reference on read_disk yet */ } +} - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); +static void close_write(struct r1bio *r1_bio) +{ + /* it really is the end of this request */ + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = r1_bio->behind_page_count; + while (i--) + safe_put_page(r1_bio->behind_bvecs[i].bv_page); + kfree(r1_bio->behind_bvecs); + r1_bio->behind_bvecs = NULL; + } + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state), + test_bit(R1BIO_BehindIO, &r1_bio->state)); + md_write_end(r1_bio->mddev); +} + +static void r1_bio_write_done(struct r1bio *r1_bio) +{ + if (!atomic_dec_and_test(&r1_bio->remaining)) + return; + + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + close_write(r1_bio); + if (test_bit(R1BIO_MadeGood, &r1_bio->state)) + reschedule_retry(r1_bio); + else + raid_end_bio_io(r1_bio); + } } static void raid1_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); + struct r1bio *r1_bio = bio->bi_private; int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); - conf_t *conf = mddev_to_conf(r1_bio->mddev); + struct r1conf *conf = r1_bio->mddev->private; struct bio *to_put = NULL; + mirror = find_bio_disk(r1_bio, bio); - for (mirror = 0; mirror < conf->raid_disks; mirror++) - if (r1_bio->bios[mirror] == bio) - break; - - if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { - set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); - set_bit(R1BIO_BarrierRetry, &r1_bio->state); - r1_bio->mddev->barriers_work = 0; - /* Don't rdev_dec_pending in this branch - keep it for the retry */ + /* + * 'one mirror IO has finished' event handler: + */ + if (!uptodate) { + set_bit(WriteErrorSeen, + &conf->mirrors[mirror].rdev->flags); + if (!test_and_set_bit(WantReplacement, + &conf->mirrors[mirror].rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + conf->mddev->recovery); + + set_bit(R1BIO_WriteError, &r1_bio->state); } else { /* - * this branch is our 'one mirror IO has finished' event handler: + * Set R1BIO_Uptodate in our master bio, so that we + * will return a good error code for to the higher + * levels even if IO on some other mirrored buffer + * fails. + * + * The 'master' represents the composite IO operation + * to user-side. So if something waits for IO, then it + * will wait for the 'master' bio. */ + sector_t first_bad; + int bad_sectors; + r1_bio->bios[mirror] = NULL; to_put = bio; - if (!uptodate) { - md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - /* an I/O failed, we can't clear the bitmap */ - set_bit(R1BIO_Degraded, &r1_bio->state); - } else - /* - * Set R1BIO_Uptodate in our master bio, so that - * we will return a good error code for to the higher - * levels even if IO on some other mirrored buffer fails. - * - * The 'master' represents the composite IO operation to - * user-side. So if something waits for IO, then it will - * wait for the 'master' bio. - */ + /* + * Do not set R1BIO_Uptodate if the current device is + * rebuilding or Faulty. This is because we cannot use + * such device for properly reading the data back (we could + * potentially use it, if the current write would have felt + * before rdev->recovery_offset, but for simplicity we don't + * check this here. + */ + if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) && + !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)) set_bit(R1BIO_Uptodate, &r1_bio->state); - update_head_pos(mirror, r1_bio); - - if (behind) { - if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) - atomic_dec(&r1_bio->behind_remaining); - - /* In behind mode, we ACK the master bio once the I/O has safely - * reached all non-writemostly disks. Setting the Returned bit - * ensures that this gets done only once -- we don't ever want to - * return -EIO here, instead we'll wait */ - - if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && - test_bit(R1BIO_Uptodate, &r1_bio->state)) { - /* Maybe we can return now */ - if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { - struct bio *mbio = r1_bio->master_bio; - PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", - (unsigned long long) mbio->bi_sector, - (unsigned long long) mbio->bi_sector + - (mbio->bi_size >> 9) - 1); - bio_endio(mbio, 0); - } + /* Maybe we can clear some bad blocks. */ + if (is_badblock(conf->mirrors[mirror].rdev, + r1_bio->sector, r1_bio->sectors, + &first_bad, &bad_sectors)) { + r1_bio->bios[mirror] = IO_MADE_GOOD; + set_bit(R1BIO_MadeGood, &r1_bio->state); + } + } + + if (behind) { + if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + atomic_dec(&r1_bio->behind_remaining); + + /* + * In behind mode, we ACK the master bio once the I/O + * has safely reached all non-writemostly + * disks. Setting the Returned bit ensures that this + * gets done only once -- we don't ever want to return + * -EIO here, instead we'll wait + */ + if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && + test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* Maybe we can return now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + struct bio *mbio = r1_bio->master_bio; + pr_debug("raid1: behind end write sectors" + " %llu-%llu\n", + (unsigned long long) mbio->bi_iter.bi_sector, + (unsigned long long) bio_end_sector(mbio) - 1); + call_bio_endio(r1_bio); } } - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } + if (r1_bio->bios[mirror] == NULL) + rdev_dec_pending(conf->mirrors[mirror].rdev, + conf->mddev); + /* - * * Let's see if all mirrored write operations have finished * already. */ - if (atomic_dec_and_test(&r1_bio->remaining)) { - if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) - reschedule_retry(r1_bio); - else { - /* it really is the end of this request */ - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - /* free extra copy of the data pages */ - int i = bio->bi_vcnt; - while (i--) - safe_put_page(bio->bi_io_vec[i].bv_page); - } - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - behind); - md_write_end(r1_bio->mddev); - raid_end_bio_io(r1_bio); - } - } + r1_bio_write_done(r1_bio); if (to_put) bio_put(to_put); @@ -411,14 +509,19 @@ static void raid1_end_write_request(struct bio *bio, int error) * * The rdev for the device selected will have nr_pending incremented. */ -static int read_balance(conf_t *conf, r1bio_t *r1_bio) +static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) { - const unsigned long this_sector = r1_bio->sector; - int new_disk = conf->last_used, disk = new_disk; - int wonly_disk = -1; - const int sectors = r1_bio->sectors; - sector_t new_distance, current_distance; - mdk_rdev_t *rdev; + const sector_t this_sector = r1_bio->sector; + int sectors; + int best_good_sectors; + int best_disk, best_dist_disk, best_pending_disk; + int has_nonrot_disk; + int disk; + sector_t best_dist; + unsigned int min_pending; + struct md_rdev *rdev; + int choose_first; + int choose_next_idle; rcu_read_lock(); /* @@ -427,161 +530,236 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) * We take the first readable disk when above the resync window. */ retry: - if (conf->mddev->recovery_cp < MaxSector && - (this_sector + sectors >= conf->next_resync)) { - /* Choose the first operation device, for consistancy */ - new_disk = 0; - - for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); - r1_bio->bios[new_disk] == IO_BLOCKED || - !rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(WriteMostly, &rdev->flags); - rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { - - if (rdev && test_bit(In_sync, &rdev->flags) && - r1_bio->bios[new_disk] != IO_BLOCKED) - wonly_disk = new_disk; - - if (new_disk == conf->raid_disks - 1) { - new_disk = wonly_disk; - break; - } - } - goto rb_out; - } + sectors = r1_bio->sectors; + best_disk = -1; + best_dist_disk = -1; + best_dist = MaxSector; + best_pending_disk = -1; + min_pending = UINT_MAX; + best_good_sectors = 0; + has_nonrot_disk = 0; + choose_next_idle = 0; + if (conf->mddev->recovery_cp < MaxSector && + (this_sector + sectors >= conf->next_resync)) + choose_first = 1; + else + choose_first = 0; - /* make sure the disk is operational */ - for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); - r1_bio->bios[new_disk] == IO_BLOCKED || - !rdev || !test_bit(In_sync, &rdev->flags) || - test_bit(WriteMostly, &rdev->flags); - rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + sector_t dist; + sector_t first_bad; + int bad_sectors; + unsigned int pending; + bool nonrot; - if (rdev && test_bit(In_sync, &rdev->flags) && - r1_bio->bios[new_disk] != IO_BLOCKED) - wonly_disk = new_disk; + rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (r1_bio->bios[disk] == IO_BLOCKED + || rdev == NULL + || test_bit(Unmerged, &rdev->flags) + || test_bit(Faulty, &rdev->flags)) + continue; + if (!test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < this_sector + sectors) + continue; + if (test_bit(WriteMostly, &rdev->flags)) { + /* Don't balance among write-mostly, just + * use the first as a last resort */ + if (best_disk < 0) { + if (is_badblock(rdev, this_sector, sectors, + &first_bad, &bad_sectors)) { + if (first_bad < this_sector) + /* Cannot use this */ + continue; + best_good_sectors = first_bad - this_sector; + } else + best_good_sectors = sectors; + best_disk = disk; + } + continue; + } + /* This is a reasonable device to use. It might + * even be best. + */ + if (is_badblock(rdev, this_sector, sectors, + &first_bad, &bad_sectors)) { + if (best_dist < MaxSector) + /* already have a better device */ + continue; + if (first_bad <= this_sector) { + /* cannot read here. If this is the 'primary' + * device, then we must not read beyond + * bad_sectors from another device.. + */ + bad_sectors -= (this_sector - first_bad); + if (choose_first && sectors > bad_sectors) + sectors = bad_sectors; + if (best_good_sectors > sectors) + best_good_sectors = sectors; - if (new_disk <= 0) - new_disk = conf->raid_disks; - new_disk--; - if (new_disk == disk) { - new_disk = wonly_disk; + } else { + sector_t good_sectors = first_bad - this_sector; + if (good_sectors > best_good_sectors) { + best_good_sectors = good_sectors; + best_disk = disk; + } + if (choose_first) + break; + } + continue; + } else + best_good_sectors = sectors; + + nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); + has_nonrot_disk |= nonrot; + pending = atomic_read(&rdev->nr_pending); + dist = abs(this_sector - conf->mirrors[disk].head_position); + if (choose_first) { + best_disk = disk; break; } - } - - if (new_disk < 0) - goto rb_out; + /* Don't change to another disk for sequential reads */ + if (conf->mirrors[disk].next_seq_sect == this_sector + || dist == 0) { + int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; + struct raid1_info *mirror = &conf->mirrors[disk]; - disk = new_disk; - /* now disk == new_disk == starting point for search */ - - /* - * Don't change to another disk for sequential reads: - */ - if (conf->next_seq_sect == this_sector) - goto rb_out; - if (this_sector == conf->mirrors[new_disk].head_position) - goto rb_out; - - current_distance = abs(this_sector - conf->mirrors[disk].head_position); - - /* Find the disk whose head is closest */ - - do { - if (disk <= 0) - disk = conf->raid_disks; - disk--; - - rdev = rcu_dereference(conf->mirrors[disk].rdev); + best_disk = disk; + /* + * If buffered sequential IO size exceeds optimal + * iosize, check if there is idle disk. If yes, choose + * the idle disk. read_balance could already choose an + * idle disk before noticing it's a sequential IO in + * this disk. This doesn't matter because this disk + * will idle, next time it will be utilized after the + * first disk has IO size exceeds optimal iosize. In + * this way, iosize of the first disk will be optimal + * iosize at least. iosize of the second disk might be + * small, but not a big deal since when the second disk + * starts IO, the first disk is likely still busy. + */ + if (nonrot && opt_iosize > 0 && + mirror->seq_start != MaxSector && + mirror->next_seq_sect > opt_iosize && + mirror->next_seq_sect - opt_iosize >= + mirror->seq_start) { + choose_next_idle = 1; + continue; + } + break; + } + /* If device is idle, use it */ + if (pending == 0) { + best_disk = disk; + break; + } - if (!rdev || r1_bio->bios[disk] == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags) || - test_bit(WriteMostly, &rdev->flags)) + if (choose_next_idle) continue; - if (!atomic_read(&rdev->nr_pending)) { - new_disk = disk; - break; + if (min_pending > pending) { + min_pending = pending; + best_pending_disk = disk; } - new_distance = abs(this_sector - conf->mirrors[disk].head_position); - if (new_distance < current_distance) { - current_distance = new_distance; - new_disk = disk; - } - } while (disk != conf->last_used); - rb_out: + if (dist < best_dist) { + best_dist = dist; + best_dist_disk = disk; + } + } + /* + * If all disks are rotational, choose the closest disk. If any disk is + * non-rotational, choose the disk with less pending request even the + * disk is rotational, which might/might not be optimal for raids with + * mixed ratation/non-rotational disks depending on workload. + */ + if (best_disk == -1) { + if (has_nonrot_disk) + best_disk = best_pending_disk; + else + best_disk = best_dist_disk; + } - if (new_disk >= 0) { - rdev = rcu_dereference(conf->mirrors[new_disk].rdev); + if (best_disk >= 0) { + rdev = rcu_dereference(conf->mirrors[best_disk].rdev); if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); - if (!test_bit(In_sync, &rdev->flags)) { + if (test_bit(Faulty, &rdev->flags)) { /* cannot risk returning a device that failed * before we inc'ed nr_pending */ rdev_dec_pending(rdev, conf->mddev); goto retry; } - conf->next_seq_sect = this_sector + sectors; - conf->last_used = new_disk; + sectors = best_good_sectors; + + if (conf->mirrors[best_disk].next_seq_sect != this_sector) + conf->mirrors[best_disk].seq_start = this_sector; + + conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; } rcu_read_unlock(); + *max_sectors = sectors; - return new_disk; + return best_disk; } -static void unplug_slaves(mddev_t *mddev) +static int raid1_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { - conf_t *conf = mddev_to_conf(mddev); - int i; - - rcu_read_lock(); - for (i=0; i<mddev->raid_disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { - struct request_queue *r_queue = bdev_get_queue(rdev->bdev); - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - blk_unplug(r_queue); - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); + struct mddev *mddev = q->queuedata; + struct r1conf *conf = mddev->private; + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); + int max = biovec->bv_len; + + if (mddev->merge_check_needed) { + int disk; + rcu_read_lock(); + for (disk = 0; disk < conf->raid_disks * 2; disk++) { + struct md_rdev *rdev = rcu_dereference( + conf->mirrors[disk].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = sector + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } } + rcu_read_unlock(); } - rcu_read_unlock(); -} + return max; -static void raid1_unplug(struct request_queue *q) -{ - mddev_t *mddev = q->queuedata; - - unplug_slaves(mddev); - md_wakeup_thread(mddev->thread); } -static int raid1_congested(void *data, int bits) +int md_raid1_congested(struct mddev *mddev, int bits) { - mddev_t *mddev = data; - conf_t *conf = mddev_to_conf(mddev); + struct r1conf *conf = mddev->private; int i, ret = 0; + if ((bits & (1 << BDI_async_congested)) && + conf->pending_count >= max_queued_requests) + return 1; + rcu_read_lock(); - for (i = 0; i < mddev->raid_disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + for (i = 0; i < conf->raid_disks * 2; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); + BUG_ON(!q); + /* Note the '|| 1' - when read_balance prefers * non-congested targets, it can be removed */ - if ((bits & (1<<BDI_write_congested)) || 1) + if ((bits & (1<<BDI_async_congested)) || 1) ret |= bdi_congested(&q->backing_dev_info, bits); else ret &= bdi_congested(&q->backing_dev_info, bits); @@ -590,37 +768,46 @@ static int raid1_congested(void *data, int bits) rcu_read_unlock(); return ret; } +EXPORT_SYMBOL_GPL(md_raid1_congested); + +static int raid1_congested(void *data, int bits) +{ + struct mddev *mddev = data; + return mddev_congested(mddev, bits) || + md_raid1_congested(mddev, bits); +} -static int flush_pending_writes(conf_t *conf) +static void flush_pending_writes(struct r1conf *conf) { /* Any writes that have been queued but are awaiting * bitmap updates get flushed here. - * We return 1 if any requests were actually submitted. */ - int rv = 0; - spin_lock_irq(&conf->device_lock); if (conf->pending_bio_list.head) { struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(conf->mddev->queue); + conf->pending_count = 0; spin_unlock_irq(&conf->device_lock); /* flush any pending bitmap writes to * disk before proceeding w/ I/O */ bitmap_unplug(conf->mddev->bitmap); + wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } - rv = 1; } else spin_unlock_irq(&conf->device_lock); - return rv; } /* Barriers.... @@ -644,169 +831,312 @@ static int flush_pending_writes(conf_t *conf) * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ -#define RESYNC_DEPTH 32 - -static void raise_barrier(conf_t *conf) +static void raise_barrier(struct r1conf *conf) { spin_lock_irq(&conf->resync_lock); /* Wait until no block IO is waiting */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, - conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + conf->resync_lock); /* block any new IO from starting */ conf->barrier++; - /* No wait for all pending IO to complete */ + /* For these conditions we must wait: + * A: while the array is in frozen state + * B: while barrier >= RESYNC_DEPTH, meaning resync reach + * the max count which allowed. + * C: next_resync + RESYNC_SECTORS > start_next_window, meaning + * next resync will reach to the window which normal bios are + * handling. + */ wait_event_lock_irq(conf->wait_barrier, - !conf->nr_pending && conf->barrier < RESYNC_DEPTH, - conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + !conf->array_frozen && + conf->barrier < RESYNC_DEPTH && + (conf->start_next_window >= + conf->next_resync + RESYNC_SECTORS), + conf->resync_lock); spin_unlock_irq(&conf->resync_lock); } -static void lower_barrier(conf_t *conf) +static void lower_barrier(struct r1conf *conf) { unsigned long flags; + BUG_ON(conf->barrier <= 0); spin_lock_irqsave(&conf->resync_lock, flags); conf->barrier--; spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } -static void wait_barrier(conf_t *conf) +static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) +{ + bool wait = false; + + if (conf->array_frozen || !bio) + wait = true; + else if (conf->barrier && bio_data_dir(bio) == WRITE) { + if (conf->next_resync < RESYNC_WINDOW_SECTORS) + wait = true; + else if ((conf->next_resync - RESYNC_WINDOW_SECTORS + >= bio_end_sector(bio)) || + (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_iter.bi_sector)) + wait = false; + else + wait = true; + } + + return wait; +} + +static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) { + sector_t sector = 0; + spin_lock_irq(&conf->resync_lock); - if (conf->barrier) { + if (need_to_wait_for_sync(conf, bio)) { conf->nr_waiting++; - wait_event_lock_irq(conf->wait_barrier, !conf->barrier, - conf->resync_lock, - raid1_unplug(conf->mddev->queue)); + /* Wait for the barrier to drop. + * However if there are already pending + * requests (preventing the barrier from + * rising completely), and the + * pre-process bio queue isn't empty, + * then don't wait, as we need to empty + * that queue to get the nr_pending + * count down. + */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen && + (!conf->barrier || + ((conf->start_next_window < + conf->next_resync + RESYNC_SECTORS) && + current->bio_list && + !bio_list_empty(current->bio_list))), + conf->resync_lock); conf->nr_waiting--; } + + if (bio && bio_data_dir(bio) == WRITE) { + if (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_iter.bi_sector) { + if (conf->start_next_window == MaxSector) + conf->start_next_window = + conf->next_resync + + NEXT_NORMALIO_DISTANCE; + + if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) + <= bio->bi_iter.bi_sector) + conf->next_window_requests++; + else + conf->current_window_requests++; + sector = conf->start_next_window; + } + } + conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); + return sector; } -static void allow_barrier(conf_t *conf) +static void allow_barrier(struct r1conf *conf, sector_t start_next_window, + sector_t bi_sector) { unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); conf->nr_pending--; + if (start_next_window) { + if (start_next_window == conf->start_next_window) { + if (conf->start_next_window + NEXT_NORMALIO_DISTANCE + <= bi_sector) + conf->next_window_requests--; + else + conf->current_window_requests--; + } else + conf->current_window_requests--; + + if (!conf->current_window_requests) { + if (conf->next_window_requests) { + conf->current_window_requests = + conf->next_window_requests; + conf->next_window_requests = 0; + conf->start_next_window += + NEXT_NORMALIO_DISTANCE; + } else + conf->start_next_window = MaxSector; + } + } spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } -static void freeze_array(conf_t *conf) +static void freeze_array(struct r1conf *conf, int extra) { /* stop syncio and normal IO and wait for everything to * go quite. - * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+1 + * We wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (1) + * Thus the number queued (nr_queued) plus this request (extra) * must match the number of pending IOs (nr_pending) before * we continue. */ spin_lock_irq(&conf->resync_lock); - conf->barrier++; - conf->nr_waiting++; - wait_event_lock_irq(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+1, - conf->resync_lock, - ({ flush_pending_writes(conf); - raid1_unplug(conf->mddev->queue); })); + conf->array_frozen = 1; + wait_event_lock_irq_cmd(conf->wait_barrier, + conf->nr_pending == conf->nr_queued+extra, + conf->resync_lock, + flush_pending_writes(conf)); spin_unlock_irq(&conf->resync_lock); } -static void unfreeze_array(conf_t *conf) +static void unfreeze_array(struct r1conf *conf) { /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); - conf->barrier--; - conf->nr_waiting--; + conf->array_frozen = 0; wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); } -/* duplicate the data pages for behind I/O */ -static struct page **alloc_behind_pages(struct bio *bio) +/* duplicate the data pages for behind I/O + */ +static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio) { int i; struct bio_vec *bvec; - struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), + struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), GFP_NOIO); - if (unlikely(!pages)) - goto do_sync_io; + if (unlikely(!bvecs)) + return; - bio_for_each_segment(bvec, bio, i) { - pages[i] = alloc_page(GFP_NOIO); - if (unlikely(!pages[i])) + bio_for_each_segment_all(bvec, bio, i) { + bvecs[i] = *bvec; + bvecs[i].bv_page = alloc_page(GFP_NOIO); + if (unlikely(!bvecs[i].bv_page)) goto do_sync_io; - memcpy(kmap(pages[i]) + bvec->bv_offset, - kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); - kunmap(pages[i]); + memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset, + kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); + kunmap(bvecs[i].bv_page); kunmap(bvec->bv_page); } - - return pages; + r1_bio->behind_bvecs = bvecs; + r1_bio->behind_page_count = bio->bi_vcnt; + set_bit(R1BIO_BehindIO, &r1_bio->state); + return; do_sync_io: - if (pages) - for (i = 0; i < bio->bi_vcnt && pages[i]; i++) - put_page(pages[i]); - kfree(pages); - PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); - return NULL; + for (i = 0; i < bio->bi_vcnt; i++) + if (bvecs[i].bv_page) + put_page(bvecs[i].bv_page); + kfree(bvecs); + pr_debug("%dB behind alloc failed, doing sync I/O\n", + bio->bi_iter.bi_size); } -static int make_request(struct request_queue *q, struct bio * bio) +struct raid1_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) { - mddev_t *mddev = q->queuedata; - conf_t *conf = mddev_to_conf(mddev); - mirror_info_t *mirror; - r1bio_t *r1_bio; + struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r1conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule || current->bio_list) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_barrier); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); + bio = next; + } + kfree(plug); +} + +static void make_request(struct mddev *mddev, struct bio * bio) +{ + struct r1conf *conf = mddev->private; + struct raid1_info *mirror; + struct r1bio *r1_bio; struct bio *read_bio; - int i, targets = 0, disks; + int i, disks; struct bitmap *bitmap; unsigned long flags; - struct bio_list bl; - struct page **behind_pages = NULL; const int rw = bio_data_dir(bio); - const int do_sync = bio_sync(bio); - int do_barriers; - mdk_rdev_t *blocked_rdev; + const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); + const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); + const unsigned long do_discard = (bio->bi_rw + & (REQ_DISCARD | REQ_SECURE)); + const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); + struct md_rdev *blocked_rdev; + struct blk_plug_cb *cb; + struct raid1_plug_cb *plug = NULL; + int first_clone; + int sectors_handled; + int max_sectors; + sector_t start_next_window; /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. - * We test barriers_work *after* md_write_start as md_write_start - * may cause the first superblock write, and that will check out - * if barriers work. */ md_write_start(mddev, bio); /* wait on superblock update early */ - if (unlikely(!mddev->barriers_work && bio_barrier(bio))) { - if (rw == WRITE) - md_write_end(mddev); - bio_endio(bio, -EOPNOTSUPP); - return 0; + if (bio_data_dir(bio) == WRITE && + bio_end_sector(bio) > mddev->suspend_lo && + bio->bi_iter.bi_sector < mddev->suspend_hi) { + /* As the suspend_* range is controlled by + * userspace, we want an interruptible + * wait. + */ + DEFINE_WAIT(w); + for (;;) { + flush_signals(current); + prepare_to_wait(&conf->wait_barrier, + &w, TASK_INTERRUPTIBLE); + if (bio_end_sector(bio) <= mddev->suspend_lo || + bio->bi_iter.bi_sector >= mddev->suspend_hi) + break; + schedule(); + } + finish_wait(&conf->wait_barrier, &w); } - wait_barrier(conf); + start_next_window = wait_barrier(conf, bio); bitmap = mddev->bitmap; - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); - /* * make_request() can abort the operation when READA is being * used and no empty request is available. @@ -815,183 +1145,325 @@ static int make_request(struct request_queue *q, struct bio * bio) r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); r1_bio->master_bio = bio; - r1_bio->sectors = bio->bi_size >> 9; + r1_bio->sectors = bio_sectors(bio); r1_bio->state = 0; r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector; + r1_bio->sector = bio->bi_iter.bi_sector; + + /* We might need to issue multiple reads to different + * devices if there are bad blocks around, so we keep + * track of the number of reads in bio->bi_phys_segments. + * If this is 0, there is only one r1_bio and no locking + * will be needed when requests complete. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + clear_bit(BIO_SEG_VALID, &bio->bi_flags); if (rw == READ) { /* * read balancing logic: */ - int rdisk = read_balance(conf, r1_bio); + int rdisk; + +read_again: + rdisk = read_balance(conf, r1_bio, &max_sectors); if (rdisk < 0) { /* couldn't find anywhere to read from */ raid_end_bio_io(r1_bio); - return 0; + return; } mirror = conf->mirrors + rdisk; + if (test_bit(WriteMostly, &mirror->rdev->flags) && + bitmap) { + /* Reading from a write-mostly device must + * take care not to over-take any writes + * that are 'behind' + */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } r1_bio->read_disk = rdisk; - read_bio = bio_clone(bio, GFP_NOIO); + read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, + max_sectors); r1_bio->bios[rdisk] = read_bio; - read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; + read_bio->bi_iter.bi_sector = r1_bio->sector + + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid1_end_read_request; read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r1_bio; - generic_make_request(read_bio); - return 0; + if (max_sectors < r1_bio->sectors) { + /* could not read all from this device, so we will + * need another r1_bio. + */ + + sectors_handled = (r1_bio->sector + max_sectors + - bio->bi_iter.bi_sector); + r1_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + /* Cannot call generic_make_request directly + * as that will be queued in __make_request + * and subsequent mempool_alloc might block waiting + * for it. So hand bio over to raid1d. + */ + reschedule_retry(r1_bio); + + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = bio; + r1_bio->sectors = bio_sectors(bio) - sectors_handled; + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_iter.bi_sector + + sectors_handled; + goto read_again; + } else + generic_make_request(read_bio); + return; } /* * WRITE: */ - /* first select target devices under spinlock and + if (conf->pending_count >= max_queued_requests) { + md_wakeup_thread(mddev->thread); + wait_event(conf->wait_barrier, + conf->pending_count < max_queued_requests); + } + /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio + * If there are known/acknowledged bad blocks on any device on + * which we have seen a write error, we want to avoid writing those + * blocks. + * This potentially requires several writes to write around + * the bad blocks. Each set of writes gets it's own r1bio + * with a set of bios attached. */ - disks = conf->raid_disks; -#if 0 - { static int first=1; - if (first) printk("First Write sector %llu disks %d\n", - (unsigned long long)r1_bio->sector, disks); - first = 0; - } -#endif + + disks = conf->raid_disks * 2; retry_write: + r1_bio->start_next_window = start_next_window; blocked_rdev = NULL; rcu_read_lock(); + max_sectors = r1_bio->sectors; for (i = 0; i < disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; break; } - if (rdev && !test_bit(Faulty, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - if (test_bit(Faulty, &rdev->flags)) { + r1_bio->bios[i] = NULL; + if (!rdev || test_bit(Faulty, &rdev->flags) + || test_bit(Unmerged, &rdev->flags)) { + if (i < conf->raid_disks) + set_bit(R1BIO_Degraded, &r1_bio->state); + continue; + } + + atomic_inc(&rdev->nr_pending); + if (test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + int bad_sectors; + int is_bad; + + is_bad = is_badblock(rdev, r1_bio->sector, + max_sectors, + &first_bad, &bad_sectors); + if (is_bad < 0) { + /* mustn't write here until the bad block is + * acknowledged*/ + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } + if (is_bad && first_bad <= r1_bio->sector) { + /* Cannot write here at all */ + bad_sectors -= (r1_bio->sector - first_bad); + if (bad_sectors < max_sectors) + /* mustn't write more than bad_sectors + * to other devices yet + */ + max_sectors = bad_sectors; rdev_dec_pending(rdev, mddev); - r1_bio->bios[i] = NULL; - } else - r1_bio->bios[i] = bio; - targets++; - } else - r1_bio->bios[i] = NULL; + /* We don't set R1BIO_Degraded as that + * only applies if the disk is + * missing, so it might be re-added, + * and we want to know to recover this + * chunk. + * In this case the device is here, + * and the fact that this chunk is not + * in-sync is recorded in the bad + * block log + */ + continue; + } + if (is_bad) { + int good_sectors = first_bad - r1_bio->sector; + if (good_sectors < max_sectors) + max_sectors = good_sectors; + } + } + r1_bio->bios[i] = bio; } rcu_read_unlock(); if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ int j; + sector_t old = start_next_window; for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); - - allow_barrier(conf); + r1_bio->state = 0; + allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); + start_next_window = wait_barrier(conf, bio); + /* + * We must make sure the multi r1bios of bio have + * the same value of bi_phys_segments + */ + if (bio->bi_phys_segments && old && + old != start_next_window) + /* Wait for the former r1bio(s) to complete */ + wait_event(conf->wait_barrier, + bio->bi_phys_segments == 1); goto retry_write; } - BUG_ON(targets == 0); /* we never fail the last device */ - - if (targets < conf->raid_disks) { - /* array is degraded, we will not clear the bitmap - * on I/O completion (see raid1_end_write_request) */ - set_bit(R1BIO_Degraded, &r1_bio->state); + if (max_sectors < r1_bio->sectors) { + /* We are splitting this write into multiple parts, so + * we need to prepare for allocating another r1_bio. + */ + r1_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); } + sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector; - /* do behind I/O ? */ - if (bitmap && - atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && - (behind_pages = alloc_behind_pages(bio)) != NULL) - set_bit(R1BIO_BehindIO, &r1_bio->state); - - atomic_set(&r1_bio->remaining, 0); + atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); - do_barriers = bio_barrier(bio); - if (do_barriers) - set_bit(R1BIO_Barrier, &r1_bio->state); - - bio_list_init(&bl); + first_clone = 1; for (i = 0; i < disks; i++) { struct bio *mbio; if (!r1_bio->bios[i]) continue; - mbio = bio_clone(bio, GFP_NOIO); - r1_bio->bios[i] = mbio; - - mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; - mbio->bi_bdev = conf->mirrors[i].rdev->bdev; - mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_barriers | do_sync; - mbio->bi_private = r1_bio; - - if (behind_pages) { + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); + + if (first_clone) { + /* do behind I/O ? + * Not if there are too many, or cannot + * allocate memory, or a reader on WriteMostly + * is waiting for behind writes to flush */ + if (bitmap && + (atomic_read(&bitmap->behind_writes) + < mddev->bitmap_info.max_write_behind) && + !waitqueue_active(&bitmap->behind_wait)) + alloc_behind_pages(mbio, r1_bio); + + bitmap_startwrite(bitmap, r1_bio->sector, + r1_bio->sectors, + test_bit(R1BIO_BehindIO, + &r1_bio->state)); + first_clone = 0; + } + if (r1_bio->behind_bvecs) { struct bio_vec *bvec; int j; - /* Yes, I really want the '__' version so that - * we clear any unused pointer in the io_vec, rather - * than leave them unchanged. This is important - * because when we come to free the pages, we won't - * know the originial bi_idx, so we just free - * them all + /* + * We trimmed the bio, so _all is legit */ - __bio_for_each_segment(bvec, mbio, j, 0) - bvec->bv_page = behind_pages[j]; + bio_for_each_segment_all(bvec, mbio, j) + bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } + r1_bio->bios[i] = mbio; + + mbio->bi_iter.bi_sector = (r1_bio->sector + + conf->mirrors[i].rdev->data_offset); + mbio->bi_bdev = conf->mirrors[i].rdev->bdev; + mbio->bi_end_io = raid1_end_write_request; + mbio->bi_rw = + WRITE | do_flush_fua | do_sync | do_discard | do_same; + mbio->bi_private = r1_bio; + atomic_inc(&r1_bio->remaining); - bio_list_add(&bl, mbio); + cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid1_plug_cb, cb); + else + plug = NULL; + spin_lock_irqsave(&conf->device_lock, flags); + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!plug) + md_wakeup_thread(mddev->thread); + } + /* Mustn't call r1_bio_write_done before this next test, + * as it could result in the bio being freed. + */ + if (sectors_handled < bio_sectors(bio)) { + r1_bio_write_done(r1_bio); + /* We need another r1_bio. It has already been counted + * in bio->bi_phys_segments + */ + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + r1_bio->master_bio = bio; + r1_bio->sectors = bio_sectors(bio) - sectors_handled; + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + goto retry_write; } - kfree(behind_pages); /* the behind pages are attached to the bios now */ - bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, - test_bit(R1BIO_BehindIO, &r1_bio->state)); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_merge(&conf->pending_bio_list, &bl); - bio_list_init(&bl); + r1_bio_write_done(r1_bio); - blk_plug_device(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); - - /* In case raid1d snuck into freeze_array */ + /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - - if (do_sync) - md_wakeup_thread(mddev->thread); -#if 0 - while ((bio = bio_list_pop(&bl)) != NULL) - generic_make_request(bio); -#endif - - return 0; } -static void status(struct seq_file *seq, mddev_t *mddev) +static void status(struct seq_file *seq, struct mddev *mddev) { - conf_t *conf = mddev_to_conf(mddev); + struct r1conf *conf = mddev->private; int i; seq_printf(seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); } @@ -1000,10 +1472,10 @@ static void status(struct seq_file *seq, mddev_t *mddev) } -static void error(mddev_t *mddev, mdk_rdev_t *rdev) +static void error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; - conf_t *conf = mddev_to_conf(mddev); + struct r1conf *conf = mddev->private; /* * If it is not operational, then we have already marked it as dead @@ -1012,12 +1484,17 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) - && (conf->raid_disks - mddev->degraded) == 1) + && (conf->raid_disks - mddev->degraded) == 1) { /* * Don't fail the drive, act as though we were just a - * normal single drive + * normal single drive. + * However don't try a recovery from this drive as + * it is very likely to fail. */ + conf->recovery_disabled = mddev->recovery_disabled; return; + } + set_bit(Blocked, &rdev->flags); if (test_and_clear_bit(In_sync, &rdev->flags)) { unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); @@ -1031,29 +1508,31 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) } else set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" - "raid1: Operation continuing on %d devices.\n", - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); + printk(KERN_ALERT + "md/raid1:%s: Disk failure on %s, disabling device.\n" + "md/raid1:%s: Operation continuing on %d devices.\n", + mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), conf->raid_disks - mddev->degraded); } -static void print_conf(conf_t *conf) +static void print_conf(struct r1conf *conf) { int i; - printk("RAID1 conf printout:\n"); + printk(KERN_DEBUG "RAID1 conf printout:\n"); if (!conf) { - printk("(!conf)\n"); + printk(KERN_DEBUG "(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, conf->raid_disks); rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; - mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev) - printk(" disk %d, wo:%d, o:%d, dev:%s\n", + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", i, !test_bit(In_sync, &rdev->flags), !test_bit(Faulty, &rdev->flags), bdevname(rdev->bdev,b)); @@ -1061,19 +1540,24 @@ static void print_conf(conf_t *conf) rcu_read_unlock(); } -static void close_sync(conf_t *conf) +static void close_sync(struct r1conf *conf) { - wait_barrier(conf); - allow_barrier(conf); + wait_barrier(conf, NULL); + allow_barrier(conf, 0, 0); mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; + + conf->next_resync = 0; + conf->start_next_window = MaxSector; } -static int raid1_spare_active(mddev_t *mddev) +static int raid1_spare_active(struct mddev *mddev) { int i; - conf_t *conf = mddev->private; + struct r1conf *conf = mddev->private; + int count = 0; + unsigned long flags; /* * Find all failed disks within the RAID1 configuration @@ -1081,46 +1565,71 @@ static int raid1_spare_active(mddev_t *mddev) * Called under mddev lock, so rcu protection not needed. */ for (i = 0; i < conf->raid_disks; i++) { - mdk_rdev_t *rdev = conf->mirrors[i].rdev; + struct md_rdev *rdev = conf->mirrors[i].rdev; + struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; + if (repl + && repl->recovery_offset == MaxSector + && !test_bit(Faulty, &repl->flags) + && !test_and_set_bit(In_sync, &repl->flags)) { + /* replacement has just become active */ + if (!rdev || + !test_and_clear_bit(In_sync, &rdev->flags)) + count++; + if (rdev) { + /* Replaced device not technically + * faulty, but we need to be sure + * it gets removed and never re-added + */ + set_bit(Faulty, &rdev->flags); + sysfs_notify_dirent_safe( + rdev->sysfs_state); + } + } if (rdev + && rdev->recovery_offset == MaxSector && !test_bit(Faulty, &rdev->flags) && !test_and_set_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded--; - spin_unlock_irqrestore(&conf->device_lock, flags); + count++; + sysfs_notify_dirent_safe(rdev->sysfs_state); } } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded -= count; + spin_unlock_irqrestore(&conf->device_lock, flags); print_conf(conf); - return 0; + return count; } -static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) +static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) { - conf_t *conf = mddev->private; + struct r1conf *conf = mddev->private; int err = -EEXIST; int mirror = 0; - mirror_info_t *p; + struct raid1_info *p; int first = 0; - int last = mddev->raid_disks - 1; + int last = conf->raid_disks - 1; + struct request_queue *q = bdev_get_queue(rdev->bdev); + + if (mddev->recovery_disabled == conf->recovery_disabled) + return -EBUSY; if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; - for (mirror = first; mirror <= last; mirror++) - if ( !(p=conf->mirrors+mirror)->rdev) { + if (q->merge_bvec_fn) { + set_bit(Unmerged, &rdev->flags); + mddev->merge_check_needed = 1; + } - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + for (mirror = first; mirror <= last; mirror++) { + p = conf->mirrors+mirror; + if (!p->rdev) { + + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); p->head_position = 0; rdev->raid_disk = mirror; @@ -1133,30 +1642,60 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - + if (test_bit(WantReplacement, &p->rdev->flags) && + p[conf->raid_disks].rdev == NULL) { + /* Add this device as a replacement */ + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = mirror; + err = 0; + conf->fullsync = 1; + rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); + break; + } + } + if (err == 0 && test_bit(Unmerged, &rdev->flags)) { + /* Some requests might not have seen this new + * merge_bvec_fn. We must wait for them to complete + * before merging the device fully. + * First we make sure any code which has tested + * our function has submitted the request, then + * we wait for all outstanding requests to complete. + */ + synchronize_sched(); + freeze_array(conf, 0); + unfreeze_array(conf); + clear_bit(Unmerged, &rdev->flags); + } + md_integrity_add_rdev(rdev, mddev); + if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); return err; } -static int raid1_remove_disk(mddev_t *mddev, int number) +static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) { - conf_t *conf = mddev->private; + struct r1conf *conf = mddev->private; int err = 0; - mdk_rdev_t *rdev; - mirror_info_t *p = conf->mirrors+ number; + int number = rdev->raid_disk; + struct raid1_info *p = conf->mirrors + number; + + if (rdev != p->rdev) + p = conf->mirrors + conf->raid_disks + number; print_conf(conf); - rdev = p->rdev; - if (rdev) { + if (rdev == p->rdev) { if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { err = -EBUSY; goto abort; } - /* Only remove non-faulty devices is recovery + /* Only remove non-faulty devices if recovery * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != conf->recovery_disabled && mddev->degraded < conf->raid_disks) { err = -EBUSY; goto abort; @@ -1167,7 +1706,23 @@ static int raid1_remove_disk(mddev_t *mddev, int number) /* lost the race, try later */ err = -EBUSY; p->rdev = rdev; - } + goto abort; + } else if (conf->mirrors[conf->raid_disks + number].rdev) { + /* We just removed a device that is being replaced. + * Move down the replacement. We drain all IO before + * doing this to avoid confusion. + */ + struct md_rdev *repl = + conf->mirrors[conf->raid_disks + number].rdev; + freeze_array(conf, 0); + clear_bit(Replacement, &repl->flags); + p->rdev = repl; + conf->mirrors[conf->raid_disks + number].rdev = NULL; + unfreeze_array(conf); + clear_bit(WantReplacement, &rdev->flags); + } else + clear_bit(WantReplacement, &rdev->flags); + err = md_integrity_register(mddev); } abort: @@ -1178,14 +1733,10 @@ abort: static void end_sync_read(struct bio *bio, int error) { - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); - int i; + struct r1bio *r1_bio = bio->bi_private; + + update_head_pos(r1_bio->read_disk, r1_bio); - for (i=r1_bio->mddev->raid_disks; i--; ) - if (r1_bio->bios[i] == bio) - break; - BUG_ON(i < 0); - update_head_pos(i, r1_bio); /* * we have read a block, now it needs to be re-written, * or re-read if the read failed. @@ -1201,19 +1752,17 @@ static void end_sync_read(struct bio *bio, int error) static void end_sync_write(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); - mddev_t *mddev = r1_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); - int i; + struct r1bio *r1_bio = bio->bi_private; + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; int mirror=0; + sector_t first_bad; + int bad_sectors; + + mirror = find_bio_disk(r1_bio, bio); - for (i = 0; i < conf->raid_disks; i++) - if (r1_bio->bios[i] == bio) { - mirror = i; - break; - } if (!uptodate) { - int sync_blocks = 0; + sector_t sync_blocks = 0; sector_t s = r1_bio->sector; long sectors_to_go = r1_bio->sectors; /* make sure these bits doesn't get cleared. */ @@ -1223,207 +1772,288 @@ static void end_sync_write(struct bio *bio, int error) s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); - md_error(mddev, conf->mirrors[mirror].rdev); - } - - update_head_pos(mirror, r1_bio); + set_bit(WriteErrorSeen, + &conf->mirrors[mirror].rdev->flags); + if (!test_and_set_bit(WantReplacement, + &conf->mirrors[mirror].rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + mddev->recovery); + set_bit(R1BIO_WriteError, &r1_bio->state); + } else if (is_badblock(conf->mirrors[mirror].rdev, + r1_bio->sector, + r1_bio->sectors, + &first_bad, &bad_sectors) && + !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, + r1_bio->sector, + r1_bio->sectors, + &first_bad, &bad_sectors) + ) + set_bit(R1BIO_MadeGood, &r1_bio->state); if (atomic_dec_and_test(&r1_bio->remaining)) { - md_done_sync(mddev, r1_bio->sectors, uptodate); - put_buf(r1_bio); + int s = r1_bio->sectors; + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, uptodate); + } } } -static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) +static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, + int sectors, struct page *page, int rw) { - conf_t *conf = mddev_to_conf(mddev); - int i; - int disks = conf->raid_disks; - struct bio *bio, *wbio; - - bio = r1_bio->bios[r1_bio->read_disk]; + if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) + /* success */ + return 1; + if (rw == WRITE) { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, + &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, & + rdev->mddev->recovery); + } + /* need to record an error - either for the block or the device */ + if (!rdev_set_badblocks(rdev, sector, sectors, 0)) + md_error(rdev->mddev, rdev); + return 0; +} +static int fix_sync_read_error(struct r1bio *r1_bio) +{ + /* Try some synchronous reads of other devices to get + * good data, much like with normal read errors. Only + * read into the pages we already have so we don't + * need to re-issue the read request. + * We don't need to freeze the array, because being in an + * active sync request, there is no normal IO, and + * no overlapping syncs. + * We don't need to check is_badblock() again as we + * made sure that anything with a bad block in range + * will have bi_end_io clear. + */ + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + struct bio *bio = r1_bio->bios[r1_bio->read_disk]; + sector_t sect = r1_bio->sector; + int sectors = r1_bio->sectors; + int idx = 0; - if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* We have read all readable devices. If we haven't - * got the block, then there is no hope left. - * If we have, then we want to do a comparison - * and skip the write if everything is the same. - * If any blocks failed to read, then we need to - * attempt an over-write - */ - int primary; - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { - for (i=0; i<mddev->raid_disks; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_read) - md_error(mddev, conf->mirrors[i].rdev); + while(sectors) { + int s = sectors; + int d = r1_bio->read_disk; + int success = 0; + struct md_rdev *rdev; + int start; - md_done_sync(mddev, r1_bio->sectors, 1); - put_buf(r1_bio); - return; - } - for (primary=0; primary<mddev->raid_disks; primary++) - if (r1_bio->bios[primary]->bi_end_io == end_sync_read && - test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { - r1_bio->bios[primary]->bi_end_io = NULL; - rdev_dec_pending(conf->mirrors[primary].rdev, mddev); - break; + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + do { + if (r1_bio->bios[d]->bi_end_io == end_sync_read) { + /* No rcu protection needed here devices + * can only be removed when no resync is + * active, and resync is currently active + */ + rdev = conf->mirrors[d].rdev; + if (sync_page_io(rdev, sect, s<<9, + bio->bi_io_vec[idx].bv_page, + READ, false)) { + success = 1; + break; + } } - r1_bio->read_disk = primary; - for (i=0; i<mddev->raid_disks; i++) - if (r1_bio->bios[i]->bi_end_io == end_sync_read) { - int j; - int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); - struct bio *pbio = r1_bio->bios[primary]; - struct bio *sbio = r1_bio->bios[i]; - - if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { - for (j = vcnt; j-- ; ) { - struct page *p, *s; - p = pbio->bi_io_vec[j].bv_page; - s = sbio->bi_io_vec[j].bv_page; - if (memcmp(page_address(p), - page_address(s), - PAGE_SIZE)) - break; - } - } else - j = 0; - if (j >= 0) - mddev->resync_mismatches += r1_bio->sectors; - if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) - && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { - sbio->bi_end_io = NULL; - rdev_dec_pending(conf->mirrors[i].rdev, mddev); - } else { - /* fixup the bio for reuse */ - int size; - sbio->bi_vcnt = vcnt; - sbio->bi_size = r1_bio->sectors << 9; - sbio->bi_idx = 0; - sbio->bi_phys_segments = 0; - sbio->bi_hw_segments = 0; - sbio->bi_hw_front_size = 0; - sbio->bi_hw_back_size = 0; - sbio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bi_next = NULL; - sbio->bi_sector = r1_bio->sector + - conf->mirrors[i].rdev->data_offset; - sbio->bi_bdev = conf->mirrors[i].rdev->bdev; - size = sbio->bi_size; - for (j = 0; j < vcnt ; j++) { - struct bio_vec *bi; - bi = &sbio->bi_io_vec[j]; - bi->bv_offset = 0; - if (size > PAGE_SIZE) - bi->bv_len = PAGE_SIZE; - else - bi->bv_len = size; - size -= PAGE_SIZE; - memcpy(page_address(bi->bv_page), - page_address(pbio->bi_io_vec[j].bv_page), - PAGE_SIZE); - } + d++; + if (d == conf->raid_disks * 2) + d = 0; + } while (!success && d != r1_bio->read_disk); - } + if (!success) { + char b[BDEVNAME_SIZE]; + int abort = 0; + /* Cannot read from anywhere, this block is lost. + * Record a bad block on each device. If that doesn't + * work just disable and interrupt the recovery. + * Don't fail devices as that won't really help. + */ + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" + " for block %llu\n", + mdname(mddev), + bdevname(bio->bi_bdev, b), + (unsigned long long)r1_bio->sector); + for (d = 0; d < conf->raid_disks * 2; d++) { + rdev = conf->mirrors[d].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags)) + continue; + if (!rdev_set_badblocks(rdev, sect, s, 0)) + abort = 1; } - } - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { - /* ouch - failed to read all of that. - * Try some synchronous reads of other devices to get - * good data, much like with normal read errors. Only - * read into the pages we already have so we don't - * need to re-issue the read request. - * We don't need to freeze the array, because being in an - * active sync request, there is no normal IO, and - * no overlapping syncs. - */ - sector_t sect = r1_bio->sector; - int sectors = r1_bio->sectors; - int idx = 0; - - while(sectors) { - int s = sectors; - int d = r1_bio->read_disk; - int success = 0; - mdk_rdev_t *rdev; - - if (s > (PAGE_SIZE>>9)) - s = PAGE_SIZE >> 9; - do { - if (r1_bio->bios[d]->bi_end_io == end_sync_read) { - /* No rcu protection needed here devices - * can only be removed when no resync is - * active, and resync is currently active - */ - rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - bio->bi_io_vec[idx].bv_page, - READ)) { - success = 1; - break; - } - } - d++; - if (d == conf->raid_disks) - d = 0; - } while (!success && d != r1_bio->read_disk); - - if (success) { - int start = d; - /* write it back and re-read */ - set_bit(R1BIO_Uptodate, &r1_bio->state); - while (d != r1_bio->read_disk) { - if (d == 0) - d = conf->raid_disks; - d--; - if (r1_bio->bios[d]->bi_end_io != end_sync_read) - continue; - rdev = conf->mirrors[d].rdev; - atomic_add(s, &rdev->corrected_errors); - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - bio->bi_io_vec[idx].bv_page, - WRITE) == 0) - md_error(mddev, rdev); - } - d = start; - while (d != r1_bio->read_disk) { - if (d == 0) - d = conf->raid_disks; - d--; - if (r1_bio->bios[d]->bi_end_io != end_sync_read) - continue; - rdev = conf->mirrors[d].rdev; - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - bio->bi_io_vec[idx].bv_page, - READ) == 0) - md_error(mddev, rdev); - } - } else { - char b[BDEVNAME_SIZE]; - /* Cannot read from anywhere, array is toast */ - md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); - printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" - " for block %llu\n", - bdevname(bio->bi_bdev,b), - (unsigned long long)r1_bio->sector); + if (abort) { + conf->recovery_disabled = + mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_done_sync(mddev, r1_bio->sectors, 0); put_buf(r1_bio); - return; + return 0; } + /* Try next page */ sectors -= s; sect += s; - idx ++; + idx++; + continue; + } + + start = d; + /* write it back and re-read */ + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks * 2; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (r1_sync_page_io(rdev, sect, s, + bio->bi_io_vec[idx].bv_page, + WRITE) == 0) { + r1_bio->bios[d]->bi_end_io = NULL; + rdev_dec_pending(rdev, mddev); + } + } + d = start; + while (d != r1_bio->read_disk) { + if (d == 0) + d = conf->raid_disks * 2; + d--; + if (r1_bio->bios[d]->bi_end_io != end_sync_read) + continue; + rdev = conf->mirrors[d].rdev; + if (r1_sync_page_io(rdev, sect, s, + bio->bi_io_vec[idx].bv_page, + READ) != 0) + atomic_add(s, &rdev->corrected_errors); + } + sectors -= s; + sect += s; + idx ++; + } + set_bit(R1BIO_Uptodate, &r1_bio->state); + set_bit(BIO_UPTODATE, &bio->bi_flags); + return 1; +} + +static int process_checks(struct r1bio *r1_bio) +{ + /* We have read all readable devices. If we haven't + * got the block, then there is no hope left. + * If we have, then we want to do a comparison + * and skip the write if everything is the same. + * If any blocks failed to read, then we need to + * attempt an over-write + */ + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + int primary; + int i; + int vcnt; + + /* Fix variable parts of all bios */ + vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); + for (i = 0; i < conf->raid_disks * 2; i++) { + int j; + int size; + int uptodate; + struct bio *b = r1_bio->bios[i]; + if (b->bi_end_io != end_sync_read) + continue; + /* fixup the bio for reuse, but preserve BIO_UPTODATE */ + uptodate = test_bit(BIO_UPTODATE, &b->bi_flags); + bio_reset(b); + if (!uptodate) + clear_bit(BIO_UPTODATE, &b->bi_flags); + b->bi_vcnt = vcnt; + b->bi_iter.bi_size = r1_bio->sectors << 9; + b->bi_iter.bi_sector = r1_bio->sector + + conf->mirrors[i].rdev->data_offset; + b->bi_bdev = conf->mirrors[i].rdev->bdev; + b->bi_end_io = end_sync_read; + b->bi_private = r1_bio; + + size = b->bi_iter.bi_size; + for (j = 0; j < vcnt ; j++) { + struct bio_vec *bi; + bi = &b->bi_io_vec[j]; + bi->bv_offset = 0; + if (size > PAGE_SIZE) + bi->bv_len = PAGE_SIZE; + else + bi->bv_len = size; + size -= PAGE_SIZE; } } + for (primary = 0; primary < conf->raid_disks * 2; primary++) + if (r1_bio->bios[primary]->bi_end_io == end_sync_read && + test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { + r1_bio->bios[primary]->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[primary].rdev, mddev); + break; + } + r1_bio->read_disk = primary; + for (i = 0; i < conf->raid_disks * 2; i++) { + int j; + struct bio *pbio = r1_bio->bios[primary]; + struct bio *sbio = r1_bio->bios[i]; + int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags); + if (sbio->bi_end_io != end_sync_read) + continue; + /* Now we can 'fixup' the BIO_UPTODATE flag */ + set_bit(BIO_UPTODATE, &sbio->bi_flags); + + if (uptodate) { + for (j = vcnt; j-- ; ) { + struct page *p, *s; + p = pbio->bi_io_vec[j].bv_page; + s = sbio->bi_io_vec[j].bv_page; + if (memcmp(page_address(p), + page_address(s), + sbio->bi_io_vec[j].bv_len)) + break; + } + } else + j = 0; + if (j >= 0) + atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); + if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) + && uptodate)) { + /* No need to write to this device. */ + sbio->bi_end_io = NULL; + rdev_dec_pending(conf->mirrors[i].rdev, mddev); + continue; + } + + bio_copy_data(sbio, pbio); + } + return 0; +} + +static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) +{ + struct r1conf *conf = mddev->private; + int i; + int disks = conf->raid_disks * 2; + struct bio *bio, *wbio; + + bio = r1_bio->bios[r1_bio->read_disk]; + + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) + /* ouch - failed to read all of that. */ + if (!fix_sync_read_error(r1_bio)) + return; + + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + if (process_checks(r1_bio) < 0) + return; /* * schedule writes */ @@ -1439,15 +2069,21 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) wbio->bi_rw = WRITE; wbio->bi_end_io = end_sync_write; atomic_inc(&r1_bio->remaining); - md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); + md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); generic_make_request(wbio); } if (atomic_dec_and_test(&r1_bio->remaining)) { /* if we're here, all write(s) have completed, so clean up */ - md_done_sync(mddev, r1_bio->sectors, 1); - put_buf(r1_bio); + int s = r1_bio->sectors; + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, 1); + } } } @@ -1456,19 +2092,19 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. + * 3. Performs writes following reads for array synchronising. */ -static void fix_read_error(conf_t *conf, int read_disk, +static void fix_read_error(struct r1conf *conf, int read_disk, sector_t sect, int sectors) { - mddev_t *mddev = conf->mddev; + struct mddev *mddev = conf->mddev; while(sectors) { int s = sectors; int d = read_disk; int success = 0; int start; - mdk_rdev_t *rdev; + struct md_rdev *rdev; if (s > (PAGE_SIZE>>9)) s = PAGE_SIZE >> 9; @@ -1479,62 +2115,59 @@ static void fix_read_error(conf_t *conf, int read_disk, * which is the thread that might remove * a device. If raid1d ever becomes multi-threaded.... */ + sector_t first_bad; + int bad_sectors; + rdev = conf->mirrors[d].rdev; if (rdev && - test_bit(In_sync, &rdev->flags) && - sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, - conf->tmppage, READ)) + (test_bit(In_sync, &rdev->flags) || + (!test_bit(Faulty, &rdev->flags) && + rdev->recovery_offset >= sect + s)) && + is_badblock(rdev, sect, s, + &first_bad, &bad_sectors) == 0 && + sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false)) success = 1; else { d++; - if (d == conf->raid_disks) + if (d == conf->raid_disks * 2) d = 0; } } while (!success && d != read_disk); if (!success) { - /* Cannot read from anywhere -- bye bye array */ - md_error(mddev, conf->mirrors[read_disk].rdev); + /* Cannot read from anywhere - mark it bad */ + struct md_rdev *rdev = conf->mirrors[read_disk].rdev; + if (!rdev_set_badblocks(rdev, sect, s, 0)) + md_error(mddev, rdev); break; } /* write it back and re-read */ start = d; while (d != read_disk) { if (d==0) - d = conf->raid_disks; + d = conf->raid_disks * 2; d--; rdev = conf->mirrors[d].rdev; if (rdev && - test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) - == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - } + test_bit(In_sync, &rdev->flags)) + r1_sync_page_io(rdev, sect, s, + conf->tmppage, WRITE); } d = start; while (d != read_disk) { char b[BDEVNAME_SIZE]; if (d==0) - d = conf->raid_disks; + d = conf->raid_disks * 2; d--; rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags)) { - if (sync_page_io(rdev->bdev, - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) - == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - else { + if (r1_sync_page_io(rdev, sect, s, + conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); printk(KERN_INFO - "raid1:%s: read error corrected " + "md/raid1:%s: read error corrected " "(%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect + @@ -1548,129 +2181,280 @@ static void fix_read_error(conf_t *conf, int read_disk, } } -static void raid1d(mddev_t *mddev) +static int narrow_write_error(struct r1bio *r1_bio, int i) +{ + struct mddev *mddev = r1_bio->mddev; + struct r1conf *conf = mddev->private; + struct md_rdev *rdev = conf->mirrors[i].rdev; + + /* bio has the data to be written to device 'i' where + * we just recently had a write error. + * We repeatedly clone the bio and trim down to one block, + * then try the write. Where the write fails we record + * a bad block. + * It is conceivable that the bio doesn't exactly align with + * blocks. We must handle this somehow. + * + * We currently own a reference on the rdev. + */ + + int block_sectors; + sector_t sector; + int sectors; + int sect_to_write = r1_bio->sectors; + int ok = 1; + + if (rdev->badblocks.shift < 0) + return 0; + + block_sectors = 1 << rdev->badblocks.shift; + sector = r1_bio->sector; + sectors = ((sector + block_sectors) + & ~(sector_t)(block_sectors - 1)) + - sector; + + while (sect_to_write) { + struct bio *wbio; + if (sectors > sect_to_write) + sectors = sect_to_write; + /* Write at 'sector' for 'sectors'*/ + + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + unsigned vcnt = r1_bio->behind_page_count; + struct bio_vec *vec = r1_bio->behind_bvecs; + + while (!vec->bv_page) { + vec++; + vcnt--; + } + + wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); + memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); + + wbio->bi_vcnt = vcnt; + } else { + wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + } + + wbio->bi_rw = WRITE; + wbio->bi_iter.bi_sector = r1_bio->sector; + wbio->bi_iter.bi_size = r1_bio->sectors << 9; + + bio_trim(wbio, sector - r1_bio->sector, sectors); + wbio->bi_iter.bi_sector += rdev->data_offset; + wbio->bi_bdev = rdev->bdev; + if (submit_bio_wait(WRITE, wbio) == 0) + /* failure! */ + ok = rdev_set_badblocks(rdev, sector, + sectors, 0) + && ok; + + bio_put(wbio); + sect_to_write -= sectors; + sector += sectors; + sectors = block_sectors; + } + return ok; +} + +static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio) +{ + int m; + int s = r1_bio->sectors; + for (m = 0; m < conf->raid_disks * 2 ; m++) { + struct md_rdev *rdev = conf->mirrors[m].rdev; + struct bio *bio = r1_bio->bios[m]; + if (bio->bi_end_io == NULL) + continue; + if (test_bit(BIO_UPTODATE, &bio->bi_flags) && + test_bit(R1BIO_MadeGood, &r1_bio->state)) { + rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); + } + if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && + test_bit(R1BIO_WriteError, &r1_bio->state)) { + if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) + md_error(conf->mddev, rdev); + } + } + put_buf(r1_bio); + md_done_sync(conf->mddev, s, 1); +} + +static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) { - r1bio_t *r1_bio; + int m; + for (m = 0; m < conf->raid_disks * 2 ; m++) + if (r1_bio->bios[m] == IO_MADE_GOOD) { + struct md_rdev *rdev = conf->mirrors[m].rdev; + rdev_clear_badblocks(rdev, + r1_bio->sector, + r1_bio->sectors, 0); + rdev_dec_pending(rdev, conf->mddev); + } else if (r1_bio->bios[m] != NULL) { + /* This drive got a write error. We need to + * narrow down and record precise write + * errors. + */ + if (!narrow_write_error(r1_bio, m)) { + md_error(conf->mddev, + conf->mirrors[m].rdev); + /* an I/O failed, we can't clear the bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } + rdev_dec_pending(conf->mirrors[m].rdev, + conf->mddev); + } + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); +} + +static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) +{ + int disk; + int max_sectors; + struct mddev *mddev = conf->mddev; struct bio *bio; + char b[BDEVNAME_SIZE]; + struct md_rdev *rdev; + + clear_bit(R1BIO_ReadError, &r1_bio->state); + /* we got a read error. Maybe the drive is bad. Maybe just + * the block and we can fix it. + * We freeze all other IO, and try reading the block from + * other devices. When we find one, we re-write + * and check it that fixes the read error. + * This is all done synchronously while the array is + * frozen + */ + if (mddev->ro == 0) { + freeze_array(conf, 1); + fix_read_error(conf, r1_bio->read_disk, + r1_bio->sector, r1_bio->sectors); + unfreeze_array(conf); + } else + md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); + + bio = r1_bio->bios[r1_bio->read_disk]; + bdevname(bio->bi_bdev, b); +read_more: + disk = read_balance(conf, r1_bio, &max_sectors); + if (disk == -1) { + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" + " read error for block %llu\n", + mdname(mddev), b, (unsigned long long)r1_bio->sector); + raid_end_bio_io(r1_bio); + } else { + const unsigned long do_sync + = r1_bio->master_bio->bi_rw & REQ_SYNC; + if (bio) { + r1_bio->bios[r1_bio->read_disk] = + mddev->ro ? IO_BLOCKED : NULL; + bio_put(bio); + } + r1_bio->read_disk = disk; + bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, + max_sectors); + r1_bio->bios[r1_bio->read_disk] = bio; + rdev = conf->mirrors[disk].rdev; + printk_ratelimited(KERN_ERR + "md/raid1:%s: redirecting sector %llu" + " to other mirror: %s\n", + mdname(mddev), + (unsigned long long)r1_bio->sector, + bdevname(rdev->bdev, b)); + bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_end_io = raid1_end_read_request; + bio->bi_rw = READ | do_sync; + bio->bi_private = r1_bio; + if (max_sectors < r1_bio->sectors) { + /* Drat - have to split this up more */ + struct bio *mbio = r1_bio->master_bio; + int sectors_handled = (r1_bio->sector + max_sectors + - mbio->bi_iter.bi_sector); + r1_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (mbio->bi_phys_segments == 0) + mbio->bi_phys_segments = 2; + else + mbio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + generic_make_request(bio); + bio = NULL; + + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = mbio; + r1_bio->sectors = bio_sectors(mbio) - sectors_handled; + r1_bio->state = 0; + set_bit(R1BIO_ReadError, &r1_bio->state); + r1_bio->mddev = mddev; + r1_bio->sector = mbio->bi_iter.bi_sector + + sectors_handled; + + goto read_more; + } else + generic_make_request(bio); + } +} + +static void raid1d(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct r1bio *r1_bio; unsigned long flags; - conf_t *conf = mddev_to_conf(mddev); + struct r1conf *conf = mddev->private; struct list_head *head = &conf->retry_list; - int unplug=0; - mdk_rdev_t *rdev; + struct blk_plug plug; md_check_recovery(mddev); - + + blk_start_plug(&plug); for (;;) { - char b[BDEVNAME_SIZE]; - unplug += flush_pending_writes(conf); + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { spin_unlock_irqrestore(&conf->device_lock, flags); break; } - r1_bio = list_entry(head->prev, r1bio_t, retry_list); + r1_bio = list_entry(head->prev, struct r1bio, retry_list); list_del(head->prev); conf->nr_queued--; spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r1_bio->mddev; - conf = mddev_to_conf(mddev); + conf = mddev->private; if (test_bit(R1BIO_IsSync, &r1_bio->state)) { - sync_request_write(mddev, r1_bio); - unplug = 1; - } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { - /* some requests in the r1bio were BIO_RW_BARRIER - * requests which failed with -EOPNOTSUPP. Hohumm.. - * Better resubmit without the barrier. - * We know which devices to resubmit for, because - * all others have had their bios[] entry cleared. - * We already have a nr_pending reference on these rdevs. - */ - int i; - const int do_sync = bio_sync(r1_bio->master_bio); - clear_bit(R1BIO_BarrierRetry, &r1_bio->state); - clear_bit(R1BIO_Barrier, &r1_bio->state); - for (i=0; i < conf->raid_disks; i++) - if (r1_bio->bios[i]) - atomic_inc(&r1_bio->remaining); - for (i=0; i < conf->raid_disks; i++) - if (r1_bio->bios[i]) { - struct bio_vec *bvec; - int j; - - bio = bio_clone(r1_bio->master_bio, GFP_NOIO); - /* copy pages from the failed bio, as - * this might be a write-behind device */ - __bio_for_each_segment(bvec, bio, j, 0) - bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page; - bio_put(r1_bio->bios[i]); - bio->bi_sector = r1_bio->sector + - conf->mirrors[i].rdev->data_offset; - bio->bi_bdev = conf->mirrors[i].rdev->bdev; - bio->bi_end_io = raid1_end_write_request; - bio->bi_rw = WRITE | do_sync; - bio->bi_private = r1_bio; - r1_bio->bios[i] = bio; - generic_make_request(bio); - } - } else { - int disk; - - /* we got a read error. Maybe the drive is bad. Maybe just - * the block and we can fix it. - * We freeze all other IO, and try reading the block from - * other devices. When we find one, we re-write - * and check it that fixes the read error. - * This is all done synchronously while the array is - * frozen + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + handle_sync_write_finished(conf, r1_bio); + else + sync_request_write(mddev, r1_bio); + } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + handle_write_finished(conf, r1_bio); + else if (test_bit(R1BIO_ReadError, &r1_bio->state)) + handle_read_error(conf, r1_bio); + else + /* just a partial read to be scheduled from separate + * context */ - if (mddev->ro == 0) { - freeze_array(conf); - fix_read_error(conf, r1_bio->read_disk, - r1_bio->sector, - r1_bio->sectors); - unfreeze_array(conf); - } + generic_make_request(r1_bio->bios[r1_bio->read_disk]); - bio = r1_bio->bios[r1_bio->read_disk]; - if ((disk=read_balance(conf, r1_bio)) == -1) { - printk(KERN_ALERT "raid1: %s: unrecoverable I/O" - " read error for block %llu\n", - bdevname(bio->bi_bdev,b), - (unsigned long long)r1_bio->sector); - raid_end_bio_io(r1_bio); - } else { - const int do_sync = bio_sync(r1_bio->master_bio); - r1_bio->bios[r1_bio->read_disk] = - mddev->ro ? IO_BLOCKED : NULL; - r1_bio->read_disk = disk; - bio_put(bio); - bio = bio_clone(r1_bio->master_bio, GFP_NOIO); - r1_bio->bios[r1_bio->read_disk] = bio; - rdev = conf->mirrors[disk].rdev; - if (printk_ratelimit()) - printk(KERN_ERR "raid1: %s: redirecting sector %llu to" - " another mirror\n", - bdevname(rdev->bdev,b), - (unsigned long long)r1_bio->sector); - bio->bi_sector = r1_bio->sector + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - bio->bi_end_io = raid1_end_read_request; - bio->bi_rw = READ | do_sync; - bio->bi_private = r1_bio; - unplug = 1; - generic_make_request(bio); - } - } + cond_resched(); + if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) + md_check_recovery(mddev); } - if (unplug) - unplug_slaves(mddev); + blk_finish_plug(&plug); } -static int init_resync(conf_t *conf) +static int init_resync(struct r1conf *conf) { int buffs; @@ -1694,29 +2478,26 @@ static int init_resync(conf_t *conf) * that can be installed to exclude normal IO requests. */ -static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) +static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) { - conf_t *conf = mddev_to_conf(mddev); - r1bio_t *r1_bio; + struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; struct bio *bio; sector_t max_sector, nr_sectors; int disk = -1; int i; int wonly = -1; int write_targets = 0, read_targets = 0; - int sync_blocks; + sector_t sync_blocks; int still_degraded = 0; + int good_sectors = RESYNC_SECTORS; + int min_bad = 0; /* number of sectors that are bad in all devices */ if (!conf->r1buf_pool) - { -/* - printk("sync start - bitmap %p\n", mddev->bitmap); -*/ if (init_resync(conf)) return 0; - } - max_sector = mddev->size << 1; + max_sector = mddev->dev_sectors; if (sector_nr >= max_sector) { /* If we aborted, we need to abort the * sync on the 'current' bitmap chunk (there will @@ -1759,11 +2540,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i msleep_interruptible(1000); bitmap_cond_end_sync(mddev->bitmap, sector_nr); + r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); raise_barrier(conf); conf->next_resync = sector_nr; - r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); rcu_read_lock(); /* * If we get a correctably read error during resync or recovery, @@ -1779,54 +2560,108 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i r1_bio->state = 0; set_bit(R1BIO_IsSync, &r1_bio->state); - for (i=0; i < conf->raid_disks; i++) { - mdk_rdev_t *rdev; + for (i = 0; i < conf->raid_disks * 2; i++) { + struct md_rdev *rdev; bio = r1_bio->bios[i]; - - /* take from bio_init */ - bio->bi_next = NULL; - bio->bi_flags |= 1 << BIO_UPTODATE; - bio->bi_rw = READ; - bio->bi_vcnt = 0; - bio->bi_idx = 0; - bio->bi_phys_segments = 0; - bio->bi_hw_segments = 0; - bio->bi_size = 0; - bio->bi_end_io = NULL; - bio->bi_private = NULL; + bio_reset(bio); rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev == NULL || - test_bit(Faulty, &rdev->flags)) { - still_degraded = 1; - continue; + test_bit(Faulty, &rdev->flags)) { + if (i < conf->raid_disks) + still_degraded = 1; } else if (!test_bit(In_sync, &rdev->flags)) { bio->bi_rw = WRITE; bio->bi_end_io = end_sync_write; write_targets ++; } else { /* may need to read from here */ - bio->bi_rw = READ; - bio->bi_end_io = end_sync_read; - if (test_bit(WriteMostly, &rdev->flags)) { - if (wonly < 0) - wonly = i; - } else { - if (disk < 0) - disk = i; + sector_t first_bad = MaxSector; + int bad_sectors; + + if (is_badblock(rdev, sector_nr, good_sectors, + &first_bad, &bad_sectors)) { + if (first_bad > sector_nr) + good_sectors = first_bad - sector_nr; + else { + bad_sectors -= (sector_nr - first_bad); + if (min_bad == 0 || + min_bad > bad_sectors) + min_bad = bad_sectors; + } + } + if (sector_nr < first_bad) { + if (test_bit(WriteMostly, &rdev->flags)) { + if (wonly < 0) + wonly = i; + } else { + if (disk < 0) + disk = i; + } + bio->bi_rw = READ; + bio->bi_end_io = end_sync_read; + read_targets++; + } else if (!test_bit(WriteErrorSeen, &rdev->flags) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { + /* + * The device is suitable for reading (InSync), + * but has bad block(s) here. Let's try to correct them, + * if we are doing resync or repair. Otherwise, leave + * this device alone for this sync request. + */ + bio->bi_rw = WRITE; + bio->bi_end_io = end_sync_write; + write_targets++; } - read_targets++; } - atomic_inc(&rdev->nr_pending); - bio->bi_sector = sector_nr + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - bio->bi_private = r1_bio; + if (bio->bi_end_io) { + atomic_inc(&rdev->nr_pending); + bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + bio->bi_private = r1_bio; + } } rcu_read_unlock(); if (disk < 0) disk = wonly; r1_bio->read_disk = disk; + if (read_targets == 0 && min_bad > 0) { + /* These sectors are bad on all InSync devices, so we + * need to mark them bad on all write targets + */ + int ok = 1; + for (i = 0 ; i < conf->raid_disks * 2 ; i++) + if (r1_bio->bios[i]->bi_end_io == end_sync_write) { + struct md_rdev *rdev = conf->mirrors[i].rdev; + ok = rdev_set_badblocks(rdev, sector_nr, + min_bad, 0 + ) && ok; + } + set_bit(MD_CHANGE_DEVS, &mddev->flags); + *skipped = 1; + put_buf(r1_bio); + + if (!ok) { + /* Cannot record the badblocks, so need to + * abort the resync. + * If there are multiple read targets, could just + * fail the really bad ones ??? + */ + conf->recovery_disabled = mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return 0; + } else + return min_bad; + + } + if (min_bad > 0 && min_bad < good_sectors) { + /* only resync enough to reach the next bad->good + * transition */ + good_sectors = min_bad; + } + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) /* extra read targets are also write targets */ write_targets += read_targets-1; @@ -1835,7 +2670,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* There is nowhere to write, so all non-sync * drives must be failed - so we are finished */ - sector_t rv = max_sector - sector_nr; + sector_t rv; + if (min_bad > 0) + max_sector = sector_nr + min_bad; + rv = max_sector - sector_nr; *skipped = 1; put_buf(r1_bio); return rv; @@ -1843,6 +2681,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (max_sector > mddev->resync_max) max_sector = mddev->resync_max; /* Don't do IO beyond here */ + if (max_sector > sector_nr + good_sectors) + max_sector = sector_nr + good_sectors; nr_sectors = 0; sync_blocks = 0; do { @@ -1859,11 +2699,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; BUG_ON(sync_blocks < (PAGE_SIZE>>9)); - if (len > (sync_blocks<<9)) + if ((len >> 9) > sync_blocks) len = sync_blocks<<9; } - for (i=0 ; i < conf->raid_disks; i++) { + for (i = 0 ; i < conf->raid_disks * 2; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io) { page = bio->bi_io_vec[bio->bi_vcnt].bv_page; @@ -1877,7 +2717,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i continue; /* remove last page from this bio */ bio->bi_vcnt--; - bio->bi_size -= len; + bio->bi_iter.bi_size -= len; bio->bi_flags &= ~(1<< BIO_SEG_VALID); } goto bio_full; @@ -1896,9 +2736,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { atomic_set(&r1_bio->remaining, read_targets); - for (i=0; i<conf->raid_disks; i++) { + for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { + read_targets--; md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } @@ -1913,77 +2754,70 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return nr_sectors; } -static int run(mddev_t *mddev) +static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks) { - conf_t *conf; - int i, j, disk_idx; - mirror_info_t *disk; - mdk_rdev_t *rdev; - struct list_head *tmp; + if (sectors) + return sectors; - if (mddev->level != 1) { - printk("raid1: %s: raid level not set to mirroring (%d)\n", - mdname(mddev), mddev->level); - goto out; - } - if (mddev->reshape_position != MaxSector) { - printk("raid1: %s: reshape_position set but not supported\n", - mdname(mddev)); - goto out; - } - /* - * copy the already verified devices into our private RAID1 - * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] - */ - conf = kzalloc(sizeof(conf_t), GFP_KERNEL); - mddev->private = conf; + return mddev->dev_sectors; +} + +static struct r1conf *setup_conf(struct mddev *mddev) +{ + struct r1conf *conf; + int i; + struct raid1_info *disk; + struct md_rdev *rdev; + int err = -ENOMEM; + + conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL); if (!conf) - goto out_no_mem; + goto abort; - conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, + conf->mirrors = kzalloc(sizeof(struct raid1_info) + * mddev->raid_disks * 2, GFP_KERNEL); if (!conf->mirrors) - goto out_no_mem; + goto abort; conf->tmppage = alloc_page(GFP_KERNEL); if (!conf->tmppage) - goto out_no_mem; + goto abort; - conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); + conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); if (!conf->poolinfo) - goto out_no_mem; - conf->poolinfo->mddev = mddev; - conf->poolinfo->raid_disks = mddev->raid_disks; + goto abort; + conf->poolinfo->raid_disks = mddev->raid_disks * 2; conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, r1bio_pool_free, conf->poolinfo); if (!conf->r1bio_pool) - goto out_no_mem; + goto abort; - spin_lock_init(&conf->device_lock); - mddev->queue->queue_lock = &conf->device_lock; + conf->poolinfo->mddev = mddev; - rdev_for_each(rdev, tmp, mddev) { - disk_idx = rdev->raid_disk; + err = -EINVAL; + spin_lock_init(&conf->device_lock); + rdev_for_each(rdev, mddev) { + struct request_queue *q; + int disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) continue; - disk = conf->mirrors + disk_idx; + if (test_bit(Replacement, &rdev->flags)) + disk = conf->mirrors + mddev->raid_disks + disk_idx; + else + disk = conf->mirrors + disk_idx; + if (disk->rdev) + goto abort; disk->rdev = rdev; - - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); + q = bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) + mddev->merge_check_needed = 1; disk->head_position = 0; + disk->seq_start = MaxSector; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; @@ -1993,110 +2827,184 @@ static int run(mddev_t *mddev) init_waitqueue_head(&conf->wait_barrier); bio_list_init(&conf->pending_bio_list); - bio_list_init(&conf->flushing_bio_list); + conf->pending_count = 0; + conf->recovery_disabled = mddev->recovery_disabled - 1; + conf->start_next_window = MaxSector; + conf->current_window_requests = conf->next_window_requests = 0; - mddev->degraded = 0; - for (i = 0; i < conf->raid_disks; i++) { + err = -EIO; + for (i = 0; i < conf->raid_disks * 2; i++) { disk = conf->mirrors + i; + if (i < conf->raid_disks && + disk[conf->raid_disks].rdev) { + /* This slot has a replacement. */ + if (!disk->rdev) { + /* No original, just make the replacement + * a recovering spare + */ + disk->rdev = + disk[conf->raid_disks].rdev; + disk[conf->raid_disks].rdev = NULL; + } else if (!test_bit(In_sync, &disk->rdev->flags)) + /* Original is not in_sync - bad */ + goto abort; + } + if (!disk->rdev || !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; - mddev->degraded++; - if (disk->rdev) + if (disk->rdev && + (disk->rdev->saved_raid_disk < 0)) conf->fullsync = 1; } } - if (mddev->degraded == conf->raid_disks) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", - mdname(mddev)); - goto out_free_conf; + + err = -ENOMEM; + conf->thread = md_register_thread(raid1d, mddev, "raid1"); + if (!conf->thread) { + printk(KERN_ERR + "md/raid1:%s: couldn't allocate thread\n", + mdname(mddev)); + goto abort; } - if (conf->raid_disks - mddev->degraded == 1) - mddev->recovery_cp = MaxSector; + return conf; + + abort: + if (conf) { + if (conf->r1bio_pool) + mempool_destroy(conf->r1bio_pool); + kfree(conf->mirrors); + safe_put_page(conf->tmppage); + kfree(conf->poolinfo); + kfree(conf); + } + return ERR_PTR(err); +} + +static int stop(struct mddev *mddev); +static int run(struct mddev *mddev) +{ + struct r1conf *conf; + int i; + struct md_rdev *rdev; + int ret; + bool discard_supported = false; + + if (mddev->level != 1) { + printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", + mdname(mddev), mddev->level); + return -EIO; + } + if (mddev->reshape_position != MaxSector) { + printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", + mdname(mddev)); + return -EIO; + } /* - * find the first working one and use it as a starting point - * to read balancing. + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in run(), + * should be freed in stop()] */ - for (j = 0; j < conf->raid_disks && - (!conf->mirrors[j].rdev || - !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++) - /* nothing */; - conf->last_used = j; + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; + if (IS_ERR(conf)) + return PTR_ERR(conf); - mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); - if (!mddev->thread) { - printk(KERN_ERR - "raid1: couldn't allocate thread for %s\n", - mdname(mddev)); - goto out_free_conf; + if (mddev->queue) + blk_queue_max_write_same_sectors(mddev->queue, 0); + + rdev_for_each(rdev, mddev) { + if (!mddev->gendisk) + continue; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } + mddev->degraded = 0; + for (i=0; i < conf->raid_disks; i++) + if (conf->mirrors[i].rdev == NULL || + !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || + test_bit(Faulty, &conf->mirrors[i].rdev->flags)) + mddev->degraded++; + + if (conf->raid_disks - mddev->degraded == 1) + mddev->recovery_cp = MaxSector; + + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "md/raid1:%s: not clean" + " -- starting background reconstruction\n", + mdname(mddev)); printk(KERN_INFO - "raid1: raid set %s active with %d out of %d mirrors\n", + "md/raid1:%s: active with %d out of %d mirrors\n", mdname(mddev), mddev->raid_disks - mddev->degraded, mddev->raid_disks); + /* * Ok, everything is just fine now */ - mddev->array_sectors = mddev->size * 2; - - mddev->queue->unplug_fn = raid1_unplug; - mddev->queue->backing_dev_info.congested_fn = raid1_congested; - mddev->queue->backing_dev_info.congested_data = mddev; + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; - return 0; + md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); -out_no_mem: - printk(KERN_ERR "raid1: couldn't allocate memory for %s\n", - mdname(mddev)); + if (mddev->queue) { + mddev->queue->backing_dev_info.congested_fn = raid1_congested; + mddev->queue->backing_dev_info.congested_data = mddev; + blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); -out_free_conf: - if (conf) { - if (conf->r1bio_pool) - mempool_destroy(conf->r1bio_pool); - kfree(conf->mirrors); - safe_put_page(conf->tmppage); - kfree(conf->poolinfo); - kfree(conf); - mddev->private = NULL; + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); } -out: - return -EIO; + + ret = md_integrity_register(mddev); + if (ret) + stop(mddev); + return ret; } -static int stop(mddev_t *mddev) +static int stop(struct mddev *mddev) { - conf_t *conf = mddev_to_conf(mddev); + struct r1conf *conf = mddev->private; struct bitmap *bitmap = mddev->bitmap; - int behind_wait = 0; /* wait for behind writes to complete */ - while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - behind_wait++; - printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); /* wait a second */ + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", + mdname(mddev)); /* need to kick something here to make sure I/O goes? */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); } - md_unregister_thread(mddev->thread); - mddev->thread = NULL; - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ + freeze_array(conf, 0); + unfreeze_array(conf); + + md_unregister_thread(&mddev->thread); if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); + safe_put_page(conf->tmppage); kfree(conf->poolinfo); kfree(conf); mddev->private = NULL; return 0; } -static int raid1_resize(mddev_t *mddev, sector_t sectors) +static int raid1_resize(struct mddev *mddev, sector_t sectors) { /* no resync is happening, and there is enough space * on all devices, so we can resize. @@ -2105,20 +3013,29 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - mddev->array_sectors = sectors; + sector_t newsize = raid1_size(mddev, sectors, 0); + if (mddev->external_size && + mddev->array_sectors > newsize) + return -EINVAL; + if (mddev->bitmap) { + int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0); + if (ret) + return ret; + } + md_set_array_sectors(mddev, newsize); set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; - if (mddev->array_sectors / 2 > mddev->size && - mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; + revalidate_disk(mddev->gendisk); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > mddev->dev_sectors) { + mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = mddev->array_sectors / 2; + mddev->dev_sectors = sectors; mddev->resync_max_sectors = sectors; return 0; } -static int raid1_reshape(mddev_t *mddev) +static int raid1_reshape(struct mddev *mddev) { /* We need to: * 1/ resize the r1bio_pool @@ -2133,17 +3050,17 @@ static int raid1_reshape(mddev_t *mddev) */ mempool_t *newpool, *oldpool; struct pool_info *newpoolinfo; - mirror_info_t *newmirrors; - conf_t *conf = mddev_to_conf(mddev); + struct raid1_info *newmirrors; + struct r1conf *conf = mddev->private; int cnt, raid_disks; unsigned long flags; int d, d2, err; /* Cannot change chunk_size, layout, or level */ - if (mddev->chunk_size != mddev->new_chunk || + if (mddev->chunk_sectors != mddev->new_chunk_sectors || mddev->layout != mddev->new_layout || mddev->level != mddev->new_level) { - mddev->new_chunk = mddev->chunk_size; + mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_layout = mddev->layout; mddev->new_level = mddev->level; return -EINVAL; @@ -2168,7 +3085,7 @@ static int raid1_reshape(mddev_t *mddev) if (!newpoolinfo) return -ENOMEM; newpoolinfo->mddev = mddev; - newpoolinfo->raid_disks = raid_disks; + newpoolinfo->raid_disks = raid_disks * 2; newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, r1bio_pool_free, newpoolinfo); @@ -2176,34 +3093,30 @@ static int raid1_reshape(mddev_t *mddev) kfree(newpoolinfo); return -ENOMEM; } - newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); + newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2, + GFP_KERNEL); if (!newmirrors) { kfree(newpoolinfo); mempool_destroy(newpool); return -ENOMEM; } - raise_barrier(conf); + freeze_array(conf, 0); /* ok, everything is stopped */ oldpool = conf->r1bio_pool; conf->r1bio_pool = newpool; for (d = d2 = 0; d < conf->raid_disks; d++) { - mdk_rdev_t *rdev = conf->mirrors[d].rdev; + struct md_rdev *rdev = conf->mirrors[d].rdev; if (rdev && rdev->raid_disk != d2) { - char nm[20]; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); + sysfs_unlink_rdev(mddev, rdev); rdev->raid_disk = d2; - sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - if (sysfs_create_link(&mddev->kobj, - &rdev->kobj, nm)) + sysfs_unlink_rdev(mddev, rdev); + if (sysfs_link_rdev(mddev, rdev)) printk(KERN_WARNING - "md/raid1: cannot register " - "%s for %s\n", - nm, mdname(mddev)); + "md/raid1:%s: cannot register rd%d\n", + mdname(mddev), rdev->raid_disk); } if (rdev) newmirrors[d2++].rdev = rdev; @@ -2219,8 +3132,7 @@ static int raid1_reshape(mddev_t *mddev) conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0; - conf->last_used = 0; /* just make sure it is in-range */ - lower_barrier(conf); + unfreeze_array(conf); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -2229,22 +3141,43 @@ static int raid1_reshape(mddev_t *mddev) return 0; } -static void raid1_quiesce(mddev_t *mddev, int state) +static void raid1_quiesce(struct mddev *mddev, int state) { - conf_t *conf = mddev_to_conf(mddev); + struct r1conf *conf = mddev->private; switch(state) { + case 2: /* wake for suspend */ + wake_up(&conf->wait_barrier); + break; case 1: - raise_barrier(conf); + freeze_array(conf, 0); break; case 0: - lower_barrier(conf); + unfreeze_array(conf); break; } } +static void *raid1_takeover(struct mddev *mddev) +{ + /* raid1 can take over: + * raid5 with 2 devices, any layout or chunk size + */ + if (mddev->level == 5 && mddev->raid_disks == 2) { + struct r1conf *conf; + mddev->new_level = 1; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 0; + conf = setup_conf(mddev); + if (!IS_ERR(conf)) + /* Array must appear to be quiesced */ + conf->array_frozen = 1; + return conf; + } + return ERR_PTR(-EINVAL); +} -static struct mdk_personality raid1_personality = +static struct md_personality raid1_personality = { .name = "raid1", .level = 1, @@ -2259,8 +3192,10 @@ static struct mdk_personality raid1_personality = .spare_active = raid1_spare_active, .sync_request = sync_request, .resize = raid1_resize, + .size = raid1_size, .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, + .takeover = raid1_takeover, }; static int __init raid_init(void) @@ -2276,6 +3211,9 @@ static void raid_exit(void) module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); MODULE_ALIAS("md-personality-3"); /* RAID1 */ MODULE_ALIAS("md-raid1"); MODULE_ALIAS("md-level-1"); + +module_param(max_queued_requests, int, S_IRUGO|S_IWUSR); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h new file mode 100644 index 00000000000..9bebca7bff2 --- /dev/null +++ b/drivers/md/raid1.h @@ -0,0 +1,178 @@ +#ifndef _RAID1_H +#define _RAID1_H + +struct raid1_info { + struct md_rdev *rdev; + sector_t head_position; + + /* When choose the best device for a read (read_balance()) + * we try to keep sequential reads one the same device + */ + sector_t next_seq_sect; + sector_t seq_start; +}; + +/* + * memory pools need a pointer to the mddev, so they can force an unplug + * when memory is tight, and a count of the number of drives that the + * pool was allocated for, so they know how much to allocate and free. + * mddev->raid_disks cannot be used, as it can change while a pool is active + * These two datums are stored in a kmalloced struct. + * The 'raid_disks' here is twice the raid_disks in r1conf. + * This allows space for each 'real' device can have a replacement in the + * second half of the array. + */ + +struct pool_info { + struct mddev *mddev; + int raid_disks; +}; + +struct r1conf { + struct mddev *mddev; + struct raid1_info *mirrors; /* twice 'raid_disks' to + * allow for replacements. + */ + int raid_disks; + + /* During resync, read_balancing is only allowed on the part + * of the array that has been resynced. 'next_resync' tells us + * where that is. + */ + sector_t next_resync; + + /* When raid1 starts resync, we divide array into four partitions + * |---------|--------------|---------------------|-------------| + * next_resync start_next_window end_window + * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE + * end_window = start_next_window + NEXT_NORMALIO_DISTANCE + * current_window_requests means the count of normalIO between + * start_next_window and end_window. + * next_window_requests means the count of normalIO after end_window. + * */ + sector_t start_next_window; + int current_window_requests; + int next_window_requests; + + spinlock_t device_lock; + + /* list of 'struct r1bio' that need to be processed by raid1d, + * whether to retry a read, writeout a resync or recovery + * block, or anything else. + */ + struct list_head retry_list; + + /* queue pending writes to be submitted on unplug */ + struct bio_list pending_bio_list; + int pending_count; + + /* for use when syncing mirrors: + * We don't allow both normal IO and resync/recovery IO at + * the same time - resync/recovery can only happen when there + * is no other IO. So when either is active, the other has to wait. + * See more details description in raid1.c near raise_barrier(). + */ + wait_queue_head_t wait_barrier; + spinlock_t resync_lock; + int nr_pending; + int nr_waiting; + int nr_queued; + int barrier; + int array_frozen; + + /* Set to 1 if a full sync is needed, (fresh device added). + * Cleared when a sync completes. + */ + int fullsync; + + /* When the same as mddev->recovery_disabled we don't allow + * recovery to be attempted as we expect a read error. + */ + int recovery_disabled; + + + /* poolinfo contains information about the content of the + * mempools - it changes when the array grows or shrinks + */ + struct pool_info *poolinfo; + mempool_t *r1bio_pool; + mempool_t *r1buf_pool; + + /* temporary buffer to synchronous IO when attempting to repair + * a read error. + */ + struct page *tmppage; + + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct md_thread *thread; +}; + +/* + * this is our 'private' RAID1 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID1 operation, and about their status: + */ + +struct r1bio { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + atomic_t behind_remaining; /* number of write-behind ios remaining + * in this BehindIO request + */ + sector_t sector; + sector_t start_next_window; + int sectors; + unsigned long state; + struct mddev *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_disk; + + struct list_head retry_list; + /* Next two are only valid when R1BIO_BehindIO is set */ + struct bio_vec *behind_bvecs; + int behind_page_count; + /* + * if the IO is in WRITE direction, then multiple bios are used. + * We choose the number when they are allocated. + */ + struct bio *bios[0]; + /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ +}; + +/* bits for r1bio.state */ +#define R1BIO_Uptodate 0 +#define R1BIO_IsSync 1 +#define R1BIO_Degraded 2 +#define R1BIO_BehindIO 3 +/* Set ReadError on bios that experience a readerror so that + * raid1d knows what to do with them. + */ +#define R1BIO_ReadError 4 +/* For write-behind requests, we call bi_end_io when + * the last non-write-behind device completes, providing + * any write was successful. Otherwise we call when + * any write-behind write succeeds, otherwise we call + * with failure when last write completes (and all failed). + * Record that bi_end_io was called with this flag... + */ +#define R1BIO_Returned 6 +/* If a write for this request means we can clear some + * known-bad-block records, we set this flag + */ +#define R1BIO_MadeGood 7 +#define R1BIO_WriteError 8 + +extern int md_raid1_congested(struct mddev *mddev, int bits); + +#endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e34cd0e6247..cb882aae9e2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -5,7 +5,7 @@ * * RAID-10 support for md. * - * Base on code in raid1.c. See raid1.c for futher copyright information. + * Base on code in raid1.c. See raid1.c for further copyright information. * * * This program is free software; you can redistribute it and/or modify @@ -18,9 +18,17 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "dm-bio-list.h" -#include <linux/raid/raid10.h> -#include <linux/raid/bitmap.h> +#include <linux/slab.h> +#include <linux/delay.h> +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/ratelimit.h> +#include <linux/kthread.h> +#include "md.h" +#include "raid10.h" +#include "raid0.h" +#include "bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. @@ -30,21 +38,36 @@ * near_copies (stored in low byte of layout) * far_copies (stored in second byte of layout) * far_offset (stored in bit 16 of layout ) + * use_far_sets (stored in bit 17 of layout ) * - * The data to be stored is divided into chunks using chunksize. - * Each device is divided into far_copies sections. - * In each section, chunks are laid out in a style similar to raid0, but - * near_copies copies of each chunk is stored (each on a different drive). - * The starting device for each section is offset near_copies from the starting - * device of the previous section. - * Thus they are (near_copies*far_copies) of each chunk, and each is on a different - * drive. - * near_copies and far_copies must be at least one, and their product is at most - * raid_disks. + * The data to be stored is divided into chunks using chunksize. Each device + * is divided into far_copies sections. In each section, chunks are laid out + * in a style similar to raid0, but near_copies copies of each chunk is stored + * (each on a different drive). The starting device for each section is offset + * near_copies from the starting device of the previous section. Thus there + * are (near_copies * far_copies) of each chunk, and each is on a different + * drive. near_copies and far_copies must be at least one, and their product + * is at most raid_disks. * * If far_offset is true, then the far_copies are handled a bit differently. - * The copies are still in different stripes, but instead of be very far apart - * on disk, there are adjacent stripes. + * The copies are still in different stripes, but instead of being very far + * apart on disk, there are adjacent stripes. + * + * The far and offset algorithms are handled slightly differently if + * 'use_far_sets' is true. In this case, the array's devices are grouped into + * sets that are (near_copies * far_copies) in size. The far copied stripes + * are still shifted by 'near_copies' devices, but this shifting stays confined + * to the set rather than the entire array. This is done to improve the number + * of device combinations that can fail without causing the array to fail. + * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk + * on a device): + * A B C D A B C D E + * ... ... + * D A B C E A B C D + * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): + * [A B] [C D] [A B] [C D E] + * |...| |...| |...| | ... | + * [B A] [D C] [B A] [E C D] */ /* @@ -52,23 +75,43 @@ */ #define NR_RAID10_BIOS 256 -static void unplug_slaves(mddev_t *mddev); +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) -static void allow_barrier(conf_t *conf); -static void lower_barrier(conf_t *conf); +/* When there are this many requests queued to be written by + * the raid10 thread, we become 'congested' to provide back-pressure + * for writeback. + */ +static int max_queued_requests = 1024; + +static void allow_barrier(struct r10conf *conf); +static void lower_barrier(struct r10conf *conf); +static int _enough(struct r10conf *conf, int previous, int ignore); +static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, + int *skipped); +static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); +static void end_reshape_write(struct bio *bio, int error); +static void end_reshape(struct r10conf *conf); static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { - conf_t *conf = data; - r10bio_t *r10_bio; - int size = offsetof(struct r10bio_s, devs[conf->copies]); - - /* allocate a r10bio with room for raid_disks entries in the bios array */ - r10_bio = kzalloc(size, gfp_flags); - if (!r10_bio) - unplug_slaves(conf->mddev); + struct r10conf *conf = data; + int size = offsetof(struct r10bio, devs[conf->copies]); - return r10_bio; + /* allocate a r10bio with room for raid_disks entries in the + * bios array */ + return kzalloc(size, gfp_flags); } static void r10bio_pool_free(void *r10_bio, void *data) @@ -93,20 +136,19 @@ static void r10bio_pool_free(void *r10_bio, void *data) */ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) { - conf_t *conf = data; + struct r10conf *conf = data; struct page *page; - r10bio_t *r10_bio; + struct r10bio *r10_bio; struct bio *bio; int i, j; int nalloc; r10_bio = r10bio_pool_alloc(gfp_flags, conf); - if (!r10_bio) { - unplug_slaves(conf->mddev); + if (!r10_bio) return NULL; - } - if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) + if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) nalloc = conf->copies; /* resync */ else nalloc = 2; /* recovery */ @@ -115,23 +157,40 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) * Allocate bios. */ for (j = nalloc ; j-- ; ) { - bio = bio_alloc(gfp_flags, RESYNC_PAGES); + bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); if (!bio) goto out_free_bio; r10_bio->devs[j].bio = bio; + if (!conf->have_replacement) + continue; + bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); + if (!bio) + goto out_free_bio; + r10_bio->devs[j].repl_bio = bio; } /* * Allocate RESYNC_PAGES data pages and attach them * where needed. */ for (j = 0 ; j < nalloc; j++) { + struct bio *rbio = r10_bio->devs[j].repl_bio; bio = r10_bio->devs[j].bio; for (i = 0; i < RESYNC_PAGES; i++) { - page = alloc_page(gfp_flags); + if (j > 0 && !test_bit(MD_RECOVERY_SYNC, + &conf->mddev->recovery)) { + /* we can share bv_page's during recovery + * and reshape */ + struct bio *rbio = r10_bio->devs[0].bio; + page = rbio->bi_io_vec[i].bv_page; + get_page(page); + } else + page = alloc_page(gfp_flags); if (unlikely(!page)) goto out_free_pages; bio->bi_io_vec[i].bv_page = page; + if (rbio) + rbio->bi_io_vec[i].bv_page = page; } } @@ -143,10 +202,14 @@ out_free_pages: while (j--) for (i = 0; i < RESYNC_PAGES ; i++) safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); - j = -1; + j = 0; out_free_bio: - while ( ++j < nalloc ) - bio_put(r10_bio->devs[j].bio); + for ( ; j < nalloc; j++) { + if (r10_bio->devs[j].bio) + bio_put(r10_bio->devs[j].bio); + if (r10_bio->devs[j].repl_bio) + bio_put(r10_bio->devs[j].repl_bio); + } r10bio_pool_free(r10_bio, conf); return NULL; } @@ -154,8 +217,8 @@ out_free_bio: static void r10buf_pool_free(void *__r10_bio, void *data) { int i; - conf_t *conf = data; - r10bio_t *r10bio = __r10_bio; + struct r10conf *conf = data; + struct r10bio *r10bio = __r10_bio; int j; for (j=0; j < conf->copies; j++) { @@ -167,50 +230,51 @@ static void r10buf_pool_free(void *__r10_bio, void *data) } bio_put(bio); } + bio = r10bio->devs[j].repl_bio; + if (bio) + bio_put(bio); } r10bio_pool_free(r10bio, conf); } -static void put_all_bios(conf_t *conf, r10bio_t *r10_bio) +static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) { int i; for (i = 0; i < conf->copies; i++) { struct bio **bio = & r10_bio->devs[i].bio; - if (*bio && *bio != IO_BLOCKED) + if (!BIO_SPECIAL(*bio)) + bio_put(*bio); + *bio = NULL; + bio = &r10_bio->devs[i].repl_bio; + if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) bio_put(*bio); *bio = NULL; } } -static void free_r10bio(r10bio_t *r10_bio) +static void free_r10bio(struct r10bio *r10_bio) { - conf_t *conf = mddev_to_conf(r10_bio->mddev); - - /* - * Wake up any possible resync thread that waits for the device - * to go idle. - */ - allow_barrier(conf); + struct r10conf *conf = r10_bio->mddev->private; put_all_bios(conf, r10_bio); mempool_free(r10_bio, conf->r10bio_pool); } -static void put_buf(r10bio_t *r10_bio) +static void put_buf(struct r10bio *r10_bio) { - conf_t *conf = mddev_to_conf(r10_bio->mddev); + struct r10conf *conf = r10_bio->mddev->private; mempool_free(r10_bio, conf->r10buf_pool); lower_barrier(conf); } -static void reschedule_retry(r10bio_t *r10_bio) +static void reschedule_retry(struct r10bio *r10_bio) { unsigned long flags; - mddev_t *mddev = r10_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; spin_lock_irqsave(&conf->device_lock, flags); list_add(&r10_bio->retry_list, &conf->retry_list); @@ -228,36 +292,84 @@ static void reschedule_retry(r10bio_t *r10_bio) * operation and are ready to return a success/failure code to the buffer * cache layer. */ -static void raid_end_bio_io(r10bio_t *r10_bio) +static void raid_end_bio_io(struct r10bio *r10_bio) { struct bio *bio = r10_bio->master_bio; + int done; + struct r10conf *conf = r10_bio->mddev->private; - bio_endio(bio, - test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO); + if (bio->bi_phys_segments) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + bio->bi_phys_segments--; + done = (bio->bi_phys_segments == 0); + spin_unlock_irqrestore(&conf->device_lock, flags); + } else + done = 1; + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + if (done) { + bio_endio(bio, 0); + /* + * Wake up any possible resync thread that waits for the device + * to go idle. + */ + allow_barrier(conf); + } free_r10bio(r10_bio); } /* * Update disk head position estimator based on IRQ completion info. */ -static inline void update_head_pos(int slot, r10bio_t *r10_bio) +static inline void update_head_pos(int slot, struct r10bio *r10_bio) { - conf_t *conf = mddev_to_conf(r10_bio->mddev); + struct r10conf *conf = r10_bio->mddev->private; conf->mirrors[r10_bio->devs[slot].devnum].head_position = r10_bio->devs[slot].addr + (r10_bio->sectors); } +/* + * Find the disk number which triggered given bio + */ +static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, + struct bio *bio, int *slotp, int *replp) +{ + int slot; + int repl = 0; + + for (slot = 0; slot < conf->copies; slot++) { + if (r10_bio->devs[slot].bio == bio) + break; + if (r10_bio->devs[slot].repl_bio == bio) { + repl = 1; + break; + } + } + + BUG_ON(slot == conf->copies); + update_head_pos(slot, r10_bio); + + if (slotp) + *slotp = slot; + if (replp) + *replp = repl; + return r10_bio->devs[slot].devnum; +} + static void raid10_end_read_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); + struct r10bio *r10_bio = bio->bi_private; int slot, dev; - conf_t *conf = mddev_to_conf(r10_bio->mddev); + struct md_rdev *rdev; + struct r10conf *conf = r10_bio->mddev->private; slot = r10_bio->read_slot; dev = r10_bio->devs[slot].devnum; + rdev = r10_bio->devs[slot].rdev; /* * this branch is our 'one mirror IO has finished' event handler: */ @@ -274,41 +386,96 @@ static void raid10_end_read_request(struct bio *bio, int error) * wait for the 'master' bio. */ set_bit(R10BIO_Uptodate, &r10_bio->state); + } else { + /* If all other devices that store this block have + * failed, we want to return the error upwards rather + * than fail the last device. Here we redefine + * "uptodate" to mean "Don't want to retry" + */ + if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state), + rdev->raid_disk)) + uptodate = 1; + } + if (uptodate) { raid_end_bio_io(r10_bio); + rdev_dec_pending(rdev, conf->mddev); } else { /* - * oops, read error: + * oops, read error - keep the refcount on the rdev */ char b[BDEVNAME_SIZE]; - if (printk_ratelimit()) - printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", - bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); + printk_ratelimited(KERN_ERR + "md/raid10:%s: %s: rescheduling sector %llu\n", + mdname(conf->mddev), + bdevname(rdev->bdev, b), + (unsigned long long)r10_bio->sector); + set_bit(R10BIO_ReadError, &r10_bio->state); reschedule_retry(r10_bio); } +} + +static void close_write(struct r10bio *r10_bio) +{ + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, + r10_bio->sectors, + !test_bit(R10BIO_Degraded, &r10_bio->state), + 0); + md_write_end(r10_bio->mddev); +} - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); +static void one_write_done(struct r10bio *r10_bio) +{ + if (atomic_dec_and_test(&r10_bio->remaining)) { + if (test_bit(R10BIO_WriteError, &r10_bio->state)) + reschedule_retry(r10_bio); + else { + close_write(r10_bio); + if (test_bit(R10BIO_MadeGood, &r10_bio->state)) + reschedule_retry(r10_bio); + else + raid_end_bio_io(r10_bio); + } + } } static void raid10_end_write_request(struct bio *bio, int error) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); - int slot, dev; - conf_t *conf = mddev_to_conf(r10_bio->mddev); - - for (slot = 0; slot < conf->copies; slot++) - if (r10_bio->devs[slot].bio == bio) - break; - dev = r10_bio->devs[slot].devnum; - + struct r10bio *r10_bio = bio->bi_private; + int dev; + int dec_rdev = 1; + struct r10conf *conf = r10_bio->mddev->private; + int slot, repl; + struct md_rdev *rdev = NULL; + + dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + + if (repl) + rdev = conf->mirrors[dev].replacement; + if (!rdev) { + smp_rmb(); + repl = 0; + rdev = conf->mirrors[dev].rdev; + } /* * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) { - md_error(r10_bio->mddev, conf->mirrors[dev].rdev); - /* an I/O failed, we can't clear the bitmap */ - set_bit(R10BIO_Degraded, &r10_bio->state); - } else + if (repl) + /* Never record new bad blocks to replacement, + * just fail it. + */ + md_error(rdev->mddev, rdev); + else { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + set_bit(R10BIO_WriteError, &r10_bio->state); + dec_rdev = 0; + } + } else { /* * Set R10BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -318,39 +485,56 @@ static void raid10_end_write_request(struct bio *bio, int error) * user-side. So if something waits for IO, then it will * wait for the 'master' bio. */ - set_bit(R10BIO_Uptodate, &r10_bio->state); + sector_t first_bad; + int bad_sectors; - update_head_pos(slot, r10_bio); + /* + * Do not set R10BIO_Uptodate if the current device is + * rebuilding or Faulty. This is because we cannot use + * such device for properly reading the data back (we could + * potentially use it, if the current write would have felt + * before rdev->recovery_offset, but for simplicity we don't + * check this here. + */ + if (test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) + set_bit(R10BIO_Uptodate, &r10_bio->state); + + /* Maybe we can clear some bad blocks. */ + if (is_badblock(rdev, + r10_bio->devs[slot].addr, + r10_bio->sectors, + &first_bad, &bad_sectors)) { + bio_put(bio); + if (repl) + r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; + else + r10_bio->devs[slot].bio = IO_MADE_GOOD; + dec_rdev = 0; + set_bit(R10BIO_MadeGood, &r10_bio->state); + } + } /* * * Let's see if all mirrored write operations have finished * already. */ - if (atomic_dec_and_test(&r10_bio->remaining)) { - /* clear the bitmap if all writes complete successfully */ - bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, - r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state), - 0); - md_write_end(r10_bio->mddev); - raid_end_bio_io(r10_bio); - } - - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + one_write_done(r10_bio); + if (dec_rdev) + rdev_dec_pending(rdev, conf->mddev); } - /* * RAID10 layout manager - * Aswell as the chunksize and raid_disks count, there are two + * As well as the chunksize and raid_disks count, there are two * parameters: near_copies and far_copies. * near_copies * far_copies must be <= raid_disks. * Normally one of these will be 1. * If both are 1, we get raid0. * If near_copies == raid_disks, we get raid1. * - * Chunks are layed out in raid0 style with near_copies copies of the + * Chunks are laid out in raid0 style with near_copies copies of the * first chunk, followed by near_copies copies of the next chunk and * so on. * If far_copies > 1, then after 1/far_copies of the array has been assigned @@ -366,79 +550,127 @@ static void raid10_end_write_request(struct bio *bio, int error) * sector offset to a virtual address */ -static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio) +static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) { int n,f; sector_t sector; sector_t chunk; sector_t stripe; int dev; - int slot = 0; + int last_far_set_start, last_far_set_size; + + last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; + last_far_set_start *= geo->far_set_size; + + last_far_set_size = geo->far_set_size; + last_far_set_size += (geo->raid_disks % geo->far_set_size); /* now calculate first sector/dev */ - chunk = r10bio->sector >> conf->chunk_shift; - sector = r10bio->sector & conf->chunk_mask; + chunk = r10bio->sector >> geo->chunk_shift; + sector = r10bio->sector & geo->chunk_mask; - chunk *= conf->near_copies; + chunk *= geo->near_copies; stripe = chunk; - dev = sector_div(stripe, conf->raid_disks); - if (conf->far_offset) - stripe *= conf->far_copies; + dev = sector_div(stripe, geo->raid_disks); + if (geo->far_offset) + stripe *= geo->far_copies; - sector += stripe << conf->chunk_shift; + sector += stripe << geo->chunk_shift; /* and calculate all the others */ - for (n=0; n < conf->near_copies; n++) { + for (n = 0; n < geo->near_copies; n++) { int d = dev; + int set; sector_t s = sector; - r10bio->devs[slot].addr = sector; r10bio->devs[slot].devnum = d; + r10bio->devs[slot].addr = s; slot++; - for (f = 1; f < conf->far_copies; f++) { - d += conf->near_copies; - if (d >= conf->raid_disks) - d -= conf->raid_disks; - s += conf->stride; + for (f = 1; f < geo->far_copies; f++) { + set = d / geo->far_set_size; + d += geo->near_copies; + + if ((geo->raid_disks % geo->far_set_size) && + (d > last_far_set_start)) { + d -= last_far_set_start; + d %= last_far_set_size; + d += last_far_set_start; + } else { + d %= geo->far_set_size; + d += geo->far_set_size * set; + } + s += geo->stride; r10bio->devs[slot].devnum = d; r10bio->devs[slot].addr = s; slot++; } dev++; - if (dev >= conf->raid_disks) { + if (dev >= geo->raid_disks) { dev = 0; - sector += (conf->chunk_mask + 1); + sector += (geo->chunk_mask + 1); } } - BUG_ON(slot != conf->copies); } -static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) +static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) +{ + struct geom *geo = &conf->geo; + + if (conf->reshape_progress != MaxSector && + ((r10bio->sector >= conf->reshape_progress) != + conf->mddev->reshape_backwards)) { + set_bit(R10BIO_Previous, &r10bio->state); + geo = &conf->prev; + } else + clear_bit(R10BIO_Previous, &r10bio->state); + + __raid10_find_phys(geo, r10bio); +} + +static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) { sector_t offset, chunk, vchunk; + /* Never use conf->prev as this is only called during resync + * or recovery, so reshape isn't happening + */ + struct geom *geo = &conf->geo; + int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; + int far_set_size = geo->far_set_size; + int last_far_set_start; + + if (geo->raid_disks % geo->far_set_size) { + last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; + last_far_set_start *= geo->far_set_size; + + if (dev >= last_far_set_start) { + far_set_size = geo->far_set_size; + far_set_size += (geo->raid_disks % geo->far_set_size); + far_set_start = last_far_set_start; + } + } - offset = sector & conf->chunk_mask; - if (conf->far_offset) { + offset = sector & geo->chunk_mask; + if (geo->far_offset) { int fc; - chunk = sector >> conf->chunk_shift; - fc = sector_div(chunk, conf->far_copies); - dev -= fc * conf->near_copies; - if (dev < 0) - dev += conf->raid_disks; + chunk = sector >> geo->chunk_shift; + fc = sector_div(chunk, geo->far_copies); + dev -= fc * geo->near_copies; + if (dev < far_set_start) + dev += far_set_size; } else { - while (sector >= conf->stride) { - sector -= conf->stride; - if (dev < conf->near_copies) - dev += conf->raid_disks - conf->near_copies; + while (sector >= geo->stride) { + sector -= geo->stride; + if (dev < (geo->near_copies + far_set_start)) + dev += far_set_size - geo->near_copies; else - dev -= conf->near_copies; + dev -= geo->near_copies; } - chunk = sector >> conf->chunk_shift; + chunk = sector >> geo->chunk_shift; } - vchunk = chunk * conf->raid_disks + dev; - sector_div(vchunk, conf->near_copies); - return (vchunk << conf->chunk_shift) + offset; + vchunk = chunk * geo->raid_disks + dev; + sector_div(vchunk, geo->near_copies); + return (vchunk << geo->chunk_shift) + offset; } /** @@ -448,25 +680,85 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev) * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset - * If near_copies == raid_disk, there are no striping issues, - * but in that case, the function isn't called at all. + * This requires checking for end-of-chunk if near_copies != raid_disks, + * and for subordinate merge_bvec_fns if merge_check_needed. */ static int raid10_mergeable_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - mddev_t *mddev = q->queuedata; + struct mddev *mddev = q->queuedata; + struct r10conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; - unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; - - max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; - if (max < 0) max = 0; /* bio_add cannot handle a negative return */ - if (max <= biovec->bv_len && bio_sectors == 0) - return biovec->bv_len; - else - return max; + struct geom *geo = &conf->geo; + + chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1; + if (conf->reshape_progress != MaxSector && + ((sector >= conf->reshape_progress) != + conf->mddev->reshape_backwards)) + geo = &conf->prev; + + if (geo->near_copies < geo->raid_disks) { + max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + + bio_sectors)) << 9; + if (max < 0) + /* bio_add cannot handle a negative return */ + max = 0; + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + } else + max = biovec->bv_len; + + if (mddev->merge_check_needed) { + struct { + struct r10bio r10_bio; + struct r10dev devs[conf->copies]; + } on_stack; + struct r10bio *r10_bio = &on_stack.r10_bio; + int s; + if (conf->reshape_progress != MaxSector) { + /* Cannot give any guidance during reshape */ + if (max <= biovec->bv_len && bio_sectors == 0) + return biovec->bv_len; + return 0; + } + r10_bio->sector = sector; + raid10_find_phys(conf, r10_bio); + rcu_read_lock(); + for (s = 0; s < conf->copies; s++) { + int disk = r10_bio->devs[s].devnum; + struct md_rdev *rdev = rcu_dereference( + conf->mirrors[disk].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = r10_bio->devs[s].addr + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + rdev = rcu_dereference(conf->mirrors[disk].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = + bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) { + bvm->bi_sector = r10_bio->devs[s].addr + + rdev->data_offset; + bvm->bi_bdev = rdev->bdev; + max = min(max, q->merge_bvec_fn( + q, bvm, biovec)); + } + } + } + rcu_read_unlock(); + } + return max; } /* @@ -488,16 +780,29 @@ static int raid10_mergeable_bvec(struct request_queue *q, * FIXME: possibly should rethink readbalancing and do it differently * depending on near_copies / far_copies geometry. */ -static int read_balance(conf_t *conf, r10bio_t *r10_bio) +static struct md_rdev *read_balance(struct r10conf *conf, + struct r10bio *r10_bio, + int *max_sectors) { - const unsigned long this_sector = r10_bio->sector; - int disk, slot, nslot; - const int sectors = r10_bio->sectors; - sector_t new_distance, current_distance; - mdk_rdev_t *rdev; + const sector_t this_sector = r10_bio->sector; + int disk, slot; + int sectors = r10_bio->sectors; + int best_good_sectors; + sector_t new_distance, best_dist; + struct md_rdev *best_rdev, *rdev = NULL; + int do_balance; + int best_slot; + struct geom *geo = &conf->geo; raid10_find_phys(conf, r10_bio); rcu_read_lock(); +retry: + sectors = r10_bio->sectors; + best_slot = -1; + best_rdev = NULL; + best_dist = MaxSector; + best_good_sectors = 0; + do_balance = 1; /* * Check if we can balance. We can balance on the whole * device if no resync is going on (recovery is ok), or below @@ -505,132 +810,122 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) * above the resync window. */ if (conf->mddev->recovery_cp < MaxSector - && (this_sector + sectors >= conf->next_resync)) { - /* make sure that disk is operational */ - slot = 0; - disk = r10_bio->devs[slot].devnum; - - while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || - r10_bio->devs[slot].bio == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags)) { - slot++; - if (slot == conf->copies) { - slot = 0; - disk = -1; - break; - } - disk = r10_bio->devs[slot].devnum; - } - goto rb_out; - } + && (this_sector + sectors >= conf->next_resync)) + do_balance = 0; + for (slot = 0; slot < conf->copies ; slot++) { + sector_t first_bad; + int bad_sectors; + sector_t dev_sector; - /* make sure the disk is operational */ - slot = 0; - disk = r10_bio->devs[slot].devnum; - while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || - r10_bio->devs[slot].bio == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags)) { - slot ++; - if (slot == conf->copies) { - disk = -1; - goto rb_out; - } + if (r10_bio->devs[slot].bio == IO_BLOCKED) + continue; disk = r10_bio->devs[slot].devnum; - } - - - current_distance = abs(r10_bio->devs[slot].addr - - conf->mirrors[disk].head_position); - - /* Find the disk whose head is closest, - * or - for far > 1 - find the closest to partition beginning */ - - for (nslot = slot; nslot < conf->copies; nslot++) { - int ndisk = r10_bio->devs[nslot].devnum; - + rdev = rcu_dereference(conf->mirrors[disk].replacement); + if (rdev == NULL || test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags) || + r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) + rdev = rcu_dereference(conf->mirrors[disk].rdev); + if (rdev == NULL || + test_bit(Faulty, &rdev->flags) || + test_bit(Unmerged, &rdev->flags)) + continue; + if (!test_bit(In_sync, &rdev->flags) && + r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) + continue; - if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || - r10_bio->devs[nslot].bio == IO_BLOCKED || - !test_bit(In_sync, &rdev->flags)) + dev_sector = r10_bio->devs[slot].addr; + if (is_badblock(rdev, dev_sector, sectors, + &first_bad, &bad_sectors)) { + if (best_dist < MaxSector) + /* Already have a better slot */ + continue; + if (first_bad <= dev_sector) { + /* Cannot read here. If this is the + * 'primary' device, then we must not read + * beyond 'bad_sectors' from another device. + */ + bad_sectors -= (dev_sector - first_bad); + if (!do_balance && sectors > bad_sectors) + sectors = bad_sectors; + if (best_good_sectors > sectors) + best_good_sectors = sectors; + } else { + sector_t good_sectors = + first_bad - dev_sector; + if (good_sectors > best_good_sectors) { + best_good_sectors = good_sectors; + best_slot = slot; + best_rdev = rdev; + } + if (!do_balance) + /* Must read from here */ + break; + } continue; + } else + best_good_sectors = sectors; + + if (!do_balance) + break; /* This optimisation is debatable, and completely destroys * sequential read speed for 'far copies' arrays. So only * keep it for 'near' arrays, and review those later. */ - if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) { - disk = ndisk; - slot = nslot; + if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) break; - } /* for far > 1 always use the lowest address */ - if (conf->far_copies > 1) - new_distance = r10_bio->devs[nslot].addr; + if (geo->far_copies > 1) + new_distance = r10_bio->devs[slot].addr; else - new_distance = abs(r10_bio->devs[nslot].addr - - conf->mirrors[ndisk].head_position); - if (new_distance < current_distance) { - current_distance = new_distance; - disk = ndisk; - slot = nslot; + new_distance = abs(r10_bio->devs[slot].addr - + conf->mirrors[disk].head_position); + if (new_distance < best_dist) { + best_dist = new_distance; + best_slot = slot; + best_rdev = rdev; } } + if (slot >= conf->copies) { + slot = best_slot; + rdev = best_rdev; + } -rb_out: - r10_bio->read_slot = slot; -/* conf->next_seq_sect = this_sector + sectors;*/ - - if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL) - atomic_inc(&conf->mirrors[disk].rdev->nr_pending); - else - disk = -1; - rcu_read_unlock(); - - return disk; -} - -static void unplug_slaves(mddev_t *mddev) -{ - conf_t *conf = mddev_to_conf(mddev); - int i; - - rcu_read_lock(); - for (i=0; i<mddev->raid_disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { - struct request_queue *r_queue = bdev_get_queue(rdev->bdev); - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - blk_unplug(r_queue); - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); + if (slot >= 0) { + atomic_inc(&rdev->nr_pending); + if (test_bit(Faulty, &rdev->flags)) { + /* Cannot risk returning a device that failed + * before we inc'ed nr_pending + */ + rdev_dec_pending(rdev, conf->mddev); + goto retry; } - } + r10_bio->read_slot = slot; + } else + rdev = NULL; rcu_read_unlock(); -} - -static void raid10_unplug(struct request_queue *q) -{ - mddev_t *mddev = q->queuedata; + *max_sectors = best_good_sectors; - unplug_slaves(q->queuedata); - md_wakeup_thread(mddev->thread); + return rdev; } -static int raid10_congested(void *data, int bits) +int md_raid10_congested(struct mddev *mddev, int bits) { - mddev_t *mddev = data; - conf_t *conf = mddev_to_conf(mddev); + struct r10conf *conf = mddev->private; int i, ret = 0; + if ((bits & (1 << BDI_async_congested)) && + conf->pending_count >= max_queued_requests) + return 1; + rcu_read_lock(); - for (i = 0; i < mddev->raid_disks && ret == 0; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); + for (i = 0; + (i < conf->geo.raid_disks || i < conf->prev.raid_disks) + && ret == 0; + i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -640,37 +935,48 @@ static int raid10_congested(void *data, int bits) rcu_read_unlock(); return ret; } +EXPORT_SYMBOL_GPL(md_raid10_congested); + +static int raid10_congested(void *data, int bits) +{ + struct mddev *mddev = data; + + return mddev_congested(mddev, bits) || + md_raid10_congested(mddev, bits); +} -static int flush_pending_writes(conf_t *conf) +static void flush_pending_writes(struct r10conf *conf) { /* Any writes that have been queued but are awaiting * bitmap updates get flushed here. - * We return 1 if any requests were actually submitted. */ - int rv = 0; - spin_lock_irq(&conf->device_lock); if (conf->pending_bio_list.head) { struct bio *bio; bio = bio_list_get(&conf->pending_bio_list); - blk_remove_plug(conf->mddev->queue); + conf->pending_count = 0; spin_unlock_irq(&conf->device_lock); /* flush any pending bitmap writes to disk * before proceeding w/ I/O */ bitmap_unplug(conf->mddev->bitmap); + wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } - rv = 1; } else spin_unlock_irq(&conf->device_lock); - return rv; } + /* Barriers.... * Sometimes we need to suspend IO while we do something else, * either some resync/recovery, or reconfigure the array. @@ -693,29 +999,27 @@ static int flush_pending_writes(conf_t *conf) * lower_barrier when the particular background IO completes. */ -static void raise_barrier(conf_t *conf, int force) +static void raise_barrier(struct r10conf *conf, int force) { BUG_ON(force && !conf->barrier); spin_lock_irq(&conf->resync_lock); /* Wait until no block IO is waiting (unless 'force') */ wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock, - raid10_unplug(conf->mddev->queue)); + conf->resync_lock); /* block any new IO from starting */ conf->barrier++; - /* No wait for all pending IO to complete */ + /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !conf->nr_pending && conf->barrier < RESYNC_DEPTH, - conf->resync_lock, - raid10_unplug(conf->mddev->queue)); + conf->resync_lock); spin_unlock_irq(&conf->resync_lock); } -static void lower_barrier(conf_t *conf) +static void lower_barrier(struct r10conf *conf) { unsigned long flags; spin_lock_irqsave(&conf->resync_lock, flags); @@ -724,21 +1028,33 @@ static void lower_barrier(conf_t *conf) wake_up(&conf->wait_barrier); } -static void wait_barrier(conf_t *conf) +static void wait_barrier(struct r10conf *conf) { spin_lock_irq(&conf->resync_lock); if (conf->barrier) { conf->nr_waiting++; - wait_event_lock_irq(conf->wait_barrier, !conf->barrier, - conf->resync_lock, - raid10_unplug(conf->mddev->queue)); + /* Wait for the barrier to drop. + * However if there are already pending + * requests (preventing the barrier from + * rising completely), and the + * pre-process bio queue isn't empty, + * then don't wait, as we need to empty + * that queue to get the nr_pending + * count down. + */ + wait_event_lock_irq(conf->wait_barrier, + !conf->barrier || + (conf->nr_pending && + current->bio_list && + !bio_list_empty(current->bio_list)), + conf->resync_lock); conf->nr_waiting--; } conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); } -static void allow_barrier(conf_t *conf) +static void allow_barrier(struct r10conf *conf) { unsigned long flags; spin_lock_irqsave(&conf->resync_lock, flags); @@ -747,32 +1063,32 @@ static void allow_barrier(conf_t *conf) wake_up(&conf->wait_barrier); } -static void freeze_array(conf_t *conf) +static void freeze_array(struct r10conf *conf, int extra) { /* stop syncio and normal IO and wait for everything to * go quiet. * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+1 + * wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (1) + * Thus the number queued (nr_queued) plus this request (extra) * must match the number of pending IOs (nr_pending) before * we continue. */ spin_lock_irq(&conf->resync_lock); conf->barrier++; conf->nr_waiting++; - wait_event_lock_irq(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+1, - conf->resync_lock, - ({ flush_pending_writes(conf); - raid10_unplug(conf->mddev->queue); })); + wait_event_lock_irq_cmd(conf->wait_barrier, + conf->nr_pending == conf->nr_queued+extra, + conf->resync_lock, + flush_pending_writes(conf)); + spin_unlock_irq(&conf->resync_lock); } -static void unfreeze_array(conf_t *conf) +static void unfreeze_array(struct r10conf *conf) { /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); @@ -782,59 +1098,79 @@ static void unfreeze_array(conf_t *conf) spin_unlock_irq(&conf->resync_lock); } -static int make_request(struct request_queue *q, struct bio * bio) +static sector_t choose_data_offset(struct r10bio *r10_bio, + struct md_rdev *rdev) { - mddev_t *mddev = q->queuedata; - conf_t *conf = mddev_to_conf(mddev); - mirror_info_t *mirror; - r10bio_t *r10_bio; - struct bio *read_bio; - int i; - int chunk_sects = conf->chunk_mask + 1; - const int rw = bio_data_dir(bio); - const int do_sync = bio_sync(bio); - struct bio_list bl; - unsigned long flags; - mdk_rdev_t *blocked_rdev; + if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || + test_bit(R10BIO_Previous, &r10_bio->state)) + return rdev->data_offset; + else + return rdev->new_data_offset; +} - if (unlikely(bio_barrier(bio))) { - bio_endio(bio, -EOPNOTSUPP); - return 0; +struct raid10_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r10conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule || current->bio_list) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_barrier); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; } - /* If this request crosses a chunk boundary, we need to - * split it. This will only happen for 1 PAGE (or less) requests. - */ - if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) - > chunk_sects && - conf->near_copies < conf->raid_disks)) { - struct bio_pair *bp; - /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || - bio->bi_idx != 0) - goto bad_map; - /* This is a one page bio that upper layers - * refuse to split for us, so we need to split it. - */ - bp = bio_split(bio, bio_split_pool, - chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); - if (make_request(q, &bp->bio1)) - generic_make_request(&bp->bio1); - if (make_request(q, &bp->bio2)) - generic_make_request(&bp->bio2); - - bio_pair_release(bp); - return 0; - bad_map: - printk("raid10_make_request bug: can't convert block across chunks" - " or bigger than %dk %llu %d\n", chunk_sects/2, - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); - bio_io_error(bio); - return 0; + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); + bio = next; } + kfree(plug); +} - md_write_start(mddev, bio); +static void __make_request(struct mddev *mddev, struct bio *bio) +{ + struct r10conf *conf = mddev->private; + struct r10bio *r10_bio; + struct bio *read_bio; + int i; + const int rw = bio_data_dir(bio); + const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); + const unsigned long do_fua = (bio->bi_rw & REQ_FUA); + const unsigned long do_discard = (bio->bi_rw + & (REQ_DISCARD | REQ_SECURE)); + const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); + unsigned long flags; + struct md_rdev *blocked_rdev; + struct blk_plug_cb *cb; + struct raid10_plug_cb *plug = NULL; + int sectors_handled; + int max_sectors; + int sectors; /* * Register the new request and wait if the reconstruction @@ -843,70 +1179,227 @@ static int make_request(struct request_queue *q, struct bio * bio) */ wait_barrier(conf); - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); + sectors = bio_sectors(bio); + while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + bio->bi_iter.bi_sector < conf->reshape_progress && + bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { + /* IO spans the reshape position. Need to wait for + * reshape to pass + */ + allow_barrier(conf); + wait_event(conf->wait_barrier, + conf->reshape_progress <= bio->bi_iter.bi_sector || + conf->reshape_progress >= bio->bi_iter.bi_sector + + sectors); + wait_barrier(conf); + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + bio_data_dir(bio) == WRITE && + (mddev->reshape_backwards + ? (bio->bi_iter.bi_sector < conf->reshape_safe && + bio->bi_iter.bi_sector + sectors > conf->reshape_progress) + : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe && + bio->bi_iter.bi_sector < conf->reshape_progress))) { + /* Need to update reshape_position in metadata */ + mddev->reshape_position = conf->reshape_progress; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); + + conf->reshape_safe = mddev->reshape_position; + } r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; - r10_bio->sectors = bio->bi_size >> 9; + r10_bio->sectors = sectors; r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_sector; + r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; + /* We might need to issue multiple reads to different + * devices if there are bad blocks around, so we keep + * track of the number of reads in bio->bi_phys_segments. + * If this is 0, there is only one r10_bio and no locking + * will be needed when the request completes. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + clear_bit(BIO_SEG_VALID, &bio->bi_flags); + if (rw == READ) { /* * read balancing logic: */ - int disk = read_balance(conf, r10_bio); - int slot = r10_bio->read_slot; - if (disk < 0) { + struct md_rdev *rdev; + int slot; + +read_again: + rdev = read_balance(conf, r10_bio, &max_sectors); + if (!rdev) { raid_end_bio_io(r10_bio); - return 0; + return; } - mirror = conf->mirrors + disk; + slot = r10_bio->read_slot; - read_bio = bio_clone(bio, GFP_NOIO); + read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, + max_sectors); r10_bio->devs[slot].bio = read_bio; + r10_bio->devs[slot].rdev = rdev; - read_bio->bi_sector = r10_bio->devs[slot].addr + - mirror->rdev->data_offset; - read_bio->bi_bdev = mirror->rdev->bdev; + read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + + choose_data_offset(r10_bio, rdev); + read_bio->bi_bdev = rdev->bdev; read_bio->bi_end_io = raid10_end_read_request; read_bio->bi_rw = READ | do_sync; read_bio->bi_private = r10_bio; - generic_make_request(read_bio); - return 0; + if (max_sectors < r10_bio->sectors) { + /* Could not read all from this device, so we will + * need another r10_bio. + */ + sectors_handled = (r10_bio->sector + max_sectors + - bio->bi_iter.bi_sector); + r10_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + /* Cannot call generic_make_request directly + * as that will be queued in __generic_make_request + * and subsequent mempool_alloc might block + * waiting for it. so hand bio over to raid10d. + */ + reschedule_retry(r10_bio); + + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = bio_sectors(bio) - sectors_handled; + r10_bio->state = 0; + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_iter.bi_sector + + sectors_handled; + goto read_again; + } else + generic_make_request(read_bio); + return; } /* * WRITE: */ + if (conf->pending_count >= max_queued_requests) { + md_wakeup_thread(mddev->thread); + wait_event(conf->wait_barrier, + conf->pending_count < max_queued_requests); + } /* first select target devices under rcu_lock and * inc refcount on their rdev. Record them by setting * bios[x] to bio + * If there are known/acknowledged bad blocks on any device + * on which we have seen a write error, we want to avoid + * writing to those blocks. This potentially requires several + * writes to write around the bad blocks. Each set of writes + * gets its own r10_bio with a set of bios attached. The number + * of r10_bios is recored in bio->bi_phys_segments just as with + * the read case. */ + + r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); - retry_write: +retry_write: blocked_rdev = NULL; rcu_read_lock(); + max_sectors = r10_bio->sectors; + for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; - mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); + struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); + struct md_rdev *rrdev = rcu_dereference( + conf->mirrors[d].replacement); + if (rdev == rrdev) + rrdev = NULL; if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { atomic_inc(&rdev->nr_pending); blocked_rdev = rdev; break; } - if (rdev && !test_bit(Faulty, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - r10_bio->devs[i].bio = bio; - } else { - r10_bio->devs[i].bio = NULL; + if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } + if (rdev && (test_bit(Faulty, &rdev->flags) + || test_bit(Unmerged, &rdev->flags))) + rdev = NULL; + if (rrdev && (test_bit(Faulty, &rrdev->flags) + || test_bit(Unmerged, &rrdev->flags))) + rrdev = NULL; + + r10_bio->devs[i].bio = NULL; + r10_bio->devs[i].repl_bio = NULL; + + if (!rdev && !rrdev) { set_bit(R10BIO_Degraded, &r10_bio->state); + continue; + } + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + sector_t dev_sector = r10_bio->devs[i].addr; + int bad_sectors; + int is_bad; + + is_bad = is_badblock(rdev, dev_sector, + max_sectors, + &first_bad, &bad_sectors); + if (is_bad < 0) { + /* Mustn't write here until the bad block + * is acknowledged + */ + atomic_inc(&rdev->nr_pending); + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } + if (is_bad && first_bad <= dev_sector) { + /* Cannot write here at all */ + bad_sectors -= (dev_sector - first_bad); + if (bad_sectors < max_sectors) + /* Mustn't write more than bad_sectors + * to other devices yet + */ + max_sectors = bad_sectors; + /* We don't set R10BIO_Degraded as that + * only applies if the disk is missing, + * so it might be re-added, and we want to + * know to recover this chunk. + * In this case the device is here, and the + * fact that this chunk is not in-sync is + * recorded in the bad block log. + */ + continue; + } + if (is_bad) { + int good_sectors = first_bad - dev_sector; + if (good_sectors < max_sectors) + max_sectors = good_sectors; + } + } + if (rdev) { + r10_bio->devs[i].bio = bio; + atomic_inc(&rdev->nr_pending); + } + if (rrdev) { + r10_bio->devs[i].repl_bio = bio; + atomic_inc(&rrdev->nr_pending); } } rcu_read_unlock(); @@ -916,90 +1409,265 @@ static int make_request(struct request_queue *q, struct bio * bio) int j; int d; - for (j = 0; j < i; j++) + for (j = 0; j < i; j++) { if (r10_bio->devs[j].bio) { d = r10_bio->devs[j].devnum; rdev_dec_pending(conf->mirrors[d].rdev, mddev); } + if (r10_bio->devs[j].repl_bio) { + struct md_rdev *rdev; + d = r10_bio->devs[j].devnum; + rdev = conf->mirrors[d].replacement; + if (!rdev) { + /* Race with remove_disk */ + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + rdev_dec_pending(rdev, mddev); + } + } allow_barrier(conf); md_wait_for_blocked_rdev(blocked_rdev, mddev); wait_barrier(conf); goto retry_write; } - atomic_set(&r10_bio->remaining, 0); + if (max_sectors < r10_bio->sectors) { + /* We are splitting this into multiple parts, so + * we need to prepare for allocating another r10_bio. + */ + r10_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + } + sectors_handled = r10_bio->sector + max_sectors - + bio->bi_iter.bi_sector; + + atomic_set(&r10_bio->remaining, 1); + bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); - bio_list_init(&bl); for (i = 0; i < conf->copies; i++) { struct bio *mbio; int d = r10_bio->devs[i].devnum; - if (!r10_bio->devs[i].bio) - continue; + if (r10_bio->devs[i].bio) { + struct md_rdev *rdev = conf->mirrors[d].rdev; + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, + max_sectors); + r10_bio->devs[i].bio = mbio; + + mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+ + choose_data_offset(r10_bio, + rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = + WRITE | do_sync | do_fua | do_discard | do_same; + mbio->bi_private = r10_bio; + + atomic_inc(&r10_bio->remaining); - mbio = bio_clone(bio, GFP_NOIO); - r10_bio->devs[i].bio = mbio; + cb = blk_check_plugged(raid10_unplug, mddev, + sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, + cb); + else + plug = NULL; + spin_lock_irqsave(&conf->device_lock, flags); + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!plug) + md_wakeup_thread(mddev->thread); + } - mbio->bi_sector = r10_bio->devs[i].addr+ - conf->mirrors[d].rdev->data_offset; - mbio->bi_bdev = conf->mirrors[d].rdev->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync; - mbio->bi_private = r10_bio; + if (r10_bio->devs[i].repl_bio) { + struct md_rdev *rdev = conf->mirrors[d].replacement; + if (rdev == NULL) { + /* Replacement just got moved to main 'rdev' */ + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, + max_sectors); + r10_bio->devs[i].repl_bio = mbio; + + mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr + + choose_data_offset( + r10_bio, rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = + WRITE | do_sync | do_fua | do_discard | do_same; + mbio->bi_private = r10_bio; - atomic_inc(&r10_bio->remaining); - bio_list_add(&bl, mbio); + atomic_inc(&r10_bio->remaining); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); + } } - if (unlikely(!atomic_read(&r10_bio->remaining))) { - /* the array is dead */ - md_write_end(mddev); - raid_end_bio_io(r10_bio); - return 0; + /* Don't remove the bias on 'remaining' (one_write_done) until + * after checking if we need to go around again. + */ + + if (sectors_handled < bio_sectors(bio)) { + one_write_done(r10_bio); + /* We need another r10_bio. It has already been counted + * in bio->bi_phys_segments. + */ + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = bio_sectors(bio) - sectors_handled; + + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + r10_bio->state = 0; + goto retry_write; } + one_write_done(r10_bio); +} - bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_merge(&conf->pending_bio_list, &bl); - blk_plug_device(mddev->queue); - spin_unlock_irqrestore(&conf->device_lock, flags); +static void make_request(struct mddev *mddev, struct bio *bio) +{ + struct r10conf *conf = mddev->private; + sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); + int chunk_sects = chunk_mask + 1; - /* In case raid10d snuck in to freeze_array */ - wake_up(&conf->wait_barrier); + struct bio *split; - if (do_sync) - md_wakeup_thread(mddev->thread); + if (unlikely(bio->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bio); + return; + } - return 0; + md_write_start(mddev, bio); + + + do { + + /* + * If this request crosses a chunk boundary, we need to split + * it. + */ + if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + + bio_sectors(bio) > chunk_sects + && (conf->geo.near_copies < conf->geo.raid_disks + || conf->prev.near_copies < + conf->prev.raid_disks))) { + split = bio_split(bio, chunk_sects - + (bio->bi_iter.bi_sector & + (chunk_sects - 1)), + GFP_NOIO, fs_bio_set); + bio_chain(split, bio); + } else { + split = bio; + } + + __make_request(mddev, split); + } while (split != bio); + + /* In case raid10d snuck in to freeze_array */ + wake_up(&conf->wait_barrier); } -static void status(struct seq_file *seq, mddev_t *mddev) +static void status(struct seq_file *seq, struct mddev *mddev) { - conf_t *conf = mddev_to_conf(mddev); + struct r10conf *conf = mddev->private; int i; - if (conf->near_copies < conf->raid_disks) - seq_printf(seq, " %dK chunks", mddev->chunk_size/1024); - if (conf->near_copies > 1) - seq_printf(seq, " %d near-copies", conf->near_copies); - if (conf->far_copies > 1) { - if (conf->far_offset) - seq_printf(seq, " %d offset-copies", conf->far_copies); + if (conf->geo.near_copies < conf->geo.raid_disks) + seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); + if (conf->geo.near_copies > 1) + seq_printf(seq, " %d near-copies", conf->geo.near_copies); + if (conf->geo.far_copies > 1) { + if (conf->geo.far_offset) + seq_printf(seq, " %d offset-copies", conf->geo.far_copies); else - seq_printf(seq, " %d far-copies", conf->far_copies); + seq_printf(seq, " %d far-copies", conf->geo.far_copies); } - seq_printf(seq, " [%d/%d] [", conf->raid_disks, - conf->raid_disks - mddev->degraded); - for (i = 0; i < conf->raid_disks; i++) + seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, + conf->geo.raid_disks - mddev->degraded); + for (i = 0; i < conf->geo.raid_disks; i++) seq_printf(seq, "%s", conf->mirrors[i].rdev && test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); seq_printf(seq, "]"); } -static void error(mddev_t *mddev, mdk_rdev_t *rdev) +/* check if there are enough drives for + * every block to appear on atleast one. + * Don't consider the device numbered 'ignore' + * as we might be about to remove it. + */ +static int _enough(struct r10conf *conf, int previous, int ignore) +{ + int first = 0; + int has_enough = 0; + int disks, ncopies; + if (previous) { + disks = conf->prev.raid_disks; + ncopies = conf->prev.near_copies; + } else { + disks = conf->geo.raid_disks; + ncopies = conf->geo.near_copies; + } + + rcu_read_lock(); + do { + int n = conf->copies; + int cnt = 0; + int this = first; + while (n--) { + struct md_rdev *rdev; + if (this != ignore && + (rdev = rcu_dereference(conf->mirrors[this].rdev)) && + test_bit(In_sync, &rdev->flags)) + cnt++; + this = (this+1) % disks; + } + if (cnt == 0) + goto out; + first = (first + ncopies) % disks; + } while (first != 0); + has_enough = 1; +out: + rcu_read_unlock(); + return has_enough; +} + +static int enough(struct r10conf *conf, int ignore) +{ + /* when calling 'enough', both 'prev' and 'geo' must + * be stable. + * This is ensured if ->reconfig_mutex or ->device_lock + * is held. + */ + return _enough(conf, 0, ignore) && + _enough(conf, 1, ignore); +} + +static void error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; - conf_t *conf = mddev_to_conf(mddev); + struct r10conf *conf = mddev->private; + unsigned long flags; /* * If it is not operational, then we have already marked it as dead @@ -1007,58 +1675,58 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) * next level up know. * else mark the drive as failed */ + spin_lock_irqsave(&conf->device_lock, flags); if (test_bit(In_sync, &rdev->flags) - && conf->raid_disks-mddev->degraded == 1) + && !enough(conf, rdev->raid_disk)) { /* * Don't fail the drive, just return an IO error. - * The test should really be more sophisticated than - * "working_disks == 1", but it isn't critical, and - * can wait until we do more sophisticated "is the drive - * really dead" tests... */ + spin_unlock_irqrestore(&conf->device_lock, flags); return; + } if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; - spin_unlock_irqrestore(&conf->device_lock, flags); - /* + /* * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); } + set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n" - "raid10: Operation continuing on %d devices.\n", - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); + spin_unlock_irqrestore(&conf->device_lock, flags); + printk(KERN_ALERT + "md/raid10:%s: Disk failure on %s, disabling device.\n" + "md/raid10:%s: Operation continuing on %d devices.\n", + mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), conf->geo.raid_disks - mddev->degraded); } -static void print_conf(conf_t *conf) +static void print_conf(struct r10conf *conf) { int i; - mirror_info_t *tmp; + struct raid10_info *tmp; - printk("RAID10 conf printout:\n"); + printk(KERN_DEBUG "RAID10 conf printout:\n"); if (!conf) { - printk("(!conf)\n"); + printk(KERN_DEBUG "(!conf)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, - conf->raid_disks); + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, + conf->geo.raid_disks); - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->geo.raid_disks; i++) { char b[BDEVNAME_SIZE]; tmp = conf->mirrors + i; if (tmp->rdev) - printk(" disk %d, wo:%d, o:%d, dev:%s\n", + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags), bdevname(tmp->rdev->bdev,b)); } } -static void close_sync(conf_t *conf) +static void close_sync(struct r10conf *conf) { wait_barrier(conf); allow_barrier(conf); @@ -1067,137 +1735,195 @@ static void close_sync(conf_t *conf) conf->r10buf_pool = NULL; } -/* check if there are enough drives for - * every block to appear on atleast one - */ -static int enough(conf_t *conf) -{ - int first = 0; - - do { - int n = conf->copies; - int cnt = 0; - while (n--) { - if (conf->mirrors[first].rdev) - cnt++; - first = (first+1) % conf->raid_disks; - } - if (cnt == 0) - return 0; - } while (first != 0); - return 1; -} - -static int raid10_spare_active(mddev_t *mddev) +static int raid10_spare_active(struct mddev *mddev) { int i; - conf_t *conf = mddev->private; - mirror_info_t *tmp; + struct r10conf *conf = mddev->private; + struct raid10_info *tmp; + int count = 0; + unsigned long flags; /* * Find all non-in_sync disks within the RAID10 configuration * and mark them in_sync */ - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; i < conf->geo.raid_disks; i++) { tmp = conf->mirrors + i; - if (tmp->rdev - && !test_bit(Faulty, &tmp->rdev->flags) - && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded--; - spin_unlock_irqrestore(&conf->device_lock, flags); + if (tmp->replacement + && tmp->replacement->recovery_offset == MaxSector + && !test_bit(Faulty, &tmp->replacement->flags) + && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { + /* Replacement has just become active */ + if (!tmp->rdev + || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) + count++; + if (tmp->rdev) { + /* Replaced device not technically faulty, + * but we need to be sure it gets removed + * and never re-added. + */ + set_bit(Faulty, &tmp->rdev->flags); + sysfs_notify_dirent_safe( + tmp->rdev->sysfs_state); + } + sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); + } else if (tmp->rdev + && tmp->rdev->recovery_offset == MaxSector + && !test_bit(Faulty, &tmp->rdev->flags) + && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { + count++; + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded -= count; + spin_unlock_irqrestore(&conf->device_lock, flags); print_conf(conf); - return 0; + return count; } -static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) +static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) { - conf_t *conf = mddev->private; + struct r10conf *conf = mddev->private; int err = -EEXIST; int mirror; - mirror_info_t *p; int first = 0; - int last = mddev->raid_disks - 1; + int last = conf->geo.raid_disks - 1; + struct request_queue *q = bdev_get_queue(rdev->bdev); if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ return -EBUSY; - if (!enough(conf)) + if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) return -EINVAL; - if (rdev->raid_disk) + if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; - if (rdev->saved_raid_disk >= 0 && - rdev->saved_raid_disk >= first && + if (q->merge_bvec_fn) { + set_bit(Unmerged, &rdev->flags); + mddev->merge_check_needed = 1; + } + + if (rdev->saved_raid_disk >= first && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; else mirror = first; - for ( ; mirror <= last ; mirror++) - if ( !(p=conf->mirrors+mirror)->rdev) { - - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); - - p->head_position = 0; + for ( ; mirror <= last ; mirror++) { + struct raid10_info *p = &conf->mirrors[mirror]; + if (p->recovery_disabled == mddev->recovery_disabled) + continue; + if (p->rdev) { + if (!test_bit(WantReplacement, &p->rdev->flags) || + p->replacement != NULL) + continue; + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); rdev->raid_disk = mirror; err = 0; - if (rdev->saved_raid_disk != mirror) - conf->fullsync = 1; - rcu_assign_pointer(p->rdev, rdev); + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); break; } + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + p->head_position = 0; + p->recovery_disabled = mddev->recovery_disabled - 1; + rdev->raid_disk = mirror; + err = 0; + if (rdev->saved_raid_disk != mirror) + conf->fullsync = 1; + rcu_assign_pointer(p->rdev, rdev); + break; + } + if (err == 0 && test_bit(Unmerged, &rdev->flags)) { + /* Some requests might not have seen this new + * merge_bvec_fn. We must wait for them to complete + * before merging the device fully. + * First we make sure any code which has tested + * our function has submitted the request, then + * we wait for all outstanding requests to complete. + */ + synchronize_sched(); + freeze_array(conf, 0); + unfreeze_array(conf); + clear_bit(Unmerged, &rdev->flags); + } + md_integrity_add_rdev(rdev, mddev); + if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + print_conf(conf); return err; } -static int raid10_remove_disk(mddev_t *mddev, int number) +static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) { - conf_t *conf = mddev->private; + struct r10conf *conf = mddev->private; int err = 0; - mdk_rdev_t *rdev; - mirror_info_t *p = conf->mirrors+ number; + int number = rdev->raid_disk; + struct md_rdev **rdevp; + struct raid10_info *p = conf->mirrors + number; print_conf(conf); - rdev = p->rdev; - if (rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - err = -EBUSY; - goto abort; - } - /* Only remove faulty devices in recovery - * is not possible. - */ - if (!test_bit(Faulty, &rdev->flags) && - enough(conf)) { - err = -EBUSY; - goto abort; - } - p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - } + if (rdev == p->rdev) + rdevp = &p->rdev; + else if (rdev == p->replacement) + rdevp = &p->replacement; + else + return 0; + + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + err = -EBUSY; + goto abort; } + /* Only remove faulty devices if recovery + * is not possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != p->recovery_disabled && + (!p->replacement || p->replacement == rdev) && + number < conf->geo.raid_disks && + enough(conf, -1)) { + err = -EBUSY; + goto abort; + } + *rdevp = NULL; + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + goto abort; + } else if (p->replacement) { + /* We must have just cleared 'rdev' */ + p->rdev = p->replacement; + clear_bit(Replacement, &p->replacement->flags); + smp_mb(); /* Make sure other CPUs may see both as identical + * but will never see neither -- if they are careful. + */ + p->replacement = NULL; + clear_bit(WantReplacement, &rdev->flags); + } else + /* We might have just remove the Replacement as faulty + * Clear the flag just in case + */ + clear_bit(WantReplacement, &rdev->flags); + + err = md_integrity_register(mddev); + abort: print_conf(conf); @@ -1207,30 +1933,29 @@ abort: static void end_sync_read(struct bio *bio, int error) { - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); - conf_t *conf = mddev_to_conf(r10_bio->mddev); - int i,d; + struct r10bio *r10_bio = bio->bi_private; + struct r10conf *conf = r10_bio->mddev->private; + int d; - for (i=0; i<conf->copies; i++) - if (r10_bio->devs[i].bio == bio) - break; - BUG_ON(i == conf->copies); - update_head_pos(i, r10_bio); - d = r10_bio->devs[i].devnum; + if (bio == r10_bio->master_bio) { + /* this is a reshape read */ + d = r10_bio->read_slot; /* really the read dev */ + } else + d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); if (test_bit(BIO_UPTODATE, &bio->bi_flags)) set_bit(R10BIO_Uptodate, &r10_bio->state); - else { + else + /* The write handler will notice the lack of + * R10BIO_Uptodate and record any errors etc + */ atomic_add(r10_bio->sectors, &conf->mirrors[d].rdev->corrected_errors); - if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) - md_error(r10_bio->mddev, - conf->mirrors[d].rdev); - } /* for reconstruct, we always reschedule after a read. * for resync, only after all reads */ + rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); if (test_bit(R10BIO_IsRecover, &r10_bio->state) || atomic_dec_and_test(&r10_bio->remaining)) { /* we have read all the blocks, @@ -1238,40 +1963,73 @@ static void end_sync_read(struct bio *bio, int error) */ reschedule_retry(r10_bio); } - rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); } -static void end_sync_write(struct bio *bio, int error) +static void end_sync_request(struct r10bio *r10_bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); - mddev_t *mddev = r10_bio->mddev; - conf_t *conf = mddev_to_conf(mddev); - int i,d; - - for (i = 0; i < conf->copies; i++) - if (r10_bio->devs[i].bio == bio) - break; - d = r10_bio->devs[i].devnum; - - if (!uptodate) - md_error(mddev, conf->mirrors[d].rdev); - - update_head_pos(i, r10_bio); + struct mddev *mddev = r10_bio->mddev; while (atomic_dec_and_test(&r10_bio->remaining)) { if (r10_bio->master_bio == NULL) { /* the primary of several recovery bios */ - md_done_sync(mddev, r10_bio->sectors, 1); - put_buf(r10_bio); + sector_t s = r10_bio->sectors; + if (test_bit(R10BIO_MadeGood, &r10_bio->state) || + test_bit(R10BIO_WriteError, &r10_bio->state)) + reschedule_retry(r10_bio); + else + put_buf(r10_bio); + md_done_sync(mddev, s, 1); break; } else { - r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio; - put_buf(r10_bio); + struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; + if (test_bit(R10BIO_MadeGood, &r10_bio->state) || + test_bit(R10BIO_WriteError, &r10_bio->state)) + reschedule_retry(r10_bio); + else + put_buf(r10_bio); r10_bio = r10_bio2; } } - rdev_dec_pending(conf->mirrors[d].rdev, mddev); +} + +static void end_sync_write(struct bio *bio, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct r10bio *r10_bio = bio->bi_private; + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + int d; + sector_t first_bad; + int bad_sectors; + int slot; + int repl; + struct md_rdev *rdev = NULL; + + d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[d].replacement; + else + rdev = conf->mirrors[d].rdev; + + if (!uptodate) { + if (repl) + md_error(mddev, rdev); + else { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + set_bit(R10BIO_WriteError, &r10_bio->state); + } + } else if (is_badblock(rdev, + r10_bio->devs[slot].addr, + r10_bio->sectors, + &first_bad, &bad_sectors)) + set_bit(R10BIO_MadeGood, &r10_bio->state); + + rdev_dec_pending(rdev, mddev); + + end_sync_request(r10_bio); } /* @@ -1290,11 +2048,12 @@ static void end_sync_write(struct bio *bio, int error) * We check if all blocks are in-sync and only write to blocks that * aren't in sync */ -static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) +static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) { - conf_t *conf = mddev_to_conf(mddev); + struct r10conf *conf = mddev->private; int i, first; struct bio *tbio, *fbio; + int vcnt; atomic_set(&r10_bio->remaining, 1); @@ -1309,10 +2068,10 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) first = i; fbio = r10_bio->devs[i].bio; + vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); /* now find blocks with errors */ for (i=0 ; i < conf->copies ; i++) { int j, d; - int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); tbio = r10_bio->devs[i].bio; @@ -1325,35 +2084,36 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) * both 'first' and 'i', so we just compare them. * All vec entries are PAGE_SIZE; */ - for (j = 0; j < vcnt; j++) + int sectors = r10_bio->sectors; + for (j = 0; j < vcnt; j++) { + int len = PAGE_SIZE; + if (sectors < (len / 512)) + len = sectors * 512; if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), page_address(tbio->bi_io_vec[j].bv_page), - PAGE_SIZE)) + len)) break; + sectors -= len/512; + } if (j == vcnt) continue; - mddev->resync_mismatches += r10_bio->sectors; + atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); + if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + /* Don't fix anything. */ + continue; } - if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) - /* Don't fix anything. */ - continue; - /* Ok, we need to write this bio + /* Ok, we need to write this bio, either to correct an + * inconsistency or to correct an unreadable block. * First we need to fixup bv_offset, bv_len and * bi_vecs, as the read request might have corrupted these */ + bio_reset(tbio); + tbio->bi_vcnt = vcnt; - tbio->bi_size = r10_bio->sectors << 9; - tbio->bi_idx = 0; - tbio->bi_phys_segments = 0; - tbio->bi_hw_segments = 0; - tbio->bi_hw_front_size = 0; - tbio->bi_hw_back_size = 0; - tbio->bi_flags &= ~(BIO_POOL_MASK - 1); - tbio->bi_flags |= 1 << BIO_UPTODATE; - tbio->bi_next = NULL; + tbio->bi_iter.bi_size = r10_bio->sectors << 9; tbio->bi_rw = WRITE; tbio->bi_private = r10_bio; - tbio->bi_sector = r10_bio->devs[i].addr; + tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; for (j=0; j < vcnt ; j++) { tbio->bi_io_vec[j].bv_offset = 0; @@ -1368,13 +2128,35 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio) d = r10_bio->devs[i].devnum; atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); - md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); + md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); - tbio->bi_sector += conf->mirrors[d].rdev->data_offset; + tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; tbio->bi_bdev = conf->mirrors[d].rdev->bdev; generic_make_request(tbio); } + /* Now write out to any replacement devices + * that are active + */ + for (i = 0; i < conf->copies; i++) { + int j, d; + + tbio = r10_bio->devs[i].repl_bio; + if (!tbio || !tbio->bi_end_io) + continue; + if (r10_bio->devs[i].bio->bi_end_io != end_sync_write + && r10_bio->devs[i].bio != fbio) + for (j = 0; j < vcnt; j++) + memcpy(page_address(tbio->bi_io_vec[j].bv_page), + page_address(fbio->bi_io_vec[j].bv_page), + PAGE_SIZE); + d = r10_bio->devs[i].devnum; + atomic_inc(&r10_bio->remaining); + md_sync_acct(conf->mirrors[d].replacement->bdev, + bio_sectors(tbio)); + generic_make_request(tbio); + } + done: if (atomic_dec_and_test(&r10_bio->remaining)) { md_done_sync(mddev, r10_bio->sectors, 1); @@ -1392,34 +2174,189 @@ done: * The second for writing. * */ +static void fix_recovery_read_error(struct r10bio *r10_bio) +{ + /* We got a read error during recovery. + * We repeat the read in smaller page-sized sections. + * If a read succeeds, write it to the new device or record + * a bad block if we cannot. + * If a read fails, record a bad block on both old and + * new devices. + */ + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + struct bio *bio = r10_bio->devs[0].bio; + sector_t sect = 0; + int sectors = r10_bio->sectors; + int idx = 0; + int dr = r10_bio->devs[0].devnum; + int dw = r10_bio->devs[1].devnum; + + while (sectors) { + int s = sectors; + struct md_rdev *rdev; + sector_t addr; + int ok; + + if (s > (PAGE_SIZE>>9)) + s = PAGE_SIZE >> 9; + + rdev = conf->mirrors[dr].rdev; + addr = r10_bio->devs[0].addr + sect, + ok = sync_page_io(rdev, + addr, + s << 9, + bio->bi_io_vec[idx].bv_page, + READ, false); + if (ok) { + rdev = conf->mirrors[dw].rdev; + addr = r10_bio->devs[1].addr + sect; + ok = sync_page_io(rdev, + addr, + s << 9, + bio->bi_io_vec[idx].bv_page, + WRITE, false); + if (!ok) { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, + &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } + } + if (!ok) { + /* We don't worry if we cannot set a bad block - + * it really is bad so there is no loss in not + * recording it yet + */ + rdev_set_badblocks(rdev, addr, s, 0); + + if (rdev != conf->mirrors[dw].rdev) { + /* need bad block on destination too */ + struct md_rdev *rdev2 = conf->mirrors[dw].rdev; + addr = r10_bio->devs[1].addr + sect; + ok = rdev_set_badblocks(rdev2, addr, s, 0); + if (!ok) { + /* just abort the recovery */ + printk(KERN_NOTICE + "md/raid10:%s: recovery aborted" + " due to read error\n", + mdname(mddev)); + + conf->mirrors[dw].recovery_disabled + = mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, + &mddev->recovery); + break; + } + } + } -static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) + sectors -= s; + sect += s; + idx++; + } +} + +static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) { - conf_t *conf = mddev_to_conf(mddev); - int i, d; - struct bio *bio, *wbio; + struct r10conf *conf = mddev->private; + int d; + struct bio *wbio, *wbio2; + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { + fix_recovery_read_error(r10_bio); + end_sync_request(r10_bio); + return; + } - /* move the pages across to the second bio + /* + * share the pages with the first bio * and submit the write request */ - bio = r10_bio->devs[0].bio; + d = r10_bio->devs[1].devnum; wbio = r10_bio->devs[1].bio; - for (i=0; i < wbio->bi_vcnt; i++) { - struct page *p = bio->bi_io_vec[i].bv_page; - bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page; - wbio->bi_io_vec[i].bv_page = p; + wbio2 = r10_bio->devs[1].repl_bio; + /* Need to test wbio2->bi_end_io before we call + * generic_make_request as if the former is NULL, + * the latter is free to free wbio2. + */ + if (wbio2 && !wbio2->bi_end_io) + wbio2 = NULL; + if (wbio->bi_end_io) { + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); + generic_make_request(wbio); } - d = r10_bio->devs[1].devnum; + if (wbio2) { + atomic_inc(&conf->mirrors[d].replacement->nr_pending); + md_sync_acct(conf->mirrors[d].replacement->bdev, + bio_sectors(wbio2)); + generic_make_request(wbio2); + } +} - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); - if (test_bit(R10BIO_Uptodate, &r10_bio->state)) - generic_make_request(wbio); + +/* + * Used by fix_read_error() to decay the per rdev read_errors. + * We halve the read error count for every hour that has elapsed + * since the last recorded read error. + * + */ +static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) +{ + struct timespec cur_time_mon; + unsigned long hours_since_last; + unsigned int read_errors = atomic_read(&rdev->read_errors); + + ktime_get_ts(&cur_time_mon); + + if (rdev->last_read_error.tv_sec == 0 && + rdev->last_read_error.tv_nsec == 0) { + /* first time we've seen a read error */ + rdev->last_read_error = cur_time_mon; + return; + } + + hours_since_last = (cur_time_mon.tv_sec - + rdev->last_read_error.tv_sec) / 3600; + + rdev->last_read_error = cur_time_mon; + + /* + * if hours_since_last is > the number of bits in read_errors + * just set read errors to 0. We do this to avoid + * overflowing the shift of read_errors by hours_since_last. + */ + if (hours_since_last >= 8 * sizeof(read_errors)) + atomic_set(&rdev->read_errors, 0); else - bio_endio(wbio, -EIO); + atomic_set(&rdev->read_errors, read_errors >> hours_since_last); } +static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, + int sectors, struct page *page, int rw) +{ + sector_t first_bad; + int bad_sectors; + + if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) + && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) + return -1; + if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) + /* success */ + return 1; + if (rw == WRITE) { + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } + /* need to record an error - either for the block or the device */ + if (!rdev_set_badblocks(rdev, sector, sectors, 0)) + md_error(rdev->mddev, rdev); + return 0; +} /* * This is a kernel thread which: @@ -1429,11 +2366,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) * 3. Performs writes following reads for array synchronising. */ -static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) +static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) { int sect = 0; /* Offset from r10_bio->sector */ int sectors = r10_bio->sectors; - mdk_rdev_t*rdev; + struct md_rdev*rdev; + int max_read_errors = atomic_read(&mddev->max_corr_read_errors); + int d = r10_bio->devs[r10_bio->read_slot].devnum; + + /* still own a reference to this rdev, so it cannot + * have been cleared recently. + */ + rdev = conf->mirrors[d].rdev; + + if (test_bit(Faulty, &rdev->flags)) + /* drive has already been failed, just ignore any + more fix_read_error() attempts */ + return; + + check_decay_read_errors(mddev, rdev); + atomic_inc(&rdev->read_errors); + if (atomic_read(&rdev->read_errors) > max_read_errors) { + char b[BDEVNAME_SIZE]; + bdevname(rdev->bdev, b); + + printk(KERN_NOTICE + "md/raid10:%s: %s: Raid device exceeded " + "read_error threshold [cur %d:max %d]\n", + mdname(mddev), b, + atomic_read(&rdev->read_errors), max_read_errors); + printk(KERN_NOTICE + "md/raid10:%s: %s: Failing raid device\n", + mdname(mddev), b); + md_error(mddev, conf->mirrors[d].rdev); + r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; + return; + } + while(sectors) { int s = sectors; int sl = r10_bio->read_slot; @@ -1445,17 +2414,23 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_lock(); do { - int d = r10_bio->devs[sl].devnum; + sector_t first_bad; + int bad_sectors; + + d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && - test_bit(In_sync, &rdev->flags)) { + !test_bit(Unmerged, &rdev->flags) && + test_bit(In_sync, &rdev->flags) && + is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, + &first_bad, &bad_sectors) == 0) { atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - success = sync_page_io(rdev->bdev, + success = sync_page_io(rdev, r10_bio->devs[sl].addr + - sect + rdev->data_offset, + sect, s<<9, - conf->tmppage, READ); + conf->tmppage, READ, false); rdev_dec_pending(rdev, mddev); rcu_read_lock(); if (success) @@ -1468,9 +2443,22 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) rcu_read_unlock(); if (!success) { - /* Cannot read from anywhere -- bye bye array */ + /* Cannot read from anywhere, just mark the block + * as bad on the first device to discourage future + * reads. + */ int dn = r10_bio->devs[r10_bio->read_slot].devnum; - md_error(mddev, conf->mirrors[dn].rdev); + rdev = conf->mirrors[dn].rdev; + + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[r10_bio->read_slot].addr + + sect, + s, 0)) { + md_error(mddev, rdev); + r10_bio->devs[r10_bio->read_slot].bio + = IO_BLOCKED; + } break; } @@ -1478,59 +2466,94 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) /* write it back and re-read */ rcu_read_lock(); while (sl != r10_bio->read_slot) { - int d; + char b[BDEVNAME_SIZE]; + if (sl==0) sl = conf->copies; sl--; d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - atomic_add(s, &rdev->corrected_errors); - if (sync_page_io(rdev->bdev, - r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, WRITE) - == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); + if (!rdev || + test_bit(Unmerged, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + continue; + + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (r10_sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, + s, conf->tmppage, WRITE) + == 0) { + /* Well, this device is dead */ + printk(KERN_NOTICE + "md/raid10:%s: read correction " + "write failed" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + + choose_data_offset(r10_bio, + rdev)), + bdevname(rdev->bdev, b)); + printk(KERN_NOTICE "md/raid10:%s: %s: failing " + "drive\n", + mdname(mddev), + bdevname(rdev->bdev, b)); } + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); } sl = start; while (sl != r10_bio->read_slot) { - int d; + char b[BDEVNAME_SIZE]; + if (sl==0) sl = conf->copies; sl--; d = r10_bio->devs[sl].devnum; rdev = rcu_dereference(conf->mirrors[d].rdev); - if (rdev && - test_bit(In_sync, &rdev->flags)) { - char b[BDEVNAME_SIZE]; - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - if (sync_page_io(rdev->bdev, - r10_bio->devs[sl].addr + - sect + rdev->data_offset, - s<<9, conf->tmppage, READ) == 0) - /* Well, this device is dead */ - md_error(mddev, rdev); - else - printk(KERN_INFO - "raid10:%s: read error corrected" - " (%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)(sect+ - rdev->data_offset), - bdevname(rdev->bdev, b)); + if (!rdev || + !test_bit(In_sync, &rdev->flags)) + continue; - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + switch (r10_sync_page_io(rdev, + r10_bio->devs[sl].addr + + sect, + s, conf->tmppage, + READ)) { + case 0: + /* Well, this device is dead */ + printk(KERN_NOTICE + "md/raid10:%s: unable to read back " + "corrected sectors" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + + choose_data_offset(r10_bio, rdev)), + bdevname(rdev->bdev, b)); + printk(KERN_NOTICE "md/raid10:%s: %s: failing " + "drive\n", + mdname(mddev), + bdevname(rdev->bdev, b)); + break; + case 1: + printk(KERN_INFO + "md/raid10:%s: read error corrected" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + + choose_data_offset(r10_bio, rdev)), + bdevname(rdev->bdev, b)); + atomic_add(s, &rdev->corrected_errors); } + + rdev_dec_pending(rdev, mddev); + rcu_read_lock(); } rcu_read_unlock(); @@ -1539,101 +2562,309 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) } } -static void raid10d(mddev_t *mddev) +static int narrow_write_error(struct r10bio *r10_bio, int i) +{ + struct bio *bio = r10_bio->master_bio; + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; + /* bio has the data to be written to slot 'i' where + * we just recently had a write error. + * We repeatedly clone the bio and trim down to one block, + * then try the write. Where the write fails we record + * a bad block. + * It is conceivable that the bio doesn't exactly align with + * blocks. We must handle this. + * + * We currently own a reference to the rdev. + */ + + int block_sectors; + sector_t sector; + int sectors; + int sect_to_write = r10_bio->sectors; + int ok = 1; + + if (rdev->badblocks.shift < 0) + return 0; + + block_sectors = 1 << rdev->badblocks.shift; + sector = r10_bio->sector; + sectors = ((r10_bio->sector + block_sectors) + & ~(sector_t)(block_sectors - 1)) + - sector; + + while (sect_to_write) { + struct bio *wbio; + if (sectors > sect_to_write) + sectors = sect_to_write; + /* Write at 'sector' for 'sectors' */ + wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); + wbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+ + choose_data_offset(r10_bio, rdev) + + (sector - r10_bio->sector)); + wbio->bi_bdev = rdev->bdev; + if (submit_bio_wait(WRITE, wbio) == 0) + /* Failure! */ + ok = rdev_set_badblocks(rdev, sector, + sectors, 0) + && ok; + + bio_put(wbio); + sect_to_write -= sectors; + sector += sectors; + sectors = block_sectors; + } + return ok; +} + +static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) { - r10bio_t *r10_bio; + int slot = r10_bio->read_slot; struct bio *bio; + struct r10conf *conf = mddev->private; + struct md_rdev *rdev = r10_bio->devs[slot].rdev; + char b[BDEVNAME_SIZE]; + unsigned long do_sync; + int max_sectors; + + /* we got a read error. Maybe the drive is bad. Maybe just + * the block and we can fix it. + * We freeze all other IO, and try reading the block from + * other devices. When we find one, we re-write + * and check it that fixes the read error. + * This is all done synchronously while the array is + * frozen. + */ + bio = r10_bio->devs[slot].bio; + bdevname(bio->bi_bdev, b); + bio_put(bio); + r10_bio->devs[slot].bio = NULL; + + if (mddev->ro == 0) { + freeze_array(conf, 1); + fix_read_error(conf, mddev, r10_bio); + unfreeze_array(conf); + } else + r10_bio->devs[slot].bio = IO_BLOCKED; + + rdev_dec_pending(rdev, mddev); + +read_more: + rdev = read_balance(conf, r10_bio, &max_sectors); + if (rdev == NULL) { + printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" + " read error for block %llu\n", + mdname(mddev), b, + (unsigned long long)r10_bio->sector); + raid_end_bio_io(r10_bio); + return; + } + + do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); + slot = r10_bio->read_slot; + printk_ratelimited( + KERN_ERR + "md/raid10:%s: %s: redirecting " + "sector %llu to another mirror\n", + mdname(mddev), + bdevname(rdev->bdev, b), + (unsigned long long)r10_bio->sector); + bio = bio_clone_mddev(r10_bio->master_bio, + GFP_NOIO, mddev); + bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); + r10_bio->devs[slot].bio = bio; + r10_bio->devs[slot].rdev = rdev; + bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + + choose_data_offset(r10_bio, rdev); + bio->bi_bdev = rdev->bdev; + bio->bi_rw = READ | do_sync; + bio->bi_private = r10_bio; + bio->bi_end_io = raid10_end_read_request; + if (max_sectors < r10_bio->sectors) { + /* Drat - have to split this up more */ + struct bio *mbio = r10_bio->master_bio; + int sectors_handled = + r10_bio->sector + max_sectors + - mbio->bi_iter.bi_sector; + r10_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (mbio->bi_phys_segments == 0) + mbio->bi_phys_segments = 2; + else + mbio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + generic_make_request(bio); + + r10_bio = mempool_alloc(conf->r10bio_pool, + GFP_NOIO); + r10_bio->master_bio = mbio; + r10_bio->sectors = bio_sectors(mbio) - sectors_handled; + r10_bio->state = 0; + set_bit(R10BIO_ReadError, + &r10_bio->state); + r10_bio->mddev = mddev; + r10_bio->sector = mbio->bi_iter.bi_sector + + sectors_handled; + + goto read_more; + } else + generic_make_request(bio); +} + +static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) +{ + /* Some sort of write request has finished and it + * succeeded in writing where we thought there was a + * bad block. So forget the bad block. + * Or possibly if failed and we need to record + * a bad block. + */ + int m; + struct md_rdev *rdev; + + if (test_bit(R10BIO_IsSync, &r10_bio->state) || + test_bit(R10BIO_IsRecover, &r10_bio->state)) { + for (m = 0; m < conf->copies; m++) { + int dev = r10_bio->devs[m].devnum; + rdev = conf->mirrors[dev].rdev; + if (r10_bio->devs[m].bio == NULL) + continue; + if (test_bit(BIO_UPTODATE, + &r10_bio->devs[m].bio->bi_flags)) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0); + } else { + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0)) + md_error(conf->mddev, rdev); + } + rdev = conf->mirrors[dev].replacement; + if (r10_bio->devs[m].repl_bio == NULL) + continue; + if (test_bit(BIO_UPTODATE, + &r10_bio->devs[m].repl_bio->bi_flags)) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0); + } else { + if (!rdev_set_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0)) + md_error(conf->mddev, rdev); + } + } + put_buf(r10_bio); + } else { + for (m = 0; m < conf->copies; m++) { + int dev = r10_bio->devs[m].devnum; + struct bio *bio = r10_bio->devs[m].bio; + rdev = conf->mirrors[dev].rdev; + if (bio == IO_MADE_GOOD) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0); + rdev_dec_pending(rdev, conf->mddev); + } else if (bio != NULL && + !test_bit(BIO_UPTODATE, &bio->bi_flags)) { + if (!narrow_write_error(r10_bio, m)) { + md_error(conf->mddev, rdev); + set_bit(R10BIO_Degraded, + &r10_bio->state); + } + rdev_dec_pending(rdev, conf->mddev); + } + bio = r10_bio->devs[m].repl_bio; + rdev = conf->mirrors[dev].replacement; + if (rdev && bio == IO_MADE_GOOD) { + rdev_clear_badblocks( + rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0); + rdev_dec_pending(rdev, conf->mddev); + } + } + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); + raid_end_bio_io(r10_bio); + } +} + +static void raid10d(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct r10bio *r10_bio; unsigned long flags; - conf_t *conf = mddev_to_conf(mddev); + struct r10conf *conf = mddev->private; struct list_head *head = &conf->retry_list; - int unplug=0; - mdk_rdev_t *rdev; + struct blk_plug plug; md_check_recovery(mddev); + blk_start_plug(&plug); for (;;) { - char b[BDEVNAME_SIZE]; - unplug += flush_pending_writes(conf); + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { spin_unlock_irqrestore(&conf->device_lock, flags); break; } - r10_bio = list_entry(head->prev, r10bio_t, retry_list); + r10_bio = list_entry(head->prev, struct r10bio, retry_list); list_del(head->prev); conf->nr_queued--; spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r10_bio->mddev; - conf = mddev_to_conf(mddev); - if (test_bit(R10BIO_IsSync, &r10_bio->state)) { + conf = mddev->private; + if (test_bit(R10BIO_MadeGood, &r10_bio->state) || + test_bit(R10BIO_WriteError, &r10_bio->state)) + handle_write_completed(conf, r10_bio); + else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) + reshape_request_write(mddev, r10_bio); + else if (test_bit(R10BIO_IsSync, &r10_bio->state)) sync_request_write(mddev, r10_bio); - unplug = 1; - } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) { + else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) recovery_request_write(mddev, r10_bio); - unplug = 1; - } else { - int mirror; - /* we got a read error. Maybe the drive is bad. Maybe just - * the block and we can fix it. - * We freeze all other IO, and try reading the block from - * other devices. When we find one, we re-write - * and check it that fixes the read error. - * This is all done synchronously while the array is - * frozen. + else if (test_bit(R10BIO_ReadError, &r10_bio->state)) + handle_read_error(mddev, r10_bio); + else { + /* just a partial read to be scheduled from a + * separate context */ - if (mddev->ro == 0) { - freeze_array(conf); - fix_read_error(conf, mddev, r10_bio); - unfreeze_array(conf); - } - - bio = r10_bio->devs[r10_bio->read_slot].bio; - r10_bio->devs[r10_bio->read_slot].bio = - mddev->ro ? IO_BLOCKED : NULL; - mirror = read_balance(conf, r10_bio); - if (mirror == -1) { - printk(KERN_ALERT "raid10: %s: unrecoverable I/O" - " read error for block %llu\n", - bdevname(bio->bi_bdev,b), - (unsigned long long)r10_bio->sector); - raid_end_bio_io(r10_bio); - bio_put(bio); - } else { - const int do_sync = bio_sync(r10_bio->master_bio); - bio_put(bio); - rdev = conf->mirrors[mirror].rdev; - if (printk_ratelimit()) - printk(KERN_ERR "raid10: %s: redirecting sector %llu to" - " another mirror\n", - bdevname(rdev->bdev,b), - (unsigned long long)r10_bio->sector); - bio = bio_clone(r10_bio->master_bio, GFP_NOIO); - r10_bio->devs[r10_bio->read_slot].bio = bio; - bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr - + rdev->data_offset; - bio->bi_bdev = rdev->bdev; - bio->bi_rw = READ | do_sync; - bio->bi_private = r10_bio; - bio->bi_end_io = raid10_end_read_request; - unplug = 1; - generic_make_request(bio); - } + int slot = r10_bio->read_slot; + generic_make_request(r10_bio->devs[slot].bio); } + + cond_resched(); + if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) + md_check_recovery(mddev); } - if (unplug) - unplug_slaves(mddev); + blk_finish_plug(&plug); } -static int init_resync(conf_t *conf) +static int init_resync(struct r10conf *conf) { int buffs; + int i; buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; BUG_ON(conf->r10buf_pool); + conf->have_replacement = 0; + for (i = 0; i < conf->geo.raid_disks; i++) + if (conf->mirrors[i].replacement) + conf->have_replacement = 1; conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); if (!conf->r10buf_pool) return -ENOMEM; @@ -1673,27 +2904,43 @@ static int init_resync(conf_t *conf) * */ -static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) +static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, + int *skipped, int go_faster) { - conf_t *conf = mddev_to_conf(mddev); - r10bio_t *r10_bio; + struct r10conf *conf = mddev->private; + struct r10bio *r10_bio; struct bio *biolist = NULL, *bio; sector_t max_sector, nr_sectors; - int disk; int i; int max_sync; - int sync_blocks; - + sector_t sync_blocks; sector_t sectors_skipped = 0; int chunks_skipped = 0; + sector_t chunk_mask = conf->geo.chunk_mask; if (!conf->r10buf_pool) if (init_resync(conf)) return 0; + /* + * Allow skipping a full rebuild for incremental assembly + * of a clean array, like RAID1 does. + */ + if (mddev->bitmap == NULL && + mddev->recovery_cp == MaxSector && + mddev->reshape_position == MaxSector && + !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + conf->fullsync == 0) { + *skipped = 1; + return mddev->dev_sectors - sector_nr; + } + skipped: - max_sector = mddev->size << 1; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + max_sector = mddev->dev_sectors; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { /* If we aborted, we need to abort the @@ -1705,25 +2952,47 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i * we need to convert that to several * virtual addresses. */ + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { + end_reshape(conf); + return 0; + } + if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) bitmap_end_sync(mddev->bitmap, mddev->curr_resync, &sync_blocks, 1); - else for (i=0; i<conf->raid_disks; i++) { + else for (i = 0; i < conf->geo.raid_disks; i++) { sector_t sect = raid10_find_virt(conf, mddev->curr_resync, i); bitmap_end_sync(mddev->bitmap, sect, &sync_blocks, 1); } - } else /* completed sync */ + } else { + /* completed sync */ + if ((!mddev->bitmap || conf->fullsync) + && conf->have_replacement + && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* Completed a full sync so the replacements + * are now fully recovered. + */ + for (i = 0; i < conf->geo.raid_disks; i++) + if (conf->mirrors[i].replacement) + conf->mirrors[i].replacement + ->recovery_offset + = MaxSector; + } conf->fullsync = 0; - + } bitmap_close_sync(mddev->bitmap); close_sync(conf); *skipped = 1; return sectors_skipped; } - if (chunks_skipped >= conf->raid_disks) { + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return reshape_request(mddev, sector_nr, skipped); + + if (chunks_skipped >= conf->geo.raid_disks) { /* if there has been nothing to do on any drive, * then there is nothing to do at all.. */ @@ -1737,9 +3006,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* make sure whole request will fit in a chunk - if chunks * are meaningful */ - if (conf->near_copies < conf->raid_disks && - max_sector > (sector_nr | conf->chunk_mask)) - max_sector = (sector_nr | conf->chunk_mask) + 1; + if (conf->geo.near_copies < conf->geo.raid_disks && + max_sector > (sector_nr | chunk_mask)) + max_sector = (sector_nr | chunk_mask) + 1; /* * If there is non-resync activity waiting for us then * put in a delay to throttle resync. @@ -1747,8 +3016,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (!go_faster && conf->nr_waiting) msleep_interruptible(1000); - bitmap_cond_end_sync(mddev->bitmap, sector_nr); - /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that * have bi_end_io, bi_sector, bi_bdev set, @@ -1767,114 +3034,220 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* recovery... the complicated one */ - int i, j, k; + int j; r10_bio = NULL; - for (i=0 ; i<conf->raid_disks; i++) - if (conf->mirrors[i].rdev && - !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { - int still_degraded = 0; - /* want to reconstruct this device */ - r10bio_t *rb2 = r10_bio; - sector_t sect = raid10_find_virt(conf, sector_nr, i); - int must_sync; - /* Unless we are doing a full sync, we only need - * to recover the block if it is set in the bitmap + for (i = 0 ; i < conf->geo.raid_disks; i++) { + int still_degraded; + struct r10bio *rb2; + sector_t sect; + int must_sync; + int any_working; + struct raid10_info *mirror = &conf->mirrors[i]; + + if ((mirror->rdev == NULL || + test_bit(In_sync, &mirror->rdev->flags)) + && + (mirror->replacement == NULL || + test_bit(Faulty, + &mirror->replacement->flags))) + continue; + + still_degraded = 0; + /* want to reconstruct this device */ + rb2 = r10_bio; + sect = raid10_find_virt(conf, sector_nr, i); + if (sect >= mddev->resync_max_sectors) { + /* last stripe is not complete - don't + * try to recover this sector. */ - must_sync = bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, 1); - if (sync_blocks < max_sync) - max_sync = sync_blocks; - if (!must_sync && - !conf->fullsync) { - /* yep, skip the sync_blocks here, but don't assume - * that there will never be anything to do here - */ - chunks_skipped = -1; - continue; - } + continue; + } + /* Unless we are doing a full sync, or a replacement + * we only need to recover the block if it is set in + * the bitmap + */ + must_sync = bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, 1); + if (sync_blocks < max_sync) + max_sync = sync_blocks; + if (!must_sync && + mirror->replacement == NULL && + !conf->fullsync) { + /* yep, skip the sync_blocks here, but don't assume + * that there will never be anything to do here + */ + chunks_skipped = -1; + continue; + } - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); - raise_barrier(conf, rb2 != NULL); - atomic_set(&r10_bio->remaining, 0); + r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + raise_barrier(conf, rb2 != NULL); + atomic_set(&r10_bio->remaining, 0); - r10_bio->master_bio = (struct bio*)rb2; - if (rb2) - atomic_inc(&rb2->remaining); - r10_bio->mddev = mddev; - set_bit(R10BIO_IsRecover, &r10_bio->state); - r10_bio->sector = sect; - - raid10_find_phys(conf, r10_bio); - /* Need to check if this section will still be - * degraded - */ - for (j=0; j<conf->copies;j++) { - int d = r10_bio->devs[j].devnum; - if (conf->mirrors[d].rdev == NULL || - test_bit(Faulty, &conf->mirrors[d].rdev->flags)) { - still_degraded = 1; - break; + r10_bio->master_bio = (struct bio*)rb2; + if (rb2) + atomic_inc(&rb2->remaining); + r10_bio->mddev = mddev; + set_bit(R10BIO_IsRecover, &r10_bio->state); + r10_bio->sector = sect; + + raid10_find_phys(conf, r10_bio); + + /* Need to check if the array will still be + * degraded + */ + for (j = 0; j < conf->geo.raid_disks; j++) + if (conf->mirrors[j].rdev == NULL || + test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { + still_degraded = 1; + break; + } + + must_sync = bitmap_start_sync(mddev->bitmap, sect, + &sync_blocks, still_degraded); + + any_working = 0; + for (j=0; j<conf->copies;j++) { + int k; + int d = r10_bio->devs[j].devnum; + sector_t from_addr, to_addr; + struct md_rdev *rdev; + sector_t sector, first_bad; + int bad_sectors; + if (!conf->mirrors[d].rdev || + !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) + continue; + /* This is where we read from */ + any_working = 1; + rdev = conf->mirrors[d].rdev; + sector = r10_bio->devs[j].addr; + + if (is_badblock(rdev, sector, max_sync, + &first_bad, &bad_sectors)) { + if (first_bad > sector) + max_sync = first_bad - sector; + else { + bad_sectors -= (sector + - first_bad); + if (max_sync > bad_sectors) + max_sync = bad_sectors; + continue; } } - must_sync = bitmap_start_sync(mddev->bitmap, sect, - &sync_blocks, still_degraded); - - for (j=0; j<conf->copies;j++) { - int d = r10_bio->devs[j].devnum; - if (conf->mirrors[d].rdev && - test_bit(In_sync, &conf->mirrors[d].rdev->flags)) { - /* This is where we read from */ - bio = r10_bio->devs[0].bio; - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_read; - bio->bi_rw = READ; - bio->bi_sector = r10_bio->devs[j].addr + - conf->mirrors[d].rdev->data_offset; - bio->bi_bdev = conf->mirrors[d].rdev->bdev; - atomic_inc(&conf->mirrors[d].rdev->nr_pending); - atomic_inc(&r10_bio->remaining); - /* and we write to 'i' */ - - for (k=0; k<conf->copies; k++) - if (r10_bio->devs[k].devnum == i) - break; - BUG_ON(k == conf->copies); - bio = r10_bio->devs[1].bio; - bio->bi_next = biolist; - biolist = bio; - bio->bi_private = r10_bio; - bio->bi_end_io = end_sync_write; - bio->bi_rw = WRITE; - bio->bi_sector = r10_bio->devs[k].addr + - conf->mirrors[i].rdev->data_offset; - bio->bi_bdev = conf->mirrors[i].rdev->bdev; - - r10_bio->devs[0].devnum = d; - r10_bio->devs[1].devnum = i; + bio = r10_bio->devs[0].bio; + bio_reset(bio); + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_read; + bio->bi_rw = READ; + from_addr = r10_bio->devs[j].addr; + bio->bi_iter.bi_sector = from_addr + + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + atomic_inc(&rdev->nr_pending); + /* and we write to 'i' (if not in_sync) */ + for (k=0; k<conf->copies; k++) + if (r10_bio->devs[k].devnum == i) break; - } + BUG_ON(k == conf->copies); + to_addr = r10_bio->devs[k].addr; + r10_bio->devs[0].devnum = d; + r10_bio->devs[0].addr = from_addr; + r10_bio->devs[1].devnum = i; + r10_bio->devs[1].addr = to_addr; + + rdev = mirror->rdev; + if (!test_bit(In_sync, &rdev->flags)) { + bio = r10_bio->devs[1].bio; + bio_reset(bio); + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_iter.bi_sector = to_addr + + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + atomic_inc(&r10_bio->remaining); + } else + r10_bio->devs[1].bio->bi_end_io = NULL; + + /* and maybe write to replacement */ + bio = r10_bio->devs[1].repl_bio; + if (bio) + bio->bi_end_io = NULL; + rdev = mirror->replacement; + /* Note: if rdev != NULL, then bio + * cannot be NULL as r10buf_pool_alloc will + * have allocated it. + * So the second test here is pointless. + * But it keeps semantic-checkers happy, and + * this comment keeps human reviewers + * happy. + */ + if (rdev == NULL || bio == NULL || + test_bit(Faulty, &rdev->flags)) + break; + bio_reset(bio); + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_iter.bi_sector = to_addr + + rdev->data_offset; + bio->bi_bdev = rdev->bdev; + atomic_inc(&r10_bio->remaining); + break; + } + if (j == conf->copies) { + /* Cannot recover, so abort the recovery or + * record a bad block */ + if (any_working) { + /* problem is that there are bad blocks + * on other device(s) + */ + int k; + for (k = 0; k < conf->copies; k++) + if (r10_bio->devs[k].devnum == i) + break; + if (!test_bit(In_sync, + &mirror->rdev->flags) + && !rdev_set_badblocks( + mirror->rdev, + r10_bio->devs[k].addr, + max_sync, 0)) + any_working = 0; + if (mirror->replacement && + !rdev_set_badblocks( + mirror->replacement, + r10_bio->devs[k].addr, + max_sync, 0)) + any_working = 0; } - if (j == conf->copies) { - /* Cannot recover, so abort the recovery */ - put_buf(r10_bio); - if (rb2) - atomic_dec(&rb2->remaining); - r10_bio = rb2; + if (!any_working) { if (!test_and_set_bit(MD_RECOVERY_INTR, &mddev->recovery)) - printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", + printk(KERN_INFO "md/raid10:%s: insufficient " + "working devices for recovery.\n", mdname(mddev)); - break; + mirror->recovery_disabled + = mddev->recovery_disabled; } + put_buf(r10_bio); + if (rb2) + atomic_dec(&rb2->remaining); + r10_bio = rb2; + break; } + } if (biolist == NULL) { while (r10_bio) { - r10bio_t *rb2 = r10_bio; - r10_bio = (r10bio_t*) rb2->master_bio; + struct r10bio *rb2 = r10_bio; + r10_bio = (struct r10bio*) rb2->master_bio; rb2->master_bio = NULL; put_buf(rb2); } @@ -1884,9 +3257,12 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i /* resync. Schedule a read for every block at this virt offset */ int count = 0; + bitmap_cond_end_sync(mddev->bitmap, sector_nr); + if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && - !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, + &mddev->recovery)) { /* We can skip this block */ *skipped = 1; return sync_blocks + sectors_skipped; @@ -1904,16 +3280,35 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i r10_bio->sector = sector_nr; set_bit(R10BIO_IsSync, &r10_bio->state); raid10_find_phys(conf, r10_bio); - r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; + r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; - for (i=0; i<conf->copies; i++) { + for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; + sector_t first_bad, sector; + int bad_sectors; + + if (r10_bio->devs[i].repl_bio) + r10_bio->devs[i].repl_bio->bi_end_io = NULL; + bio = r10_bio->devs[i].bio; - bio->bi_end_io = NULL; + bio_reset(bio); clear_bit(BIO_UPTODATE, &bio->bi_flags); if (conf->mirrors[d].rdev == NULL || test_bit(Faulty, &conf->mirrors[d].rdev->flags)) continue; + sector = r10_bio->devs[i].addr; + if (is_badblock(conf->mirrors[d].rdev, + sector, max_sync, + &first_bad, &bad_sectors)) { + if (first_bad > sector) + max_sync = first_bad - sector; + else { + bad_sectors -= (sector - first_bad); + if (max_sync > bad_sectors) + max_sync = bad_sectors; + continue; + } + } atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); bio->bi_next = biolist; @@ -1921,17 +3316,45 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio->bi_rw = READ; - bio->bi_sector = r10_bio->devs[i].addr + + bio->bi_iter.bi_sector = sector + conf->mirrors[d].rdev->data_offset; bio->bi_bdev = conf->mirrors[d].rdev->bdev; count++; + + if (conf->mirrors[d].replacement == NULL || + test_bit(Faulty, + &conf->mirrors[d].replacement->flags)) + continue; + + /* Need to set up for writing to the replacement */ + bio = r10_bio->devs[i].repl_bio; + bio_reset(bio); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + + sector = r10_bio->devs[i].addr; + atomic_inc(&conf->mirrors[d].rdev->nr_pending); + bio->bi_next = biolist; + biolist = bio; + bio->bi_private = r10_bio; + bio->bi_end_io = end_sync_write; + bio->bi_rw = WRITE; + bio->bi_iter.bi_sector = sector + + conf->mirrors[d].replacement->data_offset; + bio->bi_bdev = conf->mirrors[d].replacement->bdev; + count++; } if (count < 2) { for (i=0; i<conf->copies; i++) { int d = r10_bio->devs[i].devnum; if (r10_bio->devs[i].bio->bi_end_io) - rdev_dec_pending(conf->mirrors[d].rdev, mddev); + rdev_dec_pending(conf->mirrors[d].rdev, + mddev); + if (r10_bio->devs[i].repl_bio && + r10_bio->devs[i].repl_bio->bi_end_io) + rdev_dec_pending( + conf->mirrors[d].replacement, + mddev); } put_buf(r10_bio); biolist = NULL; @@ -1939,44 +3362,33 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i } } - for (bio = biolist; bio ; bio=bio->bi_next) { - - bio->bi_flags &= ~(BIO_POOL_MASK - 1); - if (bio->bi_end_io) - bio->bi_flags |= 1 << BIO_UPTODATE; - bio->bi_vcnt = 0; - bio->bi_idx = 0; - bio->bi_phys_segments = 0; - bio->bi_hw_segments = 0; - bio->bi_size = 0; - } - nr_sectors = 0; if (sector_nr + max_sync < max_sector) max_sector = sector_nr + max_sync; do { struct page *page; int len = PAGE_SIZE; - disk = 0; if (sector_nr + (len>>9) > max_sector) len = (max_sector - sector_nr) << 9; if (len == 0) break; for (bio= biolist ; bio ; bio=bio->bi_next) { + struct bio *bio2; page = bio->bi_io_vec[bio->bi_vcnt].bv_page; - if (bio_add_page(bio, page, len, 0) == 0) { - /* stop here */ - struct bio *bio2; - bio->bi_io_vec[bio->bi_vcnt].bv_page = page; - for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { - /* remove last page from this bio */ - bio2->bi_vcnt--; - bio2->bi_size -= len; - bio2->bi_flags &= ~(1<< BIO_SEG_VALID); - } - goto bio_full; + if (bio_add_page(bio, page, len, 0)) + continue; + + /* stop here */ + bio->bi_io_vec[bio->bi_vcnt].bv_page = page; + for (bio2 = biolist; + bio2 && bio2 != bio; + bio2 = bio2->bi_next) { + /* remove last page from this bio */ + bio2->bi_vcnt--; + bio2->bi_iter.bi_size -= len; + bio2->bi_flags &= ~(1<< BIO_SEG_VALID); } - disk = i; + goto bio_full; } nr_sectors += len>>9; sector_nr += len>>9; @@ -1994,6 +3406,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (bio->bi_end_io == end_sync_read) { md_sync_acct(bio->bi_bdev, nr_sectors); + set_bit(BIO_UPTODATE, &bio->bi_flags); generic_make_request(bio); } } @@ -2007,191 +3420,392 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return sectors_skipped + nr_sectors; giveup: /* There is nowhere to write, so all non-sync - * drives must be failed, so try the next chunk... + * drives must be failed or in resync, all drives + * have a bad block, so try the next chunk... */ - { - sector_t sec = max_sector - sector_nr; - sectors_skipped += sec; + if (sector_nr + max_sync < max_sector) + max_sector = sector_nr + max_sync; + + sectors_skipped += (max_sector - sector_nr); chunks_skipped ++; sector_nr = max_sector; goto skipped; +} + +static sector_t +raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + sector_t size; + struct r10conf *conf = mddev->private; + + if (!raid_disks) + raid_disks = min(conf->geo.raid_disks, + conf->prev.raid_disks); + if (!sectors) + sectors = conf->dev_sectors; + + size = sectors >> conf->geo.chunk_shift; + sector_div(size, conf->geo.far_copies); + size = size * raid_disks; + sector_div(size, conf->geo.near_copies); + + return size << conf->geo.chunk_shift; +} + +static void calc_sectors(struct r10conf *conf, sector_t size) +{ + /* Calculate the number of sectors-per-device that will + * actually be used, and set conf->dev_sectors and + * conf->stride + */ + + size = size >> conf->geo.chunk_shift; + sector_div(size, conf->geo.far_copies); + size = size * conf->geo.raid_disks; + sector_div(size, conf->geo.near_copies); + /* 'size' is now the number of chunks in the array */ + /* calculate "used chunks per device" */ + size = size * conf->copies; + + /* We need to round up when dividing by raid_disks to + * get the stride size. + */ + size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); + + conf->dev_sectors = size << conf->geo.chunk_shift; + + if (conf->geo.far_offset) + conf->geo.stride = 1 << conf->geo.chunk_shift; + else { + sector_div(size, conf->geo.far_copies); + conf->geo.stride = size << conf->geo.chunk_shift; } } -static int run(mddev_t *mddev) +enum geo_type {geo_new, geo_old, geo_start}; +static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) { - conf_t *conf; - int i, disk_idx; - mirror_info_t *disk; - mdk_rdev_t *rdev; - struct list_head *tmp; int nc, fc, fo; - sector_t stride, size; - - if (mddev->chunk_size == 0) { - printk(KERN_ERR "md/raid10: non-zero chunk size required.\n"); - return -EINVAL; + int layout, chunk, disks; + switch (new) { + case geo_old: + layout = mddev->layout; + chunk = mddev->chunk_sectors; + disks = mddev->raid_disks - mddev->delta_disks; + break; + case geo_new: + layout = mddev->new_layout; + chunk = mddev->new_chunk_sectors; + disks = mddev->raid_disks; + break; + default: /* avoid 'may be unused' warnings */ + case geo_start: /* new when starting reshape - raid_disks not + * updated yet. */ + layout = mddev->new_layout; + chunk = mddev->new_chunk_sectors; + disks = mddev->raid_disks + mddev->delta_disks; + break; } + if (layout >> 18) + return -1; + if (chunk < (PAGE_SIZE >> 9) || + !is_power_of_2(chunk)) + return -2; + nc = layout & 255; + fc = (layout >> 8) & 255; + fo = layout & (1<<16); + geo->raid_disks = disks; + geo->near_copies = nc; + geo->far_copies = fc; + geo->far_offset = fo; + geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks; + geo->chunk_mask = chunk - 1; + geo->chunk_shift = ffz(~chunk); + return nc*fc; +} + +static struct r10conf *setup_conf(struct mddev *mddev) +{ + struct r10conf *conf = NULL; + int err = -EINVAL; + struct geom geo; + int copies; - nc = mddev->layout & 255; - fc = (mddev->layout >> 8) & 255; - fo = mddev->layout & (1<<16); - if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || - (mddev->layout >> 17)) { - printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", - mdname(mddev), mddev->layout); + copies = setup_geo(&geo, mddev, geo_new); + + if (copies == -2) { + printk(KERN_ERR "md/raid10:%s: chunk size must be " + "at least PAGE_SIZE(%ld) and be a power of 2.\n", + mdname(mddev), PAGE_SIZE); goto out; } - /* - * copy the already verified devices into our private RAID10 - * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] - */ - conf = kzalloc(sizeof(conf_t), GFP_KERNEL); - mddev->private = conf; - if (!conf) { - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", - mdname(mddev)); + + if (copies < 2 || copies > mddev->raid_disks) { + printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", + mdname(mddev), mddev->new_layout); goto out; } - conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, - GFP_KERNEL); - if (!conf->mirrors) { - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", - mdname(mddev)); - goto out_free_conf; - } + + err = -ENOMEM; + conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL); + if (!conf) + goto out; + + /* FIXME calc properly */ + conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks + + max(0,-mddev->delta_disks)), + GFP_KERNEL); + if (!conf->mirrors) + goto out; conf->tmppage = alloc_page(GFP_KERNEL); if (!conf->tmppage) - goto out_free_conf; + goto out; + + conf->geo = geo; + conf->copies = copies; + conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, + r10bio_pool_free, conf); + if (!conf->r10bio_pool) + goto out; + + calc_sectors(conf, mddev->dev_sectors); + if (mddev->reshape_position == MaxSector) { + conf->prev = conf->geo; + conf->reshape_progress = MaxSector; + } else { + if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { + err = -EINVAL; + goto out; + } + conf->reshape_progress = mddev->reshape_position; + if (conf->prev.far_offset) + conf->prev.stride = 1 << conf->prev.chunk_shift; + else + /* far_copies must be 1 */ + conf->prev.stride = conf->dev_sectors; + } + spin_lock_init(&conf->device_lock); + INIT_LIST_HEAD(&conf->retry_list); + + spin_lock_init(&conf->resync_lock); + init_waitqueue_head(&conf->wait_barrier); + + conf->thread = md_register_thread(raid10d, mddev, "raid10"); + if (!conf->thread) + goto out; conf->mddev = mddev; - conf->raid_disks = mddev->raid_disks; - conf->near_copies = nc; - conf->far_copies = fc; - conf->copies = nc*fc; - conf->far_offset = fo; - conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; - conf->chunk_shift = ffz(~mddev->chunk_size) - 9; - size = mddev->size >> (conf->chunk_shift-1); - sector_div(size, fc); - size = size * conf->raid_disks; - sector_div(size, nc); - /* 'size' is now the number of chunks in the array */ - /* calculate "used chunks per device" in 'stride' */ - stride = size * conf->copies; + return conf; - /* We need to round up when dividing by raid_disks to - * get the stride size. - */ - stride += conf->raid_disks - 1; - sector_div(stride, conf->raid_disks); - mddev->size = stride << (conf->chunk_shift-1); + out: + if (err == -ENOMEM) + printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", + mdname(mddev)); + if (conf) { + if (conf->r10bio_pool) + mempool_destroy(conf->r10bio_pool); + kfree(conf->mirrors); + safe_put_page(conf->tmppage); + kfree(conf); + } + return ERR_PTR(err); +} - if (fo) - stride = 1; - else - sector_div(stride, fc); - conf->stride = stride << conf->chunk_shift; +static int run(struct mddev *mddev) +{ + struct r10conf *conf; + int i, disk_idx, chunk_size; + struct raid10_info *disk; + struct md_rdev *rdev; + sector_t size; + sector_t min_offset_diff = 0; + int first = 1; + bool discard_supported = false; + + if (mddev->private == NULL) { + conf = setup_conf(mddev); + if (IS_ERR(conf)) + return PTR_ERR(conf); + mddev->private = conf; + } + conf = mddev->private; + if (!conf) + goto out; - conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, - r10bio_pool_free, conf); - if (!conf->r10bio_pool) { - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", - mdname(mddev)); - goto out_free_conf; + mddev->thread = conf->thread; + conf->thread = NULL; + + chunk_size = mddev->chunk_sectors << 9; + if (mddev->queue) { + blk_queue_max_discard_sectors(mddev->queue, + mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, 0); + blk_queue_io_min(mddev->queue, chunk_size); + if (conf->geo.raid_disks % conf->geo.near_copies) + blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); + else + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->geo.raid_disks / conf->geo.near_copies)); } - spin_lock_init(&conf->device_lock); - mddev->queue->queue_lock = &conf->device_lock; + rdev_for_each(rdev, mddev) { + long long diff; + struct request_queue *q; - rdev_for_each(rdev, tmp, mddev) { disk_idx = rdev->raid_disk; - if (disk_idx >= mddev->raid_disks - || disk_idx < 0) + if (disk_idx < 0) + continue; + if (disk_idx >= conf->geo.raid_disks && + disk_idx >= conf->prev.raid_disks) continue; disk = conf->mirrors + disk_idx; - disk->rdev = rdev; - - blk_queue_stack_limits(mddev->queue, - rdev->bdev->bd_disk->queue); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + if (test_bit(Replacement, &rdev->flags)) { + if (disk->replacement) + goto out_free_conf; + disk->replacement = rdev; + } else { + if (disk->rdev) + goto out_free_conf; + disk->rdev = rdev; + } + q = bdev_get_queue(rdev->bdev); + if (q->merge_bvec_fn) + mddev->merge_check_needed = 1; + diff = (rdev->new_data_offset - rdev->data_offset); + if (!mddev->reshape_backwards) + diff = -diff; + if (diff < 0) + diff = 0; + if (first || diff < min_offset_diff) + min_offset_diff = diff; + + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); disk->head_position = 0; - } - INIT_LIST_HEAD(&conf->retry_list); - spin_lock_init(&conf->resync_lock); - init_waitqueue_head(&conf->wait_barrier); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; + } + if (mddev->queue) { + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + } /* need to check that every block has at least one working mirror */ - if (!enough(conf)) { - printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", + if (!enough(conf, -1)) { + printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", mdname(mddev)); goto out_free_conf; } + if (conf->reshape_progress != MaxSector) { + /* must ensure that shape change is supported */ + if (conf->geo.far_copies != 1 && + conf->geo.far_offset == 0) + goto out_free_conf; + if (conf->prev.far_copies != 1 && + conf->prev.far_offset == 0) + goto out_free_conf; + } + mddev->degraded = 0; - for (i = 0; i < conf->raid_disks; i++) { + for (i = 0; + i < conf->geo.raid_disks + || i < conf->prev.raid_disks; + i++) { disk = conf->mirrors + i; + if (!disk->rdev && disk->replacement) { + /* The replacement is all we have - use it */ + disk->rdev = disk->replacement; + disk->replacement = NULL; + clear_bit(Replacement, &disk->rdev->flags); + } + if (!disk->rdev || !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; mddev->degraded++; - if (disk->rdev) + if (disk->rdev && + disk->rdev->saved_raid_disk < 0) conf->fullsync = 1; } + disk->recovery_disabled = mddev->recovery_disabled - 1; } - - mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10"); - if (!mddev->thread) { - printk(KERN_ERR - "raid10: couldn't allocate thread for %s\n", + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "md/raid10:%s: not clean" + " -- starting background reconstruction\n", mdname(mddev)); - goto out_free_conf; - } - printk(KERN_INFO - "raid10: raid set %s active with %d out of %d devices\n", - mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks); + "md/raid10:%s: active with %d out of %d devices\n", + mdname(mddev), conf->geo.raid_disks - mddev->degraded, + conf->geo.raid_disks); /* * Ok, everything is just fine now */ - mddev->array_sectors = size << conf->chunk_shift; - mddev->resync_max_sectors = size << conf->chunk_shift; + mddev->dev_sectors = conf->dev_sectors; + size = raid10_size(mddev, 0, 0); + md_set_array_sectors(mddev, size); + mddev->resync_max_sectors = size; + + if (mddev->queue) { + int stripe = conf->geo.raid_disks * + ((mddev->chunk_sectors << 9) / PAGE_SIZE); + mddev->queue->backing_dev_info.congested_fn = raid10_congested; + mddev->queue->backing_dev_info.congested_data = mddev; + + /* Calculate max read-ahead size. + * We need to readahead at least twice a whole stripe.... + * maybe... + */ + stripe /= conf->geo.near_copies; + if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) + mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); + } - mddev->queue->unplug_fn = raid10_unplug; - mddev->queue->backing_dev_info.congested_fn = raid10_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - /* Calculate max read-ahead size. - * We need to readahead at least twice a whole stripe.... - * maybe... - */ - { - int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE); - stripe /= conf->near_copies; - if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) - mddev->queue->backing_dev_info.ra_pages = 2* stripe; + if (md_integrity_register(mddev)) + goto out_free_conf; + + if (conf->reshape_progress != MaxSector) { + unsigned long before_length, after_length; + + before_length = ((1 << conf->prev.chunk_shift) * + conf->prev.far_copies); + after_length = ((1 << conf->geo.chunk_shift) * + conf->geo.far_copies); + + if (max(before_length, after_length) > min_offset_diff) { + /* This cannot work */ + printk("md/raid10: offset difference not enough to continue reshape\n"); + goto out_free_conf; + } + conf->offset_diff = min_offset_diff; + + conf->reshape_safe = conf->reshape_progress; + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "reshape"); } - if (conf->near_copies < mddev->raid_disks) - blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); return 0; out_free_conf: + md_unregister_thread(&mddev->thread); if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); safe_put_page(conf->tmppage); @@ -2202,24 +3816,30 @@ out: return -EIO; } -static int stop(mddev_t *mddev) +static int stop(struct mddev *mddev) { - conf_t *conf = mddev_to_conf(mddev); + struct r10conf *conf = mddev->private; + + raise_barrier(conf, 0); + lower_barrier(conf); + + md_unregister_thread(&mddev->thread); + if (mddev->queue) + /* the unplug fn references 'conf'*/ + blk_sync_queue(mddev->queue); - md_unregister_thread(mddev->thread); - mddev->thread = NULL; - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); + safe_put_page(conf->tmppage); kfree(conf->mirrors); kfree(conf); mddev->private = NULL; return 0; } -static void raid10_quiesce(mddev_t *mddev, int state) +static void raid10_quiesce(struct mddev *mddev, int state) { - conf_t *conf = mddev_to_conf(mddev); + struct r10conf *conf = mddev->private; switch(state) { case 1: @@ -2229,16 +3849,866 @@ static void raid10_quiesce(mddev_t *mddev, int state) lower_barrier(conf); break; } - if (mddev->thread) { - if (mddev->bitmap) - mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; +} + +static int raid10_resize(struct mddev *mddev, sector_t sectors) +{ + /* Resize of 'far' arrays is not supported. + * For 'near' and 'offset' arrays we can set the + * number of sectors used to be an appropriate multiple + * of the chunk size. + * For 'offset', this is far_copies*chunksize. + * For 'near' the multiplier is the LCM of + * near_copies and raid_disks. + * So if far_copies > 1 && !far_offset, fail. + * Else find LCM(raid_disks, near_copy)*far_copies and + * multiply by chunk_size. Then round to this number. + * This is mostly done by raid10_size() + */ + struct r10conf *conf = mddev->private; + sector_t oldsize, size; + + if (mddev->reshape_position != MaxSector) + return -EBUSY; + + if (conf->geo.far_copies > 1 && !conf->geo.far_offset) + return -EINVAL; + + oldsize = raid10_size(mddev, 0, 0); + size = raid10_size(mddev, sectors, 0); + if (mddev->external_size && + mddev->array_sectors > size) + return -EINVAL; + if (mddev->bitmap) { + int ret = bitmap_resize(mddev->bitmap, size, 0, 0); + if (ret) + return ret; + } + md_set_array_sectors(mddev, size); + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > oldsize) { + mddev->recovery_cp = oldsize; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + calc_sectors(conf, sectors); + mddev->dev_sectors = conf->dev_sectors; + mddev->resync_max_sectors = size; + return 0; +} + +static void *raid10_takeover_raid0(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct r10conf *conf; + + if (mddev->degraded > 0) { + printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + /* Set new parameters */ + mddev->new_level = 10; + /* new layout: far_copies = 1, near_copies = 2 */ + mddev->new_layout = (1<<8) + 2; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->delta_disks = mddev->raid_disks; + mddev->raid_disks *= 2; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + conf = setup_conf(mddev); + if (!IS_ERR(conf)) { + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0) + rdev->new_raid_disk = rdev->raid_disk * 2; + conf->barrier = 1; + } + + return conf; +} + +static void *raid10_takeover(struct mddev *mddev) +{ + struct r0conf *raid0_conf; + + /* raid10 can take over: + * raid0 - providing it has only two drives + */ + if (mddev->level == 0) { + /* for raid0 takeover only one zone is supported */ + raid0_conf = mddev->private; + if (raid0_conf->nr_strip_zones > 1) { + printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" + " with more than one zone.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + return raid10_takeover_raid0(mddev); + } + return ERR_PTR(-EINVAL); +} + +static int raid10_check_reshape(struct mddev *mddev) +{ + /* Called when there is a request to change + * - layout (to ->new_layout) + * - chunk size (to ->new_chunk_sectors) + * - raid_disks (by delta_disks) + * or when trying to restart a reshape that was ongoing. + * + * We need to validate the request and possibly allocate + * space if that might be an issue later. + * + * Currently we reject any reshape of a 'far' mode array, + * allow chunk size to change if new is generally acceptable, + * allow raid_disks to increase, and allow + * a switch between 'near' mode and 'offset' mode. + */ + struct r10conf *conf = mddev->private; + struct geom geo; + + if (conf->geo.far_copies != 1 && !conf->geo.far_offset) + return -EINVAL; + + if (setup_geo(&geo, mddev, geo_start) != conf->copies) + /* mustn't change number of copies */ + return -EINVAL; + if (geo.far_copies > 1 && !geo.far_offset) + /* Cannot switch to 'far' mode */ + return -EINVAL; + + if (mddev->array_sectors & geo.chunk_mask) + /* not factor of array size */ + return -EINVAL; + + if (!enough(conf, -1)) + return -EINVAL; + + kfree(conf->mirrors_new); + conf->mirrors_new = NULL; + if (mddev->delta_disks > 0) { + /* allocate new 'mirrors' list */ + conf->mirrors_new = kzalloc( + sizeof(struct raid10_info) + *(mddev->raid_disks + + mddev->delta_disks), + GFP_KERNEL); + if (!conf->mirrors_new) + return -ENOMEM; + } + return 0; +} + +/* + * Need to check if array has failed when deciding whether to: + * - start an array + * - remove non-faulty devices + * - add a spare + * - allow a reshape + * This determination is simple when no reshape is happening. + * However if there is a reshape, we need to carefully check + * both the before and after sections. + * This is because some failed devices may only affect one + * of the two sections, and some non-in_sync devices may + * be insync in the section most affected by failed devices. + */ +static int calc_degraded(struct r10conf *conf) +{ + int degraded, degraded2; + int i; + + rcu_read_lock(); + degraded = 0; + /* 'prev' section first */ + for (i = 0; i < conf->prev.raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (!test_bit(In_sync, &rdev->flags)) + /* When we can reduce the number of devices in + * an array, this might not contribute to + * 'degraded'. It does now. + */ + degraded++; + } + rcu_read_unlock(); + if (conf->geo.raid_disks == conf->prev.raid_disks) + return degraded; + rcu_read_lock(); + degraded2 = 0; + for (i = 0; i < conf->geo.raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded2++; + else if (!test_bit(In_sync, &rdev->flags)) { + /* If reshape is increasing the number of devices, + * this section has already been recovered, so + * it doesn't contribute to degraded. + * else it does. + */ + if (conf->geo.raid_disks <= conf->prev.raid_disks) + degraded2++; + } + } + rcu_read_unlock(); + if (degraded2 > degraded) + return degraded2; + return degraded; +} + +static int raid10_start_reshape(struct mddev *mddev) +{ + /* A 'reshape' has been requested. This commits + * the various 'new' fields and sets MD_RECOVER_RESHAPE + * This also checks if there are enough spares and adds them + * to the array. + * We currently require enough spares to make the final + * array non-degraded. We also require that the difference + * between old and new data_offset - on each device - is + * enough that we never risk over-writing. + */ + + unsigned long before_length, after_length; + sector_t min_offset_diff = 0; + int first = 1; + struct geom new; + struct r10conf *conf = mddev->private; + struct md_rdev *rdev; + int spares = 0; + int ret; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + + if (setup_geo(&new, mddev, geo_start) != conf->copies) + return -EINVAL; + + before_length = ((1 << conf->prev.chunk_shift) * + conf->prev.far_copies); + after_length = ((1 << conf->geo.chunk_shift) * + conf->geo.far_copies); + + rdev_for_each(rdev, mddev) { + if (!test_bit(In_sync, &rdev->flags) + && !test_bit(Faulty, &rdev->flags)) + spares++; + if (rdev->raid_disk >= 0) { + long long diff = (rdev->new_data_offset + - rdev->data_offset); + if (!mddev->reshape_backwards) + diff = -diff; + if (diff < 0) + diff = 0; + if (first || diff < min_offset_diff) + min_offset_diff = diff; + } + } + + if (max(before_length, after_length) > min_offset_diff) + return -EINVAL; + + if (spares < mddev->delta_disks) + return -EINVAL; + + conf->offset_diff = min_offset_diff; + spin_lock_irq(&conf->device_lock); + if (conf->mirrors_new) { + memcpy(conf->mirrors_new, conf->mirrors, + sizeof(struct raid10_info)*conf->prev.raid_disks); + smp_mb(); + kfree(conf->mirrors_old); /* FIXME and elsewhere */ + conf->mirrors_old = conf->mirrors; + conf->mirrors = conf->mirrors_new; + conf->mirrors_new = NULL; + } + setup_geo(&conf->geo, mddev, geo_start); + smp_mb(); + if (mddev->reshape_backwards) { + sector_t size = raid10_size(mddev, 0, 0); + if (size < mddev->array_sectors) { + spin_unlock_irq(&conf->device_lock); + printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n", + mdname(mddev)); + return -EINVAL; + } + mddev->resync_max_sectors = size; + conf->reshape_progress = size; + } else + conf->reshape_progress = 0; + spin_unlock_irq(&conf->device_lock); + + if (mddev->delta_disks && mddev->bitmap) { + ret = bitmap_resize(mddev->bitmap, + raid10_size(mddev, 0, + conf->geo.raid_disks), + 0, 0); + if (ret) + goto abort; + } + if (mddev->delta_disks > 0) { + rdev_for_each(rdev, mddev) + if (rdev->raid_disk < 0 && + !test_bit(Faulty, &rdev->flags)) { + if (raid10_add_disk(mddev, rdev) == 0) { + if (rdev->raid_disk >= + conf->prev.raid_disks) + set_bit(In_sync, &rdev->flags); + else + rdev->recovery_offset = 0; + + if (sysfs_link_rdev(mddev, rdev)) + /* Failure here is OK */; + } + } else if (rdev->raid_disk >= conf->prev.raid_disks + && !test_bit(Faulty, &rdev->flags)) { + /* This is a spare that was manually added */ + set_bit(In_sync, &rdev->flags); + } + } + /* When a reshape changes the number of devices, + * ->degraded is measured against the larger of the + * pre and post numbers. + */ + spin_lock_irq(&conf->device_lock); + mddev->degraded = calc_degraded(conf); + spin_unlock_irq(&conf->device_lock); + mddev->raid_disks = conf->geo.raid_disks; + mddev->reshape_position = conf->reshape_progress; + set_bit(MD_CHANGE_DEVS, &mddev->flags); + + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "reshape"); + if (!mddev->sync_thread) { + ret = -EAGAIN; + goto abort; + } + conf->reshape_checkpoint = jiffies; + md_wakeup_thread(mddev->sync_thread); + md_new_event(mddev); + return 0; + +abort: + mddev->recovery = 0; + spin_lock_irq(&conf->device_lock); + conf->geo = conf->prev; + mddev->raid_disks = conf->geo.raid_disks; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; + smp_wmb(); + conf->reshape_progress = MaxSector; + mddev->reshape_position = MaxSector; + spin_unlock_irq(&conf->device_lock); + return ret; +} + +/* Calculate the last device-address that could contain + * any block from the chunk that includes the array-address 's' + * and report the next address. + * i.e. the address returned will be chunk-aligned and after + * any data that is in the chunk containing 's'. + */ +static sector_t last_dev_address(sector_t s, struct geom *geo) +{ + s = (s | geo->chunk_mask) + 1; + s >>= geo->chunk_shift; + s *= geo->near_copies; + s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); + s *= geo->far_copies; + s <<= geo->chunk_shift; + return s; +} + +/* Calculate the first device-address that could contain + * any block from the chunk that includes the array-address 's'. + * This too will be the start of a chunk + */ +static sector_t first_dev_address(sector_t s, struct geom *geo) +{ + s >>= geo->chunk_shift; + s *= geo->near_copies; + sector_div(s, geo->raid_disks); + s *= geo->far_copies; + s <<= geo->chunk_shift; + return s; +} + +static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, + int *skipped) +{ + /* We simply copy at most one chunk (smallest of old and new) + * at a time, possibly less if that exceeds RESYNC_PAGES, + * or we hit a bad block or something. + * This might mean we pause for normal IO in the middle of + * a chunk, but that is not a problem was mddev->reshape_position + * can record any location. + * + * If we will want to write to a location that isn't + * yet recorded as 'safe' (i.e. in metadata on disk) then + * we need to flush all reshape requests and update the metadata. + * + * When reshaping forwards (e.g. to more devices), we interpret + * 'safe' as the earliest block which might not have been copied + * down yet. We divide this by previous stripe size and multiply + * by previous stripe length to get lowest device offset that we + * cannot write to yet. + * We interpret 'sector_nr' as an address that we want to write to. + * From this we use last_device_address() to find where we might + * write to, and first_device_address on the 'safe' position. + * If this 'next' write position is after the 'safe' position, + * we must update the metadata to increase the 'safe' position. + * + * When reshaping backwards, we round in the opposite direction + * and perform the reverse test: next write position must not be + * less than current safe position. + * + * In all this the minimum difference in data offsets + * (conf->offset_diff - always positive) allows a bit of slack, + * so next can be after 'safe', but not by more than offset_disk + * + * We need to prepare all the bios here before we start any IO + * to ensure the size we choose is acceptable to all devices. + * The means one for each copy for write-out and an extra one for + * read-in. + * We store the read-in bio in ->master_bio and the others in + * ->devs[x].bio and ->devs[x].repl_bio. + */ + struct r10conf *conf = mddev->private; + struct r10bio *r10_bio; + sector_t next, safe, last; + int max_sectors; + int nr_sectors; + int s; + struct md_rdev *rdev; + int need_flush = 0; + struct bio *blist; + struct bio *bio, *read_bio; + int sectors_done = 0; + + if (sector_nr == 0) { + /* If restarting in the middle, skip the initial sectors */ + if (mddev->reshape_backwards && + conf->reshape_progress < raid10_size(mddev, 0, 0)) { + sector_nr = (raid10_size(mddev, 0, 0) + - conf->reshape_progress); + } else if (!mddev->reshape_backwards && + conf->reshape_progress > 0) + sector_nr = conf->reshape_progress; + if (sector_nr) { + mddev->curr_resync_completed = sector_nr; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + *skipped = 1; + return sector_nr; + } + } + + /* We don't use sector_nr to track where we are up to + * as that doesn't work well for ->reshape_backwards. + * So just use ->reshape_progress. + */ + if (mddev->reshape_backwards) { + /* 'next' is the earliest device address that we might + * write to for this chunk in the new layout + */ + next = first_dev_address(conf->reshape_progress - 1, + &conf->geo); + + /* 'safe' is the last device address that we might read from + * in the old layout after a restart + */ + safe = last_dev_address(conf->reshape_safe - 1, + &conf->prev); + + if (next + conf->offset_diff < safe) + need_flush = 1; + + last = conf->reshape_progress - 1; + sector_nr = last & ~(sector_t)(conf->geo.chunk_mask + & conf->prev.chunk_mask); + if (sector_nr + RESYNC_BLOCK_SIZE/512 < last) + sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512; + } else { + /* 'next' is after the last device address that we + * might write to for this chunk in the new layout + */ + next = last_dev_address(conf->reshape_progress, &conf->geo); + + /* 'safe' is the earliest device address that we might + * read from in the old layout after a restart + */ + safe = first_dev_address(conf->reshape_safe, &conf->prev); + + /* Need to update metadata if 'next' might be beyond 'safe' + * as that would possibly corrupt data + */ + if (next > safe + conf->offset_diff) + need_flush = 1; + + sector_nr = conf->reshape_progress; + last = sector_nr | (conf->geo.chunk_mask + & conf->prev.chunk_mask); + + if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last) + last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1; + } + + if (need_flush || + time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { + /* Need to update reshape_position in metadata */ + wait_barrier(conf); + mddev->reshape_position = conf->reshape_progress; + if (mddev->reshape_backwards) + mddev->curr_resync_completed = raid10_size(mddev, 0, 0) + - conf->reshape_progress; else - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev->curr_resync_completed = conf->reshape_progress; + conf->reshape_checkpoint = jiffies; + set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, mddev->flags == 0 || + test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + allow_barrier(conf); + return sectors_done; + } + conf->reshape_safe = mddev->reshape_position; + allow_barrier(conf); + } + +read_more: + /* Now schedule reads for blocks from sector_nr to last */ + r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + raise_barrier(conf, sectors_done != 0); + atomic_set(&r10_bio->remaining, 0); + r10_bio->mddev = mddev; + r10_bio->sector = sector_nr; + set_bit(R10BIO_IsReshape, &r10_bio->state); + r10_bio->sectors = last - sector_nr + 1; + rdev = read_balance(conf, r10_bio, &max_sectors); + BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); + + if (!rdev) { + /* Cannot read from here, so need to record bad blocks + * on all the target devices. + */ + // FIXME + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return sectors_done; + } + + read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); + + read_bio->bi_bdev = rdev->bdev; + read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr + + rdev->data_offset); + read_bio->bi_private = r10_bio; + read_bio->bi_end_io = end_sync_read; + read_bio->bi_rw = READ; + read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); + read_bio->bi_flags |= 1 << BIO_UPTODATE; + read_bio->bi_vcnt = 0; + read_bio->bi_iter.bi_size = 0; + r10_bio->master_bio = read_bio; + r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; + + /* Now find the locations in the new layout */ + __raid10_find_phys(&conf->geo, r10_bio); + + blist = read_bio; + read_bio->bi_next = NULL; + + for (s = 0; s < conf->copies*2; s++) { + struct bio *b; + int d = r10_bio->devs[s/2].devnum; + struct md_rdev *rdev2; + if (s&1) { + rdev2 = conf->mirrors[d].replacement; + b = r10_bio->devs[s/2].repl_bio; + } else { + rdev2 = conf->mirrors[d].rdev; + b = r10_bio->devs[s/2].bio; + } + if (!rdev2 || test_bit(Faulty, &rdev2->flags)) + continue; + + bio_reset(b); + b->bi_bdev = rdev2->bdev; + b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + + rdev2->new_data_offset; + b->bi_private = r10_bio; + b->bi_end_io = end_reshape_write; + b->bi_rw = WRITE; + b->bi_next = blist; + blist = b; + } + + /* Now add as many pages as possible to all of these bios. */ + + nr_sectors = 0; + for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { + struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page; + int len = (max_sectors - s) << 9; + if (len > PAGE_SIZE) + len = PAGE_SIZE; + for (bio = blist; bio ; bio = bio->bi_next) { + struct bio *bio2; + if (bio_add_page(bio, page, len, 0)) + continue; + + /* Didn't fit, must stop */ + for (bio2 = blist; + bio2 && bio2 != bio; + bio2 = bio2->bi_next) { + /* Remove last page from this bio */ + bio2->bi_vcnt--; + bio2->bi_iter.bi_size -= len; + bio2->bi_flags &= ~(1<<BIO_SEG_VALID); + } + goto bio_full; + } + sector_nr += len >> 9; + nr_sectors += len >> 9; } +bio_full: + r10_bio->sectors = nr_sectors; + + /* Now submit the read */ + md_sync_acct(read_bio->bi_bdev, r10_bio->sectors); + atomic_inc(&r10_bio->remaining); + read_bio->bi_next = NULL; + generic_make_request(read_bio); + sector_nr += nr_sectors; + sectors_done += nr_sectors; + if (sector_nr <= last) + goto read_more; + + /* Now that we have done the whole section we can + * update reshape_progress + */ + if (mddev->reshape_backwards) + conf->reshape_progress -= sectors_done; + else + conf->reshape_progress += sectors_done; + + return sectors_done; } -static struct mdk_personality raid10_personality = +static void end_reshape_request(struct r10bio *r10_bio); +static int handle_reshape_read_error(struct mddev *mddev, + struct r10bio *r10_bio); +static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) +{ + /* Reshape read completed. Hopefully we have a block + * to write out. + * If we got a read error then we do sync 1-page reads from + * elsewhere until we find the data - or give up. + */ + struct r10conf *conf = mddev->private; + int s; + + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + if (handle_reshape_read_error(mddev, r10_bio) < 0) { + /* Reshape has been aborted */ + md_done_sync(mddev, r10_bio->sectors, 0); + return; + } + + /* We definitely have the data in the pages, schedule the + * writes. + */ + atomic_set(&r10_bio->remaining, 1); + for (s = 0; s < conf->copies*2; s++) { + struct bio *b; + int d = r10_bio->devs[s/2].devnum; + struct md_rdev *rdev; + if (s&1) { + rdev = conf->mirrors[d].replacement; + b = r10_bio->devs[s/2].repl_bio; + } else { + rdev = conf->mirrors[d].rdev; + b = r10_bio->devs[s/2].bio; + } + if (!rdev || test_bit(Faulty, &rdev->flags)) + continue; + atomic_inc(&rdev->nr_pending); + md_sync_acct(b->bi_bdev, r10_bio->sectors); + atomic_inc(&r10_bio->remaining); + b->bi_next = NULL; + generic_make_request(b); + } + end_reshape_request(r10_bio); +} + +static void end_reshape(struct r10conf *conf) +{ + if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) + return; + + spin_lock_irq(&conf->device_lock); + conf->prev = conf->geo; + md_finish_reshape(conf->mddev); + smp_wmb(); + conf->reshape_progress = MaxSector; + spin_unlock_irq(&conf->device_lock); + + /* read-ahead size must cover two whole stripes, which is + * 2 * (datadisks) * chunksize where 'n' is the number of raid devices + */ + if (conf->mddev->queue) { + int stripe = conf->geo.raid_disks * + ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); + stripe /= conf->geo.near_copies; + if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) + conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + } + conf->fullsync = 0; +} + + +static int handle_reshape_read_error(struct mddev *mddev, + struct r10bio *r10_bio) +{ + /* Use sync reads to get the blocks from somewhere else */ + int sectors = r10_bio->sectors; + struct r10conf *conf = mddev->private; + struct { + struct r10bio r10_bio; + struct r10dev devs[conf->copies]; + } on_stack; + struct r10bio *r10b = &on_stack.r10_bio; + int slot = 0; + int idx = 0; + struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec; + + r10b->sector = r10_bio->sector; + __raid10_find_phys(&conf->prev, r10b); + + while (sectors) { + int s = sectors; + int success = 0; + int first_slot = slot; + + if (s > (PAGE_SIZE >> 9)) + s = PAGE_SIZE >> 9; + + while (!success) { + int d = r10b->devs[slot].devnum; + struct md_rdev *rdev = conf->mirrors[d].rdev; + sector_t addr; + if (rdev == NULL || + test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + goto failed; + + addr = r10b->devs[slot].addr + idx * PAGE_SIZE; + success = sync_page_io(rdev, + addr, + s << 9, + bvec[idx].bv_page, + READ, false); + if (success) + break; + failed: + slot++; + if (slot >= conf->copies) + slot = 0; + if (slot == first_slot) + break; + } + if (!success) { + /* couldn't read this block, must give up */ + set_bit(MD_RECOVERY_INTR, + &mddev->recovery); + return -EIO; + } + sectors -= s; + idx++; + } + return 0; +} + +static void end_reshape_write(struct bio *bio, int error) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct r10bio *r10_bio = bio->bi_private; + struct mddev *mddev = r10_bio->mddev; + struct r10conf *conf = mddev->private; + int d; + int slot; + int repl; + struct md_rdev *rdev = NULL; + + d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); + if (repl) + rdev = conf->mirrors[d].replacement; + if (!rdev) { + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + + if (!uptodate) { + /* FIXME should record badblock */ + md_error(mddev, rdev); + } + + rdev_dec_pending(rdev, mddev); + end_reshape_request(r10_bio); +} + +static void end_reshape_request(struct r10bio *r10_bio) +{ + if (!atomic_dec_and_test(&r10_bio->remaining)) + return; + md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); + bio_put(r10_bio->master_bio); + put_buf(r10_bio); +} + +static void raid10_finish_reshape(struct mddev *mddev) +{ + struct r10conf *conf = mddev->private; + + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + return; + + if (mddev->delta_disks > 0) { + sector_t size = raid10_size(mddev, 0, 0); + md_set_array_sectors(mddev, size); + if (mddev->recovery_cp > mddev->resync_max_sectors) { + mddev->recovery_cp = mddev->resync_max_sectors; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } + mddev->resync_max_sectors = size; + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } else { + int d; + for (d = conf->geo.raid_disks ; + d < conf->geo.raid_disks - mddev->delta_disks; + d++) { + struct md_rdev *rdev = conf->mirrors[d].rdev; + if (rdev) + clear_bit(In_sync, &rdev->flags); + rdev = conf->mirrors[d].replacement; + if (rdev) + clear_bit(In_sync, &rdev->flags); + } + } + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = 1 << conf->geo.chunk_shift; + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; +} + +static struct md_personality raid10_personality = { .name = "raid10", .level = 10, @@ -2253,6 +4723,12 @@ static struct mdk_personality raid10_personality = .spare_active = raid10_spare_active, .sync_request = sync_request, .quiesce = raid10_quiesce, + .size = raid10_size, + .resize = raid10_resize, + .takeover = raid10_takeover, + .check_reshape = raid10_check_reshape, + .start_reshape = raid10_start_reshape, + .finish_reshape = raid10_finish_reshape, }; static int __init raid_init(void) @@ -2268,6 +4744,9 @@ static void raid_exit(void) module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); MODULE_ALIAS("md-personality-9"); /* RAID10 */ MODULE_ALIAS("md-raid10"); MODULE_ALIAS("md-level-10"); + +module_param(max_queued_requests, int, S_IRUGO|S_IWUSR); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h new file mode 100644 index 00000000000..157d69e83ff --- /dev/null +++ b/drivers/md/raid10.h @@ -0,0 +1,156 @@ +#ifndef _RAID10_H +#define _RAID10_H + +struct raid10_info { + struct md_rdev *rdev, *replacement; + sector_t head_position; + int recovery_disabled; /* matches + * mddev->recovery_disabled + * when we shouldn't try + * recovering this device. + */ +}; + +struct r10conf { + struct mddev *mddev; + struct raid10_info *mirrors; + struct raid10_info *mirrors_new, *mirrors_old; + spinlock_t device_lock; + + /* geometry */ + struct geom { + int raid_disks; + int near_copies; /* number of copies laid out + * raid0 style */ + int far_copies; /* number of copies laid out + * at large strides across drives + */ + int far_offset; /* far_copies are offset by 1 + * stripe instead of many + */ + sector_t stride; /* distance between far copies. + * This is size / far_copies unless + * far_offset, in which case it is + * 1 stripe. + */ + int far_set_size; /* The number of devices in a set, + * where a 'set' are devices that + * contain far/offset copies of + * each other. + */ + int chunk_shift; /* shift from chunks to sectors */ + sector_t chunk_mask; + } prev, geo; + int copies; /* near_copies * far_copies. + * must be <= raid_disks + */ + + sector_t dev_sectors; /* temp copy of + * mddev->dev_sectors */ + sector_t reshape_progress; + sector_t reshape_safe; + unsigned long reshape_checkpoint; + sector_t offset_diff; + + struct list_head retry_list; + /* queue pending writes and submit them on unplug */ + struct bio_list pending_bio_list; + int pending_count; + + spinlock_t resync_lock; + int nr_pending; + int nr_waiting; + int nr_queued; + int barrier; + sector_t next_resync; + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + int have_replacement; /* There is at least one + * replacement device. + */ + wait_queue_head_t wait_barrier; + + mempool_t *r10bio_pool; + mempool_t *r10buf_pool; + struct page *tmppage; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct md_thread *thread; +}; + +/* + * this is our 'private' RAID10 bio. + * + * it contains information about what kind of IO operations were started + * for this RAID10 operation, and about their status: + */ + +struct r10bio { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + sector_t sector; /* virtual sector number */ + int sectors; + unsigned long state; + struct mddev *mddev; + /* + * original bio going to /dev/mdx + */ + struct bio *master_bio; + /* + * if the IO is in READ direction, then this is where we read + */ + int read_slot; + + struct list_head retry_list; + /* + * if the IO is in WRITE direction, then multiple bios are used, + * one for each copy. + * When resyncing we also use one for each copy. + * When reconstructing, we use 2 bios, one for read, one for write. + * We choose the number when they are allocated. + * We sometimes need an extra bio to write to the replacement. + */ + struct r10dev { + struct bio *bio; + union { + struct bio *repl_bio; /* used for resync and + * writes */ + struct md_rdev *rdev; /* used for reads + * (read_slot >= 0) */ + }; + sector_t addr; + int devnum; + } devs[0]; +}; + +/* bits for r10bio.state */ +enum r10bio_state { + R10BIO_Uptodate, + R10BIO_IsSync, + R10BIO_IsRecover, + R10BIO_IsReshape, + R10BIO_Degraded, +/* Set ReadError on bios that experience a read error + * so that raid10d knows what to do with them. + */ + R10BIO_ReadError, +/* If a write for this request means we can clear some + * known-bad-block records, we set this flag. + */ + R10BIO_MadeGood, + R10BIO_WriteError, +/* During a reshape we might be performing IO on the + * 'previous' part of the array, in which case this + * flag is set + */ + R10BIO_Previous, +}; + +extern int md_raid10_congested(struct mddev *mddev, int bits); + +#endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 224de022e7c..6234b2e8458 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -27,12 +27,12 @@ * * We group bitmap updates into batches. Each batch has a number. * We may write out several batches at once, but that isn't very important. - * conf->bm_write is the number of the last batch successfully written. - * conf->bm_flush is the number of the last batch that was closed to + * conf->seq_write is the number of the last batch successfully written. + * conf->seq_flush is the number of the last batch that was closed to * new additions. * When we discover that we will need to write to any block in a stripe * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq - * the number of the batch it will be in. This is bm_flush+1. + * the number of the batch it will be in. This is seq_flush+1. * When we are ready to do a write, if that batch hasn't been written yet, * we plug the array and queue the stripe for later. * When an unplug happens, we increment bm_flush, thus closing the current @@ -43,17 +43,28 @@ * miss any bits. */ +#include <linux/blkdev.h> +#include <linux/kthread.h> +#include <linux/raid/pq.h> +#include <linux/async_tx.h> #include <linux/module.h> +#include <linux/async.h> +#include <linux/seq_file.h> +#include <linux/cpu.h> #include <linux/slab.h> -#include <linux/highmem.h> -#include <linux/bitops.h> -#include <linux/kthread.h> -#include <asm/atomic.h> -#include "raid6.h" +#include <linux/ratelimit.h> +#include <linux/nodemask.h> +#include <trace/events/block.h> -#include <linux/raid/bitmap.h> -#include <linux/async_tx.h> +#include "md.h" +#include "raid5.h" +#include "raid0.h" +#include "bitmap.h" +#define cpu_to_group(cpu) cpu_to_node(cpu) +#define ANY_GROUP NUMA_NO_NODE + +static struct workqueue_struct *raid5_wq; /* * Stripe cache */ @@ -66,8 +77,49 @@ #define BYPASS_THRESHOLD 1 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) #define HASH_MASK (NR_HASH - 1) +#define MAX_STRIPE_BATCH 8 + +static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) +{ + int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; + return &conf->stripe_hashtbl[hash]; +} + +static inline int stripe_hash_locks_hash(sector_t sect) +{ + return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; +} + +static inline void lock_device_hash_lock(struct r5conf *conf, int hash) +{ + spin_lock_irq(conf->hash_locks + hash); + spin_lock(&conf->device_lock); +} + +static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) +{ + spin_unlock(&conf->device_lock); + spin_unlock_irq(conf->hash_locks + hash); +} -#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) +static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) +{ + int i; + local_irq_disable(); + spin_lock(conf->hash_locks); + for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) + spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); + spin_lock(&conf->device_lock); +} + +static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) +{ + int i; + spin_unlock(&conf->device_lock); + for (i = NR_STRIPE_HASH_LOCKS; i; i--) + spin_unlock(conf->hash_locks + i - 1); + local_irq_enable(); +} /* bio's attached to a stripe+device for I/O are linked together in bi_sector * order without overlap. There may be several bio's per stripe+device, and @@ -75,38 +127,97 @@ * When walking this list for a particular stripe+device, we must never proceed * beyond a bio that extends past this device, as the next bio might no longer * be valid. - * This macro is used to determine the 'next' bio in the list, given the sector + * This function is used to determine the 'next' bio in the list, given the sector * of the current stripe+device */ -#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) +static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) +{ + int sectors = bio_sectors(bio); + if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) + return bio->bi_next; + else + return NULL; +} + /* - * The following can be used to debug the driver + * We maintain a biased count of active stripes in the bottom 16 bits of + * bi_phys_segments, and a count of processed stripes in the upper 16 bits */ -#define RAID5_PARANOIA 1 -#if RAID5_PARANOIA && defined(CONFIG_SMP) -# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) -#else -# define CHECK_DEVLOCK() -#endif +static inline int raid5_bi_processed_stripes(struct bio *bio) +{ + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + return (atomic_read(segments) >> 16) & 0xffff; +} -#ifdef DEBUG -#define inline -#define __inline__ -#endif +static inline int raid5_dec_bi_active_stripes(struct bio *bio) +{ + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + return atomic_sub_return(1, segments) & 0xffff; +} -#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) +static inline void raid5_inc_bi_active_stripes(struct bio *bio) +{ + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + atomic_inc(segments); +} -#if !RAID6_USE_EMPTY_ZERO_PAGE -/* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -#endif +static inline void raid5_set_bi_processed_stripes(struct bio *bio, + unsigned int cnt) +{ + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + int old, new; + + do { + old = atomic_read(segments); + new = (old & 0xffff) | (cnt << 16); + } while (atomic_cmpxchg(segments, old, new) != old); +} + +static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) +{ + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + atomic_set(segments, cnt); +} +/* Find first data disk in a raid6 stripe */ +static inline int raid6_d0(struct stripe_head *sh) +{ + if (sh->ddf_layout) + /* ddf always start from first device */ + return 0; + /* md starts just after Q block */ + if (sh->qd_idx == sh->disks - 1) + return 0; + else + return sh->qd_idx + 1; +} static inline int raid6_next_disk(int disk, int raid_disks) { disk++; return (disk < raid_disks) ? disk : 0; } +/* When walking through the disks in a raid5, starting at raid6_d0, + * We need to map each disk to a 'slot', where the data disks are slot + * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk + * is raid_disks-1. This help does that mapping. + */ +static int raid6_idx_to_slot(int idx, struct stripe_head *sh, + int *count, int syndrome_disks) +{ + int slot = *count; + + if (sh->ddf_layout) + (*count)++; + if (idx == sh->pd_idx) + return syndrome_disks; + if (idx == sh->qd_idx) + return syndrome_disks + 1; + if (!sh->ddf_layout) + (*count)++; + return slot; +} + static void return_io(struct bio *return_bi) { struct bio *bi = return_bi; @@ -114,13 +225,15 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; - bi->bi_size = 0; + bi->bi_iter.bi_size = 0; + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); bi = return_bi; } } -static void print_raid5_conf (raid5_conf_t *conf); +static void print_raid5_conf (struct r5conf *conf); static int stripe_operations_active(struct stripe_head *sh) { @@ -129,49 +242,203 @@ static int stripe_operations_active(struct stripe_head *sh) test_bit(STRIPE_COMPUTE_RUN, &sh->state); } -static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) +static void raid5_wakeup_stripe_thread(struct stripe_head *sh) { - if (atomic_dec_and_test(&sh->count)) { - BUG_ON(!list_empty(&sh->lru)); - BUG_ON(atomic_read(&conf->active_stripes)==0); - if (test_bit(STRIPE_HANDLE, &sh->state)) { - if (test_bit(STRIPE_DELAYED, &sh->state)) { - list_add_tail(&sh->lru, &conf->delayed_list); - blk_plug_device(conf->mddev->queue); - } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && - sh->bm_seq - conf->seq_write > 0) { - list_add_tail(&sh->lru, &conf->bitmap_list); - blk_plug_device(conf->mddev->queue); - } else { - clear_bit(STRIPE_BIT_DELAY, &sh->state); + struct r5conf *conf = sh->raid_conf; + struct r5worker_group *group; + int thread_cnt; + int i, cpu = sh->cpu; + + if (!cpu_online(cpu)) { + cpu = cpumask_any(cpu_online_mask); + sh->cpu = cpu; + } + + if (list_empty(&sh->lru)) { + struct r5worker_group *group; + group = conf->worker_groups + cpu_to_group(cpu); + list_add_tail(&sh->lru, &group->handle_list); + group->stripes_cnt++; + sh->group = group; + } + + if (conf->worker_cnt_per_group == 0) { + md_wakeup_thread(conf->mddev->thread); + return; + } + + group = conf->worker_groups + cpu_to_group(sh->cpu); + + group->workers[0].working = true; + /* at least one worker should run to avoid race */ + queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); + + thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; + /* wakeup more workers */ + for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { + if (group->workers[i].working == false) { + group->workers[i].working = true; + queue_work_on(sh->cpu, raid5_wq, + &group->workers[i].work); + thread_cnt--; + } + } +} + +static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, + struct list_head *temp_inactive_list) +{ + BUG_ON(!list_empty(&sh->lru)); + BUG_ON(atomic_read(&conf->active_stripes)==0); + if (test_bit(STRIPE_HANDLE, &sh->state)) { + if (test_bit(STRIPE_DELAYED, &sh->state) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + list_add_tail(&sh->lru, &conf->delayed_list); + if (atomic_read(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + sh->bm_seq - conf->seq_write > 0) + list_add_tail(&sh->lru, &conf->bitmap_list); + else { + clear_bit(STRIPE_DELAYED, &sh->state); + clear_bit(STRIPE_BIT_DELAY, &sh->state); + if (conf->worker_cnt_per_group == 0) { list_add_tail(&sh->lru, &conf->handle_list); + } else { + raid5_wakeup_stripe_thread(sh); + return; } - md_wakeup_thread(conf->mddev->thread); - } else { - BUG_ON(stripe_operations_active(sh)); - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } - atomic_dec(&conf->active_stripes); - if (!test_bit(STRIPE_EXPANDING, &sh->state)) { - list_add_tail(&sh->lru, &conf->inactive_list); - wake_up(&conf->wait_for_stripe); - if (conf->retry_read_aligned) - md_wakeup_thread(conf->mddev->thread); - } } + md_wakeup_thread(conf->mddev->thread); + } else { + BUG_ON(stripe_operations_active(sh)); + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + if (atomic_dec_return(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + atomic_dec(&conf->active_stripes); + if (!test_bit(STRIPE_EXPANDING, &sh->state)) + list_add_tail(&sh->lru, temp_inactive_list); + } +} + +static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, + struct list_head *temp_inactive_list) +{ + if (atomic_dec_and_test(&sh->count)) + do_release_stripe(conf, sh, temp_inactive_list); +} + +/* + * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list + * + * Be careful: Only one task can add/delete stripes from temp_inactive_list at + * given time. Adding stripes only takes device lock, while deleting stripes + * only takes hash lock. + */ +static void release_inactive_stripe_list(struct r5conf *conf, + struct list_head *temp_inactive_list, + int hash) +{ + int size; + bool do_wakeup = false; + unsigned long flags; + + if (hash == NR_STRIPE_HASH_LOCKS) { + size = NR_STRIPE_HASH_LOCKS; + hash = NR_STRIPE_HASH_LOCKS - 1; + } else + size = 1; + while (size) { + struct list_head *list = &temp_inactive_list[size - 1]; + + /* + * We don't hold any lock here yet, get_active_stripe() might + * remove stripes from the list + */ + if (!list_empty_careful(list)) { + spin_lock_irqsave(conf->hash_locks + hash, flags); + if (list_empty(conf->inactive_list + hash) && + !list_empty(list)) + atomic_dec(&conf->empty_inactive_list_nr); + list_splice_tail_init(list, conf->inactive_list + hash); + do_wakeup = true; + spin_unlock_irqrestore(conf->hash_locks + hash, flags); + } + size--; + hash--; + } + + if (do_wakeup) { + wake_up(&conf->wait_for_stripe); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); + } +} + +/* should hold conf->device_lock already */ +static int release_stripe_list(struct r5conf *conf, + struct list_head *temp_inactive_list) +{ + struct stripe_head *sh; + int count = 0; + struct llist_node *head; + + head = llist_del_all(&conf->released_stripes); + head = llist_reverse_order(head); + while (head) { + int hash; + + sh = llist_entry(head, struct stripe_head, release_list); + head = llist_next(head); + /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ + smp_mb(); + clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); + /* + * Don't worry the bit is set here, because if the bit is set + * again, the count is always > 1. This is true for + * STRIPE_ON_UNPLUG_LIST bit too. + */ + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &temp_inactive_list[hash]); + count++; } + + return count; } + static void release_stripe(struct stripe_head *sh) { - raid5_conf_t *conf = sh->raid_conf; + struct r5conf *conf = sh->raid_conf; unsigned long flags; + struct list_head list; + int hash; + bool wakeup; - spin_lock_irqsave(&conf->device_lock, flags); - __release_stripe(conf, sh); - spin_unlock_irqrestore(&conf->device_lock, flags); + /* Avoid release_list until the last reference. + */ + if (atomic_add_unless(&sh->count, -1, 1)) + return; + + if (unlikely(!conf->mddev->thread) || + test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) + goto slow_path; + wakeup = llist_add(&sh->release_list, &conf->released_stripes); + if (wakeup) + md_wakeup_thread(conf->mddev->thread); + return; +slow_path: + local_irq_save(flags); + /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ + if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { + INIT_LIST_HEAD(&list); + hash = sh->hash_lock_index; + do_release_stripe(conf, sh, &list); + spin_unlock(&conf->device_lock); + release_inactive_stripe_list(conf, &list, hash); + } + local_irq_restore(flags); } static inline void remove_hash(struct stripe_head *sh) @@ -182,42 +449,45 @@ static inline void remove_hash(struct stripe_head *sh) hlist_del_init(&sh->hash); } -static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) +static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) { struct hlist_head *hp = stripe_hash(conf, sh->sector); pr_debug("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); - CHECK_DEVLOCK(); hlist_add_head(&sh->hash, hp); } /* find an idle stripe, make sure it is unhashed, and return it. */ -static struct stripe_head *get_free_stripe(raid5_conf_t *conf) +static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh = NULL; struct list_head *first; - CHECK_DEVLOCK(); - if (list_empty(&conf->inactive_list)) + if (list_empty(conf->inactive_list + hash)) goto out; - first = conf->inactive_list.next; + first = (conf->inactive_list + hash)->next; sh = list_entry(first, struct stripe_head, lru); list_del_init(first); remove_hash(sh); atomic_inc(&conf->active_stripes); + BUG_ON(hash != sh->hash_lock_index); + if (list_empty(conf->inactive_list + hash)) + atomic_inc(&conf->empty_inactive_list_nr); out: return sh; } -static void shrink_buffers(struct stripe_head *sh, int num) +static void shrink_buffers(struct stripe_head *sh) { struct page *p; int i; + int num = sh->raid_conf->pool_size; - for (i=0; i<num ; i++) { + for (i = 0; i < num ; i++) { + WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); p = sh->dev[i].page; if (!p) continue; @@ -226,43 +496,48 @@ static void shrink_buffers(struct stripe_head *sh, int num) } } -static int grow_buffers(struct stripe_head *sh, int num) +static int grow_buffers(struct stripe_head *sh) { int i; + int num = sh->raid_conf->pool_size; - for (i=0; i<num; i++) { + for (i = 0; i < num; i++) { struct page *page; if (!(page = alloc_page(GFP_KERNEL))) { return 1; } sh->dev[i].page = page; + sh->dev[i].orig_page = page; } return 0; } -static void raid5_build_block (struct stripe_head *sh, int i); +static void raid5_build_block(struct stripe_head *sh, int i, int previous); +static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, + struct stripe_head *sh); -static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks) +static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { - raid5_conf_t *conf = sh->raid_conf; - int i; + struct r5conf *conf = sh->raid_conf; + int i, seq; BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); BUG_ON(stripe_operations_active(sh)); - CHECK_DEVLOCK(); pr_debug("init_stripe called, stripe %llu\n", (unsigned long long)sh->sector); remove_hash(sh); - +retry: + seq = read_seqcount_begin(&conf->gen_lock); + sh->generation = conf->generation - previous; + sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; - sh->pd_idx = pd_idx; + stripe_set_idx(sector, conf, previous, sh); sh->state = 0; - sh->disks = disks; for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -273,84 +548,190 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int (unsigned long long)sh->sector, i, dev->toread, dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); - BUG(); + WARN_ON(1); } dev->flags = 0; - raid5_build_block(sh, i); + raid5_build_block(sh, i, previous); } + if (read_seqcount_retry(&conf->gen_lock, seq)) + goto retry; insert_hash(conf, sh); + sh->cpu = smp_processor_id(); } -static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks) +static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, + short generation) { struct stripe_head *sh; - struct hlist_node *hn; - CHECK_DEVLOCK(); pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); - hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) - if (sh->sector == sector && sh->disks == disks) + hlist_for_each_entry(sh, stripe_hash(conf, sector), hash) + if (sh->sector == sector && sh->generation == generation) return sh; pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); return NULL; } -static void unplug_slaves(mddev_t *mddev); -static void raid5_unplug_device(struct request_queue *q); +/* + * Need to check if array has failed when deciding whether to: + * - start an array + * - remove non-faulty devices + * - add a spare + * - allow a reshape + * This determination is simple when no reshape is happening. + * However if there is a reshape, we need to carefully check + * both the before and after sections. + * This is because some failed devices may only affect one + * of the two sections, and some non-in_sync devices may + * be insync in the section most affected by failed devices. + */ +static int calc_degraded(struct r5conf *conf) +{ + int degraded, degraded2; + int i; + + rcu_read_lock(); + degraded = 0; + for (i = 0; i < conf->previous_raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = rcu_dereference(conf->disks[i].replacement); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If the reshape increases the number of devices, + * this is being recovered by the reshape, so + * this 'previous' section is not in_sync. + * If the number of devices is being reduced however, + * the device can only be part of the array if + * we are reverting a reshape, so this section will + * be in-sync. + */ + if (conf->raid_disks >= conf->previous_raid_disks) + degraded++; + } + rcu_read_unlock(); + if (conf->raid_disks == conf->previous_raid_disks) + return degraded; + rcu_read_lock(); + degraded2 = 0; + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = rcu_dereference(conf->disks[i].replacement); + if (!rdev || test_bit(Faulty, &rdev->flags)) + degraded2++; + else if (test_bit(In_sync, &rdev->flags)) + ; + else + /* not in-sync or faulty. + * If reshape increases the number of devices, this + * section has already been recovered, else it + * almost certainly hasn't. + */ + if (conf->raid_disks <= conf->previous_raid_disks) + degraded2++; + } + rcu_read_unlock(); + if (degraded2 > degraded) + return degraded2; + return degraded; +} + +static int has_failed(struct r5conf *conf) +{ + int degraded; + + if (conf->mddev->reshape_position == MaxSector) + return conf->mddev->degraded > conf->max_degraded; + + degraded = calc_degraded(conf); + if (degraded > conf->max_degraded) + return 1; + return 0; +} -static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks, - int pd_idx, int noblock) +static struct stripe_head * +get_active_stripe(struct r5conf *conf, sector_t sector, + int previous, int noblock, int noquiesce) { struct stripe_head *sh; + int hash = stripe_hash_locks_hash(sector); pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); - spin_lock_irq(&conf->device_lock); + spin_lock_irq(conf->hash_locks + hash); do { wait_event_lock_irq(conf->wait_for_stripe, - conf->quiesce == 0, - conf->device_lock, /* nothing */); - sh = __find_stripe(conf, sector, disks); + conf->quiesce == 0 || noquiesce, + *(conf->hash_locks + hash)); + sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { if (!conf->inactive_blocked) - sh = get_free_stripe(conf); + sh = get_free_stripe(conf, hash); if (noblock && sh == NULL) break; if (!sh) { conf->inactive_blocked = 1; - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list) && - (atomic_read(&conf->active_stripes) - < (conf->max_nr_stripes *3/4) - || !conf->inactive_blocked), - conf->device_lock, - raid5_unplug_device(conf->mddev->queue) - ); + wait_event_lock_irq( + conf->wait_for_stripe, + !list_empty(conf->inactive_list + hash) && + (atomic_read(&conf->active_stripes) + < (conf->max_nr_stripes * 3 / 4) + || !conf->inactive_blocked), + *(conf->hash_locks + hash)); conf->inactive_blocked = 0; - } else - init_stripe(sh, sector, pd_idx, disks); - } else { - if (atomic_read(&sh->count)) { - BUG_ON(!list_empty(&sh->lru)); } else { + init_stripe(sh, sector, previous); + atomic_inc(&sh->count); + } + } else if (!atomic_inc_not_zero(&sh->count)) { + spin_lock(&conf->device_lock); + if (!atomic_read(&sh->count)) { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); - if (list_empty(&sh->lru) && - !test_bit(STRIPE_EXPANDING, &sh->state)) - BUG(); + BUG_ON(list_empty(&sh->lru) && + !test_bit(STRIPE_EXPANDING, &sh->state)); list_del_init(&sh->lru); + if (sh->group) { + sh->group->stripes_cnt--; + sh->group = NULL; + } } + atomic_inc(&sh->count); + spin_unlock(&conf->device_lock); } } while (sh == NULL); - if (sh) - atomic_inc(&sh->count); - - spin_unlock_irq(&conf->device_lock); + spin_unlock_irq(conf->hash_locks + hash); return sh; } +/* Determine if 'data_offset' or 'new_data_offset' should be used + * in this stripe_head. + */ +static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) +{ + sector_t progress = conf->reshape_progress; + /* Need a memory barrier to make sure we see the value + * of conf->generation, or ->data_offset that was set before + * reshape_progress was updated. + */ + smp_rmb(); + if (progress == MaxSector) + return 0; + if (sh->generation == conf->generation - 1) + return 0; + /* We are in a reshape, and this is a new-generation stripe, + * so use new_data_offset. + */ + return 1; +} + static void raid5_end_read_request(struct bio *bi, int error); static void @@ -358,66 +739,200 @@ raid5_end_write_request(struct bio *bi, int error); static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { - raid5_conf_t *conf = sh->raid_conf; + struct r5conf *conf = sh->raid_conf; int i, disks = sh->disks; might_sleep(); for (i = disks; i--; ) { int rw; - struct bio *bi; - mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = WRITE; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) + int replace_only = 0; + struct bio *bi, *rbi; + struct md_rdev *rdev, *rrdev = NULL; + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { + if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) + rw = WRITE_FUA; + else + rw = WRITE; + if (test_bit(R5_Discard, &sh->dev[i].flags)) + rw |= REQ_DISCARD; + } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; - else + else if (test_and_clear_bit(R5_WantReplace, + &sh->dev[i].flags)) { + rw = WRITE; + replace_only = 1; + } else continue; + if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags)) + rw |= REQ_SYNC; bi = &sh->dev[i].req; - - bi->bi_rw = rw; - if (rw == WRITE) - bi->bi_end_io = raid5_end_write_request; - else - bi->bi_end_io = raid5_end_read_request; + rbi = &sh->dev[i].rreq; /* For writing to replacement */ rcu_read_lock(); + rrdev = rcu_dereference(conf->disks[i].replacement); + smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ rdev = rcu_dereference(conf->disks[i].rdev); + if (!rdev) { + rdev = rrdev; + rrdev = NULL; + } + if (rw & WRITE) { + if (replace_only) + rdev = NULL; + if (rdev == rrdev) + /* We raced and saw duplicates */ + rrdev = NULL; + } else { + if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev) + rdev = rrdev; + rrdev = NULL; + } + if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; if (rdev) atomic_inc(&rdev->nr_pending); + if (rrdev && test_bit(Faulty, &rrdev->flags)) + rrdev = NULL; + if (rrdev) + atomic_inc(&rrdev->nr_pending); rcu_read_unlock(); + /* We have already checked bad blocks for reads. Now + * need to check for writes. We never accept write errors + * on the replacement, so we don't to check rrdev. + */ + while ((rw & WRITE) && rdev && + test_bit(WriteErrorSeen, &rdev->flags)) { + sector_t first_bad; + int bad_sectors; + int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + &first_bad, &bad_sectors); + if (!bad) + break; + + if (bad < 0) { + set_bit(BlockedBadBlocks, &rdev->flags); + if (!conf->mddev->external && + conf->mddev->flags) { + /* It is very unlikely, but we might + * still need to write out the + * bad block log - better give it + * a chance*/ + md_check_recovery(conf->mddev); + } + /* + * Because md_wait_for_blocked_rdev + * will dec nr_pending, we must + * increment it first. + */ + atomic_inc(&rdev->nr_pending); + md_wait_for_blocked_rdev(rdev, conf->mddev); + } else { + /* Acknowledged bad block - skip the write */ + rdev_dec_pending(rdev, conf->mddev); + rdev = NULL; + } + } + if (rdev) { - if (s->syncing || s->expanding || s->expanded) + if (s->syncing || s->expanding || s->expanded + || s->replacing) md_sync_acct(rdev->bdev, STRIPE_SECTORS); set_bit(STRIPE_IO_STARTED, &sh->state); + bio_reset(bi); bi->bi_bdev = rdev->bdev; + bi->bi_rw = rw; + bi->bi_end_io = (rw & WRITE) + ? raid5_end_write_request + : raid5_end_read_request; + bi->bi_private = sh; + pr_debug("%s: for %llu schedule op %ld on disc %d\n", __func__, (unsigned long long)sh->sector, bi->bi_rw, i); atomic_inc(&sh->count); - bi->bi_sector = sh->sector + rdev->data_offset; - bi->bi_flags = 1 << BIO_UPTODATE; + if (use_new_offset(conf, sh)) + bi->bi_iter.bi_sector = (sh->sector + + rdev->new_data_offset); + else + bi->bi_iter.bi_sector = (sh->sector + + rdev->data_offset); + if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + bi->bi_rw |= REQ_NOMERGE; + + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; - bi->bi_max_vecs = 1; - bi->bi_idx = 0; - bi->bi_io_vec = &sh->dev[i].vec; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; - bi->bi_size = STRIPE_SIZE; - bi->bi_next = NULL; - if (rw == WRITE && - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, - &rdev->corrected_errors); + bi->bi_iter.bi_size = STRIPE_SIZE; + /* + * If this is discard request, set bi_vcnt 0. We don't + * want to confuse SCSI because SCSI will replace payload + */ + if (rw & REQ_DISCARD) + bi->bi_vcnt = 0; + if (rrdev) + set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); + + if (conf->mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), + bi, disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); generic_make_request(bi); - } else { - if (rw == WRITE) + } + if (rrdev) { + if (s->syncing || s->expanding || s->expanded + || s->replacing) + md_sync_acct(rrdev->bdev, STRIPE_SECTORS); + + set_bit(STRIPE_IO_STARTED, &sh->state); + + bio_reset(rbi); + rbi->bi_bdev = rrdev->bdev; + rbi->bi_rw = rw; + BUG_ON(!(rw & WRITE)); + rbi->bi_end_io = raid5_end_write_request; + rbi->bi_private = sh; + + pr_debug("%s: for %llu schedule op %ld on " + "replacement disc %d\n", + __func__, (unsigned long long)sh->sector, + rbi->bi_rw, i); + atomic_inc(&sh->count); + if (use_new_offset(conf, sh)) + rbi->bi_iter.bi_sector = (sh->sector + + rrdev->new_data_offset); + else + rbi->bi_iter.bi_sector = (sh->sector + + rrdev->data_offset); + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].rvec.bv_page = sh->dev[i].page; + rbi->bi_vcnt = 1; + rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; + rbi->bi_io_vec[0].bv_offset = 0; + rbi->bi_iter.bi_size = STRIPE_SIZE; + /* + * If this is discard request, set bi_vcnt 0. We don't + * want to confuse SCSI because SCSI will replace payload + */ + if (rw & REQ_DISCARD) + rbi->bi_vcnt = 0; + if (conf->mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), + rbi, disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); + generic_make_request(rbi); + } + if (!rdev && !rrdev) { + if (rw & WRITE) set_bit(STRIPE_DEGRADED, &sh->state); pr_debug("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); @@ -428,20 +943,28 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) } static struct dma_async_tx_descriptor * -async_copy_data(int frombio, struct bio *bio, struct page *page, - sector_t sector, struct dma_async_tx_descriptor *tx) +async_copy_data(int frombio, struct bio *bio, struct page **page, + sector_t sector, struct dma_async_tx_descriptor *tx, + struct stripe_head *sh) { - struct bio_vec *bvl; + struct bio_vec bvl; + struct bvec_iter iter; struct page *bio_page; - int i; int page_offset; + struct async_submit_ctl submit; + enum async_tx_flags flags = 0; - if (bio->bi_sector >= sector) - page_offset = (signed)(bio->bi_sector - sector) * 512; + if (bio->bi_iter.bi_sector >= sector) + page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; else - page_offset = (signed)(sector - bio->bi_sector) * -512; - bio_for_each_segment(bvl, bio, i) { - int len = bio_iovec_idx(bio, i)->bv_len; + page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512; + + if (frombio) + flags |= ASYNC_TX_FENCE; + init_async_submit(&submit, flags, tx, NULL, NULL, NULL); + + bio_for_each_segment(bvl, bio, iter) { + int len = bvl.bv_len; int clen; int b_offset = 0; @@ -457,19 +980,23 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, clen = len; if (clen > 0) { - b_offset += bio_iovec_idx(bio, i)->bv_offset; - bio_page = bio_iovec_idx(bio, i)->bv_page; - if (frombio) - tx = async_memcpy(page, bio_page, page_offset, - b_offset, clen, - ASYNC_TX_DEP_ACK, - tx, NULL, NULL); - else - tx = async_memcpy(bio_page, page, b_offset, - page_offset, clen, - ASYNC_TX_DEP_ACK, - tx, NULL, NULL); + b_offset += bvl.bv_offset; + bio_page = bvl.bv_page; + if (frombio) { + if (sh->raid_conf->skip_copy && + b_offset == 0 && page_offset == 0 && + clen == STRIPE_SIZE) + *page = bio_page; + else + tx = async_memcpy(*page, bio_page, page_offset, + b_offset, clen, &submit); + } else + tx = async_memcpy(bio_page, *page, b_offset, + page_offset, clen, &submit); } + /* chain the operations */ + submit.depend_tx = tx; + if (clen < len) /* hit end of page */ break; page_offset += len; @@ -482,14 +1009,12 @@ static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; struct bio *return_bi = NULL; - raid5_conf_t *conf = sh->raid_conf; int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); /* clear completed biofills */ - spin_lock_irq(&conf->device_lock); for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -504,10 +1029,10 @@ static void ops_complete_biofill(void *stripe_head_ref) BUG_ON(!dev->read); rbi = dev->read; dev->read = NULL; - while (rbi && rbi->bi_sector < + while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - if (--rbi->bi_phys_segments == 0) { + if (!raid5_dec_bi_active_stripes(rbi)) { rbi->bi_next = return_bi; return_bi = rbi; } @@ -515,7 +1040,6 @@ static void ops_complete_biofill(void *stripe_head_ref) } } } - spin_unlock_irq(&conf->device_lock); clear_bit(STRIPE_BIOFILL_RUN, &sh->state); return_io(return_bi); @@ -527,7 +1051,7 @@ static void ops_complete_biofill(void *stripe_head_ref) static void ops_run_biofill(struct stripe_head *sh) { struct dma_async_tx_descriptor *tx = NULL; - raid5_conf_t *conf = sh->raid_conf; + struct async_submit_ctl submit; int i; pr_debug("%s: stripe %llu\n", __func__, @@ -537,36 +1061,48 @@ static void ops_run_biofill(struct stripe_head *sh) struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_Wantfill, &dev->flags)) { struct bio *rbi; - spin_lock_irq(&conf->device_lock); + spin_lock_irq(&sh->stripe_lock); dev->read = rbi = dev->toread; dev->toread = NULL; - spin_unlock_irq(&conf->device_lock); - while (rbi && rbi->bi_sector < + spin_unlock_irq(&sh->stripe_lock); + while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { - tx = async_copy_data(0, rbi, dev->page, - dev->sector, tx); + tx = async_copy_data(0, rbi, &dev->page, + dev->sector, tx, sh); rbi = r5_next_bio(rbi, dev->sector); } } } atomic_inc(&sh->count); - async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, - ops_complete_biofill, sh); + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); + async_trigger_callback(&submit); } -static void ops_complete_compute5(void *stripe_head_ref) +static void mark_target_uptodate(struct stripe_head *sh, int target) { - struct stripe_head *sh = stripe_head_ref; - int target = sh->ops.target; - struct r5dev *tgt = &sh->dev[target]; + struct r5dev *tgt; - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); + if (target < 0) + return; + tgt = &sh->dev[target]; set_bit(R5_UPTODATE, &tgt->flags); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); clear_bit(R5_Wantcompute, &tgt->flags); +} + +static void ops_complete_compute(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + + pr_debug("%s: stripe %llu\n", __func__, + (unsigned long long)sh->sector); + + /* mark the computed target(s) as uptodate */ + mark_target_uptodate(sh, sh->ops.target); + mark_target_uptodate(sh, sh->ops.target2); + clear_bit(STRIPE_COMPUTE_RUN, &sh->state); if (sh->check_state == check_state_compute_run) sh->check_state = check_state_compute_result; @@ -574,16 +1110,24 @@ static void ops_complete_compute5(void *stripe_head_ref) release_stripe(sh); } -static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) +/* return a pointer to the address conversion region of the scribble buffer */ +static addr_conv_t *to_addr_conv(struct stripe_head *sh, + struct raid5_percpu *percpu) +{ + return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); +} + +static struct dma_async_tx_descriptor * +ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) { - /* kernel stack size limits the total number of disks */ int disks = sh->disks; - struct page *xor_srcs[disks]; + struct page **xor_srcs = percpu->scribble; int target = sh->ops.target; struct r5dev *tgt = &sh->dev[target]; struct page *xor_dest = tgt->page; int count = 0; struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; int i; pr_debug("%s: stripe %llu block: %d\n", @@ -596,17 +1140,213 @@ static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) atomic_inc(&sh->count); + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, + ops_complete_compute, sh, to_addr_conv(sh, percpu)); if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, - 0, NULL, ops_complete_compute5, sh); + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - ASYNC_TX_XOR_ZERO_DST, NULL, - ops_complete_compute5, sh); + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); return tx; } +/* set_syndrome_sources - populate source buffers for gen_syndrome + * @srcs - (struct page *) array of size sh->disks + * @sh - stripe_head to parse + * + * Populates srcs in proper layout order for the stripe and returns the + * 'count' of sources to be used in a call to async_gen_syndrome. The P + * destination buffer is recorded in srcs[count] and the Q destination + * is recorded in srcs[count+1]]. + */ +static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) +{ + int disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); + int d0_idx = raid6_d0(sh); + int count; + int i; + + for (i = 0; i < disks; i++) + srcs[i] = NULL; + + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + srcs[slot] = sh->dev[i].page; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + + return syndrome_disks; +} + +static struct dma_async_tx_descriptor * +ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int disks = sh->disks; + struct page **blocks = percpu->scribble; + int target; + int qd_idx = sh->qd_idx; + struct dma_async_tx_descriptor *tx; + struct async_submit_ctl submit; + struct r5dev *tgt; + struct page *dest; + int i; + int count; + + if (sh->ops.target < 0) + target = sh->ops.target2; + else if (sh->ops.target2 < 0) + target = sh->ops.target; + else + /* we should only have one valid target */ + BUG(); + BUG_ON(target < 0); + pr_debug("%s: stripe %llu block: %d\n", + __func__, (unsigned long long)sh->sector, target); + + tgt = &sh->dev[target]; + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + dest = tgt->page; + + atomic_inc(&sh->count); + + if (target == qd_idx) { + count = set_syndrome_sources(blocks, sh); + blocks[count] = NULL; /* regenerating p is not necessary */ + BUG_ON(blocks[count+1] != dest); /* q should already be set */ + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + } else { + /* Compute any data- or p-drive using XOR */ + count = 0; + for (i = disks; i-- ; ) { + if (i == target || i == qd_idx) + continue; + blocks[count++] = sh->dev[i].page; + } + + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, + NULL, ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); + } + + return tx; +} + +static struct dma_async_tx_descriptor * +ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) +{ + int i, count, disks = sh->disks; + int syndrome_disks = sh->ddf_layout ? disks : disks-2; + int d0_idx = raid6_d0(sh); + int faila = -1, failb = -1; + int target = sh->ops.target; + int target2 = sh->ops.target2; + struct r5dev *tgt = &sh->dev[target]; + struct r5dev *tgt2 = &sh->dev[target2]; + struct dma_async_tx_descriptor *tx; + struct page **blocks = percpu->scribble; + struct async_submit_ctl submit; + + pr_debug("%s: stripe %llu block1: %d block2: %d\n", + __func__, (unsigned long long)sh->sector, target, target2); + BUG_ON(target < 0 || target2 < 0); + BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); + BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); + + /* we need to open-code set_syndrome_sources to handle the + * slot number conversion for 'faila' and 'failb' + */ + for (i = 0; i < disks ; i++) + blocks[i] = NULL; + count = 0; + i = d0_idx; + do { + int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); + + blocks[slot] = sh->dev[i].page; + + if (i == target) + faila = slot; + if (i == target2) + failb = slot; + i = raid6_next_disk(i, disks); + } while (i != d0_idx); + + BUG_ON(faila == failb); + if (failb < faila) + swap(faila, failb); + pr_debug("%s: stripe: %llu faila: %d failb: %d\n", + __func__, (unsigned long long)sh->sector, faila, failb); + + atomic_inc(&sh->count); + + if (failb == syndrome_disks+1) { + /* Q disk is one of the missing disks */ + if (faila == syndrome_disks) { + /* Missing P+Q, just recompute */ + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + return async_gen_syndrome(blocks, 0, syndrome_disks+2, + STRIPE_SIZE, &submit); + } else { + struct page *dest; + int data_target; + int qd_idx = sh->qd_idx; + + /* Missing D+Q: recompute D from P, then recompute Q */ + if (target == qd_idx) + data_target = target2; + else + data_target = target; + + count = 0; + for (i = disks; i-- ; ) { + if (i == data_target || i == qd_idx) + continue; + blocks[count++] = sh->dev[i].page; + } + dest = sh->dev[data_target].page; + init_async_submit(&submit, + ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, + NULL, NULL, NULL, + to_addr_conv(sh, percpu)); + tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, + &submit); + + count = set_syndrome_sources(blocks, sh); + init_async_submit(&submit, ASYNC_TX_FENCE, tx, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + return async_gen_syndrome(blocks, 0, count+2, + STRIPE_SIZE, &submit); + } + } else { + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, + ops_complete_compute, sh, + to_addr_conv(sh, percpu)); + if (failb == syndrome_disks) { + /* We're missing D+P. */ + return async_raid6_datap_recov(syndrome_disks+2, + STRIPE_SIZE, faila, + blocks, &submit); + } else { + /* We're missing D+D. */ + return async_raid6_2data_recov(syndrome_disks+2, + STRIPE_SIZE, faila, failb, + blocks, &submit); + } + } +} + + static void ops_complete_prexor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; @@ -616,12 +1356,13 @@ static void ops_complete_prexor(void *stripe_head_ref) } static struct dma_async_tx_descriptor * -ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) +ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) { - /* kernel stack size limits the total number of disks */ int disks = sh->disks; - struct page *xor_srcs[disks]; + struct page **xor_srcs = percpu->scribble; int count = 0, pd_idx = sh->pd_idx, i; + struct async_submit_ctl submit; /* existing parity data subtracted */ struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; @@ -636,9 +1377,9 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) xor_srcs[count++] = dev->page; } - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - ASYNC_TX_DEP_ACK | ASYNC_TX_XOR_DROP_DST, tx, - ops_complete_prexor, sh); + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, + ops_complete_prexor, sh, to_addr_conv(sh, percpu)); + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); return tx; } @@ -659,17 +1400,31 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { struct bio *wbi; - spin_lock(&sh->lock); + spin_lock_irq(&sh->stripe_lock); chosen = dev->towrite; dev->towrite = NULL; BUG_ON(dev->written); wbi = dev->written = chosen; - spin_unlock(&sh->lock); + spin_unlock_irq(&sh->stripe_lock); + WARN_ON(dev->page != dev->orig_page); - while (wbi && wbi->bi_sector < + while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { - tx = async_copy_data(1, wbi, dev->page, - dev->sector, tx); + if (wbi->bi_rw & REQ_FUA) + set_bit(R5_WantFUA, &dev->flags); + if (wbi->bi_rw & REQ_SYNC) + set_bit(R5_SyncIO, &dev->flags); + if (wbi->bi_rw & REQ_DISCARD) + set_bit(R5_Discard, &dev->flags); + else { + tx = async_copy_data(1, wbi, &dev->page, + dev->sector, tx, sh); + if (dev->page != dev->orig_page) { + set_bit(R5_SkipCopy, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + clear_bit(R5_OVERWRITE, &dev->flags); + } + } wbi = r5_next_bio(wbi, dev->sector); } } @@ -678,18 +1433,35 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) return tx; } -static void ops_complete_postxor(void *stripe_head_ref) +static void ops_complete_reconstruct(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - int disks = sh->disks, i, pd_idx = sh->pd_idx; + int disks = sh->disks; + int pd_idx = sh->pd_idx; + int qd_idx = sh->qd_idx; + int i; + bool fua = false, sync = false, discard = false; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); for (i = disks; i--; ) { + fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); + sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); + discard |= test_bit(R5_Discard, &sh->dev[i].flags); + } + + for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->written || i == pd_idx) - set_bit(R5_UPTODATE, &dev->flags); + + if (dev->written || i == pd_idx || i == qd_idx) { + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) + set_bit(R5_UPTODATE, &dev->flags); + if (fua) + set_bit(R5_WantFUA, &dev->flags); + if (sync) + set_bit(R5_SyncIO, &dev->flags); + } } if (sh->reconstruct_state == reconstruct_state_drain_run) @@ -706,12 +1478,12 @@ static void ops_complete_postxor(void *stripe_head_ref) } static void -ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) +ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) { - /* kernel stack size limits the total number of disks */ int disks = sh->disks; - struct page *xor_srcs[disks]; - + struct page **xor_srcs = percpu->scribble; + struct async_submit_ctl submit; int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest; int prexor = 0; @@ -720,6 +1492,18 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = 0; i < sh->disks; i++) { + if (pd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + set_bit(R5_Discard, &sh->dev[pd_idx].flags); + ops_complete_reconstruct(sh); + return; + } /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ @@ -745,18 +1529,50 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST * for the synchronous xor case */ - flags = ASYNC_TX_DEP_ACK | ASYNC_TX_ACK | + flags = ASYNC_TX_ACK | (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); atomic_inc(&sh->count); - if (unlikely(count == 1)) { - flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, - flags, tx, ops_complete_postxor, sh); - } else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - flags, tx, ops_complete_postxor, sh); + init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, + to_addr_conv(sh, percpu)); + if (unlikely(count == 1)) + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + else + tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); +} + +static void +ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, + struct dma_async_tx_descriptor *tx) +{ + struct async_submit_ctl submit; + struct page **blocks = percpu->scribble; + int count, i; + + pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + + for (i = 0; i < sh->disks; i++) { + if (sh->pd_idx == i || sh->qd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + ops_complete_reconstruct(sh); + return; + } + + count = set_syndrome_sources(blocks, sh); + + atomic_inc(&sh->count); + + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, + sh, to_addr_conv(sh, percpu)); + async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); } static void ops_complete_check(void *stripe_head_ref) @@ -771,63 +1587,115 @@ static void ops_complete_check(void *stripe_head_ref) release_stripe(sh); } -static void ops_run_check(struct stripe_head *sh) +static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) { - /* kernel stack size limits the total number of disks */ int disks = sh->disks; - struct page *xor_srcs[disks]; + int pd_idx = sh->pd_idx; + int qd_idx = sh->qd_idx; + struct page *xor_dest; + struct page **xor_srcs = percpu->scribble; struct dma_async_tx_descriptor *tx; - - int count = 0, pd_idx = sh->pd_idx, i; - struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; + struct async_submit_ctl submit; + int count; + int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + count = 0; + xor_dest = sh->dev[pd_idx].page; + xor_srcs[count++] = xor_dest; for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (i != pd_idx) - xor_srcs[count++] = dev->page; + if (i == pd_idx || i == qd_idx) + continue; + xor_srcs[count++] = sh->dev[i].page; } - tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); + init_async_submit(&submit, 0, NULL, NULL, NULL, + to_addr_conv(sh, percpu)); + tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + &sh->ops.zero_sum_result, &submit); atomic_inc(&sh->count); - tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, - ops_complete_check, sh); + init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); + tx = async_trigger_callback(&submit); } -static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) +static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) +{ + struct page **srcs = percpu->scribble; + struct async_submit_ctl submit; + int count; + + pr_debug("%s: stripe %llu checkp: %d\n", __func__, + (unsigned long long)sh->sector, checkp); + + count = set_syndrome_sources(srcs, sh); + if (!checkp) + srcs[count] = NULL; + + atomic_inc(&sh->count); + init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, + sh, to_addr_conv(sh, percpu)); + async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, + &sh->ops.zero_sum_result, percpu->spare_page, &submit); +} + +static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; + struct r5conf *conf = sh->raid_conf; + int level = conf->level; + struct raid5_percpu *percpu; + unsigned long cpu; + cpu = get_cpu(); + percpu = per_cpu_ptr(conf->percpu, cpu); if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; } if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { - tx = ops_run_compute5(sh); - /* terminate the chain if postxor is not set to be run */ - if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) + if (level < 6) + tx = ops_run_compute5(sh, percpu); + else { + if (sh->ops.target2 < 0 || sh->ops.target < 0) + tx = ops_run_compute6_1(sh, percpu); + else + tx = ops_run_compute6_2(sh, percpu); + } + /* terminate the chain if reconstruct is not set to be run */ + if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) async_tx_ack(tx); } if (test_bit(STRIPE_OP_PREXOR, &ops_request)) - tx = ops_run_prexor(sh, tx); + tx = ops_run_prexor(sh, percpu, tx); if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { tx = ops_run_biodrain(sh, tx); overlap_clear++; } - if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) - ops_run_postxor(sh, tx); + if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { + if (level < 6) + ops_run_reconstruct5(sh, percpu, tx); + else + ops_run_reconstruct6(sh, percpu, tx); + } - if (test_bit(STRIPE_OP_CHECK, &ops_request)) - ops_run_check(sh); + if (test_bit(STRIPE_OP_CHECK, &ops_request)) { + if (sh->check_state == check_state_run) + ops_run_check_p(sh, percpu); + else if (sh->check_state == check_state_run_q) + ops_run_check_pq(sh, percpu, 0); + else if (sh->check_state == check_state_run_pq) + ops_run_check_pq(sh, percpu, 1); + else + BUG(); + } if (overlap_clear) for (i = disks; i--; ) { @@ -835,24 +1703,26 @@ static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) if (test_and_clear_bit(R5_Overlap, &dev->flags)) wake_up(&sh->raid_conf->wait_for_overlap); } + put_cpu(); } -static int grow_one_stripe(raid5_conf_t *conf) +static int grow_one_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh; - sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); + sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); if (!sh) return 0; - memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); + sh->raid_conf = conf; - spin_lock_init(&sh->lock); - if (grow_buffers(sh, conf->raid_disks)) { - shrink_buffers(sh, conf->raid_disks); + spin_lock_init(&sh->stripe_lock); + + if (grow_buffers(sh)) { + shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); return 0; } - sh->disks = conf->raid_disks; + sh->hash_lock_index = hash; /* we just created an active stripe so... */ atomic_set(&sh->count, 1); atomic_inc(&conf->active_stripes); @@ -861,13 +1731,20 @@ static int grow_one_stripe(raid5_conf_t *conf) return 1; } -static int grow_stripes(raid5_conf_t *conf, int num) +static int grow_stripes(struct r5conf *conf, int num) { struct kmem_cache *sc; - int devs = conf->raid_disks; + int devs = max(conf->raid_disks, conf->previous_raid_disks); + int hash; + + if (conf->mddev->gendisk) + sprintf(conf->cache_name[0], + "raid%d-%s", conf->level, mdname(conf->mddev)); + else + sprintf(conf->cache_name[0], + "raid%d-%p", conf->level, conf->mddev); + sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); - sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev)); - sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev)); conf->active_name = 0; sc = kmem_cache_create(conf->cache_name[conf->active_name], sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), @@ -876,14 +1753,39 @@ static int grow_stripes(raid5_conf_t *conf, int num) return 1; conf->slab_cache = sc; conf->pool_size = devs; - while (num--) - if (!grow_one_stripe(conf)) + hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; + while (num--) { + if (!grow_one_stripe(conf, hash)) return 1; + conf->max_nr_stripes++; + hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; + } return 0; } -#ifdef CONFIG_MD_RAID5_RESHAPE -static int resize_stripes(raid5_conf_t *conf, int newsize) +/** + * scribble_len - return the required size of the scribble region + * @num - total number of disks in the array + * + * The size must be enough to contain: + * 1/ a struct page pointer for each device in the array +2 + * 2/ room to convert each entry in (1) to its corresponding dma + * (dma_map_page()) or page (page_address()) address. + * + * Note: the +2 is for the destination buffers of the ddf/raid6 case where we + * calculate over all devices (not just the data blocks), using zeros in place + * of the P and Q blocks. + */ +static size_t scribble_len(int num) +{ + size_t len; + + len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); + + return len; +} + +static int resize_stripes(struct r5conf *conf, int newsize) { /* Make all the stripes able to hold 'newsize' devices. * New slots in each stripe get 'page' set to a new page. @@ -891,7 +1793,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) * This happens in stages: * 1/ create a new kmem_cache and allocate the required number of * stripe_heads. - * 2/ gather all the old stripe_heads and tranfer the pages across + * 2/ gather all the old stripe_heads and transfer the pages across * to the new stripe_heads. This will have the side effect of * freezing the array as once all stripe_heads have been collected, * no IO will be possible. Old stripe heads are freed once their @@ -911,9 +1813,11 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; + unsigned long cpu; int err; struct kmem_cache *sc; int i; + int hash, cnt; if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ @@ -930,14 +1834,12 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) return -ENOMEM; for (i = conf->max_nr_stripes; i; i--) { - nsh = kmem_cache_alloc(sc, GFP_KERNEL); + nsh = kmem_cache_zalloc(sc, GFP_KERNEL); if (!nsh) break; - memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); - nsh->raid_conf = conf; - spin_lock_init(&nsh->lock); + spin_lock_init(&nsh->stripe_lock); list_add(&nsh->lru, &newstripes); } @@ -955,28 +1857,38 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) * OK, we have enough stripes, start collecting inactive * stripes and copying them over */ + hash = 0; + cnt = 0; list_for_each_entry(nsh, &newstripes, lru) { - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list), - conf->device_lock, - unplug_slaves(conf->mddev) - ); - osh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); + lock_device_hash_lock(conf, hash); + wait_event_cmd(conf->wait_for_stripe, + !list_empty(conf->inactive_list + hash), + unlock_device_hash_lock(conf, hash), + lock_device_hash_lock(conf, hash)); + osh = get_free_stripe(conf, hash); + unlock_device_hash_lock(conf, hash); atomic_set(&nsh->count, 1); - for(i=0; i<conf->pool_size; i++) + for(i=0; i<conf->pool_size; i++) { nsh->dev[i].page = osh->dev[i].page; + nsh->dev[i].orig_page = osh->dev[i].page; + } for( ; i<newsize; i++) nsh->dev[i].page = NULL; + nsh->hash_lock_index = hash; kmem_cache_free(conf->slab_cache, osh); + cnt++; + if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + + !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { + hash++; + cnt = 0; + } } kmem_cache_destroy(conf->slab_cache); /* Step 3. * At this point, we are holding all the stripes so the array * is completely stalled, so now is a good time to resize - * conf->disks. + * conf->disks and the scribble region */ ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); if (ndisks) { @@ -987,14 +1899,35 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) } else err = -ENOMEM; + get_online_cpus(); + conf->scribble_len = scribble_len(newsize); + for_each_present_cpu(cpu) { + struct raid5_percpu *percpu; + void *scribble; + + percpu = per_cpu_ptr(conf->percpu, cpu); + scribble = kmalloc(conf->scribble_len, GFP_NOIO); + + if (scribble) { + kfree(percpu->scribble); + percpu->scribble = scribble; + } else { + err = -ENOMEM; + break; + } + } + put_online_cpus(); + /* Step 4, return new stripes to service */ while(!list_empty(&newstripes)) { nsh = list_entry(newstripes.next, struct stripe_head, lru); list_del_init(&nsh->lru); + for (i=conf->raid_disks; i < newsize; i++) if (nsh->dev[i].page == NULL) { struct page *p = alloc_page(GFP_NOIO); nsh->dev[i].page = p; + nsh->dev[i].orig_page = p; if (!p) err = -ENOMEM; } @@ -1007,28 +1940,29 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) conf->pool_size = newsize; return err; } -#endif -static int drop_one_stripe(raid5_conf_t *conf) +static int drop_one_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh; - spin_lock_irq(&conf->device_lock); - sh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); + spin_lock_irq(conf->hash_locks + hash); + sh = get_free_stripe(conf, hash); + spin_unlock_irq(conf->hash_locks + hash); if (!sh) return 0; BUG_ON(atomic_read(&sh->count)); - shrink_buffers(sh, conf->pool_size); + shrink_buffers(sh); kmem_cache_free(conf->slab_cache, sh); atomic_dec(&conf->active_stripes); return 1; } -static void shrink_stripes(raid5_conf_t *conf) +static void shrink_stripes(struct r5conf *conf) { - while (drop_one_stripe(conf)) - ; + int hash; + for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) + while (drop_one_stripe(conf, hash)) + ; if (conf->slab_cache) kmem_cache_destroy(conf->slab_cache); @@ -1037,13 +1971,13 @@ static void shrink_stripes(raid5_conf_t *conf) static void raid5_end_read_request(struct bio * bi, int error) { - struct stripe_head *sh = bi->bi_private; - raid5_conf_t *conf = sh->raid_conf; + struct stripe_head *sh = bi->bi_private; + struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); char b[BDEVNAME_SIZE]; - mdk_rdev_t *rdev; - + struct md_rdev *rdev = NULL; + sector_t s; for (i=0 ; i<disks; i++) if (bi == &sh->dev[i].req) @@ -1056,78 +1990,137 @@ static void raid5_end_read_request(struct bio * bi, int error) BUG(); return; } + if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) + /* If replacement finished while this request was outstanding, + * 'replacement' might be NULL already. + * In that case it moved down to 'rdev'. + * rdev is not removed until all requests are finished. + */ + rdev = conf->disks[i].replacement; + if (!rdev) + rdev = conf->disks[i].rdev; + if (use_new_offset(conf, sh)) + s = sh->sector + rdev->new_data_offset; + else + s = sh->sector + rdev->data_offset; if (uptodate) { set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - rdev = conf->disks[i].rdev; - printk_rl(KERN_INFO "raid5:%s: read error corrected" - " (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)(sh->sector - + rdev->data_offset), - bdevname(rdev->bdev, b)); + /* Note that this cannot happen on a + * replacement device. We just fail those on + * any error + */ + printk_ratelimited( + KERN_INFO + "md/raid:%s: read error corrected" + " (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)s, + bdevname(rdev->bdev, b)); + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); - } - if (atomic_read(&conf->disks[i].rdev->read_errors)) - atomic_set(&conf->disks[i].rdev->read_errors, 0); + } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); + + if (atomic_read(&rdev->read_errors)) + atomic_set(&rdev->read_errors, 0); } else { - const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); + const char *bdn = bdevname(rdev->bdev, b); int retry = 0; - rdev = conf->disks[i].rdev; + int set_bad = 0; clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); - if (conf->mddev->degraded) - printk_rl(KERN_WARNING - "raid5:%s: read error not correctable " - "(sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), - bdn); - else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) + if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) + printk_ratelimited( + KERN_WARNING + "md/raid:%s: read error on replacement device " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)s, + bdn); + else if (conf->mddev->degraded >= conf->max_degraded) { + set_bad = 1; + printk_ratelimited( + KERN_WARNING + "md/raid:%s: read error not correctable " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)s, + bdn); + } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { /* Oh, no!!! */ - printk_rl(KERN_WARNING - "raid5:%s: read error NOT corrected!! " - "(sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector - + rdev->data_offset), - bdn); - else if (atomic_read(&rdev->read_errors) + set_bad = 1; + printk_ratelimited( + KERN_WARNING + "md/raid:%s: read error NOT corrected!! " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)s, + bdn); + } else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING - "raid5:%s: Too many read errors, failing device %s.\n", + "md/raid:%s: Too many read errors, failing device %s.\n", mdname(conf->mddev), bdn); else retry = 1; + if (set_bad && test_bit(In_sync, &rdev->flags) + && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + retry = 1; if (retry) - set_bit(R5_ReadError, &sh->dev[i].flags); + if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { + set_bit(R5_ReadError, &sh->dev[i].flags); + clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); + } else + set_bit(R5_ReadNoMerge, &sh->dev[i].flags); else { clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); - md_error(conf->mddev, rdev); + if (!(set_bad + && test_bit(In_sync, &rdev->flags) + && rdev_set_badblocks( + rdev, sh->sector, STRIPE_SECTORS, 0))) + md_error(conf->mddev, rdev); } } - rdev_dec_pending(conf->disks[i].rdev, conf->mddev); + rdev_dec_pending(rdev, conf->mddev); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } -static void raid5_end_write_request (struct bio *bi, int error) +static void raid5_end_write_request(struct bio *bi, int error) { - struct stripe_head *sh = bi->bi_private; - raid5_conf_t *conf = sh->raid_conf; + struct stripe_head *sh = bi->bi_private; + struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; + struct md_rdev *uninitialized_var(rdev); int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); + sector_t first_bad; + int bad_sectors; + int replacement = 0; - for (i=0 ; i<disks; i++) - if (bi == &sh->dev[i].req) + for (i = 0 ; i < disks; i++) { + if (bi == &sh->dev[i].req) { + rdev = conf->disks[i].rdev; break; - + } + if (bi == &sh->dev[i].rreq) { + rdev = conf->disks[i].replacement; + if (rdev) + replacement = 1; + else + /* rdev was removed and 'replacement' + * replaced it. rdev is not removed + * until all requests are finished. + */ + rdev = conf->disks[i].rdev; + break; + } + } pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), uptodate); @@ -1136,77 +2129,107 @@ static void raid5_end_write_request (struct bio *bi, int error) return; } - if (!uptodate) - md_error(conf->mddev, conf->disks[i].rdev); + if (replacement) { + if (!uptodate) + md_error(conf->mddev, rdev); + else if (is_badblock(rdev, sh->sector, + STRIPE_SECTORS, + &first_bad, &bad_sectors)) + set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); + } else { + if (!uptodate) { + set_bit(STRIPE_DEGRADED, &sh->state); + set_bit(WriteErrorSeen, &rdev->flags); + set_bit(R5_WriteError, &sh->dev[i].flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) + set_bit(MD_RECOVERY_NEEDED, + &rdev->mddev->recovery); + } else if (is_badblock(rdev, sh->sector, + STRIPE_SECTORS, + &first_bad, &bad_sectors)) { + set_bit(R5_MadeGood, &sh->dev[i].flags); + if (test_bit(R5_ReadError, &sh->dev[i].flags)) + /* That was a successful write so make + * sure it looks like we already did + * a re-write. + */ + set_bit(R5_ReWrite, &sh->dev[i].flags); + } + } + rdev_dec_pending(rdev, conf->mddev); - rdev_dec_pending(conf->disks[i].rdev, conf->mddev); - - clear_bit(R5_LOCKED, &sh->dev[i].flags); + if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) + clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } +static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); -static sector_t compute_blocknr(struct stripe_head *sh, int i); - -static void raid5_build_block (struct stripe_head *sh, int i) +static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; bio_init(&dev->req); dev->req.bi_io_vec = &dev->vec; - dev->req.bi_vcnt++; - dev->req.bi_max_vecs++; - dev->vec.bv_page = dev->page; - dev->vec.bv_len = STRIPE_SIZE; - dev->vec.bv_offset = 0; - - dev->req.bi_sector = sh->sector; + dev->req.bi_max_vecs = 1; dev->req.bi_private = sh; + bio_init(&dev->rreq); + dev->rreq.bi_io_vec = &dev->rvec; + dev->rreq.bi_max_vecs = 1; + dev->rreq.bi_private = sh; + dev->flags = 0; - dev->sector = compute_blocknr(sh, i); + dev->sector = compute_blocknr(sh, i, previous); } -static void error(mddev_t *mddev, mdk_rdev_t *rdev) +static void error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; - pr_debug("raid5: error called\n"); + struct r5conf *conf = mddev->private; + unsigned long flags; + pr_debug("raid456: error called\n"); - if (!test_bit(Faulty, &rdev->flags)) { - set_bit(MD_CHANGE_DEVS, &mddev->flags); - if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded++; - spin_unlock_irqrestore(&conf->device_lock, flags); - /* - * if recovery was running, make sure it aborts. - */ - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - } - set_bit(Faulty, &rdev->flags); - printk (KERN_ALERT - "raid5: Disk failure on %s, disabling device.\n" - "raid5: Operation continuing on %d devices.\n", - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); - } + spin_lock_irqsave(&conf->device_lock, flags); + clear_bit(In_sync, &rdev->flags); + mddev->degraded = calc_degraded(conf); + spin_unlock_irqrestore(&conf->device_lock, flags); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + + set_bit(Blocked, &rdev->flags); + set_bit(Faulty, &rdev->flags); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + printk(KERN_ALERT + "md/raid:%s: Disk failure on %s, disabling device.\n" + "md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), + bdevname(rdev->bdev, b), + mdname(mddev), + conf->raid_disks - mddev->degraded); } /* * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */ -static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, - unsigned int data_disks, unsigned int * dd_idx, - unsigned int * pd_idx, raid5_conf_t *conf) +static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, + int previous, int *dd_idx, + struct stripe_head *sh) { - long stripe; - unsigned long chunk_number; + sector_t stripe, stripe2; + sector_t chunk_number; unsigned int chunk_offset; + int pd_idx, qd_idx; + int ddf_layout = 0; sector_t new_sector; - int sectors_per_chunk = conf->chunk_size >> 9; + int algorithm = previous ? conf->prev_algo + : conf->algorithm; + int sectors_per_chunk = previous ? conf->prev_chunk_sectors + : conf->chunk_sectors; + int raid_disks = previous ? conf->previous_raid_disks + : conf->raid_disks; + int data_disks = raid_disks - conf->max_degraded; /* First compute the information on this sector */ @@ -1215,83 +2238,176 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, */ chunk_offset = sector_div(r_sector, sectors_per_chunk); chunk_number = r_sector; - BUG_ON(r_sector != chunk_number); /* * Compute the stripe number */ - stripe = chunk_number / data_disks; - - /* - * Compute the data disk and parity disk indexes inside the stripe - */ - *dd_idx = chunk_number % data_disks; - + stripe = chunk_number; + *dd_idx = sector_div(stripe, data_disks); + stripe2 = stripe; /* * Select the parity disk based on the user selected algorithm. */ + pd_idx = qd_idx = -1; switch(conf->level) { case 4: - *pd_idx = data_disks; + pd_idx = data_disks; break; case 5: - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - if (*dd_idx >= *pd_idx) + pd_idx = data_disks - sector_div(stripe2, raid_disks); + if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - if (*dd_idx >= *pd_idx) + pd_idx = sector_div(stripe2, raid_disks); + if (*dd_idx >= pd_idx) (*dd_idx)++; break; case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = data_disks - stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + pd_idx = data_disks - sector_div(stripe2, raid_disks); + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + pd_idx = sector_div(stripe2, raid_disks); + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + break; + case ALGORITHM_PARITY_0: + pd_idx = 0; + (*dd_idx)++; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; break; default: - printk(KERN_ERR "raid5: unsupported algorithm %d\n", - conf->algorithm); + BUG(); } break; case 6: - /**** FIX THIS ****/ - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - if (*pd_idx == raid_disks-1) - (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_RIGHT_ASYMMETRIC: - *pd_idx = stripe % raid_disks; - if (*pd_idx == raid_disks-1) - (*dd_idx)++; /* Q D D D P */ - else if (*dd_idx >= *pd_idx) + pd_idx = sector_div(stripe2, raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) (*dd_idx) += 2; /* D D P Q D */ break; case ALGORITHM_LEFT_SYMMETRIC: - *pd_idx = raid_disks - 1 - (stripe % raid_disks); - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; break; case ALGORITHM_RIGHT_SYMMETRIC: - *pd_idx = stripe % raid_disks; - *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; + pd_idx = sector_div(stripe2, raid_disks); + qd_idx = (pd_idx + 1) % raid_disks; + *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; + break; + + case ALGORITHM_PARITY_0: + pd_idx = 0; + qd_idx = 1; + (*dd_idx) += 2; + break; + case ALGORITHM_PARITY_N: + pd_idx = data_disks; + qd_idx = data_disks + 1; break; + + case ALGORITHM_ROTATING_ZERO_RESTART: + /* Exactly the same as RIGHT_ASYMMETRIC, but or + * of blocks for computing Q is different. + */ + pd_idx = sector_div(stripe2, raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; + break; + + case ALGORITHM_ROTATING_N_RESTART: + /* Same a left_asymmetric, by first stripe is + * D D D P Q rather than + * Q D D D P + */ + stripe2 += 1; + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); + qd_idx = pd_idx + 1; + if (pd_idx == raid_disks-1) { + (*dd_idx)++; /* Q D D D P */ + qd_idx = 0; + } else if (*dd_idx >= pd_idx) + (*dd_idx) += 2; /* D D P Q D */ + ddf_layout = 1; + break; + + case ALGORITHM_ROTATING_N_CONTINUE: + /* Same as left_symmetric but Q is before P */ + pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); + qd_idx = (pd_idx + raid_disks - 1) % raid_disks; + *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; + ddf_layout = 1; + break; + + case ALGORITHM_LEFT_ASYMMETRIC_6: + /* RAID5 left_asymmetric, with Q on last device */ + pd_idx = data_disks - sector_div(stripe2, raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_ASYMMETRIC_6: + pd_idx = sector_div(stripe2, raid_disks-1); + if (*dd_idx >= pd_idx) + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_LEFT_SYMMETRIC_6: + pd_idx = data_disks - sector_div(stripe2, raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_RIGHT_SYMMETRIC_6: + pd_idx = sector_div(stripe2, raid_disks-1); + *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); + qd_idx = raid_disks - 1; + break; + + case ALGORITHM_PARITY_0_6: + pd_idx = 0; + (*dd_idx)++; + qd_idx = raid_disks - 1; + break; + default: - printk (KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); + BUG(); } break; } + if (sh) { + sh->pd_idx = pd_idx; + sh->qd_idx = qd_idx; + sh->ddf_layout = ddf_layout; + } /* * Finally, compute the new sector number */ @@ -1300,29 +2416,33 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, } -static sector_t compute_blocknr(struct stripe_head *sh, int i) +static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) { - raid5_conf_t *conf = sh->raid_conf; + struct r5conf *conf = sh->raid_conf; int raid_disks = sh->disks; int data_disks = raid_disks - conf->max_degraded; sector_t new_sector = sh->sector, check; - int sectors_per_chunk = conf->chunk_size >> 9; + int sectors_per_chunk = previous ? conf->prev_chunk_sectors + : conf->chunk_sectors; + int algorithm = previous ? conf->prev_algo + : conf->algorithm; sector_t stripe; int chunk_offset; - int chunk_number, dummy1, dummy2, dd_idx = i; + sector_t chunk_number; + int dummy1, dd_idx = i; sector_t r_sector; + struct stripe_head sh2; chunk_offset = sector_div(new_sector, sectors_per_chunk); stripe = new_sector; - BUG_ON(new_sector != stripe); if (i == sh->pd_idx) return 0; switch(conf->level) { case 4: break; case 5: - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: if (i > sh->pd_idx) @@ -1334,19 +2454,25 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i += raid_disks; i -= (sh->pd_idx + 1); break; + case ALGORITHM_PARITY_0: + i -= 1; + break; + case ALGORITHM_PARITY_N: + break; default: - printk(KERN_ERR "raid5: unsupported algorithm %d\n", - conf->algorithm); + BUG(); } break; case 6: - if (i == raid6_next_disk(sh->pd_idx, raid_disks)) + if (i == sh->qd_idx) return 0; /* It is the Q disk */ - switch (conf->algorithm) { + switch (algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: - if (sh->pd_idx == raid_disks-1) - i--; /* Q D D D P */ + case ALGORITHM_ROTATING_ZERO_RESTART: + case ALGORITHM_ROTATING_N_RESTART: + if (sh->pd_idx == raid_disks-1) + i--; /* Q D D D P */ else if (i > sh->pd_idx) i -= 2; /* D D P Q D */ break; @@ -1361,282 +2487,66 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) i -= (sh->pd_idx + 2); } break; + case ALGORITHM_PARITY_0: + i -= 2; + break; + case ALGORITHM_PARITY_N: + break; + case ALGORITHM_ROTATING_N_CONTINUE: + /* Like left_symmetric, but P is before Q */ + if (sh->pd_idx == 0) + i--; /* P D D D Q */ + else { + /* D D Q P D */ + if (i < sh->pd_idx) + i += raid_disks; + i -= (sh->pd_idx + 1); + } + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + if (i > sh->pd_idx) + i--; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (i < sh->pd_idx) + i += data_disks + 1; + i -= (sh->pd_idx + 1); + break; + case ALGORITHM_PARITY_0_6: + i -= 1; + break; default: - printk (KERN_CRIT "raid6: unsupported algorithm %d\n", - conf->algorithm); + BUG(); } break; } chunk_number = stripe * data_disks + i; - r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; - - check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); - if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { - printk(KERN_ERR "compute_blocknr: map not correct\n"); + r_sector = chunk_number * sectors_per_chunk + chunk_offset; + + check = raid5_compute_sector(conf, r_sector, + previous, &dummy1, &sh2); + if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx + || sh2.qd_idx != sh->qd_idx) { + printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", + mdname(conf->mddev)); return 0; } return r_sector; } - -/* - * Copy data between a page in the stripe cache, and one or more bion - * The page could align with the middle of the bio, or there could be - * several bion, each with several bio_vecs, which cover part of the page - * Multiple bion are linked together on bi_next. There may be extras - * at the end of this list. We ignore them. - */ -static void copy_data(int frombio, struct bio *bio, - struct page *page, - sector_t sector) -{ - char *pa = page_address(page); - struct bio_vec *bvl; - int i; - int page_offset; - - if (bio->bi_sector >= sector) - page_offset = (signed)(bio->bi_sector - sector) * 512; - else - page_offset = (signed)(sector - bio->bi_sector) * -512; - bio_for_each_segment(bvl, bio, i) { - int len = bio_iovec_idx(bio,i)->bv_len; - int clen; - int b_offset = 0; - - if (page_offset < 0) { - b_offset = -page_offset; - page_offset += b_offset; - len -= b_offset; - } - - if (len > 0 && page_offset + len > STRIPE_SIZE) - clen = STRIPE_SIZE - page_offset; - else clen = len; - - if (clen > 0) { - char *ba = __bio_kmap_atomic(bio, i, KM_USER0); - if (frombio) - memcpy(pa+page_offset, ba+b_offset, clen); - else - memcpy(ba+b_offset, pa+page_offset, clen); - __bio_kunmap_atomic(ba, KM_USER0); - } - if (clen < len) /* hit end of page */ - break; - page_offset += len; - } -} - -#define check_xor() do { \ - if (count == MAX_XOR_BLOCKS) { \ - xor_blocks(count, STRIPE_SIZE, dest, ptr);\ - count = 0; \ - } \ - } while(0) - -static void compute_parity6(struct stripe_head *sh, int method) -{ - raid6_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; - struct bio *chosen; - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; - - qd_idx = raid6_next_disk(pd_idx, disks); - d0_idx = raid6_next_disk(qd_idx, disks); - - pr_debug("compute_parity, stripe %llu, method %d\n", - (unsigned long long)sh->sector, method); - - switch(method) { - case READ_MODIFY_WRITE: - BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */ - case RECONSTRUCT_WRITE: - for (i= disks; i-- ;) - if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) { - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - } - break; - case CHECK_PARITY: - BUG(); /* Not implemented yet */ - } - - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); - } - - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); - } - -// switch(method) { -// case RECONSTRUCT_WRITE: -// case CHECK_PARITY: -// case UPDATE_PARITY: - /* Note that unlike RAID-5, the ordering of the disks matters greatly. */ - /* FIX: Is this ordering of drives even remotely optimal? */ - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("block %d/%d not uptodate on parity calc\n", i,count); - i = raid6_next_disk(i, disks); - } while ( i != d0_idx ); -// break; -// } - - raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); - - switch(method) { - case RECONSTRUCT_WRITE: - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); - set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); - set_bit(R5_LOCKED, &sh->dev[qd_idx].flags); - break; - case UPDATE_PARITY: - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags); - break; - } -} - - -/* Compute one missing block */ -static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) -{ - int i, count, disks = sh->disks; - void *ptr[MAX_XOR_BLOCKS], *dest, *p; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); - - pr_debug("compute_block_1, stripe %llu, idx %d\n", - (unsigned long long)sh->sector, dd_idx); - - if ( dd_idx == qd_idx ) { - /* We're actually computing the Q drive */ - compute_parity6(sh, UPDATE_PARITY); - } else { - dest = page_address(sh->dev[dd_idx].page); - if (!nozero) memset(dest, 0, STRIPE_SIZE); - count = 0; - for (i = disks ; i--; ) { - if (i == dd_idx || i == qd_idx) - continue; - p = page_address(sh->dev[i].page); - if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) - ptr[count++] = p; - else - printk("compute_block() %d, stripe %llu, %d" - " not present\n", dd_idx, - (unsigned long long)sh->sector, i); - - check_xor(); - } - if (count) - xor_blocks(count, STRIPE_SIZE, dest, ptr); - if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); - else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); - } -} - -/* Compute two missing blocks */ -static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) -{ - int i, count, disks = sh->disks; - int pd_idx = sh->pd_idx; - int qd_idx = raid6_next_disk(pd_idx, disks); - int d0_idx = raid6_next_disk(qd_idx, disks); - int faila, failb; - - /* faila and failb are disk numbers relative to d0_idx */ - /* pd_idx become disks-2 and qd_idx become disks-1 */ - faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; - failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; - - BUG_ON(faila == failb); - if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } - - pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", - (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); - - if ( failb == disks-1 ) { - /* Q disk is one of the missing disks */ - if ( faila == disks-2 ) { - /* Missing P+Q, just recompute */ - compute_parity6(sh, UPDATE_PARITY); - return; - } else { - /* We're missing D+Q; recompute D from P */ - compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); - compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ - return; - } - } - - /* We're missing D+P or D+D; build pointer table */ - { - /**** FIX THIS: This could be very bad if disks is close to 256 ****/ - void *ptrs[disks]; - - count = 0; - i = d0_idx; - do { - ptrs[count++] = page_address(sh->dev[i].page); - i = raid6_next_disk(i, disks); - if (i != dd_idx1 && i != dd_idx2 && - !test_bit(R5_UPTODATE, &sh->dev[i].flags)) - printk("compute_2 with missing block %d/%d\n", count, i); - } while ( i != d0_idx ); - - if ( failb == disks-2 ) { - /* We're missing D+P. */ - raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); - } else { - /* We're missing D+D. */ - raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); - } - - /* Both the above update both missing blocks */ - set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); - set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); - } -} - static void -schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, +schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) { int i, pd_idx = sh->pd_idx, disks = sh->disks; + struct r5conf *conf = sh->raid_conf; + int level = conf->level; if (rcw) { - /* if we are not expanding this is a proper write request, and - * there will be bios with new data to be drained into the - * stripe cache - */ - if (!expand) { - sh->reconstruct_state = reconstruct_state_drain_run; - set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - } else - sh->reconstruct_state = reconstruct_state_run; - - set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -1649,18 +2559,29 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } } - if (s->locked + 1 == disks) + /* if we are not expanding this is a proper write request, and + * there will be bios with new data to be drained into the + * stripe cache + */ + if (!expand) { + if (!s->locked) + /* False alarm, nothing to do */ + return; + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + } else + sh->reconstruct_state = reconstruct_state_run; + + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); + + if (s->locked + conf->max_degraded == disks) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) - atomic_inc(&sh->raid_conf->pending_full_writes); + atomic_inc(&conf->pending_full_writes); } else { + BUG_ON(level == 6); BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); - sh->reconstruct_state = reconstruct_state_prexor_drain_run; - set_bit(STRIPE_OP_PREXOR, &s->ops_request); - set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); - set_bit(STRIPE_OP_POSTXOR, &s->ops_request); - for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (i == pd_idx) @@ -1675,15 +2596,31 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } } + if (!s->locked) + /* False alarm - nothing to do */ + return; + sh->reconstruct_state = reconstruct_state_prexor_drain_run; + set_bit(STRIPE_OP_PREXOR, &s->ops_request); + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); } - /* keep the parity disk locked while asynchronous operations + /* keep the parity disk(s) locked while asynchronous operations * are in flight */ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); s->locked++; + if (level == 6) { + int qd_idx = sh->qd_idx; + struct r5dev *dev = &sh->dev[qd_idx]; + + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + s->locked++; + } + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", __func__, (unsigned long long)sh->sector, s->locked, s->ops_request); @@ -1697,95 +2634,95 @@ schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) { struct bio **bip; - raid5_conf_t *conf = sh->raid_conf; + struct r5conf *conf = sh->raid_conf; int firstwrite=0; - pr_debug("adding bh b#%llu to stripe s#%llu\n", - (unsigned long long)bi->bi_sector, + pr_debug("adding bi b#%llu to stripe s#%llu\n", + (unsigned long long)bi->bi_iter.bi_sector, (unsigned long long)sh->sector); - - spin_lock(&sh->lock); - spin_lock_irq(&conf->device_lock); + /* + * If several bio share a stripe. The bio bi_phys_segments acts as a + * reference count to avoid race. The reference count should already be + * increased before this function is called (for example, in + * make_request()), so other bio sharing this stripe will not free the + * stripe. If a stripe is owned by one stripe, the stripe lock will + * protect it. + */ + spin_lock_irq(&sh->stripe_lock); if (forwrite) { bip = &sh->dev[dd_idx].towrite; - if (*bip == NULL && sh->dev[dd_idx].written == NULL) + if (*bip == NULL) firstwrite = 1; } else bip = &sh->dev[dd_idx].toread; - while (*bip && (*bip)->bi_sector < bi->bi_sector) { - if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { + if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) goto overlap; bip = & (*bip)->bi_next; } - if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) + if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) goto overlap; BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); if (*bip) bi->bi_next = *bip; *bip = bi; - bi->bi_phys_segments ++; - spin_unlock_irq(&conf->device_lock); - spin_unlock(&sh->lock); - - pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", - (unsigned long long)bi->bi_sector, - (unsigned long long)sh->sector, dd_idx); - - if (conf->mddev->bitmap && firstwrite) { - bitmap_startwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0); - sh->bm_seq = conf->seq_flush+1; - set_bit(STRIPE_BIT_DELAY, &sh->state); - } + raid5_inc_bi_active_stripes(bi); if (forwrite) { /* check if page is covered */ sector_t sector = sh->dev[dd_idx].sector; for (bi=sh->dev[dd_idx].towrite; sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && - bi && bi->bi_sector <= sector; + bi && bi->bi_iter.bi_sector <= sector; bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { - if (bi->bi_sector + (bi->bi_size>>9) >= sector) - sector = bi->bi_sector + (bi->bi_size>>9); + if (bio_end_sector(bi) >= sector) + sector = bio_end_sector(bi); } if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); } + + pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", + (unsigned long long)(*bip)->bi_iter.bi_sector, + (unsigned long long)sh->sector, dd_idx); + spin_unlock_irq(&sh->stripe_lock); + + if (conf->mddev->bitmap && firstwrite) { + bitmap_startwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0); + sh->bm_seq = conf->seq_flush+1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } return 1; overlap: set_bit(R5_Overlap, &sh->dev[dd_idx].flags); - spin_unlock_irq(&conf->device_lock); - spin_unlock(&sh->lock); + spin_unlock_irq(&sh->stripe_lock); return 0; } -static void end_reshape(raid5_conf_t *conf); +static void end_reshape(struct r5conf *conf); -static int page_is_zero(struct page *p) +static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, + struct stripe_head *sh) { - char *a = page_address(p); - return ((*(u32*)a) == 0 && - memcmp(a, a+4, STRIPE_SIZE-4)==0); -} - -static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) -{ - int sectors_per_chunk = conf->chunk_size >> 9; - int pd_idx, dd_idx; + int sectors_per_chunk = + previous ? conf->prev_chunk_sectors : conf->chunk_sectors; + int dd_idx; int chunk_offset = sector_div(stripe, sectors_per_chunk); + int disks = previous ? conf->previous_raid_disks : conf->raid_disks; - raid5_compute_sector(stripe * (disks - conf->max_degraded) + raid5_compute_sector(conf, + stripe * (disks - conf->max_degraded) *sectors_per_chunk + chunk_offset, - disks, disks - conf->max_degraded, - &dd_idx, &pd_idx, conf); - return pd_idx; + previous, + &dd_idx, sh); } static void -handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, +handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, struct bio **return_bi) { @@ -1795,46 +2732,63 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, int bitmap_end = 0; if (test_bit(R5_ReadError, &sh->dev[i].flags)) { - mdk_rdev_t *rdev; + struct md_rdev *rdev; rcu_read_lock(); rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && test_bit(In_sync, &rdev->flags)) - /* multiple read failures in one stripe */ - md_error(conf->mddev, rdev); + atomic_inc(&rdev->nr_pending); + else + rdev = NULL; rcu_read_unlock(); + if (rdev) { + if (!rdev_set_badblocks( + rdev, + sh->sector, + STRIPE_SECTORS, 0)) + md_error(conf->mddev, rdev); + rdev_dec_pending(rdev, conf->mddev); + } } - spin_lock_irq(&conf->device_lock); + spin_lock_irq(&sh->stripe_lock); /* fail all writes first */ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; - if (bi) { - s->to_write--; + spin_unlock_irq(&sh->stripe_lock); + if (bi) bitmap_end = 1; - } if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); - while (bi && bi->bi_sector < + while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { + if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; } bi = nextbi; } + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); + bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; + if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].page = sh->dev[i].orig_page; + } + if (bi) bitmap_end = 1; - while (bi && bi->bi_sector < + while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { + if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); bi->bi_next = *return_bi; *return_bi = bi; @@ -1848,27 +2802,31 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && (!test_bit(R5_Insync, &sh->dev[i].flags) || test_bit(R5_ReadError, &sh->dev[i].flags))) { + spin_lock_irq(&sh->stripe_lock); bi = sh->dev[i].toread; sh->dev[i].toread = NULL; + spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); - if (bi) s->to_read--; - while (bi && bi->bi_sector < + while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); - if (--bi->bi_phys_segments == 0) { + if (!raid5_dec_bi_active_stripes(bi)) { bi->bi_next = *return_bi; *return_bi = bi; } bi = nextbi; } } - spin_unlock_irq(&conf->device_lock); if (bitmap_end) bitmap_endwrite(conf->mddev->bitmap, sh->sector, STRIPE_SECTORS, 0, 0); + /* If we were in the middle of a write the parity block might + * still be locked - so just clear all R5_LOCKED flags + */ + clear_bit(R5_LOCKED, &sh->dev[i].flags); } if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) @@ -1876,17 +2834,80 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, md_wakeup_thread(conf->mddev->thread); } -/* fetch_block5 - checks the given member device to see if its data needs +static void +handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s) +{ + int abort = 0; + int i; + + clear_bit(STRIPE_SYNCING, &sh->state); + if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) + wake_up(&conf->wait_for_overlap); + s->syncing = 0; + s->replacing = 0; + /* There is nothing more to do for sync/check/repair. + * Don't even need to abort as that is handled elsewhere + * if needed, and not always wanted e.g. if there is a known + * bad block here. + * For recover/replace we need to record a bad block on all + * non-sync devices, or abort the recovery + */ + if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { + /* During recovery devices cannot be removed, so + * locking and refcounting of rdevs is not needed + */ + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = conf->disks[i].rdev; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + abort = 1; + rdev = conf->disks[i].replacement; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + abort = 1; + } + if (abort) + conf->recovery_disabled = + conf->mddev->recovery_disabled; + } + md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); +} + +static int want_replace(struct stripe_head *sh, int disk_idx) +{ + struct md_rdev *rdev; + int rv = 0; + /* Doing recovery so rcu locking not required */ + rdev = sh->raid_conf->disks[disk_idx].replacement; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && (rdev->recovery_offset <= sh->sector + || rdev->mddev->recovery_cp <= sh->sector)) + rv = 1; + + return rv; +} + +/* fetch_block - checks the given member device to see if its data needs * to be read or computed to satisfy a request. * * Returns 1 when no more member devices need to be checked, otherwise returns - * 0 to tell the loop in handle_stripe_fill5 to continue + * 0 to tell the loop in handle_stripe_fill to continue */ -static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, - int disk_idx, int disks) +static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; - struct r5dev *failed_dev = &sh->dev[s->failed_num]; + struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], + &sh->dev[s->failed_num[1]] }; /* is the data in this block needed, and can we get it? */ if (!test_bit(R5_LOCKED, &dev->flags) && @@ -1894,34 +2915,73 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || s->syncing || s->expanding || - (s->failed && - (failed_dev->toread || - (failed_dev->towrite && - !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { - /* We would like to get this block, possibly by computing it, + (s->replacing && want_replace(sh, disk_idx)) || + (s->failed >= 1 && fdev[0]->toread) || + (s->failed >= 2 && fdev[1]->toread) || + (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && + !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || + (sh->raid_conf->level == 6 && s->failed && s->to_write && + s->to_write < sh->raid_conf->raid_disks - 2 && + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { + /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ + BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); + BUG_ON(test_bit(R5_Wantread, &dev->flags)); if ((s->uptodate == disks - 1) && - (s->failed && disk_idx == s->failed_num)) { + (s->failed && (disk_idx == s->failed_num[0] || + disk_idx == s->failed_num[1]))) { + /* have disk failed, and we're requested to fetch it; + * do compute it + */ + pr_debug("Computing stripe %llu block %d\n", + (unsigned long long)sh->sector, disk_idx); set_bit(STRIPE_COMPUTE_RUN, &sh->state); set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; + sh->ops.target2 = -1; /* no 2nd target */ s->req_compute = 1; /* Careful: from this point on 'uptodate' is in the eye - * of raid5_run_ops which services 'compute' operations + * of raid_run_ops which services 'compute' operations * before writes. R5_Wantcompute flags a block that will * be R5_UPTODATE by the time it is needed for a * subsequent operation. */ s->uptodate++; - return 1; /* uptodate + compute == disks */ + return 1; + } else if (s->uptodate == disks-2 && s->failed >= 2) { + /* Computing 2-failure is *very* expensive; only + * do it if failed >= 2 + */ + int other; + for (other = disks; other--; ) { + if (other == disk_idx) + continue; + if (!test_bit(R5_UPTODATE, + &sh->dev[other].flags)) + break; + } + BUG_ON(other < 0); + pr_debug("Computing stripe %llu blocks %d,%d\n", + (unsigned long long)sh->sector, + disk_idx, other); + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); + set_bit(R5_Wantcompute, &sh->dev[other].flags); + sh->ops.target = disk_idx; + sh->ops.target2 = other; + s->uptodate += 2; + s->req_compute = 1; + return 1; } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; - pr_debug("Reading block %d (sync=%d)\n", disk_idx, - s->syncing); + pr_debug("Reading block %d (sync=%d)\n", + disk_idx, s->syncing); } } @@ -1929,10 +2989,11 @@ static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, } /** - * handle_stripe_fill5 - read or compute data to satisfy pending requests. + * handle_stripe_fill - read or compute data to satisfy pending requests. */ -static void handle_stripe_fill5(struct stripe_head *sh, - struct stripe_head_state *s, int disks) +static void handle_stripe_fill(struct stripe_head *sh, + struct stripe_head_state *s, + int disks) { int i; @@ -1943,125 +3004,115 @@ static void handle_stripe_fill5(struct stripe_head *sh, if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && !sh->reconstruct_state) for (i = disks; i--; ) - if (fetch_block5(sh, s, i, disks)) + if (fetch_block(sh, s, i, disks)) break; set_bit(STRIPE_HANDLE, &sh->state); } -static void handle_stripe_fill6(struct stripe_head *sh, - struct stripe_head_state *s, struct r6_state *r6s, - int disks) -{ - int i; - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || (dev->towrite && - !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->failed >= 1 && - (sh->dev[r6s->failed_num[0]].toread || - s->to_write)) || - (s->failed >= 2 && - (sh->dev[r6s->failed_num[1]].toread || - s->to_write)))) { - /* we would like to get this block, possibly - * by computing it, but we might not be able to - */ - if ((s->uptodate == disks - 1) && - (s->failed && (i == r6s->failed_num[0] || - i == r6s->failed_num[1]))) { - pr_debug("Computing stripe %llu block %d\n", - (unsigned long long)sh->sector, i); - compute_block_1(sh, i, 0); - s->uptodate++; - } else if ( s->uptodate == disks-2 && s->failed >= 2 ) { - /* Computing 2-failure is *very* expensive; only - * do it if failed >= 2 - */ - int other; - for (other = disks; other--; ) { - if (other == i) - continue; - if (!test_bit(R5_UPTODATE, - &sh->dev[other].flags)) - break; - } - BUG_ON(other < 0); - pr_debug("Computing stripe %llu blocks %d,%d\n", - (unsigned long long)sh->sector, - i, other); - compute_block_2(sh, i, other); - s->uptodate += 2; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - pr_debug("Reading block %d (sync=%d)\n", - i, s->syncing); - } - } - } - set_bit(STRIPE_HANDLE, &sh->state); -} - /* handle_stripe_clean_event * any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but * never LOCKED, so we don't need to test 'failed' directly. */ -static void handle_stripe_clean_event(raid5_conf_t *conf, +static void handle_stripe_clean_event(struct r5conf *conf, struct stripe_head *sh, int disks, struct bio **return_bi) { int i; struct r5dev *dev; + int discard_pending = 0; for (i = disks; i--; ) if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags)) { + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Discard, &dev->flags) || + test_bit(R5_SkipCopy, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; - int bitmap_end = 0; pr_debug("Return write for disc %d\n", i); - spin_lock_irq(&conf->device_lock); + if (test_and_clear_bit(R5_Discard, &dev->flags)) + clear_bit(R5_UPTODATE, &dev->flags); + if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { + WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); + dev->page = dev->orig_page; + } wbi = dev->written; dev->written = NULL; - while (wbi && wbi->bi_sector < + while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); - if (--wbi->bi_phys_segments == 0) { + if (!raid5_dec_bi_active_stripes(wbi)) { md_write_end(conf->mddev); wbi->bi_next = *return_bi; *return_bi = wbi; } wbi = wbi2; } - if (dev->towrite == NULL) - bitmap_end = 1; - spin_unlock_irq(&conf->device_lock); - if (bitmap_end) - bitmap_endwrite(conf->mddev->bitmap, - sh->sector, - STRIPE_SECTORS, + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, !test_bit(STRIPE_DEGRADED, &sh->state), - 0); - } + 0); + } else if (test_bit(R5_Discard, &dev->flags)) + discard_pending = 1; + WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); + WARN_ON(dev->page != dev->orig_page); } + if (!discard_pending && + test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { + clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + if (sh->qd_idx >= 0) { + clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); + } + /* now that discard is done we can proceed with any sync */ + clear_bit(STRIPE_DISCARD, &sh->state); + /* + * SCSI discard will change some bio fields and the stripe has + * no updated data, so remove it from hash list and the stripe + * will be reinitialized + */ + spin_lock_irq(&conf->device_lock); + remove_hash(sh); + spin_unlock_irq(&conf->device_lock); + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) + set_bit(STRIPE_HANDLE, &sh->state); + + } if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) md_wakeup_thread(conf->mddev->thread); } -static void handle_stripe_dirtying5(raid5_conf_t *conf, - struct stripe_head *sh, struct stripe_head_state *s, int disks) +static void handle_stripe_dirtying(struct r5conf *conf, + struct stripe_head *sh, + struct stripe_head_state *s, + int disks) { int rmw = 0, rcw = 0, i; - for (i = disks; i--; ) { + sector_t recovery_cp = conf->mddev->recovery_cp; + + /* RAID6 requires 'rcw' in current implementation. + * Otherwise, check whether resync is now happening or should start. + * If yes, then the array is dirty (after unclean shutdown or + * initial creation), so parity in some stripes might be inconsistent. + * In this case, we need to always do reconstruct-write, to ensure + * that in case of drive failure or read-error correction, we + * generate correct data from the parity. + */ + if (conf->max_degraded == 2 || + (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { + /* Calculate the real rcw later - for now make it + * look like rcw is cheaper + */ + rcw = 1; rmw = 2; + pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", + conf->max_degraded, (unsigned long long)recovery_cp, + (unsigned long long)sh->sector); + } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && @@ -2078,7 +3129,8 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { - if (test_bit(R5_Insync, &dev->flags)) rcw++; + if (test_bit(R5_Insync, &dev->flags)) + rcw++; else rcw += 2*disks; } @@ -2086,8 +3138,12 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, pr_debug("for sector %llu, rmw=%d rcw=%d\n", (unsigned long long)sh->sector, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); - if (rmw < rcw && rmw > 0) + if (rmw < rcw && rmw > 0) { /* prefer read-modify-write, but need to get some data */ + if (conf->mddev->queue) + blk_add_trace_msg(conf->mddev->queue, + "raid5 rmw %llu %d", + (unsigned long long)sh->sector, rmw); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if ((dev->towrite || i == sh->pd_idx) && @@ -2095,10 +3151,10 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - pr_debug("Read_old block " - "%d for r-m-w\n", i); + if (test_bit(STRIPE_PREREAD_ACTIVE, + &sh->state)) { + pr_debug("Read_old block %d for r-m-w\n", + i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; @@ -2108,146 +3164,56 @@ static void handle_stripe_dirtying5(raid5_conf_t *conf, } } } - if (rcw <= rmw && rcw > 0) + } + if (rcw <= rmw && rcw > 0) { /* want reconstruct write, but need to get some data */ + int qread =0; + rcw = 0; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (!test_bit(R5_OVERWRITE, &dev->flags) && - i != sh->pd_idx && + i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags)) && - test_bit(R5_Insync, &dev->flags)) { - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + test_bit(R5_Wantcompute, &dev->flags))) { + rcw++; + if (test_bit(R5_Insync, &dev->flags) && + test_bit(STRIPE_PREREAD_ACTIVE, + &sh->state)) { pr_debug("Read_old block " "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; + qread++; } else { set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } } } + if (rcw && conf->mddev->queue) + blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", + (unsigned long long)sh->sector, + rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); + } /* now if nothing is locked, and if we have enough data, * we can start a write request */ /* since handle_stripe can be called at any time we need to handle the * case where a compute block operation has been submitted and then a - * subsequent call wants to start a write request. raid5_run_ops only - * handles the case where compute block and postxor are requested + * subsequent call wants to start a write request. raid_run_ops only + * handles the case where compute block and reconstruct are requested * simultaneously. If this is not the case then new writes need to be * held off until the compute completes. */ if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && (s->locked == 0 && (rcw == 0 || rmw == 0) && !test_bit(STRIPE_BIT_DELAY, &sh->state))) - schedule_reconstruction5(sh, s, rcw == 0, 0); -} - -static void handle_stripe_dirtying6(raid5_conf_t *conf, - struct stripe_head *sh, struct stripe_head_state *s, - struct r6_state *r6s, int disks) -{ - int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; - int qd_idx = r6s->qd_idx; - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - /* Would I have to read this buffer for reconstruct_write */ - if (!test_bit(R5_OVERWRITE, &dev->flags) - && i != pd_idx && i != qd_idx - && (!test_bit(R5_LOCKED, &dev->flags) - ) && - !test_bit(R5_UPTODATE, &dev->flags)) { - if (test_bit(R5_Insync, &dev->flags)) rcw++; - else { - pr_debug("raid6: must_compute: " - "disk %d flags=%#lx\n", i, dev->flags); - must_compute++; - } - } - } - pr_debug("for sector %llu, rcw=%d, must_compute=%d\n", - (unsigned long long)sh->sector, rcw, must_compute); - set_bit(STRIPE_HANDLE, &sh->state); - - if (rcw > 0) - /* want reconstruct write, but need to get some data */ - for (i = disks; i--; ) { - struct r5dev *dev = &sh->dev[i]; - if (!test_bit(R5_OVERWRITE, &dev->flags) - && !(s->failed == 0 && (i == pd_idx || i == qd_idx)) - && !test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - test_bit(R5_Insync, &dev->flags)) { - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - pr_debug("Read_old stripe %llu " - "block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - s->locked++; - } else { - pr_debug("Request delayed stripe %llu " - "block %d for Reconstruct\n", - (unsigned long long)sh->sector, i); - set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } - } - } - /* now if nothing is locked, and if we have enough data, we can start a - * write request - */ - if (s->locked == 0 && rcw == 0 && - !test_bit(STRIPE_BIT_DELAY, &sh->state)) { - if (must_compute > 0) { - /* We have failed blocks and need to compute them */ - switch (s->failed) { - case 0: - BUG(); - case 1: - compute_block_1(sh, r6s->failed_num[0], 0); - break; - case 2: - compute_block_2(sh, r6s->failed_num[0], - r6s->failed_num[1]); - break; - default: /* This request should have been failed? */ - BUG(); - } - } - - pr_debug("Computing parity for stripe %llu\n", - (unsigned long long)sh->sector); - compute_parity6(sh, RECONSTRUCT_WRITE); - /* now every locked buffer is ready to be written */ - for (i = disks; i--; ) - if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { - pr_debug("Writing stripe %llu block %d\n", - (unsigned long long)sh->sector, i); - s->locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - } - if (s->locked == disks) - if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) - atomic_inc(&conf->pending_full_writes); - /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ - set_bit(STRIPE_INSYNC, &sh->state); - - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < - IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } - } + schedule_reconstruction(sh, s, rcw == 0, 0); } -static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, +static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { struct r5dev *dev = NULL; @@ -2265,7 +3231,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, s->uptodate--; break; } - dev = &sh->dev[s->failed_num]; + dev = &sh->dev[s->failed_num[0]]; /* fall through */ case check_state_compute_result: sh->check_state = check_state_idle; @@ -2302,13 +3268,13 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, * we are done. Otherwise update the mismatch count and repair * parity if !MD_RECOVERY_CHECK */ - if (sh->ops.zero_sum_result == 0) + if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) /* parity is correct (on disc, * not in buffer any more) */ set_bit(STRIPE_INSYNC, &sh->state); else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -2319,6 +3285,7 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, set_bit(R5_Wantcompute, &sh->dev[sh->pd_idx].flags); sh->ops.target = sh->pd_idx; + sh->ops.target2 = -1; s->uptodate++; } } @@ -2334,88 +3301,94 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, } -static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, - struct stripe_head_state *s, - struct r6_state *r6s, struct page *tmp_page, - int disks) +static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, + int disks) { - int update_p = 0, update_q = 0; - struct r5dev *dev; int pd_idx = sh->pd_idx; - int qd_idx = r6s->qd_idx; + int qd_idx = sh->qd_idx; + struct r5dev *dev; set_bit(STRIPE_HANDLE, &sh->state); BUG_ON(s->failed > 2); - BUG_ON(s->uptodate < disks); + /* Want to check and possibly repair P and Q. * However there could be one 'failed' device, in which * case we can only check one of them, possibly using the * other to generate missing data */ - /* If !tmp_page, we cannot do the calculations, - * but as we have set STRIPE_HANDLE, we will soon be called - * by stripe_handle with a tmp_page - just wait until then. - */ - if (tmp_page) { - if (s->failed == r6s->q_failed) { - /* The only possible failed device holds 'Q', so it + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are < 2 failures */ + if (s->failed == s->q_failed) { + /* The only possible failed device holds Q, so it * makes sense to check P (If anything else were failed, * we would have used P to recreate it). */ - compute_block_1(sh, pd_idx, 1); - if (!page_is_zero(sh->dev[pd_idx].page)) { - compute_block_1(sh, pd_idx, 0); - update_p = 1; - } + sh->check_state = check_state_run; } - if (!r6s->q_failed && s->failed < 2) { - /* q is not failed, and we didn't use it to generate + if (!s->q_failed && s->failed < 2) { + /* Q is not failed, and we didn't use it to generate * anything, so it makes sense to check it */ - memcpy(page_address(tmp_page), - page_address(sh->dev[qd_idx].page), - STRIPE_SIZE); - compute_parity6(sh, UPDATE_PARITY); - if (memcmp(page_address(tmp_page), - page_address(sh->dev[qd_idx].page), - STRIPE_SIZE) != 0) { - clear_bit(STRIPE_INSYNC, &sh->state); - update_q = 1; - } + if (sh->check_state == check_state_run) + sh->check_state = check_state_run_pq; + else + sh->check_state = check_state_run_q; } - if (update_p || update_q) { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - update_p = update_q = 0; + + /* discard potentially stale zero_sum_result */ + sh->ops.zero_sum_result = 0; + + if (sh->check_state == check_state_run) { + /* async_xor_zero_sum destroys the contents of P */ + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + s->uptodate--; } + if (sh->check_state >= check_state_run && + sh->check_state <= check_state_run_pq) { + /* async_syndrome_zero_sum preserves P and Q, so + * no need to mark them !uptodate here + */ + set_bit(STRIPE_OP_CHECK, &s->ops_request); + break; + } + + /* we have 2-disk failure */ + BUG_ON(s->failed != 2); + /* fall through */ + case check_state_compute_result: + sh->check_state = check_state_idle; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; /* now write out any block on a failed drive, - * or P or Q if they need it + * or P or Q if they were recomputed */ - + BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ if (s->failed == 2) { - dev = &sh->dev[r6s->failed_num[1]]; + dev = &sh->dev[s->failed_num[1]]; s->locked++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); } if (s->failed >= 1) { - dev = &sh->dev[r6s->failed_num[0]]; + dev = &sh->dev[s->failed_num[0]]; s->locked++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); } - - if (update_p) { + if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { dev = &sh->dev[pd_idx]; s->locked++; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); } - if (update_q) { + if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { dev = &sh->dev[qd_idx]; s->locked++; set_bit(R5_LOCKED, &dev->flags); @@ -2424,11 +3397,74 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + case check_state_run_q: + case check_state_run_pq: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) { + /* both parities are correct */ + if (!s->failed) + set_bit(STRIPE_INSYNC, &sh->state); + else { + /* in contrast to the raid5 case we can validate + * parity, but still have a failure to write + * back + */ + sh->check_state = check_state_compute_result; + /* Returning at this point means that we may go + * off and bring p and/or q uptodate again so + * we make sure to check zero_sum_result again + * to verify if p or q need writeback + */ + } + } else { + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + int *target = &sh->ops.target; + + sh->ops.target = -1; + sh->ops.target2 = -1; + sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { + set_bit(R5_Wantcompute, + &sh->dev[pd_idx].flags); + *target = pd_idx; + target = &sh->ops.target2; + s->uptodate++; + } + if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { + set_bit(R5_Wantcompute, + &sh->dev[qd_idx].flags); + *target = qd_idx; + s->uptodate++; + } + } + } + break; + case check_state_compute_run: + break; + default: + printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); } } -static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, - struct r6_state *r6s) +static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) { int i; @@ -2438,17 +3474,15 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, struct dma_async_tx_descriptor *tx = NULL; clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i = 0; i < sh->disks; i++) - if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { - int dd_idx, pd_idx, j; + if (i != sh->pd_idx && i != sh->qd_idx) { + int dd_idx, j; struct stripe_head *sh2; + struct async_submit_ctl submit; - sector_t bn = compute_blocknr(sh, i); - sector_t s = raid5_compute_sector(bn, conf->raid_disks, - conf->raid_disks - - conf->max_degraded, &dd_idx, - &pd_idx, conf); - sh2 = get_active_stripe(conf, s, conf->raid_disks, - pd_idx, 1); + sector_t bn = compute_blocknr(sh, i, 1); + sector_t s = raid5_compute_sector(conf, bn, 0, + &dd_idx, NULL); + sh2 = get_active_stripe(conf, s, 0, 1, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -2463,16 +3497,16 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, } /* place all the copies on one channel */ + init_async_submit(&submit, 0, tx, NULL, NULL, NULL); tx = async_memcpy(sh2->dev[dd_idx].page, - sh->dev[i].page, 0, 0, STRIPE_SIZE, - ASYNC_TX_DEP_ACK, tx, NULL, NULL); + sh->dev[i].page, 0, 0, STRIPE_SIZE, + &submit); set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); for (j = 0; j < conf->raid_disks; j++) if (j != sh2->pd_idx && - (!r6s || j != raid6_next_disk(sh2->pd_idx, - sh2->disks)) && + j != sh2->qd_idx && !test_bit(R5_Expanded, &sh2->dev[j].flags)) break; if (j == conf->raid_disks) { @@ -2483,66 +3517,52 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, } /* done submitting copies, wait for them to complete */ - if (tx) { - async_tx_ack(tx); - dma_wait_for_async_tx(tx); - } + async_tx_quiesce(&tx); } - /* * handle_stripe - do things to a stripe. * - * We lock the stripe and then examine the state of various bits - * to see what needs to be done. + * We lock the stripe by setting STRIPE_ACTIVE and then examine the + * state of various bits to see what needs to be done. * Possible results: - * return some read request which now have data - * return some write requests which are safely on disc + * return some read requests which now have data + * return some write requests which are safely on storage * schedule a read on some buffers * schedule a write of some buffers * return confirmation of parity correctness * - * buffers are taken off read_list or write_list, and bh_cache buffers - * get BH_Lock set before the stripe lock is released. - * */ -static bool handle_stripe5(struct stripe_head *sh) +static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) { - raid5_conf_t *conf = sh->raid_conf; - int disks = sh->disks, i; - struct bio *return_bi = NULL; - struct stripe_head_state s; + struct r5conf *conf = sh->raid_conf; + int disks = sh->disks; struct r5dev *dev; - mdk_rdev_t *blocked_rdev = NULL; - int prexor; - - memset(&s, 0, sizeof(s)); - pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " - "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), sh->pd_idx, sh->check_state, - sh->reconstruct_state); + int i; + int do_recovery = 0; - spin_lock(&sh->lock); - clear_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); + memset(s, 0, sizeof(*s)); - s.syncing = test_bit(STRIPE_SYNCING, &sh->state); - s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); - s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); + s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); + s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); + s->failed_num[0] = -1; + s->failed_num[1] = -1; /* Now to look around and see what can be done */ rcu_read_lock(); for (i=disks; i--; ) { - mdk_rdev_t *rdev; - struct r5dev *dev = &sh->dev[i]; - clear_bit(R5_Insync, &dev->flags); + struct md_rdev *rdev; + sector_t first_bad; + int bad_sectors; + int is_bad = 0; - pr_debug("check %d: state 0x%lx toread %p read %p write %p " - "written %p\n", i, dev->flags, dev->toread, dev->read, - dev->towrite, dev->written); + dev = &sh->dev[i]; - /* maybe we can request a biofill operation + pr_debug("check %d: state 0x%lx read %p write %p written %p\n", + i, dev->flags, + dev->toread, dev->towrite, dev->written); + /* maybe we can reply to a read * * new wantfill requests are only permitted while * ops_complete_biofill is guaranteed to be inactive @@ -2552,50 +3572,201 @@ static bool handle_stripe5(struct stripe_head *sh) set_bit(R5_Wantfill, &dev->flags); /* now count some things */ - if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; - if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; - if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; + if (test_bit(R5_LOCKED, &dev->flags)) + s->locked++; + if (test_bit(R5_UPTODATE, &dev->flags)) + s->uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) { + s->compute++; + BUG_ON(s->compute > 2); + } if (test_bit(R5_Wantfill, &dev->flags)) - s.to_fill++; + s->to_fill++; else if (dev->toread) - s.to_read++; + s->to_read++; if (dev->towrite) { - s.to_write++; + s->to_write++; if (!test_bit(R5_OVERWRITE, &dev->flags)) - s.non_overwrite++; + s->non_overwrite++; } if (dev->written) - s.written++; - rdev = rcu_dereference(conf->disks[i].rdev); - if (blocked_rdev == NULL && - rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - blocked_rdev = rdev; - atomic_inc(&rdev->nr_pending); + s->written++; + /* Prefer to use the replacement for reads, but only + * if it is recovered enough and has no bad blocks. + */ + rdev = rcu_dereference(conf->disks[i].replacement); + if (rdev && !test_bit(Faulty, &rdev->flags) && + rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && + !is_badblock(rdev, sh->sector, STRIPE_SECTORS, + &first_bad, &bad_sectors)) + set_bit(R5_ReadRepl, &dev->flags); + else { + if (rdev) + set_bit(R5_NeedReplace, &dev->flags); + rdev = rcu_dereference(conf->disks[i].rdev); + clear_bit(R5_ReadRepl, &dev->flags); + } + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = NULL; + if (rdev) { + is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + &first_bad, &bad_sectors); + if (s->blocked_rdev == NULL + && (test_bit(Blocked, &rdev->flags) + || is_bad < 0)) { + if (is_bad < 0) + set_bit(BlockedBadBlocks, + &rdev->flags); + s->blocked_rdev = rdev; + atomic_inc(&rdev->nr_pending); + } + } + clear_bit(R5_Insync, &dev->flags); + if (!rdev) + /* Not in-sync */; + else if (is_bad) { + /* also not in-sync */ + if (!test_bit(WriteErrorSeen, &rdev->flags) && + test_bit(R5_UPTODATE, &dev->flags)) { + /* treat as in-sync, but with a read error + * which we can now try to correct + */ + set_bit(R5_Insync, &dev->flags); + set_bit(R5_ReadError, &dev->flags); + } + } else if (test_bit(In_sync, &rdev->flags)) + set_bit(R5_Insync, &dev->flags); + else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + /* in sync if before recovery_offset */ + set_bit(R5_Insync, &dev->flags); + else if (test_bit(R5_UPTODATE, &dev->flags) && + test_bit(R5_Expanded, &dev->flags)) + /* If we've reshaped into here, we assume it is Insync. + * We will shortly update recovery_offset to make + * it official. + */ + set_bit(R5_Insync, &dev->flags); + + if (test_bit(R5_WriteError, &dev->flags)) { + /* This flag does not apply to '.replacement' + * only to .rdev, so make sure to check that*/ + struct md_rdev *rdev2 = rcu_dereference( + conf->disks[i].rdev); + if (rdev2 == rdev) + clear_bit(R5_Insync, &dev->flags); + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { + s->handle_bad_blocks = 1; + atomic_inc(&rdev2->nr_pending); + } else + clear_bit(R5_WriteError, &dev->flags); + } + if (test_bit(R5_MadeGood, &dev->flags)) { + /* This flag does not apply to '.replacement' + * only to .rdev, so make sure to check that*/ + struct md_rdev *rdev2 = rcu_dereference( + conf->disks[i].rdev); + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { + s->handle_bad_blocks = 1; + atomic_inc(&rdev2->nr_pending); + } else + clear_bit(R5_MadeGood, &dev->flags); + } + if (test_bit(R5_MadeGoodRepl, &dev->flags)) { + struct md_rdev *rdev2 = rcu_dereference( + conf->disks[i].replacement); + if (rdev2 && !test_bit(Faulty, &rdev2->flags)) { + s->handle_bad_blocks = 1; + atomic_inc(&rdev2->nr_pending); + } else + clear_bit(R5_MadeGoodRepl, &dev->flags); } - if (!rdev || !test_bit(In_sync, &rdev->flags)) { + if (!test_bit(R5_Insync, &dev->flags)) { /* The ReadError flag will just be confusing now */ clear_bit(R5_ReadError, &dev->flags); clear_bit(R5_ReWrite, &dev->flags); } - if (!rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(R5_ReadError, &dev->flags)) { - s.failed++; - s.failed_num = i; - } else - set_bit(R5_Insync, &dev->flags); + if (test_bit(R5_ReadError, &dev->flags)) + clear_bit(R5_Insync, &dev->flags); + if (!test_bit(R5_Insync, &dev->flags)) { + if (s->failed < 2) + s->failed_num[s->failed] = i; + s->failed++; + if (rdev && !test_bit(Faulty, &rdev->flags)) + do_recovery = 1; + } + } + if (test_bit(STRIPE_SYNCING, &sh->state)) { + /* If there is a failed device being replaced, + * we must be recovering. + * else if we are after recovery_cp, we must be syncing + * else if MD_RECOVERY_REQUESTED is set, we also are syncing. + * else we can only be replacing + * sync and recovery both need to read all devices, and so + * use the same flag. + */ + if (do_recovery || + sh->sector >= conf->mddev->recovery_cp || + test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) + s->syncing = 1; + else + s->replacing = 1; } rcu_read_unlock(); +} + +static void handle_stripe(struct stripe_head *sh) +{ + struct stripe_head_state s; + struct r5conf *conf = sh->raid_conf; + int i; + int prexor; + int disks = sh->disks; + struct r5dev *pdev, *qdev; + + clear_bit(STRIPE_HANDLE, &sh->state); + if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { + /* already being handled, ensure it gets handled + * again when current action finishes */ + set_bit(STRIPE_HANDLE, &sh->state); + return; + } + + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + spin_lock(&sh->stripe_lock); + /* Cannot process 'sync' concurrently with 'discard' */ + if (!test_bit(STRIPE_DISCARD, &sh->state) && + test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + set_bit(STRIPE_SYNCING, &sh->state); + clear_bit(STRIPE_INSYNC, &sh->state); + clear_bit(STRIPE_REPLACED, &sh->state); + } + spin_unlock(&sh->stripe_lock); + } + clear_bit(STRIPE_DELAYED, &sh->state); - if (unlikely(blocked_rdev)) { + pr_debug("handling stripe %llu, state=%#lx cnt=%d, " + "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", + (unsigned long long)sh->sector, sh->state, + atomic_read(&sh->count), sh->pd_idx, sh->qd_idx, + sh->check_state, sh->reconstruct_state); + + analyse_stripe(sh, &s); + + if (s.handle_bad_blocks) { + set_bit(STRIPE_HANDLE, &sh->state); + goto finish; + } + + if (unlikely(s.blocked_rdev)) { if (s.syncing || s.expanding || s.expanded || - s.to_write || s.written) { + s.replacing || s.to_write || s.written) { set_bit(STRIPE_HANDLE, &sh->state); - goto unlock; + goto finish; } /* There is nothing for the blocked_rdev to block */ - rdev_dec_pending(blocked_rdev, conf->mddev); - blocked_rdev = NULL; + rdev_dec_pending(s.blocked_rdev, conf->mddev); + s.blocked_rdev = NULL; } if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { @@ -2604,39 +3775,21 @@ static bool handle_stripe5(struct stripe_head *sh) } pr_debug("locked=%d uptodate=%d to_read=%d" - " to_write=%d failed=%d failed_num=%d\n", - s.locked, s.uptodate, s.to_read, s.to_write, - s.failed, s.failed_num); - /* check if the array has lost two devices and, if so, some requests might - * need to be failed + " to_write=%d failed=%d failed_num=%d,%d\n", + s.locked, s.uptodate, s.to_read, s.to_write, s.failed, + s.failed_num[0], s.failed_num[1]); + /* check if the array has lost more than max_degraded devices and, + * if so, some requests might need to be failed. */ - if (s.failed > 1 && s.to_read+s.to_write+s.written) - handle_failed_stripe(conf, sh, &s, disks, &return_bi); - if (s.failed > 1 && s.syncing) { - md_done_sync(conf->mddev, STRIPE_SECTORS,0); - clear_bit(STRIPE_SYNCING, &sh->state); - s.syncing = 0; + if (s.failed > conf->max_degraded) { + sh->check_state = 0; + sh->reconstruct_state = 0; + if (s.to_read+s.to_write+s.written) + handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); + if (s.syncing + s.replacing) + handle_failed_sync(conf, sh, &s); } - /* might be able to return some write requests if the parity block - * is safe, or on a failed drive - */ - dev = &sh->dev[sh->pd_idx]; - if ( s.written && - ((test_bit(R5_Insync, &dev->flags) && - !test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags)) || - (s.failed == 1 && s.failed_num == sh->pd_idx))) - handle_stripe_clean_event(conf, sh, disks, &return_bi); - - /* Now we might consider reading some blocks, either to check/generate - * parity, or to satisfy requests - * or to load a block that is being partially written. - */ - if (s.to_read || s.non_overwrite || - (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) - handle_stripe_fill5(sh, &s, disks); - /* Now we check to see if any write operations have recently * completed */ @@ -2650,28 +3803,64 @@ static bool handle_stripe5(struct stripe_head *sh) /* All the 'written' buffers and the parity block are ready to * be written back to disk */ - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); + BUG_ON(sh->qd_idx >= 0 && + !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); for (i = disks; i--; ) { - dev = &sh->dev[i]; + struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_LOCKED, &dev->flags) && - (i == sh->pd_idx || dev->written)) { + (i == sh->pd_idx || i == sh->qd_idx || + dev->written)) { pr_debug("Writing block %d\n", i); set_bit(R5_Wantwrite, &dev->flags); if (prexor) continue; if (!test_bit(R5_Insync, &dev->flags) || - (i == sh->pd_idx && s.failed == 0)) + ((i == sh->pd_idx || i == sh->qd_idx) && + s.failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); } } - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < - IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + s.dec_preread_active = 1; } + /* + * might be able to return some write requests if the parity blocks + * are safe, or on a failed drive + */ + pdev = &sh->dev[sh->pd_idx]; + s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); + qdev = &sh->dev[sh->qd_idx]; + s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) + || conf->level < 6; + + if (s.written && + (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) + && !test_bit(R5_LOCKED, &pdev->flags) + && (test_bit(R5_UPTODATE, &pdev->flags) || + test_bit(R5_Discard, &pdev->flags))))) && + (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) + && !test_bit(R5_LOCKED, &qdev->flags) + && (test_bit(R5_UPTODATE, &qdev->flags) || + test_bit(R5_Discard, &qdev->flags)))))) + handle_stripe_clean_event(conf, sh, disks, &s.return_bi); + + /* Now we might consider reading some blocks, either to check/generate + * parity, or to satisfy requests + * or to load a block that is being partially written. + */ + if (s.to_read || s.non_overwrite + || (conf->level == 6 && s.to_write && s.failed) + || (s.syncing && (s.uptodate + s.compute < disks)) + || s.replacing + || s.expanding) + handle_stripe_fill(sh, &s, disks); + /* Now to consider new write requests and what else, if anything * should be read. We do not handle new writes when: * 1/ A 'write' operation (copy+xor) is already in flight. @@ -2679,7 +3868,7 @@ static bool handle_stripe5(struct stripe_head *sh) * block. */ if (s.to_write && !sh->reconstruct_state && !sh->check_state) - handle_stripe_dirtying5(conf, sh, &s, disks); + handle_stripe_dirtying(conf, sh, &s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough @@ -2689,251 +3878,43 @@ static bool handle_stripe5(struct stripe_head *sh) if (sh->check_state || (s.syncing && s.locked == 0 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && - !test_bit(STRIPE_INSYNC, &sh->state))) - handle_parity_checks5(conf, sh, &s, disks); - - if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, STRIPE_SECTORS,1); - clear_bit(STRIPE_SYNCING, &sh->state); - } - - /* If the failed drive is just a ReadError, then we might need to progress - * the repair/check process - */ - if (s.failed == 1 && !conf->mddev->ro && - test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) - && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) - && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) - ) { - dev = &sh->dev[s.failed_num]; - if (!test_bit(R5_ReWrite, &dev->flags)) { - set_bit(R5_Wantwrite, &dev->flags); - set_bit(R5_ReWrite, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } else { - /* let's read it back */ - set_bit(R5_Wantread, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } - } - - /* Finish reconstruct operations initiated by the expansion process */ - if (sh->reconstruct_state == reconstruct_state_result) { - sh->reconstruct_state = reconstruct_state_idle; - clear_bit(STRIPE_EXPANDING, &sh->state); - for (i = conf->raid_disks; i--; ) { - set_bit(R5_Wantwrite, &sh->dev[i].flags); - set_bit(R5_LOCKED, &sh->dev[i].flags); - s.locked++; - } - } - - if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && - !sh->reconstruct_state) { - /* Need to write out all blocks after computing parity */ - sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, - conf->raid_disks); - schedule_reconstruction5(sh, &s, 1, 1); - } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { - clear_bit(STRIPE_EXPAND_READY, &sh->state); - atomic_dec(&conf->reshape_stripes); - wake_up(&conf->wait_for_overlap); - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + !test_bit(STRIPE_INSYNC, &sh->state))) { + if (conf->level == 6) + handle_parity_checks6(conf, sh, &s, disks); + else + handle_parity_checks5(conf, sh, &s, disks); } - if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) - handle_stripe_expansion(conf, sh, NULL); - - unlock: - spin_unlock(&sh->lock); - - /* wait for this device to become unblocked */ - if (unlikely(blocked_rdev)) - md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - - if (s.ops_request) - raid5_run_ops(sh, s.ops_request); - - ops_run_io(sh, &s); - - return_io(return_bi); - - return blocked_rdev == NULL; -} - -static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) -{ - raid6_conf_t *conf = sh->raid_conf; - int disks = sh->disks; - struct bio *return_bi = NULL; - int i, pd_idx = sh->pd_idx; - struct stripe_head_state s; - struct r6_state r6s; - struct r5dev *dev, *pdev, *qdev; - mdk_rdev_t *blocked_rdev = NULL; - - r6s.qd_idx = raid6_next_disk(pd_idx, disks); - pr_debug("handling stripe %llu, state=%#lx cnt=%d, " - "pd_idx=%d, qd_idx=%d\n", - (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), pd_idx, r6s.qd_idx); - memset(&s, 0, sizeof(s)); - - spin_lock(&sh->lock); - clear_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - - s.syncing = test_bit(STRIPE_SYNCING, &sh->state); - s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); - s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - /* Now to look around and see what can be done */ - - rcu_read_lock(); - for (i=disks; i--; ) { - mdk_rdev_t *rdev; - dev = &sh->dev[i]; - clear_bit(R5_Insync, &dev->flags); - - pr_debug("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); - /* maybe we can reply to a read */ - if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { - struct bio *rbi, *rbi2; - pr_debug("Return read for disc %d\n", i); - spin_lock_irq(&conf->device_lock); - rbi = dev->toread; - dev->toread = NULL; - if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); - while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { - copy_data(0, rbi, dev->page, dev->sector); - rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); - if (--rbi->bi_phys_segments == 0) { - rbi->bi_next = return_bi; - return_bi = rbi; - } - spin_unlock_irq(&conf->device_lock); - rbi = rbi2; + if ((s.replacing || s.syncing) && s.locked == 0 + && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) + && !test_bit(STRIPE_REPLACED, &sh->state)) { + /* Write out to replacement devices where possible */ + for (i = 0; i < conf->raid_disks; i++) + if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) { + WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags)); + set_bit(R5_WantReplace, &sh->dev[i].flags); + set_bit(R5_LOCKED, &sh->dev[i].flags); + s.locked++; } - } - - /* now count some things */ - if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; - if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; - - - if (dev->toread) - s.to_read++; - if (dev->towrite) { - s.to_write++; - if (!test_bit(R5_OVERWRITE, &dev->flags)) - s.non_overwrite++; - } - if (dev->written) - s.written++; - rdev = rcu_dereference(conf->disks[i].rdev); - if (blocked_rdev == NULL && - rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - blocked_rdev = rdev; - atomic_inc(&rdev->nr_pending); - } - if (!rdev || !test_bit(In_sync, &rdev->flags)) { - /* The ReadError flag will just be confusing now */ - clear_bit(R5_ReadError, &dev->flags); - clear_bit(R5_ReWrite, &dev->flags); - } - if (!rdev || !test_bit(In_sync, &rdev->flags) - || test_bit(R5_ReadError, &dev->flags)) { - if (s.failed < 2) - r6s.failed_num[s.failed] = i; - s.failed++; - } else - set_bit(R5_Insync, &dev->flags); - } - rcu_read_unlock(); - - if (unlikely(blocked_rdev)) { - if (s.syncing || s.expanding || s.expanded || - s.to_write || s.written) { - set_bit(STRIPE_HANDLE, &sh->state); - goto unlock; - } - /* There is nothing for the blocked_rdev to block */ - rdev_dec_pending(blocked_rdev, conf->mddev); - blocked_rdev = NULL; - } - - pr_debug("locked=%d uptodate=%d to_read=%d" - " to_write=%d failed=%d failed_num=%d,%d\n", - s.locked, s.uptodate, s.to_read, s.to_write, s.failed, - r6s.failed_num[0], r6s.failed_num[1]); - /* check if the array has lost >2 devices and, if so, some requests - * might need to be failed - */ - if (s.failed > 2 && s.to_read+s.to_write+s.written) - handle_failed_stripe(conf, sh, &s, disks, &return_bi); - if (s.failed > 2 && s.syncing) { - md_done_sync(conf->mddev, STRIPE_SECTORS,0); - clear_bit(STRIPE_SYNCING, &sh->state); - s.syncing = 0; + if (s.replacing) + set_bit(STRIPE_INSYNC, &sh->state); + set_bit(STRIPE_REPLACED, &sh->state); } - - /* - * might be able to return some write requests if the parity blocks - * are safe, or on a failed drive - */ - pdev = &sh->dev[pd_idx]; - r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) - || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); - qdev = &sh->dev[r6s.qd_idx]; - r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx) - || (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx); - - if ( s.written && - ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) - && !test_bit(R5_LOCKED, &pdev->flags) - && test_bit(R5_UPTODATE, &pdev->flags)))) && - ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) - && !test_bit(R5_LOCKED, &qdev->flags) - && test_bit(R5_UPTODATE, &qdev->flags))))) - handle_stripe_clean_event(conf, sh, disks, &return_bi); - - /* Now we might consider reading some blocks, either to check/generate - * parity, or to satisfy requests - * or to load a block that is being partially written. - */ - if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || - (s.syncing && (s.uptodate < disks)) || s.expanding) - handle_stripe_fill6(sh, &s, &r6s, disks); - - /* now to consider writing and what else, if anything should be read */ - if (s.to_write) - handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); - - /* maybe we need to check and possibly fix the parity for this stripe - * Any reads will already have been scheduled, so we just see if enough - * data is available - */ - if (s.syncing && s.locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) - handle_parity_checks6(conf, sh, &s, &r6s, tmp_page, disks); - - if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, STRIPE_SECTORS,1); + if ((s.syncing || s.replacing) && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && + test_bit(STRIPE_INSYNC, &sh->state)) { + md_done_sync(conf->mddev, STRIPE_SECTORS, 1); clear_bit(STRIPE_SYNCING, &sh->state); + if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) + wake_up(&conf->wait_for_overlap); } /* If the failed drives are just a ReadError, then we might need * to progress the repair/check process */ - if (s.failed <= 2 && !conf->mddev->ro) + if (s.failed <= conf->max_degraded && !conf->mddev->ro) for (i = 0; i < s.failed; i++) { - dev = &sh->dev[r6s.failed_num[i]]; + struct r5dev *dev = &sh->dev[s.failed_num[i]]; if (test_bit(R5_ReadError, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags) @@ -2942,27 +3923,52 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) set_bit(R5_Wantwrite, &dev->flags); set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); + s.locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); set_bit(R5_LOCKED, &dev->flags); + s.locked++; } } } - if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { - /* Need to write out all blocks after computing P&Q */ - sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, - conf->raid_disks); - compute_parity6(sh, RECONSTRUCT_WRITE); - for (i = conf->raid_disks ; i-- ; ) { + + /* Finish reconstruct operations initiated by the expansion process */ + if (sh->reconstruct_state == reconstruct_state_result) { + struct stripe_head *sh_src + = get_active_stripe(conf, sh->sector, 1, 1, 1); + if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { + /* sh cannot be written until sh_src has been read. + * so arrange for sh to be delayed a little + */ + set_bit(STRIPE_DELAYED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, + &sh_src->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe(sh_src); + goto finish; + } + if (sh_src) + release_stripe(sh_src); + + sh->reconstruct_state = reconstruct_state_idle; + clear_bit(STRIPE_EXPANDING, &sh->state); + for (i = conf->raid_disks; i--; ) { + set_bit(R5_Wantwrite, &sh->dev[i].flags); set_bit(R5_LOCKED, &sh->dev[i].flags); s.locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); } - clear_bit(STRIPE_EXPANDING, &sh->state); - } else if (s.expanded) { + } + + if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && + !sh->reconstruct_state) { + /* Need to write out all blocks after computing parity */ + sh->disks = conf->raid_disks; + stripe_set_idx(sh->sector, conf, 0, sh); + schedule_reconstruction(sh, &s, 1, 1); + } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@ -2971,34 +3977,74 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (s.expanding && s.locked == 0 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) - handle_stripe_expansion(conf, sh, &r6s); - - unlock: - spin_unlock(&sh->lock); + handle_stripe_expansion(conf, sh); +finish: /* wait for this device to become unblocked */ - if (unlikely(blocked_rdev)) - md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); + if (unlikely(s.blocked_rdev)) { + if (conf->mddev->external) + md_wait_for_blocked_rdev(s.blocked_rdev, + conf->mddev); + else + /* Internal metadata will immediately + * be written by raid5d, so we don't + * need to wait here. + */ + rdev_dec_pending(s.blocked_rdev, + conf->mddev); + } - ops_run_io(sh, &s); + if (s.handle_bad_blocks) + for (i = disks; i--; ) { + struct md_rdev *rdev; + struct r5dev *dev = &sh->dev[i]; + if (test_and_clear_bit(R5_WriteError, &dev->flags)) { + /* We own a safe reference to the rdev */ + rdev = conf->disks[i].rdev; + if (!rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + md_error(conf->mddev, rdev); + rdev_dec_pending(rdev, conf->mddev); + } + if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { + rdev = conf->disks[i].rdev; + rdev_clear_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0); + rdev_dec_pending(rdev, conf->mddev); + } + if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { + rdev = conf->disks[i].replacement; + if (!rdev) + /* rdev have been moved down */ + rdev = conf->disks[i].rdev; + rdev_clear_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0); + rdev_dec_pending(rdev, conf->mddev); + } + } - return_io(return_bi); + if (s.ops_request) + raid_run_ops(sh, s.ops_request); - return blocked_rdev == NULL; -} + ops_run_io(sh, &s); -/* returns true if the stripe was handled */ -static bool handle_stripe(struct stripe_head *sh, struct page *tmp_page) -{ - if (sh->raid_conf->level == 6) - return handle_stripe6(sh, tmp_page); - else - return handle_stripe5(sh); -} + if (s.dec_preread_active) { + /* We delay this until after ops_run_io so that if make_request + * is waiting on a flush, it won't continue until the writes + * have actually been submitted. + */ + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } + return_io(s.return_bi); + clear_bit_unlock(STRIPE_ACTIVE, &sh->state); +} -static void raid5_activate_delayed(raid5_conf_t *conf) +static void raid5_activate_delayed(struct r5conf *conf) { if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { while (!list_empty(&conf->delayed_list)) { @@ -3010,12 +4056,13 @@ static void raid5_activate_delayed(raid5_conf_t *conf) if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); list_add_tail(&sh->lru, &conf->hold_list); + raid5_wakeup_stripe_thread(sh); } - } else - blk_plug_device(conf->mddev->queue); + } } -static void activate_bit_delay(raid5_conf_t *conf) +static void activate_bit_delay(struct r5conf *conf, + struct list_head *temp_inactive_list) { /* device_lock is held */ struct list_head head; @@ -3023,71 +4070,40 @@ static void activate_bit_delay(raid5_conf_t *conf) list_del_init(&conf->bitmap_list); while (!list_empty(&head)) { struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); + int hash; list_del_init(&sh->lru); atomic_inc(&sh->count); - __release_stripe(conf, sh); - } -} - -static void unplug_slaves(mddev_t *mddev) -{ - raid5_conf_t *conf = mddev_to_conf(mddev); - int i; - - rcu_read_lock(); - for (i=0; i<mddev->raid_disks; i++) { - mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { - struct request_queue *r_queue = bdev_get_queue(rdev->bdev); - - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - blk_unplug(r_queue); - - rdev_dec_pending(rdev, mddev); - rcu_read_lock(); - } - } - rcu_read_unlock(); -} - -static void raid5_unplug_device(struct request_queue *q) -{ - mddev_t *mddev = q->queuedata; - raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned long flags; - - spin_lock_irqsave(&conf->device_lock, flags); - - if (blk_remove_plug(q)) { - conf->seq_flush++; - raid5_activate_delayed(conf); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &temp_inactive_list[hash]); } - md_wakeup_thread(mddev->thread); - - spin_unlock_irqrestore(&conf->device_lock, flags); - - unplug_slaves(mddev); } -static int raid5_congested(void *data, int bits) +int md_raid5_congested(struct mddev *mddev, int bits) { - mddev_t *mddev = data; - raid5_conf_t *conf = mddev_to_conf(mddev); + struct r5conf *conf = mddev->private; /* No difference between reads and writes. Just check * how busy the stripe_cache is */ + if (conf->inactive_blocked) return 1; if (conf->quiesce) return 1; - if (list_empty_careful(&conf->inactive_list)) + if (atomic_read(&conf->empty_inactive_list_nr)) return 1; return 0; } +EXPORT_SYMBOL_GPL(md_raid5_congested); + +static int raid5_congested(void *data, int bits) +{ + struct mddev *mddev = data; + + return mddev_congested(mddev, bits) || + md_raid5_congested(mddev, bits); +} /* We want read requests to align with chunks where possible, * but write requests don't need to. @@ -3096,15 +4112,17 @@ static int raid5_mergeable_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - mddev_t *mddev = q->queuedata; + struct mddev *mddev = q->queuedata; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; - unsigned int chunk_sectors = mddev->chunk_size >> 9; + unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int bio_sectors = bvm->bi_size >> 9; if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ + if (mddev->new_chunk_sectors < mddev->chunk_sectors) + chunk_sectors = mddev->new_chunk_sectors; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; if (max < 0) max = 0; if (max <= biovec->bv_len && bio_sectors == 0) @@ -3114,12 +4132,14 @@ static int raid5_mergeable_bvec(struct request_queue *q, } -static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) +static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); - unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); + unsigned int chunk_sectors = mddev->chunk_sectors; + unsigned int bio_sectors = bio_sectors(bio); + if (mddev->new_chunk_sectors < mddev->chunk_sectors) + chunk_sectors = mddev->new_chunk_sectors; return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); } @@ -3128,7 +4148,7 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) * later sampled by raid5d. */ -static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) +static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) { unsigned long flags; @@ -3142,7 +4162,7 @@ static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) } -static struct bio *remove_bio_from_retry(raid5_conf_t *conf) +static struct bio *remove_bio_from_retry(struct r5conf *conf) { struct bio *bi; @@ -3155,8 +4175,11 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf) if(bi) { conf->retry_read_aligned_list = bi->bi_next; bi->bi_next = NULL; - bi->bi_phys_segments = 1; /* biased count of active stripes */ - bi->bi_hw_segments = 0; /* count of processed stripes */ + /* + * this sets the active strip count to 1 and the processed + * strip count to zero (upper 8 bits) + */ + raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ } return bi; @@ -3172,21 +4195,23 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf) static void raid5_align_endio(struct bio *bi, int error) { struct bio* raid_bi = bi->bi_private; - mddev_t *mddev; - raid5_conf_t *conf; + struct mddev *mddev; + struct r5conf *conf; int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); - mdk_rdev_t *rdev; + struct md_rdev *rdev; bio_put(bi); - mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; - conf = mddev_to_conf(mddev); rdev = (void*)raid_bi->bi_next; raid_bi->bi_next = NULL; + mddev = rdev->mddev; + conf = mddev->private; rdev_dec_pending(rdev, conf->mddev); if (!error && uptodate) { + trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), + raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); @@ -3203,11 +4228,10 @@ static int bio_fits_rdev(struct bio *bi) { struct request_queue *q = bdev_get_queue(bi->bi_bdev); - if ((bi->bi_size>>9) > q->max_sectors) + if (bio_sectors(bi) > queue_max_sectors(q)) return 0; blk_recount_segments(q, bi); - if (bi->bi_phys_segments > q->max_phys_segments || - bi->bi_hw_segments > q->max_hw_segments) + if (bi->bi_phys_segments > queue_max_segments(q)) return 0; if (q->merge_bvec_fn) @@ -3220,24 +4244,22 @@ static int bio_fits_rdev(struct bio *bi) } -static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) +static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) { - mddev_t *mddev = q->queuedata; - raid5_conf_t *conf = mddev_to_conf(mddev); - const unsigned int raid_disks = conf->raid_disks; - const unsigned int data_disks = raid_disks - conf->max_degraded; - unsigned int dd_idx, pd_idx; + struct r5conf *conf = mddev->private; + int dd_idx; struct bio* align_bi; - mdk_rdev_t *rdev; + struct md_rdev *rdev; + sector_t end_sector; if (!in_chunk_boundary(mddev, raid_bio)) { pr_debug("chunk_aligned_read : non aligned\n"); return 0; } /* - * use bio_clone to make a copy of the bio + * use bio_clone_mddev to make a copy of the bio */ - align_bi = bio_clone(raid_bio, GFP_NOIO); + align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); if (!align_bi) return 0; /* @@ -3249,37 +4271,56 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) /* * compute position */ - align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, - raid_disks, - data_disks, - &dd_idx, - &pd_idx, - conf); + align_bi->bi_iter.bi_sector = + raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector, + 0, &dd_idx, NULL); + end_sector = bio_end_sector(align_bi); rcu_read_lock(); - rdev = rcu_dereference(conf->disks[dd_idx].rdev); - if (rdev && test_bit(In_sync, &rdev->flags)) { + rdev = rcu_dereference(conf->disks[dd_idx].replacement); + if (!rdev || test_bit(Faulty, &rdev->flags) || + rdev->recovery_offset < end_sector) { + rdev = rcu_dereference(conf->disks[dd_idx].rdev); + if (rdev && + (test_bit(Faulty, &rdev->flags) || + !(test_bit(In_sync, &rdev->flags) || + rdev->recovery_offset >= end_sector))) + rdev = NULL; + } + if (rdev) { + sector_t first_bad; + int bad_sectors; + atomic_inc(&rdev->nr_pending); rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; align_bi->bi_bdev = rdev->bdev; align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); - align_bi->bi_sector += rdev->data_offset; - if (!bio_fits_rdev(align_bi)) { - /* too big in some way */ + if (!bio_fits_rdev(align_bi) || + is_badblock(rdev, align_bi->bi_iter.bi_sector, + bio_sectors(align_bi), + &first_bad, &bad_sectors)) { + /* too big in some way, or has a known bad block */ bio_put(align_bi); rdev_dec_pending(rdev, mddev); return 0; } + /* No reshape active, so we can trust rdev->data_offset */ + align_bi->bi_iter.bi_sector += rdev->data_offset; + spin_lock_irq(&conf->device_lock); wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, - conf->device_lock, /* nothing */); + conf->device_lock); atomic_inc(&conf->active_aligned_reads); spin_unlock_irq(&conf->device_lock); + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev), + align_bi, disk_devt(mddev->gendisk), + raid_bio->bi_iter.bi_sector); generic_make_request(align_bi); return 1; } else { @@ -3299,18 +4340,35 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) * head of the hold_list has changed, i.e. the head was promoted to the * handle_list. */ -static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) +static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) { - struct stripe_head *sh; + struct stripe_head *sh = NULL, *tmp; + struct list_head *handle_list = NULL; + struct r5worker_group *wg = NULL; + + if (conf->worker_cnt_per_group == 0) { + handle_list = &conf->handle_list; + } else if (group != ANY_GROUP) { + handle_list = &conf->worker_groups[group].handle_list; + wg = &conf->worker_groups[group]; + } else { + int i; + for (i = 0; i < conf->group_cnt; i++) { + handle_list = &conf->worker_groups[i].handle_list; + wg = &conf->worker_groups[i]; + if (!list_empty(handle_list)) + break; + } + } pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", __func__, - list_empty(&conf->handle_list) ? "empty" : "busy", + list_empty(handle_list) ? "empty" : "busy", list_empty(&conf->hold_list) ? "empty" : "busy", atomic_read(&conf->pending_full_writes), conf->bypass_count); - if (!list_empty(&conf->handle_list)) { - sh = list_entry(conf->handle_list.next, typeof(*sh), lru); + if (!list_empty(handle_list)) { + sh = list_entry(handle_list->next, typeof(*sh), lru); if (list_empty(&conf->hold_list)) conf->bypass_count = 0; @@ -3328,92 +4386,284 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) ((conf->bypass_threshold && conf->bypass_count > conf->bypass_threshold) || atomic_read(&conf->pending_full_writes) == 0)) { - sh = list_entry(conf->hold_list.next, - typeof(*sh), lru); - conf->bypass_count -= conf->bypass_threshold; - if (conf->bypass_count < 0) - conf->bypass_count = 0; - } else + + list_for_each_entry(tmp, &conf->hold_list, lru) { + if (conf->worker_cnt_per_group == 0 || + group == ANY_GROUP || + !cpu_online(tmp->cpu) || + cpu_to_group(tmp->cpu) == group) { + sh = tmp; + break; + } + } + + if (sh) { + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } + wg = NULL; + } + + if (!sh) return NULL; + if (wg) { + wg->stripes_cnt--; + sh->group = NULL; + } list_del_init(&sh->lru); - atomic_inc(&sh->count); - BUG_ON(atomic_read(&sh->count) != 1); + BUG_ON(atomic_inc_return(&sh->count) != 1); return sh; } -static int make_request(struct request_queue *q, struct bio * bi) +struct raid5_plug_cb { + struct blk_plug_cb cb; + struct list_head list; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; +}; + +static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) +{ + struct raid5_plug_cb *cb = container_of( + blk_cb, struct raid5_plug_cb, cb); + struct stripe_head *sh; + struct mddev *mddev = cb->cb.data; + struct r5conf *conf = mddev->private; + int cnt = 0; + int hash; + + if (cb->list.next && !list_empty(&cb->list)) { + spin_lock_irq(&conf->device_lock); + while (!list_empty(&cb->list)) { + sh = list_first_entry(&cb->list, struct stripe_head, lru); + list_del_init(&sh->lru); + /* + * avoid race release_stripe_plug() sees + * STRIPE_ON_UNPLUG_LIST clear but the stripe + * is still in our list + */ + smp_mb__before_atomic(); + clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); + /* + * STRIPE_ON_RELEASE_LIST could be set here. In that + * case, the count is always > 1 here + */ + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); + cnt++; + } + spin_unlock_irq(&conf->device_lock); + } + release_inactive_stripe_list(conf, cb->temp_inactive_list, + NR_STRIPE_HASH_LOCKS); + if (mddev->queue) + trace_block_unplug(mddev->queue, cnt, !from_schedule); + kfree(cb); +} + +static void release_stripe_plug(struct mddev *mddev, + struct stripe_head *sh) { - mddev_t *mddev = q->queuedata; - raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned int dd_idx, pd_idx; + struct blk_plug_cb *blk_cb = blk_check_plugged( + raid5_unplug, mddev, + sizeof(struct raid5_plug_cb)); + struct raid5_plug_cb *cb; + + if (!blk_cb) { + release_stripe(sh); + return; + } + + cb = container_of(blk_cb, struct raid5_plug_cb, cb); + + if (cb->list.next == NULL) { + int i; + INIT_LIST_HEAD(&cb->list); + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(cb->temp_inactive_list + i); + } + + if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) + list_add_tail(&sh->lru, &cb->list); + else + release_stripe(sh); +} + +static void make_discard_request(struct mddev *mddev, struct bio *bi) +{ + struct r5conf *conf = mddev->private; + sector_t logical_sector, last_sector; + struct stripe_head *sh; + int remaining; + int stripe_sectors; + + if (mddev->reshape_position != MaxSector) + /* Skip discard while reshape is happening */ + return; + + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); + + bi->bi_next = NULL; + bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + + stripe_sectors = conf->chunk_sectors * + (conf->raid_disks - conf->max_degraded); + logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, + stripe_sectors); + sector_div(last_sector, stripe_sectors); + + logical_sector *= conf->chunk_sectors; + last_sector *= conf->chunk_sectors; + + for (; logical_sector < last_sector; + logical_sector += STRIPE_SECTORS) { + DEFINE_WAIT(w); + int d; + again: + sh = get_active_stripe(conf, logical_sector, 0, 0, 0); + prepare_to_wait(&conf->wait_for_overlap, &w, + TASK_UNINTERRUPTIBLE); + set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); + if (test_bit(STRIPE_SYNCING, &sh->state)) { + release_stripe(sh); + schedule(); + goto again; + } + clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); + spin_lock_irq(&sh->stripe_lock); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + if (sh->dev[d].towrite || sh->dev[d].toread) { + set_bit(R5_Overlap, &sh->dev[d].flags); + spin_unlock_irq(&sh->stripe_lock); + release_stripe(sh); + schedule(); + goto again; + } + } + set_bit(STRIPE_DISCARD, &sh->state); + finish_wait(&conf->wait_for_overlap, &w); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + sh->dev[d].towrite = bi; + set_bit(R5_OVERWRITE, &sh->dev[d].flags); + raid5_inc_bi_active_stripes(bi); + } + spin_unlock_irq(&sh->stripe_lock); + if (conf->mddev->bitmap) { + for (d = 0; + d < conf->raid_disks - conf->max_degraded; + d++) + bitmap_startwrite(mddev->bitmap, + sh->sector, + STRIPE_SECTORS, + 0); + sh->bm_seq = conf->seq_flush + 1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe_plug(mddev, sh); + } + + remaining = raid5_dec_bi_active_stripes(bi); + if (remaining == 0) { + md_write_end(mddev); + bio_endio(bi, 0); + } +} + +static void make_request(struct mddev *mddev, struct bio * bi) +{ + struct r5conf *conf = mddev->private; + int dd_idx; sector_t new_sector; sector_t logical_sector, last_sector; struct stripe_head *sh; const int rw = bio_data_dir(bi); int remaining; + DEFINE_WAIT(w); + bool do_prepare; - if (unlikely(bio_barrier(bi))) { - bio_endio(bi, -EOPNOTSUPP); - return 0; + if (unlikely(bi->bi_rw & REQ_FLUSH)) { + md_flush_request(mddev, bi); + return; } md_write_start(mddev, bi); - disk_stat_inc(mddev->gendisk, ios[rw]); - disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi)); - if (rw == READ && mddev->reshape_position == MaxSector && - chunk_aligned_read(q,bi)) - return 0; + chunk_aligned_read(mddev,bi)) + return; + + if (unlikely(bi->bi_rw & REQ_DISCARD)) { + make_discard_request(mddev, bi); + return; + } - logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - last_sector = bi->bi_sector + (bi->bi_size>>9); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bio_end_sector(bi); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { - DEFINE_WAIT(w); - int disks, data_disks; + int previous; + int seq; + do_prepare = false; retry: - prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - if (likely(conf->expand_progress == MaxSector)) - disks = conf->raid_disks; - else { - /* spinlock is needed as expand_progress may be + seq = read_seqcount_begin(&conf->gen_lock); + previous = 0; + if (do_prepare) + prepare_to_wait(&conf->wait_for_overlap, &w, + TASK_UNINTERRUPTIBLE); + if (unlikely(conf->reshape_progress != MaxSector)) { + /* spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be * possible to see a half-updated value - * Ofcourse expand_progress could change after + * Of course reshape_progress could change after * the lock is dropped, so once we get a reference * to the stripe that we think it is, we will have * to check again. */ spin_lock_irq(&conf->device_lock); - disks = conf->raid_disks; - if (logical_sector >= conf->expand_progress) - disks = conf->previous_raid_disks; - else { - if (logical_sector >= conf->expand_lo) { + if (mddev->reshape_backwards + ? logical_sector < conf->reshape_progress + : logical_sector >= conf->reshape_progress) { + previous = 1; + } else { + if (mddev->reshape_backwards + ? logical_sector < conf->reshape_safe + : logical_sector >= conf->reshape_safe) { spin_unlock_irq(&conf->device_lock); schedule(); + do_prepare = true; goto retry; } } spin_unlock_irq(&conf->device_lock); } - data_disks = disks - conf->max_degraded; - new_sector = raid5_compute_sector(logical_sector, disks, data_disks, - &dd_idx, &pd_idx, conf); - pr_debug("raid5: make_request, sector %llu logical %llu\n", - (unsigned long long)new_sector, + new_sector = raid5_compute_sector(conf, logical_sector, + previous, + &dd_idx, NULL); + pr_debug("raid456: make_request, sector %llu logical %llu\n", + (unsigned long long)new_sector, (unsigned long long)logical_sector); - sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); + sh = get_active_stripe(conf, new_sector, previous, + (bi->bi_rw&RWA_MASK), 0); if (sh) { - if (unlikely(conf->expand_progress != MaxSector)) { + if (unlikely(previous)) { /* expansion might have moved on while waiting for a * stripe, so we must do the range check again. * Expansion could still move past after this @@ -3424,63 +4674,87 @@ static int make_request(struct request_queue *q, struct bio * bi) */ int must_retry = 0; spin_lock_irq(&conf->device_lock); - if (logical_sector < conf->expand_progress && - disks == conf->previous_raid_disks) + if (mddev->reshape_backwards + ? logical_sector >= conf->reshape_progress + : logical_sector < conf->reshape_progress) /* mismatch, need to try again */ must_retry = 1; spin_unlock_irq(&conf->device_lock); if (must_retry) { release_stripe(sh); + schedule(); + do_prepare = true; goto retry; } } - /* FIXME what if we get a false positive because these - * are being updated. - */ - if (logical_sector >= mddev->suspend_lo && + if (read_seqcount_retry(&conf->gen_lock, seq)) { + /* Might have got the wrong stripe_head + * by accident + */ + release_stripe(sh); + goto retry; + } + + if (rw == WRITE && + logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { release_stripe(sh); - schedule(); + /* As the suspend_* range is controlled by + * userspace, we want an interruptible + * wait. + */ + flush_signals(current); + prepare_to_wait(&conf->wait_for_overlap, + &w, TASK_INTERRUPTIBLE); + if (logical_sector >= mddev->suspend_lo && + logical_sector < mddev->suspend_hi) { + schedule(); + do_prepare = true; + } goto retry; } if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { + !add_stripe_bio(sh, bi, dd_idx, rw)) { /* Stripe is busy expanding or * add failed due to overlap. Flush everything * and wait a while */ - raid5_unplug_device(mddev->queue); + md_wakeup_thread(mddev->thread); release_stripe(sh); schedule(); + do_prepare = true; goto retry; } - finish_wait(&conf->wait_for_overlap, &w); set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); - release_stripe(sh); + if ((bi->bi_rw & REQ_SYNC) && + !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe_plug(mddev, sh); } else { /* cannot get stripe for read-ahead, just give-up */ clear_bit(BIO_UPTODATE, &bi->bi_flags); - finish_wait(&conf->wait_for_overlap, &w); break; } - } - spin_lock_irq(&conf->device_lock); - remaining = --bi->bi_phys_segments; - spin_unlock_irq(&conf->device_lock); + finish_wait(&conf->wait_for_overlap, &w); + + remaining = raid5_dec_bi_active_stripes(bi); if (remaining == 0) { if ( rw == WRITE ) md_write_end(mddev); + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); } - return 0; } -static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) +static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); + +static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped) { /* reshaping is quite different to recovery/resync so it is * handled quite separately ... here. @@ -3491,63 +4765,140 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * As the reads complete, handle_stripe will copy the data * into the destination stripe and release that stripe. */ - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + struct r5conf *conf = mddev->private; struct stripe_head *sh; - int pd_idx; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; int new_data_disks = conf->raid_disks - conf->max_degraded; int i; int dd_idx; - sector_t writepos, safepos, gap; - - if (sector_nr == 0 && - conf->expand_progress != 0) { - /* restarting in the middle, skip the initial sectors */ - sector_nr = conf->expand_progress; + sector_t writepos, readpos, safepos; + sector_t stripe_addr; + int reshape_sectors; + struct list_head stripes; + + if (sector_nr == 0) { + /* If restarting in the middle, skip the initial sectors */ + if (mddev->reshape_backwards && + conf->reshape_progress < raid5_size(mddev, 0, 0)) { + sector_nr = raid5_size(mddev, 0, 0) + - conf->reshape_progress; + } else if (!mddev->reshape_backwards && + conf->reshape_progress > 0) + sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); - *skipped = 1; - return sector_nr; + if (sector_nr) { + mddev->curr_resync_completed = sector_nr; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + *skipped = 1; + return sector_nr; + } } - /* we update the metadata when there is more than 3Meg - * in the block range (that is rather arbitrary, should - * probably be time based) or when the data about to be - * copied would over-write the source of the data at - * the front of the range. - * i.e. one new_stripe forward from expand_progress new_maps - * to after where expand_lo old_maps to + /* We need to process a full chunk at a time. + * If old and new chunk sizes differ, we need to process the + * largest of these */ - writepos = conf->expand_progress + - conf->chunk_size/512*(new_data_disks); + if (mddev->new_chunk_sectors > mddev->chunk_sectors) + reshape_sectors = mddev->new_chunk_sectors; + else + reshape_sectors = mddev->chunk_sectors; + + /* We update the metadata at least every 10 seconds, or when + * the data about to be copied would over-write the source of + * the data at the front of the range. i.e. one new_stripe + * along from reshape_progress new_maps to after where + * reshape_safe old_maps to + */ + writepos = conf->reshape_progress; sector_div(writepos, new_data_disks); - safepos = conf->expand_lo; + readpos = conf->reshape_progress; + sector_div(readpos, data_disks); + safepos = conf->reshape_safe; sector_div(safepos, data_disks); - gap = conf->expand_progress - conf->expand_lo; + if (mddev->reshape_backwards) { + writepos -= min_t(sector_t, reshape_sectors, writepos); + readpos += reshape_sectors; + safepos += reshape_sectors; + } else { + writepos += reshape_sectors; + readpos -= min_t(sector_t, reshape_sectors, readpos); + safepos -= min_t(sector_t, reshape_sectors, safepos); + } + + /* Having calculated the 'writepos' possibly use it + * to set 'stripe_addr' which is where we will write to. + */ + if (mddev->reshape_backwards) { + BUG_ON(conf->reshape_progress == 0); + stripe_addr = writepos; + BUG_ON((mddev->dev_sectors & + ~((sector_t)reshape_sectors - 1)) + - reshape_sectors - stripe_addr + != sector_nr); + } else { + BUG_ON(writepos != sector_nr + reshape_sectors); + stripe_addr = sector_nr; + } + + /* 'writepos' is the most advanced device address we might write. + * 'readpos' is the least advanced device address we might read. + * 'safepos' is the least address recorded in the metadata as having + * been reshaped. + * If there is a min_offset_diff, these are adjusted either by + * increasing the safepos/readpos if diff is negative, or + * increasing writepos if diff is positive. + * If 'readpos' is then behind 'writepos', there is no way that we can + * ensure safety in the face of a crash - that must be done by userspace + * making a backup of the data. So in that case there is no particular + * rush to update metadata. + * Otherwise if 'safepos' is behind 'writepos', then we really need to + * update the metadata to advance 'safepos' to match 'readpos' so that + * we can be safe in the event of a crash. + * So we insist on updating metadata if safepos is behind writepos and + * readpos is beyond writepos. + * In any case, update the metadata every 10 seconds. + * Maybe that number should be configurable, but I'm not sure it is + * worth it.... maybe it could be a multiple of safemode_delay??? + */ + if (conf->min_offset_diff < 0) { + safepos += -conf->min_offset_diff; + readpos += -conf->min_offset_diff; + } else + writepos += conf->min_offset_diff; - if (writepos >= safepos || - gap > (new_data_disks)*3000*2 /*3Meg*/) { + if ((mddev->reshape_backwards + ? (safepos > writepos && readpos < writepos) + : (safepos < writepos && readpos > writepos)) || + time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes)==0); - mddev->reshape_position = conf->expand_progress; + atomic_read(&conf->reshape_stripes)==0 + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (atomic_read(&conf->reshape_stripes) != 0) + return 0; + mddev->reshape_position = conf->reshape_progress; + mddev->curr_resync_completed = sector_nr; + conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || - kthread_should_stop()); + test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + return 0; spin_lock_irq(&conf->device_lock); - conf->expand_lo = mddev->reshape_position; + conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { + INIT_LIST_HEAD(&stripes); + for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; - int skipped = 0; - pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); - sh = get_active_stripe(conf, sector_nr+i, - conf->raid_disks, pd_idx, 0); + int skipped_disk = 0; + sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -3558,25 +4909,28 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped if (j == sh->pd_idx) continue; if (conf->level == 6 && - j == raid6_next_disk(sh->pd_idx, sh->disks)) + j == sh->qd_idx) continue; - s = compute_blocknr(sh, j); - if (s < mddev->array_sectors) { - skipped = 1; + s = compute_blocknr(sh, j, 0); + if (s < raid5_size(mddev, 0, 0)) { + skipped_disk = 1; continue; } memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); set_bit(R5_Expanded, &sh->dev[j].flags); set_bit(R5_UPTODATE, &sh->dev[j].flags); } - if (!skipped) { + if (!skipped_disk) { set_bit(STRIPE_EXPAND_READY, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); } - release_stripe(sh); + list_add(&sh->lru, &stripes); } spin_lock_irq(&conf->device_lock); - conf->expand_progress = (sector_nr + i) * new_data_disks; + if (mddev->reshape_backwards) + conf->reshape_progress -= reshape_sectors * new_data_disks; + else + conf->reshape_progress += reshape_sectors * new_data_disks; spin_unlock_irq(&conf->device_lock); /* Ok, those stripe are ready. We can start scheduling * reads on the source stripes. @@ -3584,63 +4938,74 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * block on the destination stripes. */ first_sector = - raid5_compute_sector(sector_nr*(new_data_disks), - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); + raid5_compute_sector(conf, stripe_addr*(new_data_disks), + 1, &dd_idx, NULL); last_sector = - raid5_compute_sector((sector_nr+conf->chunk_size/512) - *(new_data_disks) -1, - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); - if (last_sector >= (mddev->size<<1)) - last_sector = (mddev->size<<1)-1; + raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) + * new_data_disks - 1), + 1, &dd_idx, NULL); + if (last_sector >= mddev->dev_sectors) + last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - pd_idx = stripe_to_pdidx(first_sector, conf, - conf->previous_raid_disks); - sh = get_active_stripe(conf, first_sector, - conf->previous_raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, first_sector, 1, 0, 1); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); first_sector += STRIPE_SECTORS; } + /* Now that the sources are clearly marked, we can release + * the destination stripes + */ + while (!list_empty(&stripes)) { + sh = list_entry(stripes.next, struct stripe_head, lru); + list_del_init(&sh->lru); + release_stripe(sh); + } /* If this takes us to the resync_max point where we have to pause, * then we need to write out the superblock. */ - sector_nr += conf->chunk_size>>9; - if (sector_nr >= mddev->resync_max) { + sector_nr += reshape_sectors; + if ((sector_nr - mddev->curr_resync_completed) * 2 + >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes) == 0); - mddev->reshape_position = conf->expand_progress; + atomic_read(&conf->reshape_stripes) == 0 + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (atomic_read(&conf->reshape_stripes) != 0) + goto ret; + mddev->reshape_position = conf->reshape_progress; + mddev->curr_resync_completed = sector_nr; + conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_DEVS, &mddev->flags) - || kthread_should_stop()); + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + goto ret; spin_lock_irq(&conf->device_lock); - conf->expand_lo = mddev->reshape_position; + conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - return conf->chunk_size>>9; +ret: + return reshape_sectors; } /* FIXME go_faster isn't used */ -static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) +static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) { - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + struct r5conf *conf = mddev->private; struct stripe_head *sh; - int pd_idx; - int raid_disks = conf->raid_disks; - sector_t max_sector = mddev->size << 1; - int sync_blocks; + sector_t max_sector = mddev->dev_sectors; + sector_t sync_blocks; int still_degraded = 0; int i; if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ - unplug_slaves(mddev); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { end_reshape(conf); return 0; @@ -3656,6 +5021,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski return 0; } + /* Allow raid5_quiesce to complete */ + wait_event(conf->wait_for_overlap, conf->quiesce != 2); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); @@ -3671,26 +5039,25 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski */ if (mddev->degraded >= conf->max_degraded && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - sector_t rv = (mddev->size << 1) - sector_nr; + sector_t rv = mddev->dev_sectors - sector_nr; *skipped = 1; return rv; } - if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && - !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { + if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + !conf->fullsync && + !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + sync_blocks >= STRIPE_SECTORS) { /* we can skip this block, and probably more */ sync_blocks /= STRIPE_SECTORS; *skipped = 1; return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ } - bitmap_cond_end_sync(mddev->bitmap, sector_nr); - pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); - sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); + sh = get_active_stripe(conf, sector_nr, 0, 1, 0); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); + sh = get_active_stripe(conf, sector_nr, 0, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -3700,26 +5067,21 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski * We don't need to check the 'failed' flag as when that gets set, * recovery aborts. */ - for (i=0; i<mddev->raid_disks; i++) + for (i = 0; i < conf->raid_disks; i++) if (conf->disks[i].rdev == NULL) still_degraded = 1; bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); - spin_lock(&sh->lock); - set_bit(STRIPE_SYNCING, &sh->state); - clear_bit(STRIPE_INSYNC, &sh->state); - spin_unlock(&sh->lock); + set_bit(STRIPE_SYNC_REQUESTED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); - /* wait for any blocked device to be handled */ - while(unlikely(!handle_stripe(sh, NULL))) - ; release_stripe(sh); return STRIPE_SECTORS; } -static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) +static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) { /* We may not be able to submit a whole bio at once as there * may not be enough stripe_heads available. @@ -3732,62 +5094,135 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. */ struct stripe_head *sh; - int dd_idx, pd_idx; + int dd_idx; sector_t sector, logical_sector, last_sector; int scnt = 0; int remaining; int handled = 0; - logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - sector = raid5_compute_sector( logical_sector, - conf->raid_disks, - conf->raid_disks - conf->max_degraded, - &dd_idx, - &pd_idx, - conf); - last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); + logical_sector = raid_bio->bi_iter.bi_sector & + ~((sector_t)STRIPE_SECTORS-1); + sector = raid5_compute_sector(conf, logical_sector, + 0, &dd_idx, NULL); + last_sector = bio_end_sector(raid_bio); for (; logical_sector < last_sector; logical_sector += STRIPE_SECTORS, sector += STRIPE_SECTORS, scnt++) { - if (scnt < raid_bio->bi_hw_segments) + if (scnt < raid5_bi_processed_stripes(raid_bio)) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); + sh = get_active_stripe(conf, sector, 0, 1, 1); if (!sh) { /* failed to get a stripe - must wait */ - raid_bio->bi_hw_segments = scnt; + raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; } - set_bit(R5_ReadError, &sh->dev[dd_idx].flags); if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { release_stripe(sh); - raid_bio->bi_hw_segments = scnt; + raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; } - handle_stripe(sh, NULL); + set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); + handle_stripe(sh); release_stripe(sh); handled++; } - spin_lock_irq(&conf->device_lock); - remaining = --raid_bio->bi_phys_segments; - spin_unlock_irq(&conf->device_lock); - if (remaining == 0) + remaining = raid5_dec_bi_active_stripes(raid_bio); + if (remaining == 0) { + trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), + raid_bio, 0); bio_endio(raid_bio, 0); + } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; } +static int handle_active_stripes(struct r5conf *conf, int group, + struct r5worker *worker, + struct list_head *temp_inactive_list) +{ + struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; + int i, batch_size = 0, hash; + bool release_inactive = false; + + while (batch_size < MAX_STRIPE_BATCH && + (sh = __get_priority_stripe(conf, group)) != NULL) + batch[batch_size++] = sh; + + if (batch_size == 0) { + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + if (!list_empty(temp_inactive_list + i)) + break; + if (i == NR_STRIPE_HASH_LOCKS) + return batch_size; + release_inactive = true; + } + spin_unlock_irq(&conf->device_lock); + + release_inactive_stripe_list(conf, temp_inactive_list, + NR_STRIPE_HASH_LOCKS); + + if (release_inactive) { + spin_lock_irq(&conf->device_lock); + return 0; + } + + for (i = 0; i < batch_size; i++) + handle_stripe(batch[i]); + cond_resched(); + + spin_lock_irq(&conf->device_lock); + for (i = 0; i < batch_size; i++) { + hash = batch[i]->hash_lock_index; + __release_stripe(conf, batch[i], &temp_inactive_list[hash]); + } + return batch_size; +} + +static void raid5_do_work(struct work_struct *work) +{ + struct r5worker *worker = container_of(work, struct r5worker, work); + struct r5worker_group *group = worker->group; + struct r5conf *conf = group->conf; + int group_id = group - conf->worker_groups; + int handled; + struct blk_plug plug; + + pr_debug("+++ raid5worker active\n"); + + blk_start_plug(&plug); + handled = 0; + spin_lock_irq(&conf->device_lock); + while (1) { + int batch_size, released; + + released = release_stripe_list(conf, worker->temp_inactive_list); + + batch_size = handle_active_stripes(conf, group_id, worker, + worker->temp_inactive_list); + worker->working = false; + if (!batch_size && !released) + break; + handled += batch_size; + } + pr_debug("%d stripes handled\n", handled); + + spin_unlock_irq(&conf->device_lock); + blk_finish_plug(&plug); + + pr_debug("--- raid5worker inactive\n"); +} /* * This is our raid5 kernel thread. @@ -3796,29 +5231,37 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */ -static void raid5d(mddev_t *mddev) +static void raid5d(struct md_thread *thread) { - struct stripe_head *sh; - raid5_conf_t *conf = mddev_to_conf(mddev); + struct mddev *mddev = thread->mddev; + struct r5conf *conf = mddev->private; int handled; + struct blk_plug plug; pr_debug("+++ raid5d active\n"); md_check_recovery(mddev); + blk_start_plug(&plug); handled = 0; spin_lock_irq(&conf->device_lock); while (1) { struct bio *bio; + int batch_size, released; - if (conf->seq_flush != conf->seq_write) { - int seq = conf->seq_flush; + released = release_stripe_list(conf, conf->temp_inactive_list); + + if ( + !list_empty(&conf->bitmap_list)) { + /* Now is a good time to flush some bitmap updates */ + conf->seq_flush++; spin_unlock_irq(&conf->device_lock); bitmap_unplug(mddev->bitmap); spin_lock_irq(&conf->device_lock); - conf->seq_write = seq; - activate_bit_delay(conf); + conf->seq_write = conf->seq_flush; + activate_bit_delay(conf, conf->temp_inactive_list); } + raid5_activate_delayed(conf); while ((bio = remove_bio_from_retry(conf))) { int ok; @@ -3830,68 +5273,88 @@ static void raid5d(mddev_t *mddev) handled++; } - sh = __get_priority_stripe(conf); - - if (!sh) + batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, + conf->temp_inactive_list); + if (!batch_size && !released) break; - spin_unlock_irq(&conf->device_lock); - - handled++; - handle_stripe(sh, conf->spare_page); - release_stripe(sh); + handled += batch_size; - spin_lock_irq(&conf->device_lock); + if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { + spin_unlock_irq(&conf->device_lock); + md_check_recovery(mddev); + spin_lock_irq(&conf->device_lock); + } } pr_debug("%d stripes handled\n", handled); spin_unlock_irq(&conf->device_lock); async_tx_issue_pending_all(); - unplug_slaves(mddev); + blk_finish_plug(&plug); pr_debug("--- raid5d inactive\n"); } static ssize_t -raid5_show_stripe_cache_size(mddev_t *mddev, char *page) +raid5_show_stripe_cache_size(struct mddev *mddev, char *page) { - raid5_conf_t *conf = mddev_to_conf(mddev); + struct r5conf *conf = mddev->private; if (conf) return sprintf(page, "%d\n", conf->max_nr_stripes); else return 0; } -static ssize_t -raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) +int +raid5_set_cache_size(struct mddev *mddev, int size) { - raid5_conf_t *conf = mddev_to_conf(mddev); - unsigned long new; + struct r5conf *conf = mddev->private; int err; + int hash; - if (len >= PAGE_SIZE) - return -EINVAL; - if (!conf) - return -ENODEV; - - if (strict_strtoul(page, 10, &new)) + if (size <= 16 || size > 32768) return -EINVAL; - if (new <= 16 || new > 32768) - return -EINVAL; - while (new < conf->max_nr_stripes) { - if (drop_one_stripe(conf)) + hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; + while (size < conf->max_nr_stripes) { + if (drop_one_stripe(conf, hash)) conf->max_nr_stripes--; else break; + hash--; + if (hash < 0) + hash = NR_STRIPE_HASH_LOCKS - 1; } err = md_allow_write(mddev); if (err) return err; - while (new > conf->max_nr_stripes) { - if (grow_one_stripe(conf)) + hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; + while (size > conf->max_nr_stripes) { + if (grow_one_stripe(conf, hash)) conf->max_nr_stripes++; else break; + hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; } + return 0; +} +EXPORT_SYMBOL(raid5_set_cache_size); + +static ssize_t +raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + int err; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (kstrtoul(page, 10, &new)) + return -EINVAL; + err = raid5_set_cache_size(mddev, new); + if (err) + return err; return len; } @@ -3901,9 +5364,9 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, raid5_store_stripe_cache_size); static ssize_t -raid5_show_preread_threshold(mddev_t *mddev, char *page) +raid5_show_preread_threshold(struct mddev *mddev, char *page) { - raid5_conf_t *conf = mddev_to_conf(mddev); + struct r5conf *conf = mddev->private; if (conf) return sprintf(page, "%d\n", conf->bypass_threshold); else @@ -3911,16 +5374,16 @@ raid5_show_preread_threshold(mddev_t *mddev, char *page) } static ssize_t -raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) +raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) { - raid5_conf_t *conf = mddev_to_conf(mddev); + struct r5conf *conf = mddev->private; unsigned long new; if (len >= PAGE_SIZE) return -EINVAL; if (!conf) return -ENODEV; - if (strict_strtoul(page, 10, &new)) + if (kstrtoul(page, 10, &new)) return -EINVAL; if (new > conf->max_nr_stripes) return -EINVAL; @@ -3935,9 +5398,53 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, raid5_store_preread_threshold); static ssize_t -stripe_cache_active_show(mddev_t *mddev, char *page) +raid5_show_skip_copy(struct mddev *mddev, char *page) { - raid5_conf_t *conf = mddev_to_conf(mddev); + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->skip_copy); + else + return 0; +} + +static ssize_t +raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (kstrtoul(page, 10, &new)) + return -EINVAL; + new = !!new; + if (new == conf->skip_copy) + return len; + + mddev_suspend(mddev); + conf->skip_copy = new; + if (new) + mddev->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + mddev->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; + mddev_resume(mddev); + return len; +} + +static struct md_sysfs_entry +raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, + raid5_show_skip_copy, + raid5_store_skip_copy); + + +static ssize_t +stripe_cache_active_show(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; if (conf) return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); else @@ -3947,10 +5454,79 @@ stripe_cache_active_show(mddev_t *mddev, char *page) static struct md_sysfs_entry raid5_stripecache_active = __ATTR_RO(stripe_cache_active); +static ssize_t +raid5_show_group_thread_cnt(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->worker_cnt_per_group); + else + return 0; +} + +static int alloc_thread_groups(struct r5conf *conf, int cnt, + int *group_cnt, + int *worker_cnt_per_group, + struct r5worker_group **worker_groups); +static ssize_t +raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + int err; + struct r5worker_group *new_groups, *old_groups; + int group_cnt, worker_cnt_per_group; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (kstrtoul(page, 10, &new)) + return -EINVAL; + + if (new == conf->worker_cnt_per_group) + return len; + + mddev_suspend(mddev); + + old_groups = conf->worker_groups; + if (old_groups) + flush_workqueue(raid5_wq); + + err = alloc_thread_groups(conf, new, + &group_cnt, &worker_cnt_per_group, + &new_groups); + if (!err) { + spin_lock_irq(&conf->device_lock); + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_groups; + spin_unlock_irq(&conf->device_lock); + + if (old_groups) + kfree(old_groups[0].workers); + kfree(old_groups); + } + + mddev_resume(mddev); + + if (err) + return err; + return len; +} + +static struct md_sysfs_entry +raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, + raid5_show_group_thread_cnt, + raid5_store_group_thread_cnt); + static struct attribute *raid5_attrs[] = { &raid5_stripecache_size.attr, &raid5_stripecache_active.attr, &raid5_preread_bypass_threshold.attr, + &raid5_group_thread_cnt.attr, + &raid5_skip_copy.attr, NULL, }; static struct attribute_group raid5_attrs_group = { @@ -3958,441 +5534,895 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int run(mddev_t *mddev) +static int alloc_thread_groups(struct r5conf *conf, int cnt, + int *group_cnt, + int *worker_cnt_per_group, + struct r5worker_group **worker_groups) +{ + int i, j, k; + ssize_t size; + struct r5worker *workers; + + *worker_cnt_per_group = cnt; + if (cnt == 0) { + *group_cnt = 0; + *worker_groups = NULL; + return 0; + } + *group_cnt = num_possible_nodes(); + size = sizeof(struct r5worker) * cnt; + workers = kzalloc(size * *group_cnt, GFP_NOIO); + *worker_groups = kzalloc(sizeof(struct r5worker_group) * + *group_cnt, GFP_NOIO); + if (!*worker_groups || !workers) { + kfree(workers); + kfree(*worker_groups); + return -ENOMEM; + } + + for (i = 0; i < *group_cnt; i++) { + struct r5worker_group *group; + + group = &(*worker_groups)[i]; + INIT_LIST_HEAD(&group->handle_list); + group->conf = conf; + group->workers = workers + i * cnt; + + for (j = 0; j < cnt; j++) { + struct r5worker *worker = group->workers + j; + worker->group = group; + INIT_WORK(&worker->work, raid5_do_work); + + for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) + INIT_LIST_HEAD(worker->temp_inactive_list + k); + } + } + + return 0; +} + +static void free_thread_groups(struct r5conf *conf) +{ + if (conf->worker_groups) + kfree(conf->worker_groups[0].workers); + kfree(conf->worker_groups); + conf->worker_groups = NULL; +} + +static sector_t +raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + struct r5conf *conf = mddev->private; + + if (!sectors) + sectors = mddev->dev_sectors; + if (!raid_disks) + /* size is defined by the smallest of previous and new size */ + raid_disks = min(conf->raid_disks, conf->previous_raid_disks); + + sectors &= ~((sector_t)mddev->chunk_sectors - 1); + sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); + return sectors * (raid_disks - conf->max_degraded); +} + +static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) +{ + safe_put_page(percpu->spare_page); + kfree(percpu->scribble); + percpu->spare_page = NULL; + percpu->scribble = NULL; +} + +static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu) +{ + if (conf->level == 6 && !percpu->spare_page) + percpu->spare_page = alloc_page(GFP_KERNEL); + if (!percpu->scribble) + percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); + + if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) { + free_scratch_buffer(conf, percpu); + return -ENOMEM; + } + + return 0; +} + +static void raid5_free_percpu(struct r5conf *conf) +{ + unsigned long cpu; + + if (!conf->percpu) + return; + +#ifdef CONFIG_HOTPLUG_CPU + unregister_cpu_notifier(&conf->cpu_notify); +#endif + + get_online_cpus(); + for_each_possible_cpu(cpu) + free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); + put_online_cpus(); + + free_percpu(conf->percpu); +} + +static void free_conf(struct r5conf *conf) +{ + free_thread_groups(conf); + shrink_stripes(conf); + raid5_free_percpu(conf); + kfree(conf->disks); + kfree(conf->stripe_hashtbl); + kfree(conf); +} + +#ifdef CONFIG_HOTPLUG_CPU +static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); + long cpu = (long)hcpu; + struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (alloc_scratch_buffer(conf, percpu)) { + pr_err("%s: failed memory allocation for cpu%ld\n", + __func__, cpu); + return notifier_from_errno(-ENOMEM); + } + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); + break; + default: + break; + } + return NOTIFY_OK; +} +#endif + +static int raid5_alloc_percpu(struct r5conf *conf) +{ + unsigned long cpu; + int err = 0; + + conf->percpu = alloc_percpu(struct raid5_percpu); + if (!conf->percpu) + return -ENOMEM; + +#ifdef CONFIG_HOTPLUG_CPU + conf->cpu_notify.notifier_call = raid456_cpu_notify; + conf->cpu_notify.priority = 0; + err = register_cpu_notifier(&conf->cpu_notify); + if (err) + return err; +#endif + + get_online_cpus(); + for_each_present_cpu(cpu) { + err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); + if (err) { + pr_err("%s: failed memory allocation for cpu%ld\n", + __func__, cpu); + break; + } + } + put_online_cpus(); + + return err; +} + +static struct r5conf *setup_conf(struct mddev *mddev) { - raid5_conf_t *conf; - int raid_disk, memory; - mdk_rdev_t *rdev; + struct r5conf *conf; + int raid_disk, memory, max_disks; + struct md_rdev *rdev; struct disk_info *disk; - struct list_head *tmp; + char pers_name[6]; + int i; + int group_cnt, worker_cnt_per_group; + struct r5worker_group *new_group; + + if (mddev->new_level != 5 + && mddev->new_level != 4 + && mddev->new_level != 6) { + printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", + mdname(mddev), mddev->new_level); + return ERR_PTR(-EIO); + } + if ((mddev->new_level == 5 + && !algorithm_valid_raid5(mddev->new_layout)) || + (mddev->new_level == 6 + && !algorithm_valid_raid6(mddev->new_layout))) { + printk(KERN_ERR "md/raid:%s: layout %d not supported\n", + mdname(mddev), mddev->new_layout); + return ERR_PTR(-EIO); + } + if (mddev->new_level == 6 && mddev->raid_disks < 4) { + printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", + mdname(mddev), mddev->raid_disks); + return ERR_PTR(-EINVAL); + } + + if (!mddev->new_chunk_sectors || + (mddev->new_chunk_sectors << 9) % PAGE_SIZE || + !is_power_of_2(mddev->new_chunk_sectors)) { + printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", + mdname(mddev), mddev->new_chunk_sectors << 9); + return ERR_PTR(-EINVAL); + } + + conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); + if (conf == NULL) + goto abort; + /* Don't enable multi-threading by default*/ + if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, + &new_group)) { + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_group; + } else + goto abort; + spin_lock_init(&conf->device_lock); + seqcount_init(&conf->gen_lock); + init_waitqueue_head(&conf->wait_for_stripe); + init_waitqueue_head(&conf->wait_for_overlap); + INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->hold_list); + INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->bitmap_list); + init_llist_head(&conf->released_stripes); + atomic_set(&conf->active_stripes, 0); + atomic_set(&conf->preread_active_stripes, 0); + atomic_set(&conf->active_aligned_reads, 0); + conf->bypass_threshold = BYPASS_THRESHOLD; + conf->recovery_disabled = mddev->recovery_disabled - 1; + + conf->raid_disks = mddev->raid_disks; + if (mddev->reshape_position == MaxSector) + conf->previous_raid_disks = mddev->raid_disks; + else + conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; + max_disks = max(conf->raid_disks, conf->previous_raid_disks); + conf->scribble_len = scribble_len(max_disks); + + conf->disks = kzalloc(max_disks * sizeof(struct disk_info), + GFP_KERNEL); + if (!conf->disks) + goto abort; + + conf->mddev = mddev; + + if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) + goto abort; + + /* We init hash_locks[0] separately to that it can be used + * as the reference lock in the spin_lock_nest_lock() call + * in lock_all_device_hash_locks_irq in order to convince + * lockdep that we know what we are doing. + */ + spin_lock_init(conf->hash_locks); + for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) + spin_lock_init(conf->hash_locks + i); + + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(conf->inactive_list + i); + + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(conf->temp_inactive_list + i); + + conf->level = mddev->new_level; + if (raid5_alloc_percpu(conf) != 0) + goto abort; + + pr_debug("raid456: run(%s) called.\n", mdname(mddev)); + + rdev_for_each(rdev, mddev) { + raid_disk = rdev->raid_disk; + if (raid_disk >= max_disks + || raid_disk < 0) + continue; + disk = conf->disks + raid_disk; + + if (test_bit(Replacement, &rdev->flags)) { + if (disk->replacement) + goto abort; + disk->replacement = rdev; + } else { + if (disk->rdev) + goto abort; + disk->rdev = rdev; + } + + if (test_bit(In_sync, &rdev->flags)) { + char b[BDEVNAME_SIZE]; + printk(KERN_INFO "md/raid:%s: device %s operational as raid" + " disk %d\n", + mdname(mddev), bdevname(rdev->bdev, b), raid_disk); + } else if (rdev->saved_raid_disk != raid_disk) + /* Cannot rely on bitmap to complete recovery */ + conf->fullsync = 1; + } + + conf->chunk_sectors = mddev->new_chunk_sectors; + conf->level = mddev->new_level; + if (conf->level == 6) + conf->max_degraded = 2; + else + conf->max_degraded = 1; + conf->algorithm = mddev->new_layout; + conf->reshape_progress = mddev->reshape_position; + if (conf->reshape_progress != MaxSector) { + conf->prev_chunk_sectors = mddev->chunk_sectors; + conf->prev_algo = mddev->layout; + } + + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; + atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); + if (grow_stripes(conf, NR_STRIPES)) { + printk(KERN_ERR + "md/raid:%s: couldn't allocate %dkB for buffers\n", + mdname(mddev), memory); + goto abort; + } else + printk(KERN_INFO "md/raid:%s: allocated %dkB\n", + mdname(mddev), memory); + + sprintf(pers_name, "raid%d", mddev->new_level); + conf->thread = md_register_thread(raid5d, mddev, pers_name); + if (!conf->thread) { + printk(KERN_ERR + "md/raid:%s: couldn't allocate thread.\n", + mdname(mddev)); + goto abort; + } + + return conf; + + abort: + if (conf) { + free_conf(conf); + return ERR_PTR(-EIO); + } else + return ERR_PTR(-ENOMEM); +} + + +static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) +{ + switch (algo) { + case ALGORITHM_PARITY_0: + if (raid_disk < max_degraded) + return 1; + break; + case ALGORITHM_PARITY_N: + if (raid_disk >= raid_disks - max_degraded) + return 1; + break; + case ALGORITHM_PARITY_0_6: + if (raid_disk == 0 || + raid_disk == raid_disks - 1) + return 1; + break; + case ALGORITHM_LEFT_ASYMMETRIC_6: + case ALGORITHM_RIGHT_ASYMMETRIC_6: + case ALGORITHM_LEFT_SYMMETRIC_6: + case ALGORITHM_RIGHT_SYMMETRIC_6: + if (raid_disk == raid_disks - 1) + return 1; + } + return 0; +} + +static int run(struct mddev *mddev) +{ + struct r5conf *conf; int working_disks = 0; + int dirty_parity_disks = 0; + struct md_rdev *rdev; + sector_t reshape_offset = 0; + int i; + long long min_offset_diff = 0; + int first = 1; + + if (mddev->recovery_cp != MaxSector) + printk(KERN_NOTICE "md/raid:%s: not clean" + " -- starting background reconstruction\n", + mdname(mddev)); - if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { - printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", - mdname(mddev), mddev->level); - return -EIO; + rdev_for_each(rdev, mddev) { + long long diff; + if (rdev->raid_disk < 0) + continue; + diff = (rdev->new_data_offset - rdev->data_offset); + if (first) { + min_offset_diff = diff; + first = 0; + } else if (mddev->reshape_backwards && + diff < min_offset_diff) + min_offset_diff = diff; + else if (!mddev->reshape_backwards && + diff > min_offset_diff) + min_offset_diff = diff; } if (mddev->reshape_position != MaxSector) { /* Check that we can continue the reshape. - * Currently only disks can change, it must - * increase, and we must be past the point where - * a stripe over-writes itself + * Difficulties arise if the stripe we would write to + * next is at or after the stripe we would read from next. + * For a reshape that changes the number of devices, this + * is only possible for a very short time, and mdadm makes + * sure that time appears to have past before assembling + * the array. So we fail if that time hasn't passed. + * For a reshape that keeps the number of devices the same + * mdadm must be monitoring the reshape can keeping the + * critical areas read-only and backed up. It will start + * the array in read-only mode, so we check for that. */ sector_t here_new, here_old; int old_disks; - int max_degraded = (mddev->level == 5 ? 1 : 2); + int max_degraded = (mddev->level == 6 ? 2 : 1); - if (mddev->new_level != mddev->level || - mddev->new_layout != mddev->layout || - mddev->new_chunk != mddev->chunk_size) { - printk(KERN_ERR "raid5: %s: unsupported reshape " + if (mddev->new_level != mddev->level) { + printk(KERN_ERR "md/raid:%s: unsupported reshape " "required - aborting.\n", mdname(mddev)); return -EINVAL; } - if (mddev->delta_disks <= 0) { - printk(KERN_ERR "raid5: %s: unsupported reshape " - "(reduce disks) required - aborting.\n", - mdname(mddev)); - return -EINVAL; - } old_disks = mddev->raid_disks - mddev->delta_disks; /* reshape_position must be on a new-stripe boundary, and one * further up in new geometry must map after here in old * geometry. */ here_new = mddev->reshape_position; - if (sector_div(here_new, (mddev->chunk_size>>9)* + if (sector_div(here_new, mddev->new_chunk_sectors * (mddev->raid_disks - max_degraded))) { - printk(KERN_ERR "raid5: reshape_position not " - "on a stripe boundary\n"); + printk(KERN_ERR "md/raid:%s: reshape_position not " + "on a stripe boundary\n", mdname(mddev)); return -EINVAL; } + reshape_offset = here_new * mddev->new_chunk_sectors; /* here_new is the stripe we will write to */ here_old = mddev->reshape_position; - sector_div(here_old, (mddev->chunk_size>>9)* + sector_div(here_old, mddev->chunk_sectors * (old_disks-max_degraded)); /* here_old is the first stripe that we might need to read * from */ - if (here_new >= here_old) { + if (mddev->delta_disks == 0) { + if ((here_new * mddev->new_chunk_sectors != + here_old * mddev->chunk_sectors)) { + printk(KERN_ERR "md/raid:%s: reshape position is" + " confused - aborting\n", mdname(mddev)); + return -EINVAL; + } + /* We cannot be sure it is safe to start an in-place + * reshape. It is only safe if user-space is monitoring + * and taking constant backups. + * mdadm always starts a situation like this in + * readonly mode so it can take control before + * allowing any writes. So just check for that. + */ + if (abs(min_offset_diff) >= mddev->chunk_sectors && + abs(min_offset_diff) >= mddev->new_chunk_sectors) + /* not really in-place - so OK */; + else if (mddev->ro == 0) { + printk(KERN_ERR "md/raid:%s: in-place reshape " + "must be started in read-only mode " + "- aborting\n", + mdname(mddev)); + return -EINVAL; + } + } else if (mddev->reshape_backwards + ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= + here_old * mddev->chunk_sectors) + : (here_new * mddev->new_chunk_sectors >= + here_old * mddev->chunk_sectors + (-min_offset_diff))) { /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "raid5: reshape_position too early for " - "auto-recovery - aborting.\n"); + printk(KERN_ERR "md/raid:%s: reshape_position too early for " + "auto-recovery - aborting.\n", + mdname(mddev)); return -EINVAL; } - printk(KERN_INFO "raid5: reshape will continue\n"); + printk(KERN_INFO "md/raid:%s: reshape will continue\n", + mdname(mddev)); /* OK, we should be able to continue; */ - } - - - mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); - if ((conf = mddev->private) == NULL) - goto abort; - if (mddev->reshape_position == MaxSector) { - conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks; } else { - conf->raid_disks = mddev->raid_disks; - conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; + BUG_ON(mddev->level != mddev->new_level); + BUG_ON(mddev->layout != mddev->new_layout); + BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); + BUG_ON(mddev->delta_disks != 0); } - conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), - GFP_KERNEL); - if (!conf->disks) - goto abort; - - conf->mddev = mddev; - - if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) - goto abort; + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; - if (mddev->level == 6) { - conf->spare_page = alloc_page(GFP_KERNEL); - if (!conf->spare_page) - goto abort; - } - spin_lock_init(&conf->device_lock); - mddev->queue->queue_lock = &conf->device_lock; - init_waitqueue_head(&conf->wait_for_stripe); - init_waitqueue_head(&conf->wait_for_overlap); - INIT_LIST_HEAD(&conf->handle_list); - INIT_LIST_HEAD(&conf->hold_list); - INIT_LIST_HEAD(&conf->delayed_list); - INIT_LIST_HEAD(&conf->bitmap_list); - INIT_LIST_HEAD(&conf->inactive_list); - atomic_set(&conf->active_stripes, 0); - atomic_set(&conf->preread_active_stripes, 0); - atomic_set(&conf->active_aligned_reads, 0); - conf->bypass_threshold = BYPASS_THRESHOLD; + if (IS_ERR(conf)) + return PTR_ERR(conf); - pr_debug("raid5: run(%s) called.\n", mdname(mddev)); + conf->min_offset_diff = min_offset_diff; + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; - rdev_for_each(rdev, tmp, mddev) { - raid_disk = rdev->raid_disk; - if (raid_disk >= conf->raid_disks - || raid_disk < 0) + for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; + i++) { + rdev = conf->disks[i].rdev; + if (!rdev && conf->disks[i].replacement) { + /* The replacement is all we have yet */ + rdev = conf->disks[i].replacement; + conf->disks[i].replacement = NULL; + clear_bit(Replacement, &rdev->flags); + conf->disks[i].rdev = rdev; + } + if (!rdev) continue; - disk = conf->disks + raid_disk; - - disk->rdev = rdev; - + if (conf->disks[i].replacement && + conf->reshape_progress != MaxSector) { + /* replacements and reshape simply do not mix. */ + printk(KERN_ERR "md: cannot handle concurrent " + "replacement and reshape.\n"); + goto abort; + } if (test_bit(In_sync, &rdev->flags)) { - char b[BDEVNAME_SIZE]; - printk(KERN_INFO "raid5: device %s operational as raid" - " disk %d\n", bdevname(rdev->bdev,b), - raid_disk); working_disks++; - } else - /* Cannot rely on bitmap to complete recovery */ - conf->fullsync = 1; + continue; + } + /* This disc is not fully in-sync. However if it + * just stored parity (beyond the recovery_offset), + * when we don't need to be concerned about the + * array being dirty. + * When reshape goes 'backwards', we never have + * partially completed devices, so we only need + * to worry about reshape going forwards. + */ + /* Hack because v0.91 doesn't store recovery_offset properly. */ + if (mddev->major_version == 0 && + mddev->minor_version > 90) + rdev->recovery_offset = reshape_offset; + + if (rdev->recovery_offset < reshape_offset) { + /* We need to check old and new layout */ + if (!only_parity(rdev->raid_disk, + conf->algorithm, + conf->raid_disks, + conf->max_degraded)) + continue; + } + if (!only_parity(rdev->raid_disk, + conf->prev_algo, + conf->previous_raid_disks, + conf->max_degraded)) + continue; + dirty_parity_disks++; } /* * 0 for a fully functional array, 1 or 2 for a degraded array. */ - mddev->degraded = conf->raid_disks - working_disks; - conf->mddev = mddev; - conf->chunk_size = mddev->chunk_size; - conf->level = mddev->level; - if (conf->level == 6) - conf->max_degraded = 2; - else - conf->max_degraded = 1; - conf->algorithm = mddev->layout; - conf->max_nr_stripes = NR_STRIPES; - conf->expand_progress = mddev->reshape_position; + mddev->degraded = calc_degraded(conf); - /* device size must be a multiple of chunk size */ - mddev->size &= ~(mddev->chunk_size/1024 -1); - mddev->resync_max_sectors = mddev->size << 1; - - if (conf->level == 6 && conf->raid_disks < 4) { - printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", - mdname(mddev), conf->raid_disks); - goto abort; - } - if (!conf->chunk_size || conf->chunk_size % 4) { - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", - conf->chunk_size, mdname(mddev)); - goto abort; - } - if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { - printk(KERN_ERR - "raid5: unsupported parity algorithm %d for %s\n", - conf->algorithm, mdname(mddev)); - goto abort; - } - if (mddev->degraded > conf->max_degraded) { - printk(KERN_ERR "raid5: not enough operational devices for %s" + if (has_failed(conf)) { + printk(KERN_ERR "md/raid:%s: not enough operational devices" " (%d/%d failed)\n", mdname(mddev), mddev->degraded, conf->raid_disks); goto abort; } - if (mddev->degraded > 0 && + /* device size must be a multiple of chunk size */ + mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); + mddev->resync_max_sectors = mddev->dev_sectors; + + if (mddev->degraded > dirty_parity_disks && mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) printk(KERN_WARNING - "raid5: starting dirty degraded array: %s" - "- data corruption possible.\n", + "md/raid:%s: starting dirty degraded array" + " - data corruption possible.\n", mdname(mddev)); else { printk(KERN_ERR - "raid5: cannot start dirty degraded array for %s\n", + "md/raid:%s: cannot start dirty degraded array.\n", mdname(mddev)); goto abort; } } - { - mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5"); - if (!mddev->thread) { - printk(KERN_ERR - "raid5: couldn't allocate thread for %s\n", - mdname(mddev)); - goto abort; - } - } - memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + - conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes)) { - printk(KERN_ERR - "raid5: couldn't allocate %dkB for buffers\n", memory); - shrink_stripes(conf); - md_unregister_thread(mddev->thread); - goto abort; - } else - printk(KERN_INFO "raid5: allocated %dkB for %s\n", - memory, mdname(mddev)); - if (mddev->degraded == 0) - printk("raid5: raid level %d set %s active with %d out of %d" - " devices, algorithm %d\n", conf->level, mdname(mddev), - mddev->raid_disks-mddev->degraded, mddev->raid_disks, - conf->algorithm); + printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" + " devices, algorithm %d\n", mdname(mddev), conf->level, + mddev->raid_disks-mddev->degraded, mddev->raid_disks, + mddev->new_layout); else - printk(KERN_ALERT "raid5: raid level %d set %s active with %d" - " out of %d devices, algorithm %d\n", conf->level, - mdname(mddev), mddev->raid_disks - mddev->degraded, - mddev->raid_disks, conf->algorithm); + printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" + " out of %d devices, algorithm %d\n", + mdname(mddev), conf->level, + mddev->raid_disks - mddev->degraded, + mddev->raid_disks, mddev->new_layout); print_raid5_conf(conf); - if (conf->expand_progress != MaxSector) { - printk("...ok start reshape thread\n"); - conf->expand_lo = conf->expand_progress; + if (conf->reshape_progress != MaxSector) { + conf->reshape_safe = conf->reshape_progress; atomic_set(&conf->reshape_stripes, 0); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "%s_reshape"); + "reshape"); } - /* read-ahead size must cover two whole stripes, which is - * 2 * (datadisks) * chunksize where 'n' is the number of raid devices - */ - { - int data_disks = conf->previous_raid_disks - conf->max_degraded; - int stripe = data_disks * - (mddev->chunk_size / PAGE_SIZE); - if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - } /* Ok, everything is just fine now */ - if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) + if (mddev->to_remove == &raid5_attrs_group) + mddev->to_remove = NULL; + else if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) printk(KERN_WARNING "raid5: failed to create sysfs attributes for %s\n", mdname(mddev)); + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + + if (mddev->queue) { + int chunk_size; + bool discard_supported = true; + /* read-ahead size must cover two whole stripes, which + * is 2 * (datadisks) * chunksize where 'n' is the + * number of raid devices + */ + int data_disks = conf->previous_raid_disks - conf->max_degraded; + int stripe = data_disks * + ((mddev->chunk_sectors << 9) / PAGE_SIZE); + if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) + mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - mddev->queue->unplug_fn = raid5_unplug_device; - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = raid5_congested; + blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); - mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - - conf->max_degraded); + mddev->queue->backing_dev_info.congested_data = mddev; + mddev->queue->backing_dev_info.congested_fn = raid5_congested; + + chunk_size = mddev->chunk_sectors << 9; + blk_queue_io_min(mddev->queue, chunk_size); + blk_queue_io_opt(mddev->queue, chunk_size * + (conf->raid_disks - conf->max_degraded)); + mddev->queue->limits.raid_partial_stripes_expensive = 1; + /* + * We can only discard a whole stripe. It doesn't make sense to + * discard data disk but write parity disk + */ + stripe = stripe * PAGE_SIZE; + /* Round up to power of 2, as discard handling + * currently assumes that */ + while ((stripe-1) & stripe) + stripe = (stripe | (stripe-1)) + 1; + mddev->queue->limits.discard_alignment = stripe; + mddev->queue->limits.discard_granularity = stripe; + /* + * unaligned part of discard request will be ignored, so can't + * guarantee discard_zerors_data + */ + mddev->queue->limits.discard_zeroes_data = 0; - blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); + blk_queue_max_write_same_sectors(mddev->queue, 0); + + rdev_for_each(rdev, mddev) { + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->new_data_offset << 9); + /* + * discard_zeroes_data is required, otherwise data + * could be lost. Consider a scenario: discard a stripe + * (the stripe could be inconsistent if + * discard_zeroes_data is 0); write one disk of the + * stripe (the stripe could be inconsistent again + * depending on which disks are used to calculate + * parity); the disk is broken; The stripe data of this + * disk is lost. + */ + if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || + !bdev_get_queue(rdev->bdev)-> + limits.discard_zeroes_data) + discard_supported = false; + } + + if (discard_supported && + mddev->queue->limits.max_discard_sectors >= stripe && + mddev->queue->limits.discard_granularity >= stripe) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + } return 0; abort: - if (conf) { - print_raid5_conf(conf); - safe_put_page(conf->spare_page); - kfree(conf->disks); - kfree(conf->stripe_hashtbl); - kfree(conf); - } + md_unregister_thread(&mddev->thread); + print_raid5_conf(conf); + free_conf(conf); mddev->private = NULL; - printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); + printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); return -EIO; } - - -static int stop(mddev_t *mddev) +static int stop(struct mddev *mddev) { - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + struct r5conf *conf = mddev->private; - md_unregister_thread(mddev->thread); - mddev->thread = NULL; - shrink_stripes(conf); - kfree(conf->stripe_hashtbl); - mddev->queue->backing_dev_info.congested_fn = NULL; - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); - kfree(conf->disks); - kfree(conf); + md_unregister_thread(&mddev->thread); + if (mddev->queue) + mddev->queue->backing_dev_info.congested_fn = NULL; + free_conf(conf); mddev->private = NULL; + mddev->to_remove = &raid5_attrs_group; return 0; } -#ifdef DEBUG -static void print_sh (struct seq_file *seq, struct stripe_head *sh) -{ - int i; - - seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n", - (unsigned long long)sh->sector, sh->pd_idx, sh->state); - seq_printf(seq, "sh %llu, count %d.\n", - (unsigned long long)sh->sector, atomic_read(&sh->count)); - seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector); - for (i = 0; i < sh->disks; i++) { - seq_printf(seq, "(cache%d: %p %ld) ", - i, sh->dev[i].page, sh->dev[i].flags); - } - seq_printf(seq, "\n"); -} - -static void printall (struct seq_file *seq, raid5_conf_t *conf) -{ - struct stripe_head *sh; - struct hlist_node *hn; - int i; - - spin_lock_irq(&conf->device_lock); - for (i = 0; i < NR_HASH; i++) { - hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { - if (sh->raid_conf != conf) - continue; - print_sh(seq, sh); - } - } - spin_unlock_irq(&conf->device_lock); -} -#endif - -static void status (struct seq_file *seq, mddev_t *mddev) +static void status(struct seq_file *seq, struct mddev *mddev) { - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + struct r5conf *conf = mddev->private; int i; - seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); + seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, + mddev->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", conf->disks[i].rdev && test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); seq_printf (seq, "]"); -#ifdef DEBUG - seq_printf (seq, "\n"); - printall(seq, conf); -#endif } -static void print_raid5_conf (raid5_conf_t *conf) +static void print_raid5_conf (struct r5conf *conf) { int i; struct disk_info *tmp; - printk("RAID5 conf printout:\n"); + printk(KERN_DEBUG "RAID conf printout:\n"); if (!conf) { printk("(conf==NULL)\n"); return; } - printk(" --- rd:%d wd:%d\n", conf->raid_disks, - conf->raid_disks - conf->mddev->degraded); + printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, + conf->raid_disks, + conf->raid_disks - conf->mddev->degraded); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; tmp = conf->disks + i; if (tmp->rdev) - printk(" disk %d, o:%d, dev:%s\n", - i, !test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", + i, !test_bit(Faulty, &tmp->rdev->flags), + bdevname(tmp->rdev->bdev, b)); } } -static int raid5_spare_active(mddev_t *mddev) +static int raid5_spare_active(struct mddev *mddev) { int i; - raid5_conf_t *conf = mddev->private; + struct r5conf *conf = mddev->private; struct disk_info *tmp; + int count = 0; + unsigned long flags; for (i = 0; i < conf->raid_disks; i++) { tmp = conf->disks + i; - if (tmp->rdev + if (tmp->replacement + && tmp->replacement->recovery_offset == MaxSector + && !test_bit(Faulty, &tmp->replacement->flags) + && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { + /* Replacement has just become active. */ + if (!tmp->rdev + || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) + count++; + if (tmp->rdev) { + /* Replaced device not technically faulty, + * but we need to be sure it gets removed + * and never re-added. + */ + set_bit(Faulty, &tmp->rdev->flags); + sysfs_notify_dirent_safe( + tmp->rdev->sysfs_state); + } + sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); + } else if (tmp->rdev + && tmp->rdev->recovery_offset == MaxSector && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded--; - spin_unlock_irqrestore(&conf->device_lock, flags); + count++; + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded = calc_degraded(conf); + spin_unlock_irqrestore(&conf->device_lock, flags); print_raid5_conf(conf); - return 0; + return count; } -static int raid5_remove_disk(mddev_t *mddev, int number) +static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) { - raid5_conf_t *conf = mddev->private; + struct r5conf *conf = mddev->private; int err = 0; - mdk_rdev_t *rdev; + int number = rdev->raid_disk; + struct md_rdev **rdevp; struct disk_info *p = conf->disks + number; print_raid5_conf(conf); - rdev = p->rdev; - if (rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - err = -EBUSY; - goto abort; - } - /* Only remove non-faulty devices if recovery - * isn't possible. - */ - if (!test_bit(Faulty, &rdev->flags) && - mddev->degraded <= conf->max_degraded) { - err = -EBUSY; - goto abort; - } - p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - } + if (rdev == p->rdev) + rdevp = &p->rdev; + else if (rdev == p->replacement) + rdevp = &p->replacement; + else + return 0; + + if (number >= conf->raid_disks && + conf->reshape_progress == MaxSector) + clear_bit(In_sync, &rdev->flags); + + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + err = -EBUSY; + goto abort; + } + /* Only remove non-faulty devices if recovery + * isn't possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->recovery_disabled != conf->recovery_disabled && + !has_failed(conf) && + (!p->replacement || p->replacement == rdev) && + number < conf->raid_disks) { + err = -EBUSY; + goto abort; } + *rdevp = NULL; + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + } else if (p->replacement) { + /* We must have just cleared 'rdev' */ + p->rdev = p->replacement; + clear_bit(Replacement, &p->replacement->flags); + smp_mb(); /* Make sure other CPUs may see both as identical + * but will never see neither - if they are careful + */ + p->replacement = NULL; + clear_bit(WantReplacement, &rdev->flags); + } else + /* We might have just removed the Replacement as faulty- + * clear the bit just in case + */ + clear_bit(WantReplacement, &rdev->flags); abort: print_raid5_conf(conf); return err; } -static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) +static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) { - raid5_conf_t *conf = mddev->private; + struct r5conf *conf = mddev->private; int err = -EEXIST; int disk; struct disk_info *p; int first = 0; int last = conf->raid_disks - 1; - if (mddev->degraded > conf->max_degraded) + if (mddev->recovery_disabled == conf->recovery_disabled) + return -EBUSY; + + if (rdev->saved_raid_disk < 0 && has_failed(conf)) /* no point adding a device */ return -EINVAL; @@ -4406,24 +6436,39 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) if (rdev->saved_raid_disk >= 0 && rdev->saved_raid_disk >= first && conf->disks[rdev->saved_raid_disk].rdev == NULL) - disk = rdev->saved_raid_disk; - else - disk = first; - for ( ; disk <= last ; disk++) - if ((p=conf->disks + disk)->rdev == NULL) { + first = rdev->saved_raid_disk; + + for (disk = first; disk <= last; disk++) { + p = conf->disks + disk; + if (p->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; err = 0; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); + goto out; + } + } + for (disk = first; disk <= last; disk++) { + p = conf->disks + disk; + if (test_bit(WantReplacement, &p->rdev->flags) && + p->replacement == NULL) { + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = disk; + err = 0; + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); break; } + } +out: print_raid5_conf(conf); return err; } -static int raid5_resize(mddev_t *mddev, sector_t sectors) +static int raid5_resize(struct mddev *mddev, sector_t sectors) { /* no resync is happening, and there is enough space * on all devices, so we can resize. @@ -4432,37 +6477,32 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) * any io in the removed space completes, but it hardly seems * worth it. */ - raid5_conf_t *conf = mddev_to_conf(mddev); - - sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_sectors = sectors * (mddev->raid_disks - - conf->max_degraded); + sector_t newsize; + sectors &= ~((sector_t)mddev->chunk_sectors - 1); + newsize = raid5_size(mddev, sectors, mddev->raid_disks); + if (mddev->external_size && + mddev->array_sectors > newsize) + return -EINVAL; + if (mddev->bitmap) { + int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0); + if (ret) + return ret; + } + md_set_array_sectors(mddev, newsize); set_capacity(mddev->gendisk, mddev->array_sectors); - mddev->changed = 1; - if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { - mddev->recovery_cp = mddev->size << 1; + revalidate_disk(mddev->gendisk); + if (sectors > mddev->dev_sectors && + mddev->recovery_cp > mddev->dev_sectors) { + mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } - mddev->size = sectors /2; + mddev->dev_sectors = sectors; mddev->resync_max_sectors = sectors; return 0; } -#ifdef CONFIG_MD_RAID5_RESHAPE -static int raid5_check_reshape(mddev_t *mddev) +static int check_stripe_cache(struct mddev *mddev) { - raid5_conf_t *conf = mddev_to_conf(mddev); - int err; - - if (mddev->delta_disks < 0 || - mddev->new_level != mddev->level) - return -EINVAL; /* Cannot shrink array or change level yet */ - if (mddev->delta_disks == 0) - return 0; /* nothing to do */ - if (mddev->bitmap) - /* Cannot grow a bitmap yet */ - return -EBUSY; - /* Can only proceed if there are plenty of stripe_heads. * We need a minimum of one full stripe,, and for sensible progress * it is best to have about 4 times that. @@ -4471,39 +6511,71 @@ static int raid5_check_reshape(mddev_t *mddev) * If the chunk size is greater, user-space should request more * stripe_heads first. */ - if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || - (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { - printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", - (mddev->chunk_size / STRIPE_SIZE)*4); - return -ENOSPC; + struct r5conf *conf = mddev->private; + if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 + > conf->max_nr_stripes || + ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 + > conf->max_nr_stripes) { + printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", + mdname(mddev), + ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) + / STRIPE_SIZE)*4); + return 0; } + return 1; +} - err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks); - if (err) - return err; +static int check_reshape(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; - if (mddev->degraded > conf->max_degraded) + if (mddev->delta_disks == 0 && + mddev->new_layout == mddev->layout && + mddev->new_chunk_sectors == mddev->chunk_sectors) + return 0; /* nothing to do */ + if (has_failed(conf)) return -EINVAL; - /* looks like we might be able to manage this */ - return 0; + if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) { + /* We might be able to shrink, but the devices must + * be made bigger first. + * For raid6, 4 is the minimum size. + * Otherwise 2 is the minimum + */ + int min = 2; + if (mddev->level == 6) + min = 4; + if (mddev->raid_disks + mddev->delta_disks < min) + return -EINVAL; + } + + if (!check_stripe_cache(mddev)) + return -ENOSPC; + + return resize_stripes(conf, (conf->previous_raid_disks + + mddev->delta_disks)); } -static int raid5_start_reshape(mddev_t *mddev) +static int raid5_start_reshape(struct mddev *mddev) { - raid5_conf_t *conf = mddev_to_conf(mddev); - mdk_rdev_t *rdev; - struct list_head *rtmp; + struct r5conf *conf = mddev->private; + struct md_rdev *rdev; int spares = 0; - int added_devices = 0; unsigned long flags; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; - rdev_for_each(rdev, rtmp, mddev) - if (rdev->raid_disk < 0 && - !test_bit(Faulty, &rdev->flags)) + if (!check_stripe_cache(mddev)) + return -ENOSPC; + + if (has_failed(conf)) + return -EINVAL; + + rdev_for_each(rdev, mddev) { + if (!test_bit(In_sync, &rdev->flags) + && !test_bit(Faulty, &rdev->flags)) spares++; + } if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) /* Not enough devices even to make a degraded array @@ -4511,41 +6583,83 @@ static int raid5_start_reshape(mddev_t *mddev) */ return -EINVAL; + /* Refuse to reduce size of the array. Any reductions in + * array size must be through explicit setting of array_size + * attribute. + */ + if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) + < mddev->array_sectors) { + printk(KERN_ERR "md/raid:%s: array size must be reduced " + "before number of disks\n", mdname(mddev)); + return -EINVAL; + } + atomic_set(&conf->reshape_stripes, 0); spin_lock_irq(&conf->device_lock); + write_seqcount_begin(&conf->gen_lock); conf->previous_raid_disks = conf->raid_disks; conf->raid_disks += mddev->delta_disks; - conf->expand_progress = 0; - conf->expand_lo = 0; + conf->prev_chunk_sectors = conf->chunk_sectors; + conf->chunk_sectors = mddev->new_chunk_sectors; + conf->prev_algo = conf->algorithm; + conf->algorithm = mddev->new_layout; + conf->generation++; + /* Code that selects data_offset needs to see the generation update + * if reshape_progress has been set - so a memory barrier needed. + */ + smp_mb(); + if (mddev->reshape_backwards) + conf->reshape_progress = raid5_size(mddev, 0, 0); + else + conf->reshape_progress = 0; + conf->reshape_safe = conf->reshape_progress; + write_seqcount_end(&conf->gen_lock); spin_unlock_irq(&conf->device_lock); + /* Now make sure any requests that proceeded on the assumption + * the reshape wasn't running - like Discard or Read - have + * completed. + */ + mddev_suspend(mddev); + mddev_resume(mddev); + /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. + * Don't add devices if we are reducing the number of + * devices in the array. This is because it is not possible + * to correctly record the "partially reconstructed" state of + * such devices during the reshape and confusion could result. */ - rdev_for_each(rdev, rtmp, mddev) - if (rdev->raid_disk < 0 && - !test_bit(Faulty, &rdev->flags)) { - if (raid5_add_disk(mddev, rdev) == 0) { - char nm[20]; + if (mddev->delta_disks >= 0) { + rdev_for_each(rdev, mddev) + if (rdev->raid_disk < 0 && + !test_bit(Faulty, &rdev->flags)) { + if (raid5_add_disk(mddev, rdev) == 0) { + if (rdev->raid_disk + >= conf->previous_raid_disks) + set_bit(In_sync, &rdev->flags); + else + rdev->recovery_offset = 0; + + if (sysfs_link_rdev(mddev, rdev)) + /* Failure here is OK */; + } + } else if (rdev->raid_disk >= conf->previous_raid_disks + && !test_bit(Faulty, &rdev->flags)) { + /* This is a spare that was manually added */ set_bit(In_sync, &rdev->flags); - added_devices++; - rdev->recovery_offset = 0; - sprintf(nm, "rd%d", rdev->raid_disk); - if (sysfs_create_link(&mddev->kobj, - &rdev->kobj, nm)) - printk(KERN_WARNING - "raid5: failed to create " - " link %s for %s\n", - nm, mdname(mddev)); - } else - break; - } + } - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; - spin_unlock_irqrestore(&conf->device_lock, flags); + /* When a reshape changes the number of devices, + * ->degraded is measured against the larger of the + * pre and post number of devices. + */ + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded = calc_degraded(conf); + spin_unlock_irqrestore(&conf->device_lock, flags); + } mddev->raid_disks = conf->raid_disks; - mddev->reshape_position = 0; + mddev->reshape_position = conf->reshape_progress; set_bit(MD_CHANGE_DEVS, &mddev->flags); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -4553,60 +6667,102 @@ static int raid5_start_reshape(mddev_t *mddev) set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "%s_reshape"); + "reshape"); if (!mddev->sync_thread) { mddev->recovery = 0; spin_lock_irq(&conf->device_lock); + write_seqcount_begin(&conf->gen_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; - conf->expand_progress = MaxSector; + mddev->new_chunk_sectors = + conf->chunk_sectors = conf->prev_chunk_sectors; + mddev->new_layout = conf->algorithm = conf->prev_algo; + rdev_for_each(rdev, mddev) + rdev->new_data_offset = rdev->data_offset; + smp_wmb(); + conf->generation --; + conf->reshape_progress = MaxSector; + mddev->reshape_position = MaxSector; + write_seqcount_end(&conf->gen_lock); spin_unlock_irq(&conf->device_lock); return -EAGAIN; } + conf->reshape_checkpoint = jiffies; md_wakeup_thread(mddev->sync_thread); md_new_event(mddev); return 0; } -#endif -static void end_reshape(raid5_conf_t *conf) +/* This is called from the reshape thread and should make any + * changes needed in 'conf' + */ +static void end_reshape(struct r5conf *conf) { - struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_sectors = 2 * conf->mddev->size * - (conf->raid_disks - conf->max_degraded); - set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); - conf->mddev->changed = 1; - - bdev = bdget_disk(conf->mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)conf->mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } + struct md_rdev *rdev; + spin_lock_irq(&conf->device_lock); - conf->expand_progress = MaxSector; + conf->previous_raid_disks = conf->raid_disks; + rdev_for_each(rdev, conf->mddev) + rdev->data_offset = rdev->new_data_offset; + smp_wmb(); + conf->reshape_progress = MaxSector; spin_unlock_irq(&conf->device_lock); - conf->mddev->reshape_position = MaxSector; + wake_up(&conf->wait_for_overlap); /* read-ahead size must cover two whole stripes, which is * 2 * (datadisks) * chunksize where 'n' is the number of raid devices */ - { - int data_disks = conf->previous_raid_disks - conf->max_degraded; - int stripe = data_disks * - (conf->mddev->chunk_size / PAGE_SIZE); + if (conf->mddev->queue) { + int data_disks = conf->raid_disks - conf->max_degraded; + int stripe = data_disks * ((conf->chunk_sectors << 9) + / PAGE_SIZE); if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; } } } -static void raid5_quiesce(mddev_t *mddev, int state) +/* This is called from the raid5d thread with mddev_lock held. + * It makes config changes to the device. + */ +static void raid5_finish_reshape(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + + if (mddev->delta_disks > 0) { + md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } else { + int d; + spin_lock_irq(&conf->device_lock); + mddev->degraded = calc_degraded(conf); + spin_unlock_irq(&conf->device_lock); + for (d = conf->raid_disks ; + d < conf->raid_disks - mddev->delta_disks; + d++) { + struct md_rdev *rdev = conf->disks[d].rdev; + if (rdev) + clear_bit(In_sync, &rdev->flags); + rdev = conf->disks[d].replacement; + if (rdev) + clear_bit(In_sync, &rdev->flags); + } + } + mddev->layout = conf->algorithm; + mddev->chunk_sectors = conf->chunk_sectors; + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + } +} + +static void raid5_quiesce(struct mddev *mddev, int state) { - raid5_conf_t *conf = mddev_to_conf(mddev); + struct r5conf *conf = mddev->private; switch(state) { case 2: /* resume for a suspend */ @@ -4614,26 +6770,271 @@ static void raid5_quiesce(mddev_t *mddev, int state) break; case 1: /* stop all writes */ - spin_lock_irq(&conf->device_lock); - conf->quiesce = 1; - wait_event_lock_irq(conf->wait_for_stripe, + lock_all_device_hash_locks_irq(conf); + /* '2' tells resync/reshape to pause so that all + * active stripes can drain + */ + conf->quiesce = 2; + wait_event_cmd(conf->wait_for_stripe, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, - conf->device_lock, /* nothing */); - spin_unlock_irq(&conf->device_lock); + unlock_all_device_hash_locks_irq(conf), + lock_all_device_hash_locks_irq(conf)); + conf->quiesce = 1; + unlock_all_device_hash_locks_irq(conf); + /* allow reshape to continue */ + wake_up(&conf->wait_for_overlap); break; case 0: /* re-enable writes */ - spin_lock_irq(&conf->device_lock); + lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_stripe); wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); + unlock_all_device_hash_locks_irq(conf); break; } } -static struct mdk_personality raid6_personality = + +static void *raid45_takeover_raid0(struct mddev *mddev, int level) +{ + struct r0conf *raid0_conf = mddev->private; + sector_t sectors; + + /* for raid0 takeover only one zone is supported */ + if (raid0_conf->nr_strip_zones > 1) { + printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", + mdname(mddev)); + return ERR_PTR(-EINVAL); + } + + sectors = raid0_conf->strip_zone[0].zone_end; + sector_div(sectors, raid0_conf->strip_zone[0].nb_dev); + mddev->dev_sectors = sectors; + mddev->new_level = level; + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_chunk_sectors = mddev->chunk_sectors; + mddev->raid_disks += 1; + mddev->delta_disks = 1; + /* make sure it will be not marked as dirty */ + mddev->recovery_cp = MaxSector; + + return setup_conf(mddev); +} + + +static void *raid5_takeover_raid1(struct mddev *mddev) +{ + int chunksect; + + if (mddev->raid_disks != 2 || + mddev->degraded > 1) + return ERR_PTR(-EINVAL); + + /* Should check if there are write-behind devices? */ + + chunksect = 64*2; /* 64K by default */ + + /* The array must be an exact multiple of chunksize */ + while (chunksect && (mddev->array_sectors & (chunksect-1))) + chunksect >>= 1; + + if ((chunksect<<9) < STRIPE_SIZE) + /* array size does not allow a suitable chunk size */ + return ERR_PTR(-EINVAL); + + mddev->new_level = 5; + mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; + mddev->new_chunk_sectors = chunksect; + + return setup_conf(mddev); +} + +static void *raid5_takeover_raid6(struct mddev *mddev) +{ + int new_layout; + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC_6: + new_layout = ALGORITHM_LEFT_ASYMMETRIC; + break; + case ALGORITHM_RIGHT_ASYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC; + break; + case ALGORITHM_LEFT_SYMMETRIC_6: + new_layout = ALGORITHM_LEFT_SYMMETRIC; + break; + case ALGORITHM_RIGHT_SYMMETRIC_6: + new_layout = ALGORITHM_RIGHT_SYMMETRIC; + break; + case ALGORITHM_PARITY_0_6: + new_layout = ALGORITHM_PARITY_0; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 5; + mddev->new_layout = new_layout; + mddev->delta_disks = -1; + mddev->raid_disks -= 1; + return setup_conf(mddev); +} + + +static int raid5_check_reshape(struct mddev *mddev) +{ + /* For a 2-drive array, the layout and chunk size can be changed + * immediately as not restriping is needed. + * For larger arrays we record the new value - after validation + * to be used by a reshape pass. + */ + struct r5conf *conf = mddev->private; + int new_chunk = mddev->new_chunk_sectors; + + if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) + return -EINVAL; + if (new_chunk > 0) { + if (!is_power_of_2(new_chunk)) + return -EINVAL; + if (new_chunk < (PAGE_SIZE>>9)) + return -EINVAL; + if (mddev->array_sectors & (new_chunk-1)) + /* not factor of array size */ + return -EINVAL; + } + + /* They look valid */ + + if (mddev->raid_disks == 2) { + /* can make the change immediately */ + if (mddev->new_layout >= 0) { + conf->algorithm = mddev->new_layout; + mddev->layout = mddev->new_layout; + } + if (new_chunk > 0) { + conf->chunk_sectors = new_chunk ; + mddev->chunk_sectors = new_chunk; + } + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + } + return check_reshape(mddev); +} + +static int raid6_check_reshape(struct mddev *mddev) +{ + int new_chunk = mddev->new_chunk_sectors; + + if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) + return -EINVAL; + if (new_chunk > 0) { + if (!is_power_of_2(new_chunk)) + return -EINVAL; + if (new_chunk < (PAGE_SIZE >> 9)) + return -EINVAL; + if (mddev->array_sectors & (new_chunk-1)) + /* not factor of array size */ + return -EINVAL; + } + + /* They look valid */ + return check_reshape(mddev); +} + +static void *raid5_takeover(struct mddev *mddev) +{ + /* raid5 can take over: + * raid0 - if there is only one strip zone - make it a raid4 layout + * raid1 - if there are two drives. We need to know the chunk size + * raid4 - trivial - just use a raid4 layout. + * raid6 - Providing it is a *_6 layout + */ + if (mddev->level == 0) + return raid45_takeover_raid0(mddev, 5); + if (mddev->level == 1) + return raid5_takeover_raid1(mddev); + if (mddev->level == 4) { + mddev->new_layout = ALGORITHM_PARITY_N; + mddev->new_level = 5; + return setup_conf(mddev); + } + if (mddev->level == 6) + return raid5_takeover_raid6(mddev); + + return ERR_PTR(-EINVAL); +} + +static void *raid4_takeover(struct mddev *mddev) +{ + /* raid4 can take over: + * raid0 - if there is only one strip zone + * raid5 - if layout is right + */ + if (mddev->level == 0) + return raid45_takeover_raid0(mddev, 4); + if (mddev->level == 5 && + mddev->layout == ALGORITHM_PARITY_N) { + mddev->new_layout = 0; + mddev->new_level = 4; + return setup_conf(mddev); + } + return ERR_PTR(-EINVAL); +} + +static struct md_personality raid5_personality; + +static void *raid6_takeover(struct mddev *mddev) +{ + /* Currently can only take over a raid5. We map the + * personality to an equivalent raid6 personality + * with the Q block at the end. + */ + int new_layout; + + if (mddev->pers != &raid5_personality) + return ERR_PTR(-EINVAL); + if (mddev->degraded > 1) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks > 253) + return ERR_PTR(-EINVAL); + if (mddev->raid_disks < 3) + return ERR_PTR(-EINVAL); + + switch (mddev->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; + break; + case ALGORITHM_LEFT_SYMMETRIC: + new_layout = ALGORITHM_LEFT_SYMMETRIC_6; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; + break; + case ALGORITHM_PARITY_0: + new_layout = ALGORITHM_PARITY_0_6; + break; + case ALGORITHM_PARITY_N: + new_layout = ALGORITHM_PARITY_N; + break; + default: + return ERR_PTR(-EINVAL); + } + mddev->new_level = 6; + mddev->new_layout = new_layout; + mddev->delta_disks = 1; + mddev->raid_disks += 1; + return setup_conf(mddev); +} + + +static struct md_personality raid6_personality = { .name = "raid6", .level = 6, @@ -4648,13 +7049,14 @@ static struct mdk_personality raid6_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, -#ifdef CONFIG_MD_RAID5_RESHAPE - .check_reshape = raid5_check_reshape, + .size = raid5_size, + .check_reshape = raid6_check_reshape, .start_reshape = raid5_start_reshape, -#endif + .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, + .takeover = raid6_takeover, }; -static struct mdk_personality raid5_personality = +static struct md_personality raid5_personality = { .name = "raid5", .level = 5, @@ -4669,14 +7071,15 @@ static struct mdk_personality raid5_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, -#ifdef CONFIG_MD_RAID5_RESHAPE + .size = raid5_size, .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, -#endif + .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, + .takeover = raid5_takeover, }; -static struct mdk_personality raid4_personality = +static struct md_personality raid4_personality = { .name = "raid4", .level = 4, @@ -4691,20 +7094,20 @@ static struct mdk_personality raid4_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, -#ifdef CONFIG_MD_RAID5_RESHAPE + .size = raid5_size, .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, -#endif + .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, + .takeover = raid4_takeover, }; static int __init raid5_init(void) { - int e; - - e = raid6_select_algo(); - if ( e ) - return e; + raid5_wq = alloc_workqueue("raid5wq", + WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); + if (!raid5_wq) + return -ENOMEM; register_md_personality(&raid6_personality); register_md_personality(&raid5_personality); register_md_personality(&raid4_personality); @@ -4716,11 +7119,13 @@ static void raid5_exit(void) unregister_md_personality(&raid6_personality); unregister_md_personality(&raid5_personality); unregister_md_personality(&raid4_personality); + destroy_workqueue(raid5_wq); } module_init(raid5_init); module_exit(raid5_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); MODULE_ALIAS("md-personality-4"); /* RAID5 */ MODULE_ALIAS("md-raid5"); MODULE_ALIAS("md-raid4"); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h new file mode 100644 index 00000000000..bc72cd4be5f --- /dev/null +++ b/drivers/md/raid5.h @@ -0,0 +1,566 @@ +#ifndef _RAID5_H +#define _RAID5_H + +#include <linux/raid/xor.h> +#include <linux/dmaengine.h> + +/* + * + * Each stripe contains one buffer per device. Each buffer can be in + * one of a number of states stored in "flags". Changes between + * these states happen *almost* exclusively under the protection of the + * STRIPE_ACTIVE flag. Some very specific changes can happen in bi_end_io, and + * these are not protected by STRIPE_ACTIVE. + * + * The flag bits that are used to represent these states are: + * R5_UPTODATE and R5_LOCKED + * + * State Empty == !UPTODATE, !LOCK + * We have no data, and there is no active request + * State Want == !UPTODATE, LOCK + * A read request is being submitted for this block + * State Dirty == UPTODATE, LOCK + * Some new data is in this buffer, and it is being written out + * State Clean == UPTODATE, !LOCK + * We have valid data which is the same as on disc + * + * The possible state transitions are: + * + * Empty -> Want - on read or write to get old data for parity calc + * Empty -> Dirty - on compute_parity to satisfy write/sync request. + * Empty -> Clean - on compute_block when computing a block for failed drive + * Want -> Empty - on failed read + * Want -> Clean - on successful completion of read request + * Dirty -> Clean - on successful completion of write request + * Dirty -> Clean - on failed write + * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW) + * + * The Want->Empty, Want->Clean, Dirty->Clean, transitions + * all happen in b_end_io at interrupt time. + * Each sets the Uptodate bit before releasing the Lock bit. + * This leaves one multi-stage transition: + * Want->Dirty->Clean + * This is safe because thinking that a Clean buffer is actually dirty + * will at worst delay some action, and the stripe will be scheduled + * for attention after the transition is complete. + * + * There is one possibility that is not covered by these states. That + * is if one drive has failed and there is a spare being rebuilt. We + * can't distinguish between a clean block that has been generated + * from parity calculations, and a clean block that has been + * successfully written to the spare ( or to parity when resyncing). + * To distinguish these states we have a stripe bit STRIPE_INSYNC that + * is set whenever a write is scheduled to the spare, or to the parity + * disc if there is no spare. A sync request clears this bit, and + * when we find it set with no buffers locked, we know the sync is + * complete. + * + * Buffers for the md device that arrive via make_request are attached + * to the appropriate stripe in one of two lists linked on b_reqnext. + * One list (bh_read) for read requests, one (bh_write) for write. + * There should never be more than one buffer on the two lists + * together, but we are not guaranteed of that so we allow for more. + * + * If a buffer is on the read list when the associated cache buffer is + * Uptodate, the data is copied into the read buffer and it's b_end_io + * routine is called. This may happen in the end_request routine only + * if the buffer has just successfully been read. end_request should + * remove the buffers from the list and then set the Uptodate bit on + * the buffer. Other threads may do this only if they first check + * that the Uptodate bit is set. Once they have checked that they may + * take buffers off the read queue. + * + * When a buffer on the write list is committed for write it is copied + * into the cache buffer, which is then marked dirty, and moved onto a + * third list, the written list (bh_written). Once both the parity + * block and the cached buffer are successfully written, any buffer on + * a written list can be returned with b_end_io. + * + * The write list and read list both act as fifos. The read list, + * write list and written list are protected by the device_lock. + * The device_lock is only for list manipulations and will only be + * held for a very short time. It can be claimed from interrupts. + * + * + * Stripes in the stripe cache can be on one of two lists (or on + * neither). The "inactive_list" contains stripes which are not + * currently being used for any request. They can freely be reused + * for another stripe. The "handle_list" contains stripes that need + * to be handled in some way. Both of these are fifo queues. Each + * stripe is also (potentially) linked to a hash bucket in the hash + * table so that it can be found by sector number. Stripes that are + * not hashed must be on the inactive_list, and will normally be at + * the front. All stripes start life this way. + * + * The inactive_list, handle_list and hash bucket lists are all protected by the + * device_lock. + * - stripes have a reference counter. If count==0, they are on a list. + * - If a stripe might need handling, STRIPE_HANDLE is set. + * - When refcount reaches zero, then if STRIPE_HANDLE it is put on + * handle_list else inactive_list + * + * This, combined with the fact that STRIPE_HANDLE is only ever + * cleared while a stripe has a non-zero count means that if the + * refcount is 0 and STRIPE_HANDLE is set, then it is on the + * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then + * the stripe is on inactive_list. + * + * The possible transitions are: + * activate an unhashed/inactive stripe (get_active_stripe()) + * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev + * activate a hashed, possibly active stripe (get_active_stripe()) + * lockdev check-hash if(!cnt++)unlink-stripe unlockdev + * attach a request to an active stripe (add_stripe_bh()) + * lockdev attach-buffer unlockdev + * handle a stripe (handle_stripe()) + * setSTRIPE_ACTIVE, clrSTRIPE_HANDLE ... + * (lockdev check-buffers unlockdev) .. + * change-state .. + * record io/ops needed clearSTRIPE_ACTIVE schedule io/ops + * release an active stripe (release_stripe()) + * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev + * + * The refcount counts each thread that have activated the stripe, + * plus raid5d if it is handling it, plus one for each active request + * on a cached buffer, and plus one if the stripe is undergoing stripe + * operations. + * + * The stripe operations are: + * -copying data between the stripe cache and user application buffers + * -computing blocks to save a disk access, or to recover a missing block + * -updating the parity on a write operation (reconstruct write and + * read-modify-write) + * -checking parity correctness + * -running i/o to disk + * These operations are carried out by raid5_run_ops which uses the async_tx + * api to (optionally) offload operations to dedicated hardware engines. + * When requesting an operation handle_stripe sets the pending bit for the + * operation and increments the count. raid5_run_ops is then run whenever + * the count is non-zero. + * There are some critical dependencies between the operations that prevent some + * from being requested while another is in flight. + * 1/ Parity check operations destroy the in cache version of the parity block, + * so we prevent parity dependent operations like writes and compute_blocks + * from starting while a check is in progress. Some dma engines can perform + * the check without damaging the parity block, in these cases the parity + * block is re-marked up to date (assuming the check was successful) and is + * not re-read from disk. + * 2/ When a write operation is requested we immediately lock the affected + * blocks, and mark them as not up to date. This causes new read requests + * to be held off, as well as parity checks and compute block operations. + * 3/ Once a compute block operation has been requested handle_stripe treats + * that block as if it is up to date. raid5_run_ops guaruntees that any + * operation that is dependent on the compute block result is initiated after + * the compute block completes. + */ + +/* + * Operations state - intermediate states that are visible outside of + * STRIPE_ACTIVE. + * In general _idle indicates nothing is running, _run indicates a data + * processing operation is active, and _result means the data processing result + * is stable and can be acted upon. For simple operations like biofill and + * compute that only have an _idle and _run state they are indicated with + * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN) + */ +/** + * enum check_states - handles syncing / repairing a stripe + * @check_state_idle - check operations are quiesced + * @check_state_run - check operation is running + * @check_state_result - set outside lock when check result is valid + * @check_state_compute_run - check failed and we are repairing + * @check_state_compute_result - set outside lock when compute result is valid + */ +enum check_states { + check_state_idle = 0, + check_state_run, /* xor parity check */ + check_state_run_q, /* q-parity check */ + check_state_run_pq, /* pq dual parity check */ + check_state_check_result, + check_state_compute_run, /* parity repair */ + check_state_compute_result, +}; + +/** + * enum reconstruct_states - handles writing or expanding a stripe + */ +enum reconstruct_states { + reconstruct_state_idle = 0, + reconstruct_state_prexor_drain_run, /* prexor-write */ + reconstruct_state_drain_run, /* write */ + reconstruct_state_run, /* expand */ + reconstruct_state_prexor_drain_result, + reconstruct_state_drain_result, + reconstruct_state_result, +}; + +struct stripe_head { + struct hlist_node hash; + struct list_head lru; /* inactive_list or handle_list */ + struct llist_node release_list; + struct r5conf *raid_conf; + short generation; /* increments with every + * reshape */ + sector_t sector; /* sector of this row */ + short pd_idx; /* parity disk index */ + short qd_idx; /* 'Q' disk index for raid6 */ + short ddf_layout;/* use DDF ordering to calculate Q */ + short hash_lock_index; + unsigned long state; /* state flags */ + atomic_t count; /* nr of active thread/requests */ + int bm_seq; /* sequence number for bitmap flushes */ + int disks; /* disks in stripe */ + enum check_states check_state; + enum reconstruct_states reconstruct_state; + spinlock_t stripe_lock; + int cpu; + struct r5worker_group *group; + /** + * struct stripe_operations + * @target - STRIPE_OP_COMPUTE_BLK target + * @target2 - 2nd compute target in the raid6 case + * @zero_sum_result - P and Q verification flags + * @request - async service request flags for raid_run_ops + */ + struct stripe_operations { + int target, target2; + enum sum_check_flags zero_sum_result; + } ops; + struct r5dev { + /* rreq and rvec are used for the replacement device when + * writing data to both devices. + */ + struct bio req, rreq; + struct bio_vec vec, rvec; + struct page *page, *orig_page; + struct bio *toread, *read, *towrite, *written; + sector_t sector; /* sector of this page */ + unsigned long flags; + } dev[1]; /* allocated with extra space depending of RAID geometry */ +}; + +/* stripe_head_state - collects and tracks the dynamic state of a stripe_head + * for handle_stripe. + */ +struct stripe_head_state { + /* 'syncing' means that we need to read all devices, either + * to check/correct parity, or to reconstruct a missing device. + * 'replacing' means we are replacing one or more drives and + * the source is valid at this point so we don't need to + * read all devices, just the replacement targets. + */ + int syncing, expanding, expanded, replacing; + int locked, uptodate, to_read, to_write, failed, written; + int to_fill, compute, req_compute, non_overwrite; + int failed_num[2]; + int p_failed, q_failed; + int dec_preread_active; + unsigned long ops_request; + + struct bio *return_bi; + struct md_rdev *blocked_rdev; + int handle_bad_blocks; +}; + +/* Flags for struct r5dev.flags */ +enum r5dev_flags { + R5_UPTODATE, /* page contains current data */ + R5_LOCKED, /* IO has been submitted on "req" */ + R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */ + R5_OVERWRITE, /* towrite covers whole page */ +/* and some that are internal to handle_stripe */ + R5_Insync, /* rdev && rdev->in_sync at start */ + R5_Wantread, /* want to schedule a read */ + R5_Wantwrite, + R5_Overlap, /* There is a pending overlapping request + * on this block */ + R5_ReadNoMerge, /* prevent bio from merging in block-layer */ + R5_ReadError, /* seen a read error here recently */ + R5_ReWrite, /* have tried to over-write the readerror */ + + R5_Expanded, /* This block now has post-expand data */ + R5_Wantcompute, /* compute_block in progress treat as + * uptodate + */ + R5_Wantfill, /* dev->toread contains a bio that needs + * filling + */ + R5_Wantdrain, /* dev->towrite needs to be drained */ + R5_WantFUA, /* Write should be FUA */ + R5_SyncIO, /* The IO is sync */ + R5_WriteError, /* got a write error - need to record it */ + R5_MadeGood, /* A bad block has been fixed by writing to it */ + R5_ReadRepl, /* Will/did read from replacement rather than orig */ + R5_MadeGoodRepl,/* A bad block on the replacement device has been + * fixed by writing to it */ + R5_NeedReplace, /* This device has a replacement which is not + * up-to-date at this stripe. */ + R5_WantReplace, /* We need to update the replacement, we have read + * data in, and now is a good time to write it out. + */ + R5_Discard, /* Discard the stripe */ + R5_SkipCopy, /* Don't copy data from bio to stripe cache */ +}; + +/* + * Stripe state + */ +enum { + STRIPE_ACTIVE, + STRIPE_HANDLE, + STRIPE_SYNC_REQUESTED, + STRIPE_SYNCING, + STRIPE_INSYNC, + STRIPE_REPLACED, + STRIPE_PREREAD_ACTIVE, + STRIPE_DELAYED, + STRIPE_DEGRADED, + STRIPE_BIT_DELAY, + STRIPE_EXPANDING, + STRIPE_EXPAND_SOURCE, + STRIPE_EXPAND_READY, + STRIPE_IO_STARTED, /* do not count towards 'bypass_count' */ + STRIPE_FULL_WRITE, /* all blocks are set to be overwritten */ + STRIPE_BIOFILL_RUN, + STRIPE_COMPUTE_RUN, + STRIPE_OPS_REQ_PENDING, + STRIPE_ON_UNPLUG_LIST, + STRIPE_DISCARD, + STRIPE_ON_RELEASE_LIST, +}; + +/* + * Operation request flags + */ +enum { + STRIPE_OP_BIOFILL, + STRIPE_OP_COMPUTE_BLK, + STRIPE_OP_PREXOR, + STRIPE_OP_BIODRAIN, + STRIPE_OP_RECONSTRUCT, + STRIPE_OP_CHECK, +}; +/* + * Plugging: + * + * To improve write throughput, we need to delay the handling of some + * stripes until there has been a chance that several write requests + * for the one stripe have all been collected. + * In particular, any write request that would require pre-reading + * is put on a "delayed" queue until there are no stripes currently + * in a pre-read phase. Further, if the "delayed" queue is empty when + * a stripe is put on it then we "plug" the queue and do not process it + * until an unplug call is made. (the unplug_io_fn() is called). + * + * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add + * it to the count of prereading stripes. + * When write is initiated, or the stripe refcnt == 0 (just in case) we + * clear the PREREAD_ACTIVE flag and decrement the count + * Whenever the 'handle' queue is empty and the device is not plugged, we + * move any strips from delayed to handle and clear the DELAYED flag and set + * PREREAD_ACTIVE. + * In stripe_handle, if we find pre-reading is necessary, we do it if + * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. + * HANDLE gets cleared if stripe_handle leaves nothing locked. + */ + + +struct disk_info { + struct md_rdev *rdev, *replacement; +}; + +/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. + * This is because we sometimes take all the spinlocks + * and creating that much locking depth can cause + * problems. + */ +#define NR_STRIPE_HASH_LOCKS 8 +#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1) + +struct r5worker { + struct work_struct work; + struct r5worker_group *group; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; + bool working; +}; + +struct r5worker_group { + struct list_head handle_list; + struct r5conf *conf; + struct r5worker *workers; + int stripes_cnt; +}; + +struct r5conf { + struct hlist_head *stripe_hashtbl; + /* only protect corresponding hash list and inactive_list */ + spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; + struct mddev *mddev; + int chunk_sectors; + int level, algorithm; + int max_degraded; + int raid_disks; + int max_nr_stripes; + + /* reshape_progress is the leading edge of a 'reshape' + * It has value MaxSector when no reshape is happening + * If delta_disks < 0, it is the last sector we started work on, + * else is it the next sector to work on. + */ + sector_t reshape_progress; + /* reshape_safe is the trailing edge of a reshape. We know that + * before (or after) this address, all reshape has completed. + */ + sector_t reshape_safe; + int previous_raid_disks; + int prev_chunk_sectors; + int prev_algo; + short generation; /* increments with every reshape */ + seqcount_t gen_lock; /* lock against generation changes */ + unsigned long reshape_checkpoint; /* Time we last updated + * metadata */ + long long min_offset_diff; /* minimum difference between + * data_offset and + * new_data_offset across all + * devices. May be negative, + * but is closest to zero. + */ + + struct list_head handle_list; /* stripes needing handling */ + struct list_head hold_list; /* preread ready stripes */ + struct list_head delayed_list; /* stripes that have plugged requests */ + struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ + struct bio *retry_read_aligned; /* currently retrying aligned bios */ + struct bio *retry_read_aligned_list; /* aligned bios retry list */ + atomic_t preread_active_stripes; /* stripes with scheduled io */ + atomic_t active_aligned_reads; + atomic_t pending_full_writes; /* full write backlog */ + int bypass_count; /* bypassed prereads */ + int bypass_threshold; /* preread nice */ + int skip_copy; /* Don't copy data from bio to stripe cache */ + struct list_head *last_hold; /* detect hold_list promotions */ + + atomic_t reshape_stripes; /* stripes with pending writes for reshape */ + /* unfortunately we need two cache names as we temporarily have + * two caches. + */ + int active_name; + char cache_name[2][32]; + struct kmem_cache *slab_cache; /* for allocating stripes */ + + int seq_flush, seq_write; + int quiesce; + + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + int recovery_disabled; + /* per cpu variables */ + struct raid5_percpu { + struct page *spare_page; /* Used when checking P/Q in raid6 */ + void *scribble; /* space for constructing buffer + * lists and performing address + * conversions + */ + } __percpu *percpu; + size_t scribble_len; /* size of scribble region must be + * associated with conf to handle + * cpu hotplug while reshaping + */ +#ifdef CONFIG_HOTPLUG_CPU + struct notifier_block cpu_notify; +#endif + + /* + * Free stripes pool + */ + atomic_t active_stripes; + struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; + atomic_t empty_inactive_list_nr; + struct llist_head released_stripes; + wait_queue_head_t wait_for_stripe; + wait_queue_head_t wait_for_overlap; + int inactive_blocked; /* release of inactive stripes blocked, + * waiting for 25% to be free + */ + int pool_size; /* number of disks in stripeheads in pool */ + spinlock_t device_lock; + struct disk_info *disks; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct md_thread *thread; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; + struct r5worker_group *worker_groups; + int group_cnt; + int worker_cnt_per_group; +}; + +/* + * Our supported algorithms + */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */ +#define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */ +#define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */ +#define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */ + +/* Define non-rotating (raid4) algorithms. These allow + * conversion of raid4 to raid5. + */ +#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */ +#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */ + +/* DDF RAID6 layouts differ from md/raid6 layouts in two ways. + * Firstly, the exact positioning of the parity block is slightly + * different between the 'LEFT_*' modes of md and the "_N_*" modes + * of DDF. + * Secondly, or order of datablocks over which the Q syndrome is computed + * is different. + * Consequently we have different layouts for DDF/raid6 than md/raid6. + * These layouts are from the DDFv1.2 spec. + * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but + * leaves RLQ=3 as 'Vendor Specific' + */ + +#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */ +#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ +#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ + + +/* For every RAID5 algorithm we define a RAID6 algorithm + * with exactly the same layout for data and parity, and + * with the Q block always on the last device (N-1). + * This allows trivial conversion from RAID5 to RAID6 + */ +#define ALGORITHM_LEFT_ASYMMETRIC_6 16 +#define ALGORITHM_RIGHT_ASYMMETRIC_6 17 +#define ALGORITHM_LEFT_SYMMETRIC_6 18 +#define ALGORITHM_RIGHT_SYMMETRIC_6 19 +#define ALGORITHM_PARITY_0_6 20 +#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N + +static inline int algorithm_valid_raid5(int layout) +{ + return (layout >= 0) && + (layout <= 5); +} +static inline int algorithm_valid_raid6(int layout) +{ + return (layout >= 0 && layout <= 5) + || + (layout >= 8 && layout <= 10) + || + (layout >= 16 && layout <= 20); +} + +static inline int algorithm_is_DDF(int layout) +{ + return layout >= 8 && layout <= 10; +} + +extern int md_raid5_congested(struct mddev *mddev, int bits); +extern void md_raid5_kick_device(struct r5conf *conf); +extern int raid5_set_cache_size(struct mddev *mddev, int size); +#endif diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h deleted file mode 100644 index 31cbee71365..00000000000 --- a/drivers/md/raid6.h +++ /dev/null @@ -1,139 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2003 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -#ifndef LINUX_RAID_RAID6_H -#define LINUX_RAID_RAID6_H - -#ifdef __KERNEL__ - -/* Set to 1 to use kernel-wide empty_zero_page */ -#define RAID6_USE_EMPTY_ZERO_PAGE 0 - -#include <linux/module.h> -#include <linux/stddef.h> -#include <linux/compiler.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/mempool.h> -#include <linux/list.h> -#include <linux/vmalloc.h> -#include <linux/raid/md.h> -#include <linux/raid/raid5.h> - -typedef raid5_conf_t raid6_conf_t; /* Same configuration */ - -/* Additional compute_parity mode -- updates the parity w/o LOCKING */ -#define UPDATE_PARITY 4 - -/* We need a pre-zeroed page... if we don't want to use the kernel-provided - one define it here */ -#if RAID6_USE_EMPTY_ZERO_PAGE -# define raid6_empty_zero_page empty_zero_page -#else -extern const char raid6_empty_zero_page[PAGE_SIZE]; -#endif - -#else /* ! __KERNEL__ */ -/* Used for testing in user space */ - -#include <errno.h> -#include <inttypes.h> -#include <limits.h> -#include <stddef.h> -#include <sys/mman.h> -#include <sys/types.h> - -/* Not standard, but glibc defines it */ -#define BITS_PER_LONG __WORDSIZE - -typedef uint8_t u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint64_t u64; - -#ifndef PAGE_SIZE -# define PAGE_SIZE 4096 -#endif -extern const char raid6_empty_zero_page[PAGE_SIZE]; - -#define __init -#define __exit -#define __attribute_const__ __attribute__((const)) -#define noinline __attribute__((noinline)) - -#define preempt_enable() -#define preempt_disable() -#define cpu_has_feature(x) 1 -#define enable_kernel_altivec() -#define disable_kernel_altivec() - -#endif /* __KERNEL__ */ - -/* Routine choices */ -struct raid6_calls { - void (*gen_syndrome)(int, size_t, void **); - int (*valid)(void); /* Returns 1 if this routine set is usable */ - const char *name; /* Name of this routine set */ - int prefer; /* Has special performance attribute */ -}; - -/* Selected algorithm */ -extern struct raid6_calls raid6_call; - -/* Algorithm list */ -extern const struct raid6_calls * const raid6_algos[]; -int raid6_select_algo(void); - -/* Return values from chk_syndrome */ -#define RAID6_OK 0 -#define RAID6_P_BAD 1 -#define RAID6_Q_BAD 2 -#define RAID6_PQ_BAD 3 - -/* Galois field tables */ -extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); -extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); -extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); -extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); - -/* Recovery routines */ -void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); -void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); -void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); - -/* Some definitions to allow code to be compiled for testing in userspace */ -#ifndef __KERNEL__ - -# define jiffies raid6_jiffies() -# define printk printf -# define GFP_KERNEL 0 -# define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0)) -# define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE) - -static inline void cpu_relax(void) -{ - /* Nothing */ -} - -#undef HZ -#define HZ 1000 -static inline uint32_t raid6_jiffies(void) -{ - struct timeval tv; - gettimeofday(&tv, NULL); - return tv.tv_sec*1000 + tv.tv_usec/1000; -} - -#endif /* ! __KERNEL__ */ - -#endif /* LINUX_RAID_RAID6_H */ diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c deleted file mode 100644 index 21987e3dbe6..00000000000 --- a/drivers/md/raid6algos.c +++ /dev/null @@ -1,154 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6algos.c - * - * Algorithm list and algorithm selection for RAID-6 - */ - -#include "raid6.h" -#ifndef __KERNEL__ -#include <sys/mman.h> -#include <stdio.h> -#endif - -struct raid6_calls raid6_call; - -/* Various routine sets */ -extern const struct raid6_calls raid6_intx1; -extern const struct raid6_calls raid6_intx2; -extern const struct raid6_calls raid6_intx4; -extern const struct raid6_calls raid6_intx8; -extern const struct raid6_calls raid6_intx16; -extern const struct raid6_calls raid6_intx32; -extern const struct raid6_calls raid6_mmxx1; -extern const struct raid6_calls raid6_mmxx2; -extern const struct raid6_calls raid6_sse1x1; -extern const struct raid6_calls raid6_sse1x2; -extern const struct raid6_calls raid6_sse2x1; -extern const struct raid6_calls raid6_sse2x2; -extern const struct raid6_calls raid6_sse2x4; -extern const struct raid6_calls raid6_altivec1; -extern const struct raid6_calls raid6_altivec2; -extern const struct raid6_calls raid6_altivec4; -extern const struct raid6_calls raid6_altivec8; - -const struct raid6_calls * const raid6_algos[] = { - &raid6_intx1, - &raid6_intx2, - &raid6_intx4, - &raid6_intx8, -#if defined(__ia64__) - &raid6_intx16, - &raid6_intx32, -#endif -#if defined(__i386__) && !defined(__arch_um__) - &raid6_mmxx1, - &raid6_mmxx2, - &raid6_sse1x1, - &raid6_sse1x2, - &raid6_sse2x1, - &raid6_sse2x2, -#endif -#if defined(__x86_64__) && !defined(__arch_um__) - &raid6_sse2x1, - &raid6_sse2x2, - &raid6_sse2x4, -#endif -#ifdef CONFIG_ALTIVEC - &raid6_altivec1, - &raid6_altivec2, - &raid6_altivec4, - &raid6_altivec8, -#endif - NULL -}; - -#ifdef __KERNEL__ -#define RAID6_TIME_JIFFIES_LG2 4 -#else -/* Need more time to be stable in userspace */ -#define RAID6_TIME_JIFFIES_LG2 9 -#endif - -/* Try to pick the best algorithm */ -/* This code uses the gfmul table as convenient data set to abuse */ - -int __init raid6_select_algo(void) -{ - const struct raid6_calls * const * algo; - const struct raid6_calls * best; - char *syndromes; - void *dptrs[(65536/PAGE_SIZE)+2]; - int i, disks; - unsigned long perf, bestperf; - int bestprefer; - unsigned long j0, j1; - - disks = (65536/PAGE_SIZE)+2; - for ( i = 0 ; i < disks-2 ; i++ ) { - dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; - } - - /* Normal code - use a 2-page allocation to avoid D$ conflict */ - syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); - - if ( !syndromes ) { - printk("raid6: Yikes! No memory available.\n"); - return -ENOMEM; - } - - dptrs[disks-2] = syndromes; - dptrs[disks-1] = syndromes + PAGE_SIZE; - - bestperf = 0; bestprefer = 0; best = NULL; - - for ( algo = raid6_algos ; *algo ; algo++ ) { - if ( !(*algo)->valid || (*algo)->valid() ) { - perf = 0; - - preempt_disable(); - j0 = jiffies; - while ( (j1 = jiffies) == j0 ) - cpu_relax(); - while (time_before(jiffies, - j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { - (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); - perf++; - } - preempt_enable(); - - if ( (*algo)->prefer > bestprefer || - ((*algo)->prefer == bestprefer && - perf > bestperf) ) { - best = *algo; - bestprefer = best->prefer; - bestperf = perf; - } - printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, - (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); - } - } - - if (best) { - printk("raid6: using algorithm %s (%ld MB/s)\n", - best->name, - (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); - raid6_call = *best; - } else - printk("raid6: Yikes! No algorithm found!\n"); - - free_pages((unsigned long)syndromes, 1); - - return best ? 0 : -EINVAL; -} diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc deleted file mode 100644 index b9afd35b881..00000000000 --- a/drivers/md/raid6altivec.uc +++ /dev/null @@ -1,130 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6altivec$#.c - * - * $#-way unrolled portable integer math RAID-6 instruction set - * - * This file is postprocessed using unroll.pl - * - * <benh> hpa: in process, - * you can just "steal" the vec unit with enable_kernel_altivec() (but - * bracked this with preempt_disable/enable or in a lock) - */ - -#include "raid6.h" - -#ifdef CONFIG_ALTIVEC - -#include <altivec.h> -#ifdef __KERNEL__ -# include <asm/system.h> -# include <asm/cputable.h> -#endif - -/* - * This is the C data type to use. We use a vector of - * signed char so vec_cmpgt() will generate the right - * instruction. - */ - -typedef vector signed char unative_t; - -#define NBYTES(x) ((vector signed char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) -#define NSIZE sizeof(unative_t) - -/* - * The SHLBYTE() operation shifts each byte left by 1, *not* - * rolling over into the next byte - */ -static inline __attribute_const__ unative_t SHLBYTE(unative_t v) -{ - return vec_add(v,v); -} - -/* - * The MASK() operation returns 0xFF in any byte for which the high - * bit is 1, 0x00 for any byte for which the high bit is 0. - */ -static inline __attribute_const__ unative_t MASK(unative_t v) -{ - unative_t zv = NBYTES(0); - - /* vec_cmpgt returns a vector bool char; thus the need for the cast */ - return (unative_t)vec_cmpgt(zv, v); -} - - -/* This is noinline to make damned sure that gcc doesn't move any of the - Altivec code around the enable/disable code */ -static void noinline -raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - unative_t wd$$, wq$$, wp$$, w1$$, w2$$; - unative_t x1d = NBYTES(0x1d); - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { - wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; - for ( z = z0-1 ; z >= 0 ; z-- ) { - wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; - wp$$ = vec_xor(wp$$, wd$$); - w2$$ = MASK(wq$$); - w1$$ = SHLBYTE(wq$$); - w2$$ = vec_and(w2$$, x1d); - w1$$ = vec_xor(w1$$, w2$$); - wq$$ = vec_xor(w1$$, wd$$); - } - *(unative_t *)&p[d+NSIZE*$$] = wp$$; - *(unative_t *)&q[d+NSIZE*$$] = wq$$; - } -} - -static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - preempt_disable(); - enable_kernel_altivec(); - - raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs); - - preempt_enable(); -} - -int raid6_have_altivec(void); -#if $# == 1 -int raid6_have_altivec(void) -{ - /* This assumes either all CPUs have Altivec or none does */ -# ifdef __KERNEL__ - return cpu_has_feature(CPU_FTR_ALTIVEC); -# else - return 1; -# endif -} -#endif - -const struct raid6_calls raid6_altivec$# = { - raid6_altivec$#_gen_syndrome, - raid6_have_altivec, - "altivecx$#", - 0 -}; - -#endif /* CONFIG_ALTIVEC */ diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc deleted file mode 100644 index ad004cee0e2..00000000000 --- a/drivers/md/raid6int.uc +++ /dev/null @@ -1,117 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6int$#.c - * - * $#-way unrolled portable integer math RAID-6 instruction set - * - * This file is postprocessed using unroll.pl - */ - -#include "raid6.h" - -/* - * This is the C data type to use - */ - -/* Change this from BITS_PER_LONG if there is something better... */ -#if BITS_PER_LONG == 64 -# define NBYTES(x) ((x) * 0x0101010101010101UL) -# define NSIZE 8 -# define NSHIFT 3 -# define NSTRING "64" -typedef u64 unative_t; -#else -# define NBYTES(x) ((x) * 0x01010101U) -# define NSIZE 4 -# define NSHIFT 2 -# define NSTRING "32" -typedef u32 unative_t; -#endif - - - -/* - * IA-64 wants insane amounts of unrolling. On other architectures that - * is just a waste of space. - */ -#if ($# <= 8) || defined(__ia64__) - - -/* - * These sub-operations are separate inlines since they can sometimes be - * specially optimized using architecture-specific hacks. - */ - -/* - * The SHLBYTE() operation shifts each byte left by 1, *not* - * rolling over into the next byte - */ -static inline __attribute_const__ unative_t SHLBYTE(unative_t v) -{ - unative_t vv; - - vv = (v << 1) & NBYTES(0xfe); - return vv; -} - -/* - * The MASK() operation returns 0xFF in any byte for which the high - * bit is 1, 0x00 for any byte for which the high bit is 0. - */ -static inline __attribute_const__ unative_t MASK(unative_t v) -{ - unative_t vv; - - vv = v & NBYTES(0x80); - vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */ - return vv; -} - - -static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - unative_t wd$$, wq$$, wp$$, w1$$, w2$$; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { - wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; - for ( z = z0-1 ; z >= 0 ; z-- ) { - wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; - wp$$ ^= wd$$; - w2$$ = MASK(wq$$); - w1$$ = SHLBYTE(wq$$); - w2$$ &= NBYTES(0x1d); - w1$$ ^= w2$$; - wq$$ = w1$$ ^ wd$$; - } - *(unative_t *)&p[d+NSIZE*$$] = wp$$; - *(unative_t *)&q[d+NSIZE*$$] = wq$$; - } -} - -const struct raid6_calls raid6_intx$# = { - raid6_int$#_gen_syndrome, - NULL, /* always valid */ - "int" NSTRING "x$#", - 0 -}; - -#endif diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c deleted file mode 100644 index d4e4a1bd70a..00000000000 --- a/drivers/md/raid6mmx.c +++ /dev/null @@ -1,142 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6mmx.c - * - * MMX implementation of RAID-6 syndrome functions - */ - -#if defined(__i386__) && !defined(__arch_um__) - -#include "raid6.h" -#include "raid6x86.h" - -/* Shared with raid6sse1.c */ -const struct raid6_mmx_constants { - u64 x1d; -} raid6_mmx_constants = { - 0x1d1d1d1d1d1d1d1dULL, -}; - -static int raid6_have_mmx(void) -{ - /* Not really "boot_cpu" but "all_cpus" */ - return boot_cpu_has(X86_FEATURE_MMX); -} - -/* - * Plain MMX implementation - */ -static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); - asm volatile("pxor %mm5,%mm5"); /* Zero temp */ - - for ( d = 0 ; d < bytes ; d += 8 ) { - asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ - asm volatile("movq %mm2,%mm4"); /* Q[0] */ - for ( z = z0-1 ; z >= 0 ; z-- ) { - asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); - asm volatile("pcmpgtb %mm4,%mm5"); - asm volatile("paddb %mm4,%mm4"); - asm volatile("pand %mm0,%mm5"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm5,%mm5"); - asm volatile("pxor %mm6,%mm2"); - asm volatile("pxor %mm6,%mm4"); - } - asm volatile("movq %%mm2,%0" : "=m" (p[d])); - asm volatile("pxor %mm2,%mm2"); - asm volatile("movq %%mm4,%0" : "=m" (q[d])); - asm volatile("pxor %mm4,%mm4"); - } - - kernel_fpu_end(); -} - -const struct raid6_calls raid6_mmxx1 = { - raid6_mmx1_gen_syndrome, - raid6_have_mmx, - "mmxx1", - 0 -}; - -/* - * Unrolled-by-2 MMX implementation - */ -static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); - asm volatile("pxor %mm5,%mm5"); /* Zero temp */ - asm volatile("pxor %mm7,%mm7"); /* Zero temp */ - - for ( d = 0 ; d < bytes ; d += 16 ) { - asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ - asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); - asm volatile("movq %mm2,%mm4"); /* Q[0] */ - asm volatile("movq %mm3,%mm6"); /* Q[1] */ - for ( z = z0-1 ; z >= 0 ; z-- ) { - asm volatile("pcmpgtb %mm4,%mm5"); - asm volatile("pcmpgtb %mm6,%mm7"); - asm volatile("paddb %mm4,%mm4"); - asm volatile("paddb %mm6,%mm6"); - asm volatile("pand %mm0,%mm5"); - asm volatile("pand %mm0,%mm7"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm7,%mm6"); - asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); - asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); - asm volatile("pxor %mm5,%mm2"); - asm volatile("pxor %mm7,%mm3"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm7,%mm6"); - asm volatile("pxor %mm5,%mm5"); - asm volatile("pxor %mm7,%mm7"); - } - asm volatile("movq %%mm2,%0" : "=m" (p[d])); - asm volatile("movq %%mm3,%0" : "=m" (p[d+8])); - asm volatile("movq %%mm4,%0" : "=m" (q[d])); - asm volatile("movq %%mm6,%0" : "=m" (q[d+8])); - } - - kernel_fpu_end(); -} - -const struct raid6_calls raid6_mmxx2 = { - raid6_mmx2_gen_syndrome, - raid6_have_mmx, - "mmxx2", - 0 -}; - -#endif diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c deleted file mode 100644 index a8c4d9451bd..00000000000 --- a/drivers/md/raid6recov.c +++ /dev/null @@ -1,133 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6recov.c - * - * RAID-6 data recovery in dual failure mode. In single failure mode, - * use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct - * the syndrome.) - */ - -#include "raid6.h" - -/* Recover two failed data blocks. */ -void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, - void **ptrs) -{ - u8 *p, *q, *dp, *dq; - u8 px, qx, db; - const u8 *pbmul; /* P multiplier table for B data */ - const u8 *qmul; /* Q multiplier table (for both) */ - - p = (u8 *)ptrs[disks-2]; - q = (u8 *)ptrs[disks-1]; - - /* Compute syndrome with zero for the missing data pages - Use the dead data pages as temporary storage for - delta p and delta q */ - dp = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; - ptrs[disks-2] = dp; - dq = (u8 *)ptrs[failb]; - ptrs[failb] = (void *)raid6_empty_zero_page; - ptrs[disks-1] = dq; - - raid6_call.gen_syndrome(disks, bytes, ptrs); - - /* Restore pointer table */ - ptrs[faila] = dp; - ptrs[failb] = dq; - ptrs[disks-2] = p; - ptrs[disks-1] = q; - - /* Now, pick the proper data tables */ - pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; - qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; - - /* Now do it... */ - while ( bytes-- ) { - px = *p ^ *dp; - qx = qmul[*q ^ *dq]; - *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ - *dp++ = db ^ px; /* Reconstructed A */ - p++; q++; - } -} - - - - -/* Recover failure of one data block plus the P block */ -void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) -{ - u8 *p, *q, *dq; - const u8 *qmul; /* Q multiplier table */ - - p = (u8 *)ptrs[disks-2]; - q = (u8 *)ptrs[disks-1]; - - /* Compute syndrome with zero for the missing data page - Use the dead data page as temporary storage for delta q */ - dq = (u8 *)ptrs[faila]; - ptrs[faila] = (void *)raid6_empty_zero_page; - ptrs[disks-1] = dq; - - raid6_call.gen_syndrome(disks, bytes, ptrs); - - /* Restore pointer table */ - ptrs[faila] = dq; - ptrs[disks-1] = q; - - /* Now, pick the proper data tables */ - qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; - - /* Now do it... */ - while ( bytes-- ) { - *p++ ^= *dq = qmul[*q ^ *dq]; - q++; dq++; - } -} - - -#ifndef __KERNEL__ /* Testing only */ - -/* Recover two failed blocks. */ -void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) -{ - if ( faila > failb ) { - int tmp = faila; - faila = failb; - failb = tmp; - } - - if ( failb == disks-1 ) { - if ( faila == disks-2 ) { - /* P+Q failure. Just rebuild the syndrome. */ - raid6_call.gen_syndrome(disks, bytes, ptrs); - } else { - /* data+Q failure. Reconstruct data from P, - then rebuild syndrome. */ - /* NOT IMPLEMENTED - equivalent to RAID-5 */ - } - } else { - if ( failb == disks-2 ) { - /* data+P failure. */ - raid6_datap_recov(disks, bytes, faila, ptrs); - } else { - /* data+data failure. */ - raid6_2data_recov(disks, bytes, faila, failb, ptrs); - } - } -} - -#endif diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c deleted file mode 100644 index 0666237276f..00000000000 --- a/drivers/md/raid6sse1.c +++ /dev/null @@ -1,162 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6sse1.c - * - * SSE-1/MMXEXT implementation of RAID-6 syndrome functions - * - * This is really an MMX implementation, but it requires SSE-1 or - * AMD MMXEXT for prefetch support and a few other features. The - * support for nontemporal memory accesses is enough to make this - * worthwhile as a separate implementation. - */ - -#if defined(__i386__) && !defined(__arch_um__) - -#include "raid6.h" -#include "raid6x86.h" - -/* Defined in raid6mmx.c */ -extern const struct raid6_mmx_constants { - u64 x1d; -} raid6_mmx_constants; - -static int raid6_have_sse1_or_mmxext(void) -{ - /* Not really boot_cpu but "all_cpus" */ - return boot_cpu_has(X86_FEATURE_MMX) && - (boot_cpu_has(X86_FEATURE_XMM) || - boot_cpu_has(X86_FEATURE_MMXEXT)); -} - -/* - * Plain SSE1 implementation - */ -static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); - asm volatile("pxor %mm5,%mm5"); /* Zero temp */ - - for ( d = 0 ; d < bytes ; d += 8 ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); - asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ - asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); - asm volatile("movq %mm2,%mm4"); /* Q[0] */ - asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d])); - for ( z = z0-2 ; z >= 0 ; z-- ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); - asm volatile("pcmpgtb %mm4,%mm5"); - asm volatile("paddb %mm4,%mm4"); - asm volatile("pand %mm0,%mm5"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm5,%mm5"); - asm volatile("pxor %mm6,%mm2"); - asm volatile("pxor %mm6,%mm4"); - asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); - } - asm volatile("pcmpgtb %mm4,%mm5"); - asm volatile("paddb %mm4,%mm4"); - asm volatile("pand %mm0,%mm5"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm5,%mm5"); - asm volatile("pxor %mm6,%mm2"); - asm volatile("pxor %mm6,%mm4"); - - asm volatile("movntq %%mm2,%0" : "=m" (p[d])); - asm volatile("movntq %%mm4,%0" : "=m" (q[d])); - } - - asm volatile("sfence" : : : "memory"); - kernel_fpu_end(); -} - -const struct raid6_calls raid6_sse1x1 = { - raid6_sse11_gen_syndrome, - raid6_have_sse1_or_mmxext, - "sse1x1", - 1 /* Has cache hints */ -}; - -/* - * Unrolled-by-2 SSE1 implementation - */ -static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); - asm volatile("pxor %mm5,%mm5"); /* Zero temp */ - asm volatile("pxor %mm7,%mm7"); /* Zero temp */ - - /* We uniformly assume a single prefetch covers at least 16 bytes */ - for ( d = 0 ; d < bytes ; d += 16 ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); - asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ - asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */ - asm volatile("movq %mm2,%mm4"); /* Q[0] */ - asm volatile("movq %mm3,%mm6"); /* Q[1] */ - for ( z = z0-1 ; z >= 0 ; z-- ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); - asm volatile("pcmpgtb %mm4,%mm5"); - asm volatile("pcmpgtb %mm6,%mm7"); - asm volatile("paddb %mm4,%mm4"); - asm volatile("paddb %mm6,%mm6"); - asm volatile("pand %mm0,%mm5"); - asm volatile("pand %mm0,%mm7"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm7,%mm6"); - asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); - asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); - asm volatile("pxor %mm5,%mm2"); - asm volatile("pxor %mm7,%mm3"); - asm volatile("pxor %mm5,%mm4"); - asm volatile("pxor %mm7,%mm6"); - asm volatile("pxor %mm5,%mm5"); - asm volatile("pxor %mm7,%mm7"); - } - asm volatile("movntq %%mm2,%0" : "=m" (p[d])); - asm volatile("movntq %%mm3,%0" : "=m" (p[d+8])); - asm volatile("movntq %%mm4,%0" : "=m" (q[d])); - asm volatile("movntq %%mm6,%0" : "=m" (q[d+8])); - } - - asm volatile("sfence" : :: "memory"); - kernel_fpu_end(); -} - -const struct raid6_calls raid6_sse1x2 = { - raid6_sse12_gen_syndrome, - raid6_have_sse1_or_mmxext, - "sse1x2", - 1 /* Has cache hints */ -}; - -#endif diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c deleted file mode 100644 index b034ad86803..00000000000 --- a/drivers/md/raid6sse2.c +++ /dev/null @@ -1,262 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6sse2.c - * - * SSE-2 implementation of RAID-6 syndrome functions - * - */ - -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) - -#include "raid6.h" -#include "raid6x86.h" - -static const struct raid6_sse_constants { - u64 x1d[2]; -} raid6_sse_constants __attribute__((aligned(16))) = { - { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL }, -}; - -static int raid6_have_sse2(void) -{ - /* Not really boot_cpu but "all_cpus" */ - return boot_cpu_has(X86_FEATURE_MMX) && - boot_cpu_has(X86_FEATURE_FXSR) && - boot_cpu_has(X86_FEATURE_XMM) && - boot_cpu_has(X86_FEATURE_XMM2); -} - -/* - * Plain SSE2 implementation - */ -static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); - asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ - - for ( d = 0 ; d < bytes ; d += 16 ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); - asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ - asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); - asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ - asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d])); - for ( z = z0-2 ; z >= 0 ; z-- ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); - asm volatile("pcmpgtb %xmm4,%xmm5"); - asm volatile("paddb %xmm4,%xmm4"); - asm volatile("pand %xmm0,%xmm5"); - asm volatile("pxor %xmm5,%xmm4"); - asm volatile("pxor %xmm5,%xmm5"); - asm volatile("pxor %xmm6,%xmm2"); - asm volatile("pxor %xmm6,%xmm4"); - asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d])); - } - asm volatile("pcmpgtb %xmm4,%xmm5"); - asm volatile("paddb %xmm4,%xmm4"); - asm volatile("pand %xmm0,%xmm5"); - asm volatile("pxor %xmm5,%xmm4"); - asm volatile("pxor %xmm5,%xmm5"); - asm volatile("pxor %xmm6,%xmm2"); - asm volatile("pxor %xmm6,%xmm4"); - - asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); - asm volatile("pxor %xmm2,%xmm2"); - asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); - asm volatile("pxor %xmm4,%xmm4"); - } - - asm volatile("sfence" : : : "memory"); - kernel_fpu_end(); -} - -const struct raid6_calls raid6_sse2x1 = { - raid6_sse21_gen_syndrome, - raid6_have_sse2, - "sse2x1", - 1 /* Has cache hints */ -}; - -/* - * Unrolled-by-2 SSE2 implementation - */ -static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); - asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ - asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ - - /* We uniformly assume a single prefetch covers at least 32 bytes */ - for ( d = 0 ; d < bytes ; d += 32 ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); - asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ - asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */ - asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ - asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */ - for ( z = z0-1 ; z >= 0 ; z-- ) { - asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); - asm volatile("pcmpgtb %xmm4,%xmm5"); - asm volatile("pcmpgtb %xmm6,%xmm7"); - asm volatile("paddb %xmm4,%xmm4"); - asm volatile("paddb %xmm6,%xmm6"); - asm volatile("pand %xmm0,%xmm5"); - asm volatile("pand %xmm0,%xmm7"); - asm volatile("pxor %xmm5,%xmm4"); - asm volatile("pxor %xmm7,%xmm6"); - asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d])); - asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16])); - asm volatile("pxor %xmm5,%xmm2"); - asm volatile("pxor %xmm7,%xmm3"); - asm volatile("pxor %xmm5,%xmm4"); - asm volatile("pxor %xmm7,%xmm6"); - asm volatile("pxor %xmm5,%xmm5"); - asm volatile("pxor %xmm7,%xmm7"); - } - asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); - asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); - asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); - asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); - } - - asm volatile("sfence" : : : "memory"); - kernel_fpu_end(); -} - -const struct raid6_calls raid6_sse2x2 = { - raid6_sse22_gen_syndrome, - raid6_have_sse2, - "sse2x2", - 1 /* Has cache hints */ -}; - -#endif - -#if defined(__x86_64__) && !defined(__arch_um__) - -/* - * Unrolled-by-4 SSE2 implementation - */ -static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) -{ - u8 **dptr = (u8 **)ptrs; - u8 *p, *q; - int d, z, z0; - - z0 = disks - 3; /* Highest data disk */ - p = dptr[z0+1]; /* XOR parity */ - q = dptr[z0+2]; /* RS syndrome */ - - kernel_fpu_begin(); - - asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); - asm volatile("pxor %xmm2,%xmm2"); /* P[0] */ - asm volatile("pxor %xmm3,%xmm3"); /* P[1] */ - asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */ - asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ - asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */ - asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ - asm volatile("pxor %xmm10,%xmm10"); /* P[2] */ - asm volatile("pxor %xmm11,%xmm11"); /* P[3] */ - asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */ - asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */ - asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */ - asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */ - - for ( d = 0 ; d < bytes ; d += 64 ) { - for ( z = z0 ; z >= 0 ; z-- ) { - /* The second prefetch seems to improve performance... */ - asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); - asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); - asm volatile("pcmpgtb %xmm4,%xmm5"); - asm volatile("pcmpgtb %xmm6,%xmm7"); - asm volatile("pcmpgtb %xmm12,%xmm13"); - asm volatile("pcmpgtb %xmm14,%xmm15"); - asm volatile("paddb %xmm4,%xmm4"); - asm volatile("paddb %xmm6,%xmm6"); - asm volatile("paddb %xmm12,%xmm12"); - asm volatile("paddb %xmm14,%xmm14"); - asm volatile("pand %xmm0,%xmm5"); - asm volatile("pand %xmm0,%xmm7"); - asm volatile("pand %xmm0,%xmm13"); - asm volatile("pand %xmm0,%xmm15"); - asm volatile("pxor %xmm5,%xmm4"); - asm volatile("pxor %xmm7,%xmm6"); - asm volatile("pxor %xmm13,%xmm12"); - asm volatile("pxor %xmm15,%xmm14"); - asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); - asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); - asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); - asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); - asm volatile("pxor %xmm5,%xmm2"); - asm volatile("pxor %xmm7,%xmm3"); - asm volatile("pxor %xmm13,%xmm10"); - asm volatile("pxor %xmm15,%xmm11"); - asm volatile("pxor %xmm5,%xmm4"); - asm volatile("pxor %xmm7,%xmm6"); - asm volatile("pxor %xmm13,%xmm12"); - asm volatile("pxor %xmm15,%xmm14"); - asm volatile("pxor %xmm5,%xmm5"); - asm volatile("pxor %xmm7,%xmm7"); - asm volatile("pxor %xmm13,%xmm13"); - asm volatile("pxor %xmm15,%xmm15"); - } - asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); - asm volatile("pxor %xmm2,%xmm2"); - asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); - asm volatile("pxor %xmm3,%xmm3"); - asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); - asm volatile("pxor %xmm10,%xmm10"); - asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); - asm volatile("pxor %xmm11,%xmm11"); - asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); - asm volatile("pxor %xmm4,%xmm4"); - asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); - asm volatile("pxor %xmm6,%xmm6"); - asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); - asm volatile("pxor %xmm12,%xmm12"); - asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); - asm volatile("pxor %xmm14,%xmm14"); - } - - asm volatile("sfence" : : : "memory"); - kernel_fpu_end(); -} - -const struct raid6_calls raid6_sse2x4 = { - raid6_sse24_gen_syndrome, - raid6_have_sse2, - "sse2x4", - 1 /* Has cache hints */ -}; - -#endif diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile deleted file mode 100644 index 78e0396adf2..00000000000 --- a/drivers/md/raid6test/Makefile +++ /dev/null @@ -1,75 +0,0 @@ -# -# This is a simple Makefile to test some of the RAID-6 code -# from userspace. -# - -CC = gcc -OPTFLAGS = -O2 # Adjust as desired -CFLAGS = -I.. -g $(OPTFLAGS) -LD = ld -PERL = perl -AR = ar -RANLIB = ranlib - -.c.o: - $(CC) $(CFLAGS) -c -o $@ $< - -%.c: ../%.c - cp -f $< $@ - -%.uc: ../%.uc - cp -f $< $@ - -all: raid6.a raid6test - -raid6.a: raid6int1.o raid6int2.o raid6int4.o raid6int8.o raid6int16.o \ - raid6int32.o \ - raid6mmx.o raid6sse1.o raid6sse2.o \ - raid6altivec1.o raid6altivec2.o raid6altivec4.o raid6altivec8.o \ - raid6recov.o raid6algos.o \ - raid6tables.o - rm -f $@ - $(AR) cq $@ $^ - $(RANLIB) $@ - -raid6test: test.c raid6.a - $(CC) $(CFLAGS) -o raid6test $^ - -raid6altivec1.c: raid6altivec.uc ../unroll.pl - $(PERL) ../unroll.pl 1 < raid6altivec.uc > $@ - -raid6altivec2.c: raid6altivec.uc ../unroll.pl - $(PERL) ../unroll.pl 2 < raid6altivec.uc > $@ - -raid6altivec4.c: raid6altivec.uc ../unroll.pl - $(PERL) ../unroll.pl 4 < raid6altivec.uc > $@ - -raid6altivec8.c: raid6altivec.uc ../unroll.pl - $(PERL) ../unroll.pl 8 < raid6altivec.uc > $@ - -raid6int1.c: raid6int.uc ../unroll.pl - $(PERL) ../unroll.pl 1 < raid6int.uc > $@ - -raid6int2.c: raid6int.uc ../unroll.pl - $(PERL) ../unroll.pl 2 < raid6int.uc > $@ - -raid6int4.c: raid6int.uc ../unroll.pl - $(PERL) ../unroll.pl 4 < raid6int.uc > $@ - -raid6int8.c: raid6int.uc ../unroll.pl - $(PERL) ../unroll.pl 8 < raid6int.uc > $@ - -raid6int16.c: raid6int.uc ../unroll.pl - $(PERL) ../unroll.pl 16 < raid6int.uc > $@ - -raid6int32.c: raid6int.uc ../unroll.pl - $(PERL) ../unroll.pl 32 < raid6int.uc > $@ - -raid6tables.c: mktables - ./mktables > raid6tables.c - -clean: - rm -f *.o *.a mktables mktables.c raid6int.uc raid6*.c raid6test - -spotless: clean - rm -f *~ diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c deleted file mode 100644 index 559cc41b258..00000000000 --- a/drivers/md/raid6test/test.c +++ /dev/null @@ -1,124 +0,0 @@ -/* -*- linux-c -*- ------------------------------------------------------- * - * - * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved - * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2 or (at your - * option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6test.c - * - * Test RAID-6 recovery with various algorithms - */ - -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include "raid6.h" - -#define NDISKS 16 /* Including P and Q */ - -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); -struct raid6_calls raid6_call; - -char *dataptrs[NDISKS]; -char data[NDISKS][PAGE_SIZE]; -char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; - -static void makedata(void) -{ - int i, j; - - for (i = 0; i < NDISKS; i++) { - for (j = 0; j < PAGE_SIZE; j++) - data[i][j] = rand(); - - dataptrs[i] = data[i]; - } -} - -static char disk_type(int d) -{ - switch (d) { - case NDISKS-2: - return 'P'; - case NDISKS-1: - return 'Q'; - default: - return 'D'; - } -} - -static int test_disks(int i, int j) -{ - int erra, errb; - - memset(recovi, 0xf0, PAGE_SIZE); - memset(recovj, 0xba, PAGE_SIZE); - - dataptrs[i] = recovi; - dataptrs[j] = recovj; - - raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs); - - erra = memcmp(data[i], recovi, PAGE_SIZE); - errb = memcmp(data[j], recovj, PAGE_SIZE); - - if (i < NDISKS-2 && j == NDISKS-1) { - /* We don't implement the DQ failure scenario, since it's - equivalent to a RAID-5 failure (XOR, then recompute Q) */ - erra = errb = 0; - } else { - printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n", - raid6_call.name, - i, disk_type(i), - j, disk_type(j), - (!erra && !errb) ? "OK" : - !erra ? "ERRB" : - !errb ? "ERRA" : "ERRAB"); - } - - dataptrs[i] = data[i]; - dataptrs[j] = data[j]; - - return erra || errb; -} - -int main(int argc, char *argv[]) -{ - const struct raid6_calls *const *algo; - int i, j; - int err = 0; - - makedata(); - - for (algo = raid6_algos; *algo; algo++) { - if (!(*algo)->valid || (*algo)->valid()) { - raid6_call = **algo; - - /* Nuke syndromes */ - memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); - - /* Generate assumed good syndrome */ - raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, - (void **)&dataptrs); - - for (i = 0; i < NDISKS-1; i++) - for (j = i+1; j < NDISKS; j++) - err += test_disks(i, j); - } - printf("\n"); - } - - printf("\n"); - /* Pick the best algorithm test */ - raid6_select_algo(); - - if (err) - printf("\n*** ERRORS FOUND ***\n"); - - return err; -} diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h deleted file mode 100644 index 99fea7a70ca..00000000000 --- a/drivers/md/raid6x86.h +++ /dev/null @@ -1,61 +0,0 @@ -/* ----------------------------------------------------------------------- * - * - * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 53 Temple Place Ste 330, - * Bostom MA 02111-1307, USA; either version 2 of the License, or - * (at your option) any later version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * raid6x86.h - * - * Definitions common to x86 and x86-64 RAID-6 code only - */ - -#ifndef LINUX_RAID_RAID6X86_H -#define LINUX_RAID_RAID6X86_H - -#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) - -#ifdef __KERNEL__ /* Real code */ - -#include <asm/i387.h> - -#else /* Dummy code for user space testing */ - -static inline void kernel_fpu_begin(void) -{ -} - -static inline void kernel_fpu_end(void) -{ -} - -#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions - * (fast save and restore) */ -#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ -#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ -#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ - -/* Should work well enough on modern CPUs for testing */ -static inline int boot_cpu_has(int flag) -{ - u32 eax = (flag >> 5) ? 0x80000001 : 1; - u32 edx; - - asm volatile("cpuid" - : "+a" (eax), "=d" (edx) - : : "ecx", "ebx"); - - return (edx >> (flag & 31)) & 1; -} - -#endif /* ndef __KERNEL__ */ - -#endif -#endif diff --git a/drivers/md/unroll.pl b/drivers/md/unroll.pl deleted file mode 100644 index 3acc710a20e..00000000000 --- a/drivers/md/unroll.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl -# -# Take a piece of C code and for each line which contains the sequence $$ -# repeat n times with $ replaced by 0...n-1; the sequence $# is replaced -# by the unrolling factor, and $* with a single $ -# - -($n) = @ARGV; -$n += 0; - -while ( defined($line = <STDIN>) ) { - if ( $line =~ /\$\$/ ) { - $rep = $n; - } else { - $rep = 1; - } - for ( $i = 0 ; $i < $rep ; $i++ ) { - $tmp = $line; - $tmp =~ s/\$\$/$i/g; - $tmp =~ s/\$\#/$n/g; - $tmp =~ s/\$\*/\$/g; - print $tmp; - } -} |
