diff options
Diffstat (limited to 'drivers/block/aoe/aoedev.c')
| -rw-r--r-- | drivers/block/aoe/aoedev.c | 455 |
1 files changed, 352 insertions, 103 deletions
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 6b5110a4745..e774c50b684 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */ +/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ /* * aoedev.c * AoE device utility functions; maintains device list. @@ -9,30 +9,140 @@ #include <linux/netdevice.h> #include <linux/delay.h> #include <linux/slab.h> +#include <linux/bitmap.h> +#include <linux/kdev_t.h> +#include <linux/moduleparam.h> +#include <linux/string.h> #include "aoe.h" static void dummy_timer(ulong); -static void aoedev_freedev(struct aoedev *); static void freetgt(struct aoedev *d, struct aoetgt *t); static void skbpoolfree(struct aoedev *d); +static int aoe_dyndevs = 1; +module_param(aoe_dyndevs, int, 0644); +MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices."); + static struct aoedev *devlist; static DEFINE_SPINLOCK(devlist_lock); -struct aoedev * -aoedev_by_aoeaddr(int maj, int min) +/* Because some systems will have one, many, or no + * - partitions, + * - slots per shelf, + * - or shelves, + * we need some flexibility in the way the minor numbers + * are allocated. So they are dynamic. + */ +#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS) + +static DEFINE_SPINLOCK(used_minors_lock); +static DECLARE_BITMAP(used_minors, N_DEVS); + +static int +minor_get_dyn(ulong *sysminor) { - struct aoedev *d; ulong flags; + ulong n; + int error = 0; + + spin_lock_irqsave(&used_minors_lock, flags); + n = find_first_zero_bit(used_minors, N_DEVS); + if (n < N_DEVS) + set_bit(n, used_minors); + else + error = -1; + spin_unlock_irqrestore(&used_minors_lock, flags); + + *sysminor = n * AOE_PARTITIONS; + return error; +} - spin_lock_irqsave(&devlist_lock, flags); +static int +minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin) +{ + ulong flags; + ulong n; + int error = 0; + enum { + /* for backwards compatibility when !aoe_dyndevs, + * a static number of supported slots per shelf */ + NPERSHELF = 16, + }; + + if (aoemin >= NPERSHELF) { + pr_err("aoe: %s %d slots per shelf\n", + "static minor device numbers support only", + NPERSHELF); + error = -1; + goto out; + } - for (d=devlist; d; d=d->next) - if (d->aoemajor == maj && d->aoeminor == min) - break; + n = aoemaj * NPERSHELF + aoemin; + if (n >= N_DEVS) { + pr_err("aoe: %s with e%ld.%d\n", + "cannot use static minor device numbers", + aoemaj, aoemin); + error = -1; + goto out; + } + + spin_lock_irqsave(&used_minors_lock, flags); + if (test_bit(n, used_minors)) { + pr_err("aoe: %s %lu\n", + "existing device already has static minor number", + n); + error = -1; + } else + set_bit(n, used_minors); + spin_unlock_irqrestore(&used_minors_lock, flags); + *sysminor = n * AOE_PARTITIONS; +out: + return error; +} + +static int +minor_get(ulong *sysminor, ulong aoemaj, int aoemin) +{ + if (aoe_dyndevs) + return minor_get_dyn(sysminor); + else + return minor_get_static(sysminor, aoemaj, aoemin); +} + +static void +minor_free(ulong minor) +{ + ulong flags; + + minor /= AOE_PARTITIONS; + BUG_ON(minor >= N_DEVS); + spin_lock_irqsave(&used_minors_lock, flags); + BUG_ON(!test_bit(minor, used_minors)); + clear_bit(minor, used_minors); + spin_unlock_irqrestore(&used_minors_lock, flags); +} + +/* + * Users who grab a pointer to the device with aoedev_by_aoeaddr + * automatically get a reference count and must be responsible + * for performing a aoedev_put. With the addition of async + * kthread processing I'm no longer confident that we can + * guarantee consistency in the face of device flushes. + * + * For the time being, we only bother to add extra references for + * frames sitting on the iocq. When the kthreads finish processing + * these frames, they will aoedev_put the device. + */ + +void +aoedev_put(struct aoedev *d) +{ + ulong flags; + + spin_lock_irqsave(&devlist_lock, flags); + d->ref--; spin_unlock_irqrestore(&devlist_lock, flags); - return d; } static void @@ -47,128 +157,247 @@ dummy_timer(ulong vp) add_timer(&d->timer); } +static void +aoe_failip(struct aoedev *d) +{ + struct request *rq; + struct bio *bio; + unsigned long n; + + aoe_failbuf(d, d->ip.buf); + + rq = d->ip.rq; + if (rq == NULL) + return; + while ((bio = d->ip.nxbio)) { + clear_bit(BIO_UPTODATE, &bio->bi_flags); + d->ip.nxbio = bio->bi_next; + n = (unsigned long) rq->special; + rq->special = (void *) --n; + } + if ((unsigned long) rq->special == 0) + aoe_end_request(d, rq, 0); +} + +static void +downdev_frame(struct list_head *pos) +{ + struct frame *f; + + f = list_entry(pos, struct frame, head); + list_del(pos); + if (f->buf) { + f->buf->nframesout--; + aoe_failbuf(f->t->d, f->buf); + } + aoe_freetframe(f); +} + void aoedev_downdev(struct aoedev *d) { - struct aoetgt **t, **te; - struct frame *f, *e; - struct buf *buf; - struct bio *bio; + struct aoetgt *t, **tt, **te; + struct list_head *head, *pos, *nx; + struct request *rq; + int i; - t = d->targets; - te = t + NTARGETS; - for (; t < te && *t; t++) { - f = (*t)->frames; - e = f + (*t)->nframes; - for (; f < e; f->tag = FREETAG, f->buf = NULL, f++) { - if (f->tag == FREETAG || f->buf == NULL) - continue; - buf = f->buf; - bio = buf->bio; - if (--buf->nframesout == 0 - && buf != d->inprocess) { - mempool_free(buf, d->bufpool); - bio_endio(bio, -EIO); - } - } - (*t)->maxout = (*t)->nframes; - (*t)->nout = 0; + d->flags &= ~DEVFL_UP; + + /* clean out active and to-be-retransmitted buffers */ + for (i = 0; i < NFACTIVE; i++) { + head = &d->factive[i]; + list_for_each_safe(pos, nx, head) + downdev_frame(pos); } - buf = d->inprocess; - if (buf) { - bio = buf->bio; - mempool_free(buf, d->bufpool); - bio_endio(bio, -EIO); + head = &d->rexmitq; + list_for_each_safe(pos, nx, head) + downdev_frame(pos); + + /* reset window dressings */ + tt = d->targets; + te = tt + d->ntargets; + for (; tt < te && (t = *tt); tt++) { + aoecmd_wreset(t); + t->nout = 0; } - d->inprocess = NULL; - d->htgt = NULL; - - while (!list_empty(&d->bufq)) { - buf = container_of(d->bufq.next, struct buf, bufs); - list_del(d->bufq.next); - bio = buf->bio; - mempool_free(buf, d->bufpool); - bio_endio(bio, -EIO); + + /* clean out the in-process request (if any) */ + aoe_failip(d); + + /* fast fail all pending I/O */ + if (d->blkq) { + while ((rq = blk_peek_request(d->blkq))) { + blk_start_request(rq); + aoe_end_request(d, rq, 1); + } } if (d->gd) set_capacity(d->gd, 0); +} - d->flags &= ~DEVFL_UP; +/* return whether the user asked for this particular + * device to be flushed + */ +static int +user_req(char *s, size_t slen, struct aoedev *d) +{ + const char *p; + size_t lim; + + if (!d->gd) + return 0; + p = kbasename(d->gd->disk_name); + lim = sizeof(d->gd->disk_name); + lim -= p - d->gd->disk_name; + if (slen < lim) + lim = slen; + + return !strncmp(s, p, lim); } static void -aoedev_freedev(struct aoedev *d) +freedev(struct aoedev *d) { struct aoetgt **t, **e; + int freeing = 0; + unsigned long flags; + + spin_lock_irqsave(&d->lock, flags); + if (d->flags & DEVFL_TKILL + && !(d->flags & DEVFL_FREEING)) { + d->flags |= DEVFL_FREEING; + freeing = 1; + } + spin_unlock_irqrestore(&d->lock, flags); + if (!freeing) + return; - cancel_work_sync(&d->work); + del_timer_sync(&d->timer); if (d->gd) { + aoedisk_rm_debugfs(d); aoedisk_rm_sysfs(d); del_gendisk(d->gd); put_disk(d->gd); + blk_cleanup_queue(d->blkq); } t = d->targets; - e = t + NTARGETS; + e = t + d->ntargets; for (; t < e && *t; t++) freetgt(d, *t); if (d->bufpool) mempool_destroy(d->bufpool); skbpoolfree(d); - blk_cleanup_queue(d->blkq); - kfree(d); + minor_free(d->sysminor); + + spin_lock_irqsave(&d->lock, flags); + d->flags |= DEVFL_FREED; + spin_unlock_irqrestore(&d->lock, flags); } -int -aoedev_flush(const char __user *str, size_t cnt) +enum flush_parms { + NOT_EXITING = 0, + EXITING = 1, +}; + +static int +flush(const char __user *str, size_t cnt, int exiting) { ulong flags; struct aoedev *d, **dd; - struct aoedev *rmd = NULL; char buf[16]; int all = 0; + int specified = 0; /* flush a specific device */ + unsigned int skipflags; + + skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL; - if (cnt >= 3) { + if (!exiting && cnt >= 3) { if (cnt > sizeof buf) cnt = sizeof buf; if (copy_from_user(buf, str, cnt)) return -EFAULT; all = !strncmp(buf, "all", 3); + if (!all) + specified = 1; } + flush_scheduled_work(); + /* pass one: without sleeping, do aoedev_downdev */ spin_lock_irqsave(&devlist_lock, flags); - dd = &devlist; - while ((d = *dd)) { + for (d = devlist; d; d = d->next) { spin_lock(&d->lock); - if ((!all && (d->flags & DEVFL_UP)) - || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) - || d->nopen) { - spin_unlock(&d->lock); - dd = &d->next; - continue; - } - *dd = d->next; + if (exiting) { + /* unconditionally take each device down */ + } else if (specified) { + if (!user_req(buf, cnt, d)) + goto cont; + } else if ((!all && (d->flags & DEVFL_UP)) + || d->flags & skipflags + || d->nopen + || d->ref) + goto cont; + aoedev_downdev(d); d->flags |= DEVFL_TKILL; +cont: spin_unlock(&d->lock); - d->next = rmd; - rmd = d; } spin_unlock_irqrestore(&devlist_lock, flags); - while ((d = rmd)) { - rmd = d->next; - del_timer_sync(&d->timer); - aoedev_freedev(d); /* must be able to sleep */ + + /* pass two: call freedev, which might sleep, + * for aoedevs marked with DEVFL_TKILL + */ +restart: + spin_lock_irqsave(&devlist_lock, flags); + for (d = devlist; d; d = d->next) { + spin_lock(&d->lock); + if (d->flags & DEVFL_TKILL + && !(d->flags & DEVFL_FREEING)) { + spin_unlock(&d->lock); + spin_unlock_irqrestore(&devlist_lock, flags); + freedev(d); + goto restart; + } + spin_unlock(&d->lock); } + + /* pass three: remove aoedevs marked with DEVFL_FREED */ + for (dd = &devlist, d = *dd; d; d = *dd) { + struct aoedev *doomed = NULL; + + spin_lock(&d->lock); + if (d->flags & DEVFL_FREED) { + *dd = d->next; + doomed = d; + } else { + dd = &d->next; + } + spin_unlock(&d->lock); + if (doomed) + kfree(doomed->targets); + kfree(doomed); + } + spin_unlock_irqrestore(&devlist_lock, flags); + return 0; } -/* I'm not really sure that this is a realistic problem, but if the -network driver goes gonzo let's just leak memory after complaining. */ +int +aoedev_flush(const char __user *str, size_t cnt) +{ + return flush(str, cnt, NOT_EXITING); +} + +/* This has been confirmed to occur once with Tms=3*1000 due to the + * driver changing link and not processing its transmit ring. The + * problem is hard enough to solve by returning an error that I'm + * still punting on "solving" this. + */ static void skbfree(struct sk_buff *skb) { - enum { Sms = 100, Tms = 3*1000}; + enum { Sms = 250, Tms = 30 * 1000}; int i = Tms / Sms; if (skb == NULL) @@ -182,6 +411,7 @@ skbfree(struct sk_buff *skb) "cannot free skb -- memory leaked."); return; } + skb->truesize -= skb->data_len; skb_shinfo(skb)->nr_frags = skb->data_len = 0; skb_trim(skb, 0); dev_kfree_skb(skb); @@ -198,26 +428,43 @@ skbpoolfree(struct aoedev *d) __skb_queue_head_init(&d->skbpool); } -/* find it or malloc it */ +/* find it or allocate it */ struct aoedev * -aoedev_by_sysminor_m(ulong sysminor) +aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) { struct aoedev *d; + int i; ulong flags; + ulong sysminor = 0; spin_lock_irqsave(&devlist_lock, flags); for (d=devlist; d; d=d->next) - if (d->sysminor == sysminor) + if (d->aoemajor == maj && d->aoeminor == min) { + spin_lock(&d->lock); + if (d->flags & DEVFL_TKILL) { + spin_unlock(&d->lock); + d = NULL; + goto out; + } + d->ref++; + spin_unlock(&d->lock); break; - if (d) + } + if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0) goto out; d = kcalloc(1, sizeof *d, GFP_ATOMIC); if (!d) goto out; + d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC); + if (!d->targets) { + kfree(d); + d = NULL; + goto out; + } + d->ntargets = NTARGETS; INIT_WORK(&d->work, aoecmd_sleepwork); spin_lock_init(&d->lock); - skb_queue_head_init(&d->sendq); skb_queue_head_init(&d->skbpool); init_timer(&d->timer); d->timer.data = (ulong) d; @@ -226,11 +473,15 @@ aoedev_by_sysminor_m(ulong sysminor) add_timer(&d->timer); d->bufpool = NULL; /* defer to aoeblk_gdalloc */ d->tgt = d->targets; - INIT_LIST_HEAD(&d->bufq); + d->ref = 1; + for (i = 0; i < NFACTIVE; i++) + INIT_LIST_HEAD(&d->factive[i]); + INIT_LIST_HEAD(&d->rexmitq); d->sysminor = sysminor; - d->aoemajor = AOEMAJOR(sysminor); - d->aoeminor = AOEMINOR(sysminor); - d->mintimer = MINTIMER; + d->aoemajor = maj; + d->aoeminor = min; + d->rttavg = RTTAVG_INIT; + d->rttdev = RTTDEV_INIT; d->next = devlist; devlist = d; out: @@ -241,33 +492,31 @@ aoedev_by_sysminor_m(ulong sysminor) static void freetgt(struct aoedev *d, struct aoetgt *t) { - struct frame *f, *e; + struct frame *f; + struct list_head *pos, *nx, *head; + struct aoeif *ifp; - f = t->frames; - e = f + t->nframes; - for (; f < e; f++) + for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) { + if (!ifp->nd) + break; + dev_put(ifp->nd); + } + + head = &t->ffree; + list_for_each_safe(pos, nx, head) { + list_del(pos); + f = list_entry(pos, struct frame, head); skbfree(f->skb); - kfree(t->frames); + kfree(f); + } kfree(t); } void aoedev_exit(void) { - struct aoedev *d; - ulong flags; - - while ((d = devlist)) { - devlist = d->next; - - spin_lock_irqsave(&d->lock, flags); - aoedev_downdev(d); - d->flags |= DEVFL_TKILL; - spin_unlock_irqrestore(&d->lock, flags); - - del_timer_sync(&d->timer); - aoedev_freedev(d); - } + flush_scheduled_work(); + flush(NULL, 0, EXITING); } int __init |
