aboutsummaryrefslogtreecommitdiff
path: root/drivers/block/aoe/aoecmd.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/aoe/aoecmd.c')
-rw-r--r--drivers/block/aoe/aoecmd.c1911
1 files changed, 1545 insertions, 366 deletions
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index b5be4b7d7b5..422b7d84f68 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1,75 +1,113 @@
-/* Copyright (c) 2004 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoecmd.c
* Filesystem request handling methods
*/
+#include <linux/ata.h>
+#include <linux/slab.h>
#include <linux/hdreg.h>
#include <linux/blkdev.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
+#include <linux/genhd.h>
+#include <linux/moduleparam.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <net/net_namespace.h>
+#include <asm/unaligned.h>
+#include <linux/uio.h>
#include "aoe.h"
-#define TIMERTICK (HZ / 10)
-#define MINTIMER (2 * TIMERTICK)
-#define MAXTIMER (HZ << 1)
-#define MAXWAIT (60 * 3) /* After MAXWAIT seconds, give up and fail dev */
+#define MAXIOC (8192) /* default meant to avoid most soft lockups */
+
+static void ktcomplete(struct frame *, struct sk_buff *);
+static int count_targets(struct aoedev *d, int *untainted);
+
+static struct buf *nextbuf(struct aoedev *);
+
+static int aoe_deadsecs = 60 * 3;
+module_param(aoe_deadsecs, int, 0644);
+MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
+
+static int aoe_maxout = 64;
+module_param(aoe_maxout, int, 0644);
+MODULE_PARM_DESC(aoe_maxout,
+ "Only aoe_maxout outstanding packets for every MAC on eX.Y.");
+
+/* The number of online cpus during module initialization gives us a
+ * convenient heuristic cap on the parallelism used for ktio threads
+ * doing I/O completion. It is not important that the cap equal the
+ * actual number of running CPUs at any given time, but because of CPU
+ * hotplug, we take care to use ncpus instead of using
+ * num_online_cpus() after module initialization.
+ */
+static int ncpus;
+
+/* mutex lock used for synchronization while thread spawning */
+static DEFINE_MUTEX(ktio_spawn_lock);
+
+static wait_queue_head_t *ktiowq;
+static struct ktstate *kts;
+
+/* io completion queue */
+struct iocq_ktio {
+ struct list_head head;
+ spinlock_t lock;
+};
+static struct iocq_ktio *iocq;
+
+static struct page *empty_page;
static struct sk_buff *
-new_skb(struct net_device *if_dev, ulong len)
+new_skb(ulong len)
{
struct sk_buff *skb;
- skb = alloc_skb(len, GFP_ATOMIC);
+ skb = alloc_skb(len + MAX_HEADER, GFP_ATOMIC);
if (skb) {
- skb->nh.raw = skb->mac.raw = skb->data;
- skb->dev = if_dev;
+ skb_reserve(skb, MAX_HEADER);
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
skb->protocol = __constant_htons(ETH_P_AOE);
- skb->priority = 0;
- skb_put(skb, len);
- skb->next = skb->prev = NULL;
-
- /* tell the network layer not to perform IP checksums
- * or to get the NIC to do it
- */
- skb->ip_summed = CHECKSUM_NONE;
+ skb_checksum_none_assert(skb);
}
return skb;
}
-static struct sk_buff *
-skb_prepare(struct aoedev *d, struct frame *f)
+static struct frame *
+getframe_deferred(struct aoedev *d, u32 tag)
{
- struct sk_buff *skb;
- char *p;
-
- skb = new_skb(d->ifp, f->ndata + f->writedatalen);
- if (!skb) {
- printk(KERN_INFO "aoe: skb_prepare: failure to allocate skb\n");
- return NULL;
- }
-
- p = skb->mac.raw;
- memcpy(p, f->data, f->ndata);
+ struct list_head *head, *pos, *nx;
+ struct frame *f;
- if (f->writedatalen) {
- p += sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr);
- memcpy(p, f->bufaddr, f->writedatalen);
+ head = &d->rexmitq;
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ if (f->tag == tag) {
+ list_del(pos);
+ return f;
+ }
}
-
- return skb;
+ return NULL;
}
static struct frame *
-getframe(struct aoedev *d, int tag)
+getframe(struct aoedev *d, u32 tag)
{
- struct frame *f, *e;
+ struct frame *f;
+ struct list_head *head, *pos, *nx;
+ u32 n;
- f = d->frames;
- e = f + d->nframes;
- for (; f<e; f++)
- if (f->tag == tag)
+ n = tag % NFACTIVE;
+ head = &d->factive[n];
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ if (f->tag == tag) {
+ list_del(pos);
return f;
+ }
+ }
return NULL;
}
@@ -87,13 +125,13 @@ newtag(struct aoedev *d)
return n |= (++d->lasttag & 0x7fff) << 16;
}
-static int
-aoehdr_atainit(struct aoedev *d, struct aoe_hdr *h)
+static u32
+aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
{
u32 host_tag = newtag(d);
- memcpy(h->src, d->ifp->dev_addr, sizeof h->src);
- memcpy(h->dst, d->addr, sizeof h->dst);
+ memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
+ memcpy(h->dst, t->addr, sizeof h->dst);
h->type = __constant_cpu_to_be16(ETH_P_AOE);
h->verfl = AOE_HVER;
h->major = cpu_to_be16(d->aoemajor);
@@ -104,144 +142,394 @@ aoehdr_atainit(struct aoedev *d, struct aoe_hdr *h)
return host_tag;
}
+static inline void
+put_lba(struct aoe_atahdr *ah, sector_t lba)
+{
+ ah->lba0 = lba;
+ ah->lba1 = lba >>= 8;
+ ah->lba2 = lba >>= 8;
+ ah->lba3 = lba >>= 8;
+ ah->lba4 = lba >>= 8;
+ ah->lba5 = lba >>= 8;
+}
+
+static struct aoeif *
+ifrotate(struct aoetgt *t)
+{
+ struct aoeif *ifp;
+
+ ifp = t->ifp;
+ ifp++;
+ if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL)
+ ifp = t->ifs;
+ if (ifp->nd == NULL)
+ return NULL;
+ return t->ifp = ifp;
+}
+
static void
-aoecmd_ata_rw(struct aoedev *d, struct frame *f)
+skb_pool_put(struct aoedev *d, struct sk_buff *skb)
+{
+ __skb_queue_tail(&d->skbpool, skb);
+}
+
+static struct sk_buff *
+skb_pool_get(struct aoedev *d)
+{
+ struct sk_buff *skb = skb_peek(&d->skbpool);
+
+ if (skb && atomic_read(&skb_shinfo(skb)->dataref) == 1) {
+ __skb_unlink(skb, &d->skbpool);
+ return skb;
+ }
+ if (skb_queue_len(&d->skbpool) < NSKBPOOLMAX &&
+ (skb = new_skb(ETH_ZLEN)))
+ return skb;
+
+ return NULL;
+}
+
+void
+aoe_freetframe(struct frame *f)
+{
+ struct aoetgt *t;
+
+ t = f->t;
+ f->buf = NULL;
+ memset(&f->iter, 0, sizeof(f->iter));
+ f->r_skb = NULL;
+ f->flags = 0;
+ list_add(&f->head, &t->ffree);
+}
+
+static struct frame *
+newtframe(struct aoedev *d, struct aoetgt *t)
{
+ struct frame *f;
+ struct sk_buff *skb;
+ struct list_head *pos;
+
+ if (list_empty(&t->ffree)) {
+ if (t->falloc >= NSKBPOOLMAX*2)
+ return NULL;
+ f = kcalloc(1, sizeof(*f), GFP_ATOMIC);
+ if (f == NULL)
+ return NULL;
+ t->falloc++;
+ f->t = t;
+ } else {
+ pos = t->ffree.next;
+ list_del(pos);
+ f = list_entry(pos, struct frame, head);
+ }
+
+ skb = f->skb;
+ if (skb == NULL) {
+ f->skb = skb = new_skb(ETH_ZLEN);
+ if (!skb) {
+bail: aoe_freetframe(f);
+ return NULL;
+ }
+ }
+
+ if (atomic_read(&skb_shinfo(skb)->dataref) != 1) {
+ skb = skb_pool_get(d);
+ if (skb == NULL)
+ goto bail;
+ skb_pool_put(d, f->skb);
+ f->skb = skb;
+ }
+
+ skb->truesize -= skb->data_len;
+ skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+ skb_trim(skb, 0);
+ return f;
+}
+
+static struct frame *
+newframe(struct aoedev *d)
+{
+ struct frame *f;
+ struct aoetgt *t, **tt;
+ int totout = 0;
+ int use_tainted;
+ int has_untainted;
+
+ if (!d->targets || !d->targets[0]) {
+ printk(KERN_ERR "aoe: NULL TARGETS!\n");
+ return NULL;
+ }
+ tt = d->tgt; /* last used target */
+ for (use_tainted = 0, has_untainted = 0;;) {
+ tt++;
+ if (tt >= &d->targets[d->ntargets] || !*tt)
+ tt = d->targets;
+ t = *tt;
+ if (!t->taint) {
+ has_untainted = 1;
+ totout += t->nout;
+ }
+ if (t->nout < t->maxout
+ && (use_tainted || !t->taint)
+ && t->ifp->nd) {
+ f = newtframe(d, t);
+ if (f) {
+ ifrotate(t);
+ d->tgt = tt;
+ return f;
+ }
+ }
+ if (tt == d->tgt) { /* we've looped and found nada */
+ if (!use_tainted && !has_untainted)
+ use_tainted = 1;
+ else
+ break;
+ }
+ }
+ if (totout == 0) {
+ d->kicked++;
+ d->flags |= DEVFL_KICKME;
+ }
+ return NULL;
+}
+
+static void
+skb_fillup(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter)
+{
+ int frag = 0;
+ struct bio_vec bv;
+
+ __bio_for_each_segment(bv, bio, iter, iter)
+ skb_fill_page_desc(skb, frag++, bv.bv_page,
+ bv.bv_offset, bv.bv_len);
+}
+
+static void
+fhash(struct frame *f)
+{
+ struct aoedev *d = f->t->d;
+ u32 n;
+
+ n = f->tag % NFACTIVE;
+ list_add_tail(&f->head, &d->factive[n]);
+}
+
+static void
+ata_rw_frameinit(struct frame *f)
+{
+ struct aoetgt *t;
struct aoe_hdr *h;
struct aoe_atahdr *ah;
- struct buf *buf;
struct sk_buff *skb;
- ulong bcnt;
- register sector_t sector;
char writebit, extbit;
+ skb = f->skb;
+ h = (struct aoe_hdr *) skb_mac_header(skb);
+ ah = (struct aoe_atahdr *) (h + 1);
+ skb_put(skb, sizeof(*h) + sizeof(*ah));
+ memset(h, 0, skb->len);
+
writebit = 0x10;
extbit = 0x4;
- buf = d->inprocess;
-
- sector = buf->sector;
- bcnt = buf->bv_resid;
- if (bcnt > MAXATADATA)
- bcnt = MAXATADATA;
-
- /* initialize the headers & frame */
- h = (struct aoe_hdr *) f->data;
- ah = (struct aoe_atahdr *) (h+1);
- f->ndata = sizeof *h + sizeof *ah;
- memset(h, 0, f->ndata);
- f->tag = aoehdr_atainit(d, h);
+ t = f->t;
+ f->tag = aoehdr_atainit(t->d, t, h);
+ fhash(f);
+ t->nout++;
f->waited = 0;
- f->buf = buf;
- f->bufaddr = buf->bufaddr;
+ f->waited_total = 0;
/* set up ata header */
- ah->scnt = bcnt >> 9;
- ah->lba0 = sector;
- ah->lba1 = sector >>= 8;
- ah->lba2 = sector >>= 8;
- ah->lba3 = sector >>= 8;
- if (d->flags & DEVFL_EXT) {
+ ah->scnt = f->iter.bi_size >> 9;
+ put_lba(ah, f->iter.bi_sector);
+ if (t->d->flags & DEVFL_EXT) {
ah->aflags |= AOEAFL_EXT;
- ah->lba4 = sector >>= 8;
- ah->lba5 = sector >>= 8;
} else {
extbit = 0;
ah->lba3 &= 0x0f;
ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */
}
-
- if (bio_data_dir(buf->bio) == WRITE) {
+ if (f->buf && bio_data_dir(f->buf->bio) == WRITE) {
+ skb_fillup(skb, f->buf->bio, f->iter);
ah->aflags |= AOEAFL_WRITE;
- f->writedatalen = bcnt;
+ skb->len += f->iter.bi_size;
+ skb->data_len = f->iter.bi_size;
+ skb->truesize += f->iter.bi_size;
+ t->wpkts++;
} else {
+ t->rpkts++;
writebit = 0;
- f->writedatalen = 0;
}
- ah->cmdstat = WIN_READ | writebit | extbit;
+ ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
+ skb->dev = t->ifp->nd;
+}
+
+static int
+aoecmd_ata_rw(struct aoedev *d)
+{
+ struct frame *f;
+ struct buf *buf;
+ struct sk_buff *skb;
+ struct sk_buff_head queue;
+
+ buf = nextbuf(d);
+ if (buf == NULL)
+ return 0;
+ f = newframe(d);
+ if (f == NULL)
+ return 0;
+
+ /* initialize the headers & frame */
+ f->buf = buf;
+ f->iter = buf->iter;
+ f->iter.bi_size = min_t(unsigned long,
+ d->maxbcnt ?: DEFAULTBCNT,
+ f->iter.bi_size);
+ bio_advance_iter(buf->bio, &buf->iter, f->iter.bi_size);
+
+ if (!buf->iter.bi_size)
+ d->ip.buf = NULL;
/* mark all tracking fields and load out */
buf->nframesout += 1;
- buf->bufaddr += bcnt;
- buf->bv_resid -= bcnt;
-/* printk(KERN_INFO "aoe: bv_resid=%ld\n", buf->bv_resid); */
- buf->resid -= bcnt;
- buf->sector += bcnt >> 9;
- if (buf->resid == 0) {
- d->inprocess = NULL;
- } else if (buf->bv_resid == 0) {
- buf->bv++;
- buf->bv_resid = buf->bv->bv_len;
- buf->bufaddr = page_address(buf->bv->bv_page) + buf->bv->bv_offset;
- }
-
- skb = skb_prepare(d, f);
+
+ ata_rw_frameinit(f);
+
+ skb = skb_clone(f->skb, GFP_ATOMIC);
if (skb) {
- skb->next = NULL;
- if (d->sendq_hd)
- d->sendq_tl->next = skb;
- else
- d->sendq_hd = skb;
- d->sendq_tl = skb;
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ __skb_queue_head_init(&queue);
+ __skb_queue_tail(&queue, skb);
+ aoenet_xmit(&queue);
}
+ return 1;
}
-/* enters with d->lock held */
-void
-aoecmd_work(struct aoedev *d)
+/* some callers cannot sleep, and they can call this function,
+ * transmitting the packets later, when interrupts are on
+ */
+static void
+aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *queue)
{
- struct frame *f;
- struct buf *buf;
-loop:
- f = getframe(d, FREETAG);
- if (f == NULL)
- return;
- if (d->inprocess == NULL) {
- if (list_empty(&d->bufq))
- return;
- buf = container_of(d->bufq.next, struct buf, bufs);
- list_del(d->bufq.next);
-/*printk(KERN_INFO "aoecmd_work: bi_size=%ld\n", buf->bio->bi_size); */
- d->inprocess = buf;
+ struct aoe_hdr *h;
+ struct aoe_cfghdr *ch;
+ struct sk_buff *skb;
+ struct net_device *ifp;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, ifp) {
+ dev_hold(ifp);
+ if (!is_aoe_netif(ifp))
+ goto cont;
+
+ skb = new_skb(sizeof *h + sizeof *ch);
+ if (skb == NULL) {
+ printk(KERN_INFO "aoe: skb alloc failure\n");
+ goto cont;
+ }
+ skb_put(skb, sizeof *h + sizeof *ch);
+ skb->dev = ifp;
+ __skb_queue_tail(queue, skb);
+ h = (struct aoe_hdr *) skb_mac_header(skb);
+ memset(h, 0, sizeof *h + sizeof *ch);
+
+ memset(h->dst, 0xff, sizeof h->dst);
+ memcpy(h->src, ifp->dev_addr, sizeof h->src);
+ h->type = __constant_cpu_to_be16(ETH_P_AOE);
+ h->verfl = AOE_HVER;
+ h->major = cpu_to_be16(aoemajor);
+ h->minor = aoeminor;
+ h->cmd = AOECMD_CFG;
+
+cont:
+ dev_put(ifp);
}
- aoecmd_ata_rw(d, f);
- goto loop;
+ rcu_read_unlock();
}
static void
-rexmit(struct aoedev *d, struct frame *f)
+resend(struct aoedev *d, struct frame *f)
{
struct sk_buff *skb;
+ struct sk_buff_head queue;
struct aoe_hdr *h;
+ struct aoetgt *t;
char buf[128];
u32 n;
+ t = f->t;
n = newtag(d);
+ skb = f->skb;
+ if (ifrotate(t) == NULL) {
+ /* probably can't happen, but set it up to fail anyway */
+ pr_info("aoe: resend: no interfaces to rotate to.\n");
+ ktcomplete(f, NULL);
+ return;
+ }
+ h = (struct aoe_hdr *) skb_mac_header(skb);
+
+ if (!(f->flags & FFL_PROBE)) {
+ snprintf(buf, sizeof(buf),
+ "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
+ "retransmit", d->aoemajor, d->aoeminor,
+ f->tag, jiffies, n,
+ h->src, h->dst, t->nout);
+ aoechr_error(buf);
+ }
- snprintf(buf, sizeof buf,
- "%15s e%ld.%ld oldtag=%08x@%08lx newtag=%08x\n",
- "retransmit",
- d->aoemajor, d->aoeminor, f->tag, jiffies, n);
- aoechr_error(buf);
-
- h = (struct aoe_hdr *) f->data;
f->tag = n;
+ fhash(f);
h->tag = cpu_to_be32(n);
+ memcpy(h->dst, t->addr, sizeof h->dst);
+ memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
- skb = skb_prepare(d, f);
- if (skb) {
- skb->next = NULL;
- if (d->sendq_hd)
- d->sendq_tl->next = skb;
- else
- d->sendq_hd = skb;
- d->sendq_tl = skb;
+ skb->dev = t->ifp->nd;
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ __skb_queue_head_init(&queue);
+ __skb_queue_tail(&queue, skb);
+ aoenet_xmit(&queue);
+}
+
+static int
+tsince_hr(struct frame *f)
+{
+ struct timeval now;
+ int n;
+
+ do_gettimeofday(&now);
+ n = now.tv_usec - f->sent.tv_usec;
+ n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
+
+ if (n < 0)
+ n = -n;
+
+ /* For relatively long periods, use jiffies to avoid
+ * discrepancies caused by updates to the system time.
+ *
+ * On system with HZ of 1000, 32-bits is over 49 days
+ * worth of jiffies, or over 71 minutes worth of usecs.
+ *
+ * Jiffies overflow is handled by subtraction of unsigned ints:
+ * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
+ * $3 = 4
+ * (gdb)
+ */
+ if (n > USEC_PER_SEC / 4) {
+ n = ((u32) jiffies) - f->sent_jiffs;
+ n *= USEC_PER_SEC / HZ;
}
+
+ return n;
}
static int
-tsince(int tag)
+tsince(u32 tag)
{
int n;
@@ -249,78 +537,473 @@ tsince(int tag)
n -= tag & 0xffff;
if (n < 0)
n += 1<<16;
- return n;
+ return jiffies_to_usecs(n + 1);
+}
+
+static struct aoeif *
+getif(struct aoetgt *t, struct net_device *nd)
+{
+ struct aoeif *p, *e;
+
+ p = t->ifs;
+ e = p + NAOEIFS;
+ for (; p < e; p++)
+ if (p->nd == nd)
+ return p;
+ return NULL;
+}
+
+static void
+ejectif(struct aoetgt *t, struct aoeif *ifp)
+{
+ struct aoeif *e;
+ struct net_device *nd;
+ ulong n;
+
+ nd = ifp->nd;
+ e = t->ifs + NAOEIFS - 1;
+ n = (e - ifp) * sizeof *ifp;
+ memmove(ifp, ifp+1, n);
+ e->nd = NULL;
+ dev_put(nd);
+}
+
+static struct frame *
+reassign_frame(struct frame *f)
+{
+ struct frame *nf;
+ struct sk_buff *skb;
+
+ nf = newframe(f->t->d);
+ if (!nf)
+ return NULL;
+ if (nf->t == f->t) {
+ aoe_freetframe(nf);
+ return NULL;
+ }
+
+ skb = nf->skb;
+ nf->skb = f->skb;
+ nf->buf = f->buf;
+ nf->iter = f->iter;
+ nf->waited = 0;
+ nf->waited_total = f->waited_total;
+ nf->sent = f->sent;
+ nf->sent_jiffs = f->sent_jiffs;
+ f->skb = skb;
+
+ return nf;
+}
+
+static void
+probe(struct aoetgt *t)
+{
+ struct aoedev *d;
+ struct frame *f;
+ struct sk_buff *skb;
+ struct sk_buff_head queue;
+ size_t n, m;
+ int frag;
+
+ d = t->d;
+ f = newtframe(d, t);
+ if (!f) {
+ pr_err("%s %pm for e%ld.%d: %s\n",
+ "aoe: cannot probe remote address",
+ t->addr,
+ (long) d->aoemajor, d->aoeminor,
+ "no frame available");
+ return;
+ }
+ f->flags |= FFL_PROBE;
+ ifrotate(t);
+ f->iter.bi_size = t->d->maxbcnt ? t->d->maxbcnt : DEFAULTBCNT;
+ ata_rw_frameinit(f);
+ skb = f->skb;
+ for (frag = 0, n = f->iter.bi_size; n > 0; ++frag, n -= m) {
+ if (n < PAGE_SIZE)
+ m = n;
+ else
+ m = PAGE_SIZE;
+ skb_fill_page_desc(skb, frag, empty_page, 0, m);
+ }
+ skb->len += f->iter.bi_size;
+ skb->data_len = f->iter.bi_size;
+ skb->truesize += f->iter.bi_size;
+
+ skb = skb_clone(f->skb, GFP_ATOMIC);
+ if (skb) {
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ __skb_queue_head_init(&queue);
+ __skb_queue_tail(&queue, skb);
+ aoenet_xmit(&queue);
+ }
+}
+
+static long
+rto(struct aoedev *d)
+{
+ long t;
+
+ t = 2 * d->rttavg >> RTTSCALE;
+ t += 8 * d->rttdev >> RTTDSCALE;
+ if (t == 0)
+ t = 1;
+
+ return t;
+}
+
+static void
+rexmit_deferred(struct aoedev *d)
+{
+ struct aoetgt *t;
+ struct frame *f;
+ struct frame *nf;
+ struct list_head *pos, *nx, *head;
+ int since;
+ int untainted;
+
+ count_targets(d, &untainted);
+
+ head = &d->rexmitq;
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ t = f->t;
+ if (t->taint) {
+ if (!(f->flags & FFL_PROBE)) {
+ nf = reassign_frame(f);
+ if (nf) {
+ if (t->nout_probes == 0
+ && untainted > 0) {
+ probe(t);
+ t->nout_probes++;
+ }
+ list_replace(&f->head, &nf->head);
+ pos = &nf->head;
+ aoe_freetframe(f);
+ f = nf;
+ t = f->t;
+ }
+ } else if (untainted < 1) {
+ /* don't probe w/o other untainted aoetgts */
+ goto stop_probe;
+ } else if (tsince_hr(f) < t->taint * rto(d)) {
+ /* reprobe slowly when taint is high */
+ continue;
+ }
+ } else if (f->flags & FFL_PROBE) {
+stop_probe: /* don't probe untainted aoetgts */
+ list_del(pos);
+ aoe_freetframe(f);
+ /* leaving d->kicked, because this is routine */
+ f->t->d->flags |= DEVFL_KICKME;
+ continue;
+ }
+ if (t->nout >= t->maxout)
+ continue;
+ list_del(pos);
+ t->nout++;
+ if (f->flags & FFL_PROBE)
+ t->nout_probes++;
+ since = tsince_hr(f);
+ f->waited += since;
+ f->waited_total += since;
+ resend(d, f);
+ }
+}
+
+/* An aoetgt accumulates demerits quickly, and successful
+ * probing redeems the aoetgt slowly.
+ */
+static void
+scorn(struct aoetgt *t)
+{
+ int n;
+
+ n = t->taint++;
+ t->taint += t->taint * 2;
+ if (n > t->taint)
+ t->taint = n;
+ if (t->taint > MAX_TAINT)
+ t->taint = MAX_TAINT;
+}
+
+static int
+count_targets(struct aoedev *d, int *untainted)
+{
+ int i, good;
+
+ for (i = good = 0; i < d->ntargets && d->targets[i]; ++i)
+ if (d->targets[i]->taint == 0)
+ good++;
+
+ if (untainted)
+ *untainted = good;
+ return i;
}
static void
rexmit_timer(ulong vp)
{
struct aoedev *d;
- struct frame *f, *e;
- struct sk_buff *sl;
+ struct aoetgt *t;
+ struct aoeif *ifp;
+ struct frame *f;
+ struct list_head *head, *pos, *nx;
+ LIST_HEAD(flist);
register long timeout;
ulong flags, n;
+ int i;
+ int utgts; /* number of aoetgt descriptors (not slots) */
+ int since;
d = (struct aoedev *) vp;
- sl = NULL;
-
- /* timeout is always ~150% of the moving average */
- timeout = d->rttavg;
- timeout += timeout >> 1;
spin_lock_irqsave(&d->lock, flags);
+ /* timeout based on observed timings and variations */
+ timeout = rto(d);
+
+ utgts = count_targets(d, NULL);
+
if (d->flags & DEVFL_TKILL) {
-tdie: spin_unlock_irqrestore(&d->lock, flags);
+ spin_unlock_irqrestore(&d->lock, flags);
return;
}
- f = d->frames;
- e = f + d->nframes;
- for (; f<e; f++) {
- if (f->tag != FREETAG && tsince(f->tag) >= timeout) {
- n = f->waited += timeout;
- n /= HZ;
- if (n > MAXWAIT) { /* waited too long. device failure. */
- aoedev_downdev(d);
- goto tdie;
+
+ /* collect all frames to rexmit into flist */
+ for (i = 0; i < NFACTIVE; i++) {
+ head = &d->factive[i];
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ if (tsince_hr(f) < timeout)
+ break; /* end of expired frames */
+ /* move to flist for later processing */
+ list_move_tail(pos, &flist);
+ }
+ }
+
+ /* process expired frames */
+ while (!list_empty(&flist)) {
+ pos = flist.next;
+ f = list_entry(pos, struct frame, head);
+ since = tsince_hr(f);
+ n = f->waited_total + since;
+ n /= USEC_PER_SEC;
+ if (aoe_deadsecs
+ && n > aoe_deadsecs
+ && !(f->flags & FFL_PROBE)) {
+ /* Waited too long. Device failure.
+ * Hang all frames on first hash bucket for downdev
+ * to clean up.
+ */
+ list_splice(&flist, &d->factive[0]);
+ aoedev_downdev(d);
+ goto out;
+ }
+
+ t = f->t;
+ n = f->waited + since;
+ n /= USEC_PER_SEC;
+ if (aoe_deadsecs && utgts > 0
+ && (n > aoe_deadsecs / utgts || n > HARD_SCORN_SECS))
+ scorn(t); /* avoid this target */
+
+ if (t->maxout != 1) {
+ t->ssthresh = t->maxout / 2;
+ t->maxout = 1;
+ }
+
+ if (f->flags & FFL_PROBE) {
+ t->nout_probes--;
+ } else {
+ ifp = getif(t, f->skb->dev);
+ if (ifp && ++ifp->lost > (t->nframes << 1)
+ && (ifp != t->ifs || t->ifs[1].nd)) {
+ ejectif(t, ifp);
+ ifp = NULL;
}
- rexmit(d, f);
}
+ list_move_tail(pos, &d->rexmitq);
+ t->nout--;
}
+ rexmit_deferred(d);
- sl = d->sendq_hd;
- d->sendq_hd = d->sendq_tl = NULL;
- if (sl) {
- n = d->rttavg <<= 1;
- if (n > MAXTIMER)
- d->rttavg = MAXTIMER;
+out:
+ if ((d->flags & DEVFL_KICKME) && d->blkq) {
+ d->flags &= ~DEVFL_KICKME;
+ d->blkq->request_fn(d->blkq);
}
d->timer.expires = jiffies + TIMERTICK;
add_timer(&d->timer);
spin_unlock_irqrestore(&d->lock, flags);
+}
+
+static unsigned long
+rqbiocnt(struct request *r)
+{
+ struct bio *bio;
+ unsigned long n = 0;
+
+ __rq_for_each_bio(bio, r)
+ n++;
+ return n;
+}
+
+/* This can be removed if we are certain that no users of the block
+ * layer will ever use zero-count pages in bios. Otherwise we have to
+ * protect against the put_page sometimes done by the network layer.
+ *
+ * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
+ * discussion.
+ *
+ * We cannot use get_page in the workaround, because it insists on a
+ * positive page count as a precondition. So we use _count directly.
+ */
+static void
+bio_pageinc(struct bio *bio)
+{
+ struct bio_vec bv;
+ struct page *page;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(bv, bio, iter) {
+ /* Non-zero page count for non-head members of
+ * compound pages is no longer allowed by the kernel.
+ */
+ page = compound_head(bv.bv_page);
+ atomic_inc(&page->_count);
+ }
+}
+
+static void
+bio_pagedec(struct bio *bio)
+{
+ struct page *page;
+ struct bio_vec bv;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(bv, bio, iter) {
+ page = compound_head(bv.bv_page);
+ atomic_dec(&page->_count);
+ }
+}
+
+static void
+bufinit(struct buf *buf, struct request *rq, struct bio *bio)
+{
+ memset(buf, 0, sizeof(*buf));
+ buf->rq = rq;
+ buf->bio = bio;
+ buf->iter = bio->bi_iter;
+ bio_pageinc(bio);
+}
+
+static struct buf *
+nextbuf(struct aoedev *d)
+{
+ struct request *rq;
+ struct request_queue *q;
+ struct buf *buf;
+ struct bio *bio;
+
+ q = d->blkq;
+ if (q == NULL)
+ return NULL; /* initializing */
+ if (d->ip.buf)
+ return d->ip.buf;
+ rq = d->ip.rq;
+ if (rq == NULL) {
+ rq = blk_peek_request(q);
+ if (rq == NULL)
+ return NULL;
+ blk_start_request(rq);
+ d->ip.rq = rq;
+ d->ip.nxbio = rq->bio;
+ rq->special = (void *) rqbiocnt(rq);
+ }
+ buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
+ if (buf == NULL) {
+ pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
+ return NULL;
+ }
+ bio = d->ip.nxbio;
+ bufinit(buf, rq, bio);
+ bio = bio->bi_next;
+ d->ip.nxbio = bio;
+ if (bio == NULL)
+ d->ip.rq = NULL;
+ return d->ip.buf = buf;
+}
+
+/* enters with d->lock held */
+void
+aoecmd_work(struct aoedev *d)
+{
+ rexmit_deferred(d);
+ while (aoecmd_ata_rw(d))
+ ;
+}
+
+/* this function performs work that has been deferred until sleeping is OK
+ */
+void
+aoecmd_sleepwork(struct work_struct *work)
+{
+ struct aoedev *d = container_of(work, struct aoedev, work);
+ struct block_device *bd;
+ u64 ssize;
- aoenet_xmit(sl);
+ if (d->flags & DEVFL_GDALLOC)
+ aoeblk_gdalloc(d);
+
+ if (d->flags & DEVFL_NEWSIZE) {
+ ssize = get_capacity(d->gd);
+ bd = bdget_disk(d->gd, 0);
+ if (bd) {
+ mutex_lock(&bd->bd_inode->i_mutex);
+ i_size_write(bd->bd_inode, (loff_t)ssize<<9);
+ mutex_unlock(&bd->bd_inode->i_mutex);
+ bdput(bd);
+ }
+ spin_lock_irq(&d->lock);
+ d->flags |= DEVFL_UP;
+ d->flags &= ~DEVFL_NEWSIZE;
+ spin_unlock_irq(&d->lock);
+ }
}
static void
-ataid_complete(struct aoedev *d, unsigned char *id)
+ata_ident_fixstring(u16 *id, int ns)
+{
+ u16 s;
+
+ while (ns-- > 0) {
+ s = *id;
+ *id++ = s >> 8 | s << 8;
+ }
+}
+
+static void
+ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
{
u64 ssize;
u16 n;
/* word 83: command set supported */
- n = le16_to_cpup((__le16 *) &id[83<<1]);
+ n = get_unaligned_le16(&id[83 << 1]);
/* word 86: command set/feature enabled */
- n |= le16_to_cpup((__le16 *) &id[86<<1]);
+ n |= get_unaligned_le16(&id[86 << 1]);
if (n & (1<<10)) { /* bit 10: LBA 48 */
d->flags |= DEVFL_EXT;
/* word 100: number lba48 sectors */
- ssize = le64_to_cpup((__le64 *) &id[100<<1]);
+ ssize = get_unaligned_le64(&id[100 << 1]);
/* set as in ide-disk.c:init_idedisk_capacity */
d->geo.cylinders = ssize;
@@ -331,317 +1014,813 @@ ataid_complete(struct aoedev *d, unsigned char *id)
d->flags &= ~DEVFL_EXT;
/* number lba28 sectors */
- ssize = le32_to_cpup((__le32 *) &id[60<<1]);
+ ssize = get_unaligned_le32(&id[60 << 1]);
/* NOTE: obsolete in ATA 6 */
- d->geo.cylinders = le16_to_cpup((__le16 *) &id[54<<1]);
- d->geo.heads = le16_to_cpup((__le16 *) &id[55<<1]);
- d->geo.sectors = le16_to_cpup((__le16 *) &id[56<<1]);
+ d->geo.cylinders = get_unaligned_le16(&id[54 << 1]);
+ d->geo.heads = get_unaligned_le16(&id[55 << 1]);
+ d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
}
+
+ ata_ident_fixstring((u16 *) &id[10<<1], 10); /* serial */
+ ata_ident_fixstring((u16 *) &id[23<<1], 4); /* firmware */
+ ata_ident_fixstring((u16 *) &id[27<<1], 20); /* model */
+ memcpy(d->ident, id, sizeof(d->ident));
+
+ if (d->ssize != ssize)
+ printk(KERN_INFO
+ "aoe: %pm e%ld.%d v%04x has %llu sectors\n",
+ t->addr,
+ d->aoemajor, d->aoeminor,
+ d->fw_ver, (long long)ssize);
d->ssize = ssize;
d->geo.start = 0;
- if (d->gd != NULL) {
- d->gd->capacity = ssize;
- d->flags |= DEVFL_UP;
- return;
- }
- if (d->flags & DEVFL_WORKON) {
- printk(KERN_INFO "aoe: ataid_complete: can't schedule work, it's already on! "
- "(This really shouldn't happen).\n");
+ if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
return;
- }
- INIT_WORK(&d->work, aoeblk_gdalloc, d);
+ if (d->gd != NULL) {
+ set_capacity(d->gd, ssize);
+ d->flags |= DEVFL_NEWSIZE;
+ } else
+ d->flags |= DEVFL_GDALLOC;
schedule_work(&d->work);
- d->flags |= DEVFL_WORKON;
}
static void
-calc_rttavg(struct aoedev *d, int rtt)
+calc_rttavg(struct aoedev *d, struct aoetgt *t, int rtt)
{
register long n;
n = rtt;
- if (n < MINTIMER)
- n = MINTIMER;
- else if (n > MAXTIMER)
- n = MAXTIMER;
- /* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */
- n -= d->rttavg;
- d->rttavg += n >> 2;
+ /* cf. Congestion Avoidance and Control, Jacobson & Karels, 1988 */
+ n -= d->rttavg >> RTTSCALE;
+ d->rttavg += n;
+ if (n < 0)
+ n = -n;
+ n -= d->rttdev >> RTTDSCALE;
+ d->rttdev += n;
+
+ if (!t || t->maxout >= t->nframes)
+ return;
+ if (t->maxout < t->ssthresh)
+ t->maxout += 1;
+ else if (t->nout == t->maxout && t->next_cwnd-- == 0) {
+ t->maxout += 1;
+ t->next_cwnd = t->maxout;
+ }
}
-void
-aoecmd_ata_rsp(struct sk_buff *skb)
+static struct aoetgt *
+gettgt(struct aoedev *d, char *addr)
{
- struct aoedev *d;
- struct aoe_hdr *hin;
- struct aoe_atahdr *ahin, *ahout;
- struct frame *f;
- struct buf *buf;
- struct sk_buff *sl;
- register long n;
- ulong flags;
- char ebuf[128];
- u16 aoemajor;
+ struct aoetgt **t, **e;
- hin = (struct aoe_hdr *) skb->mac.raw;
- aoemajor = be16_to_cpu(hin->major);
- d = aoedev_by_aoeaddr(aoemajor, hin->minor);
- if (d == NULL) {
- snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
- "for unknown device %d.%d\n",
- aoemajor, hin->minor);
- aoechr_error(ebuf);
- return;
- }
+ t = d->targets;
+ e = t + d->ntargets;
+ for (; t < e && *t; t++)
+ if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
+ return *t;
+ return NULL;
+}
- spin_lock_irqsave(&d->lock, flags);
+static void
+bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt)
+{
+ int soff = 0;
+ struct bio_vec bv;
- f = getframe(d, be32_to_cpu(hin->tag));
- if (f == NULL) {
- spin_unlock_irqrestore(&d->lock, flags);
- snprintf(ebuf, sizeof ebuf,
- "%15s e%d.%d tag=%08x@%08lx\n",
- "unexpected rsp",
- be16_to_cpu(hin->major),
- hin->minor,
- be32_to_cpu(hin->tag),
- jiffies);
- aoechr_error(ebuf);
- return;
+ iter.bi_size = cnt;
+
+ __bio_for_each_segment(bv, bio, iter, iter) {
+ char *p = page_address(bv.bv_page) + bv.bv_offset;
+ skb_copy_bits(skb, soff, p, bv.bv_len);
+ soff += bv.bv_len;
}
+}
- calc_rttavg(d, tsince(f->tag));
+void
+aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
+{
+ struct bio *bio;
+ int bok;
+ struct request_queue *q;
+
+ q = d->blkq;
+ if (rq == d->ip.rq)
+ d->ip.rq = NULL;
+ do {
+ bio = rq->bio;
+ bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
+ } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_iter.bi_size));
+
+ /* cf. http://lkml.org/lkml/2006/10/31/28 */
+ if (!fastfail)
+ __blk_run_queue(q);
+}
- ahin = (struct aoe_atahdr *) (hin+1);
- ahout = (struct aoe_atahdr *) (f->data + sizeof(struct aoe_hdr));
- buf = f->buf;
+static void
+aoe_end_buf(struct aoedev *d, struct buf *buf)
+{
+ struct request *rq;
+ unsigned long n;
+
+ if (buf == d->ip.buf)
+ d->ip.buf = NULL;
+ rq = buf->rq;
+ bio_pagedec(buf->bio);
+ mempool_free(buf, d->bufpool);
+ n = (unsigned long) rq->special;
+ rq->special = (void *) --n;
+ if (n == 0)
+ aoe_end_request(d, rq, 0);
+}
+
+static void
+ktiocomplete(struct frame *f)
+{
+ struct aoe_hdr *hin, *hout;
+ struct aoe_atahdr *ahin, *ahout;
+ struct buf *buf;
+ struct sk_buff *skb;
+ struct aoetgt *t;
+ struct aoeif *ifp;
+ struct aoedev *d;
+ long n;
+ int untainted;
+
+ if (f == NULL)
+ return;
+ t = f->t;
+ d = t->d;
+ skb = f->r_skb;
+ buf = f->buf;
+ if (f->flags & FFL_PROBE)
+ goto out;
+ if (!skb) /* just fail the buf. */
+ goto noskb;
+
+ hout = (struct aoe_hdr *) skb_mac_header(f->skb);
+ ahout = (struct aoe_atahdr *) (hout+1);
+
+ hin = (struct aoe_hdr *) skb->data;
+ skb_pull(skb, sizeof(*hin));
+ ahin = (struct aoe_atahdr *) skb->data;
+ skb_pull(skb, sizeof(*ahin));
if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */
- printk(KERN_CRIT "aoe: aoecmd_ata_rsp: ata error cmd=%2.2Xh "
- "stat=%2.2Xh from e%ld.%ld\n",
+ pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
ahout->cmdstat, ahin->cmdstat,
d->aoemajor, d->aoeminor);
- if (buf)
- buf->flags |= BUFFL_FAIL;
- } else {
- switch (ahout->cmdstat) {
- case WIN_READ:
- case WIN_READ_EXT:
- n = ahout->scnt << 9;
- if (skb->len - sizeof *hin - sizeof *ahin < n) {
- printk(KERN_CRIT "aoe: aoecmd_ata_rsp: runt "
- "ata data size in read. skb->len=%d\n",
- skb->len);
- /* fail frame f? just returning will rexmit. */
- spin_unlock_irqrestore(&d->lock, flags);
- return;
- }
- memcpy(f->bufaddr, ahin+1, n);
- case WIN_WRITE:
- case WIN_WRITE_EXT:
+noskb: if (buf)
+ clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+ goto out;
+ }
+
+ n = ahout->scnt << 9;
+ switch (ahout->cmdstat) {
+ case ATA_CMD_PIO_READ:
+ case ATA_CMD_PIO_READ_EXT:
+ if (skb->len < n) {
+ pr_err("%s e%ld.%d. skb->len=%d need=%ld\n",
+ "aoe: runt data size in read from",
+ (long) d->aoemajor, d->aoeminor,
+ skb->len, n);
+ clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
break;
- case WIN_IDENTIFY:
- if (skb->len - sizeof *hin - sizeof *ahin < 512) {
- printk(KERN_INFO "aoe: aoecmd_ata_rsp: runt data size "
- "in ataid. skb->len=%d\n", skb->len);
- spin_unlock_irqrestore(&d->lock, flags);
- return;
- }
- ataid_complete(d, (char *) (ahin+1));
- /* d->flags |= DEVFL_WC_UPDATE; */
+ }
+ if (n > f->iter.bi_size) {
+ pr_err_ratelimited("%s e%ld.%d. bytes=%ld need=%u\n",
+ "aoe: too-large data size in read from",
+ (long) d->aoemajor, d->aoeminor,
+ n, f->iter.bi_size);
+ clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+ break;
+ }
+ bvcpy(skb, f->buf->bio, f->iter, n);
+ case ATA_CMD_PIO_WRITE:
+ case ATA_CMD_PIO_WRITE_EXT:
+ spin_lock_irq(&d->lock);
+ ifp = getif(t, skb->dev);
+ if (ifp)
+ ifp->lost = 0;
+ spin_unlock_irq(&d->lock);
+ break;
+ case ATA_CMD_ID_ATA:
+ if (skb->len < 512) {
+ pr_info("%s e%ld.%d. skb->len=%d need=512\n",
+ "aoe: runt data size in ataid from",
+ (long) d->aoemajor, d->aoeminor,
+ skb->len);
break;
- default:
- printk(KERN_INFO "aoe: aoecmd_ata_rsp: unrecognized "
- "outbound ata command %2.2Xh for %d.%d\n",
- ahout->cmdstat,
- be16_to_cpu(hin->major),
- hin->minor);
}
+ if (skb_linearize(skb))
+ break;
+ spin_lock_irq(&d->lock);
+ ataid_complete(d, t, skb->data);
+ spin_unlock_irq(&d->lock);
+ break;
+ default:
+ pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n",
+ ahout->cmdstat,
+ be16_to_cpu(get_unaligned(&hin->major)),
+ hin->minor);
}
-
- if (buf) {
- buf->nframesout -= 1;
- if (buf->nframesout == 0 && buf->resid == 0) {
- unsigned long duration = jiffies - buf->start_time;
- unsigned long n_sect = buf->bio->bi_size >> 9;
- struct gendisk *disk = d->gd;
-
- if (bio_data_dir(buf->bio) == WRITE) {
- disk_stat_inc(disk, writes);
- disk_stat_add(disk, write_ticks, duration);
- disk_stat_add(disk, write_sectors, n_sect);
- } else {
- disk_stat_inc(disk, reads);
- disk_stat_add(disk, read_ticks, duration);
- disk_stat_add(disk, read_sectors, n_sect);
- }
- disk_stat_add(disk, io_ticks, duration);
- n = (buf->flags & BUFFL_FAIL) ? -EIO : 0;
- bio_endio(buf->bio, buf->bio->bi_size, n);
- mempool_free(buf, d->bufpool);
+out:
+ spin_lock_irq(&d->lock);
+ if (t->taint > 0
+ && --t->taint > 0
+ && t->nout_probes == 0) {
+ count_targets(d, &untainted);
+ if (untainted > 0) {
+ probe(t);
+ t->nout_probes++;
}
}
- f->buf = NULL;
- f->tag = FREETAG;
+ aoe_freetframe(f);
- aoecmd_work(d);
+ if (buf && --buf->nframesout == 0 && buf->iter.bi_size == 0)
+ aoe_end_buf(d, buf);
- sl = d->sendq_hd;
- d->sendq_hd = d->sendq_tl = NULL;
+ spin_unlock_irq(&d->lock);
+ aoedev_put(d);
+ dev_kfree_skb(skb);
+}
- spin_unlock_irqrestore(&d->lock, flags);
+/* Enters with iocq.lock held.
+ * Returns true iff responses needing processing remain.
+ */
+static int
+ktio(int id)
+{
+ struct frame *f;
+ struct list_head *pos;
+ int i;
+ int actual_id;
+
+ for (i = 0; ; ++i) {
+ if (i == MAXIOC)
+ return 1;
+ if (list_empty(&iocq[id].head))
+ return 0;
+ pos = iocq[id].head.next;
+ list_del(pos);
+ f = list_entry(pos, struct frame, head);
+ spin_unlock_irq(&iocq[id].lock);
+ ktiocomplete(f);
+
+ /* Figure out if extra threads are required. */
+ actual_id = f->t->d->aoeminor % ncpus;
+
+ if (!kts[actual_id].active) {
+ BUG_ON(id != 0);
+ mutex_lock(&ktio_spawn_lock);
+ if (!kts[actual_id].active
+ && aoe_ktstart(&kts[actual_id]) == 0)
+ kts[actual_id].active = 1;
+ mutex_unlock(&ktio_spawn_lock);
+ }
+ spin_lock_irq(&iocq[id].lock);
+ }
+}
- aoenet_xmit(sl);
+static int
+kthread(void *vp)
+{
+ struct ktstate *k;
+ DECLARE_WAITQUEUE(wait, current);
+ int more;
+
+ k = vp;
+ current->flags |= PF_NOFREEZE;
+ set_user_nice(current, -10);
+ complete(&k->rendez); /* tell spawner we're running */
+ do {
+ spin_lock_irq(k->lock);
+ more = k->fn(k->id);
+ if (!more) {
+ add_wait_queue(k->waitq, &wait);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ }
+ spin_unlock_irq(k->lock);
+ if (!more) {
+ schedule();
+ remove_wait_queue(k->waitq, &wait);
+ } else
+ cond_resched();
+ } while (!kthread_should_stop());
+ complete(&k->rendez); /* tell spawner we're stopping */
+ return 0;
}
void
-aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
+aoe_ktstop(struct ktstate *k)
+{
+ kthread_stop(k->task);
+ wait_for_completion(&k->rendez);
+}
+
+int
+aoe_ktstart(struct ktstate *k)
+{
+ struct task_struct *task;
+
+ init_completion(&k->rendez);
+ task = kthread_run(kthread, k, "%s", k->name);
+ if (task == NULL || IS_ERR(task))
+ return -ENOMEM;
+ k->task = task;
+ wait_for_completion(&k->rendez); /* allow kthread to start */
+ init_completion(&k->rendez); /* for waiting for exit later */
+ return 0;
+}
+
+/* pass it off to kthreads for processing */
+static void
+ktcomplete(struct frame *f, struct sk_buff *skb)
{
+ int id;
+ ulong flags;
+
+ f->r_skb = skb;
+ id = f->t->d->aoeminor % ncpus;
+ spin_lock_irqsave(&iocq[id].lock, flags);
+ if (!kts[id].active) {
+ spin_unlock_irqrestore(&iocq[id].lock, flags);
+ /* The thread with id has not been spawned yet,
+ * so delegate the work to the main thread and
+ * try spawning a new thread.
+ */
+ id = 0;
+ spin_lock_irqsave(&iocq[id].lock, flags);
+ }
+ list_add_tail(&f->head, &iocq[id].head);
+ spin_unlock_irqrestore(&iocq[id].lock, flags);
+ wake_up(&ktiowq[id]);
+}
+
+struct sk_buff *
+aoecmd_ata_rsp(struct sk_buff *skb)
+{
+ struct aoedev *d;
struct aoe_hdr *h;
- struct aoe_cfghdr *ch;
- struct sk_buff *skb, *sl;
- struct net_device *ifp;
+ struct frame *f;
+ u32 n;
+ ulong flags;
+ char ebuf[128];
+ u16 aoemajor;
- sl = NULL;
+ h = (struct aoe_hdr *) skb->data;
+ aoemajor = be16_to_cpu(get_unaligned(&h->major));
+ d = aoedev_by_aoeaddr(aoemajor, h->minor, 0);
+ if (d == NULL) {
+ snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
+ "for unknown device %d.%d\n",
+ aoemajor, h->minor);
+ aoechr_error(ebuf);
+ return skb;
+ }
- read_lock(&dev_base_lock);
- for (ifp = dev_base; ifp; dev_put(ifp), ifp = ifp->next) {
- dev_hold(ifp);
- if (!is_aoe_netif(ifp))
- continue;
+ spin_lock_irqsave(&d->lock, flags);
- skb = new_skb(ifp, sizeof *h + sizeof *ch);
- if (skb == NULL) {
- printk(KERN_INFO "aoe: aoecmd_cfg: skb alloc failure\n");
- continue;
+ n = be32_to_cpu(get_unaligned(&h->tag));
+ f = getframe(d, n);
+ if (f) {
+ calc_rttavg(d, f->t, tsince_hr(f));
+ f->t->nout--;
+ if (f->flags & FFL_PROBE)
+ f->t->nout_probes--;
+ } else {
+ f = getframe_deferred(d, n);
+ if (f) {
+ calc_rttavg(d, NULL, tsince_hr(f));
+ } else {
+ calc_rttavg(d, NULL, tsince(n));
+ spin_unlock_irqrestore(&d->lock, flags);
+ aoedev_put(d);
+ snprintf(ebuf, sizeof(ebuf),
+ "%15s e%d.%d tag=%08x@%08lx s=%pm d=%pm\n",
+ "unexpected rsp",
+ get_unaligned_be16(&h->major),
+ h->minor,
+ get_unaligned_be32(&h->tag),
+ jiffies,
+ h->src,
+ h->dst);
+ aoechr_error(ebuf);
+ return skb;
}
- h = (struct aoe_hdr *) skb->mac.raw;
- memset(h, 0, sizeof *h + sizeof *ch);
+ }
+ aoecmd_work(d);
- memset(h->dst, 0xff, sizeof h->dst);
- memcpy(h->src, ifp->dev_addr, sizeof h->src);
- h->type = __constant_cpu_to_be16(ETH_P_AOE);
- h->verfl = AOE_HVER;
- h->major = cpu_to_be16(aoemajor);
- h->minor = aoeminor;
- h->cmd = AOECMD_CFG;
+ spin_unlock_irqrestore(&d->lock, flags);
- skb->next = sl;
- sl = skb;
- }
- read_unlock(&dev_base_lock);
+ ktcomplete(f, skb);
- aoenet_xmit(sl);
+ /*
+ * Note here that we do not perform an aoedev_put, as we are
+ * leaving this reference for the ktio to release.
+ */
+ return NULL;
}
-
-/*
- * Since we only call this in one place (and it only prepares one frame)
- * we just return the skb. Usually we'd chain it up to the aoedev sendq.
- */
-static struct sk_buff *
+
+void
+aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
+{
+ struct sk_buff_head queue;
+
+ __skb_queue_head_init(&queue);
+ aoecmd_cfg_pkts(aoemajor, aoeminor, &queue);
+ aoenet_xmit(&queue);
+}
+
+struct sk_buff *
aoecmd_ata_id(struct aoedev *d)
{
struct aoe_hdr *h;
struct aoe_atahdr *ah;
struct frame *f;
struct sk_buff *skb;
+ struct aoetgt *t;
- f = getframe(d, FREETAG);
- if (f == NULL) {
- printk(KERN_CRIT "aoe: aoecmd_ata_id: can't get a frame. "
- "This shouldn't happen.\n");
+ f = newframe(d);
+ if (f == NULL)
return NULL;
- }
+
+ t = *d->tgt;
/* initialize the headers & frame */
- h = (struct aoe_hdr *) f->data;
+ skb = f->skb;
+ h = (struct aoe_hdr *) skb_mac_header(skb);
ah = (struct aoe_atahdr *) (h+1);
- f->ndata = sizeof *h + sizeof *ah;
- memset(h, 0, f->ndata);
- f->tag = aoehdr_atainit(d, h);
+ skb_put(skb, sizeof *h + sizeof *ah);
+ memset(h, 0, skb->len);
+ f->tag = aoehdr_atainit(d, t, h);
+ fhash(f);
+ t->nout++;
f->waited = 0;
- f->writedatalen = 0;
-
- /* this message initializes the device, so we reset the rttavg */
- d->rttavg = MAXTIMER;
+ f->waited_total = 0;
/* set up ata header */
ah->scnt = 1;
- ah->cmdstat = WIN_IDENTIFY;
+ ah->cmdstat = ATA_CMD_ID_ATA;
ah->lba3 = 0xa0;
- skb = skb_prepare(d, f);
+ skb->dev = t->ifp->nd;
- /* we now want to start the rexmit tracking */
- d->flags &= ~DEVFL_TKILL;
- d->timer.data = (ulong) d;
+ d->rttavg = RTTAVG_INIT;
+ d->rttdev = RTTDEV_INIT;
d->timer.function = rexmit_timer;
- d->timer.expires = jiffies + TIMERTICK;
- add_timer(&d->timer);
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (skb) {
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ }
return skb;
}
-
+
+static struct aoetgt **
+grow_targets(struct aoedev *d)
+{
+ ulong oldn, newn;
+ struct aoetgt **tt;
+
+ oldn = d->ntargets;
+ newn = oldn * 2;
+ tt = kcalloc(newn, sizeof(*d->targets), GFP_ATOMIC);
+ if (!tt)
+ return NULL;
+ memmove(tt, d->targets, sizeof(*d->targets) * oldn);
+ d->tgt = tt + (d->tgt - d->targets);
+ kfree(d->targets);
+ d->targets = tt;
+ d->ntargets = newn;
+
+ return &d->targets[oldn];
+}
+
+static struct aoetgt *
+addtgt(struct aoedev *d, char *addr, ulong nframes)
+{
+ struct aoetgt *t, **tt, **te;
+
+ tt = d->targets;
+ te = tt + d->ntargets;
+ for (; tt < te && *tt; tt++)
+ ;
+
+ if (tt == te) {
+ tt = grow_targets(d);
+ if (!tt)
+ goto nomem;
+ }
+ t = kzalloc(sizeof(*t), GFP_ATOMIC);
+ if (!t)
+ goto nomem;
+ t->nframes = nframes;
+ t->d = d;
+ memcpy(t->addr, addr, sizeof t->addr);
+ t->ifp = t->ifs;
+ aoecmd_wreset(t);
+ t->maxout = t->nframes / 2;
+ INIT_LIST_HEAD(&t->ffree);
+ return *tt = t;
+
+ nomem:
+ pr_info("aoe: cannot allocate memory to add target\n");
+ return NULL;
+}
+
+static void
+setdbcnt(struct aoedev *d)
+{
+ struct aoetgt **t, **e;
+ int bcnt = 0;
+
+ t = d->targets;
+ e = t + d->ntargets;
+ for (; t < e && *t; t++)
+ if (bcnt == 0 || bcnt > (*t)->minbcnt)
+ bcnt = (*t)->minbcnt;
+ if (bcnt != d->maxbcnt) {
+ d->maxbcnt = bcnt;
+ pr_info("aoe: e%ld.%d: setting %d byte data frames\n",
+ d->aoemajor, d->aoeminor, bcnt);
+ }
+}
+
+static void
+setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt)
+{
+ struct aoedev *d;
+ struct aoeif *p, *e;
+ int minbcnt;
+
+ d = t->d;
+ minbcnt = bcnt;
+ p = t->ifs;
+ e = p + NAOEIFS;
+ for (; p < e; p++) {
+ if (p->nd == NULL)
+ break; /* end of the valid interfaces */
+ if (p->nd == nd) {
+ p->bcnt = bcnt; /* we're updating */
+ nd = NULL;
+ } else if (minbcnt > p->bcnt)
+ minbcnt = p->bcnt; /* find the min interface */
+ }
+ if (nd) {
+ if (p == e) {
+ pr_err("aoe: device setifbcnt failure; too many interfaces.\n");
+ return;
+ }
+ dev_hold(nd);
+ p->nd = nd;
+ p->bcnt = bcnt;
+ }
+ t->minbcnt = minbcnt;
+ setdbcnt(d);
+}
+
void
aoecmd_cfg_rsp(struct sk_buff *skb)
{
struct aoedev *d;
struct aoe_hdr *h;
struct aoe_cfghdr *ch;
- ulong flags, sysminor, aoemajor;
- u16 bufcnt;
+ struct aoetgt *t;
+ ulong flags, aoemajor;
struct sk_buff *sl;
- enum { MAXFRAMES = 8 };
+ struct sk_buff_head queue;
+ u16 n;
- h = (struct aoe_hdr *) skb->mac.raw;
+ sl = NULL;
+ h = (struct aoe_hdr *) skb_mac_header(skb);
ch = (struct aoe_cfghdr *) (h+1);
/*
* Enough people have their dip switches set backwards to
* warrant a loud message for this special case.
*/
- aoemajor = be16_to_cpu(h->major);
+ aoemajor = get_unaligned_be16(&h->major);
if (aoemajor == 0xfff) {
- printk(KERN_CRIT "aoe: aoecmd_cfg_rsp: Warning: shelf "
- "address is all ones. Check shelf dip switches\n");
+ printk(KERN_ERR "aoe: Warning: shelf address is all ones. "
+ "Check shelf dip switches.\n");
return;
}
-
- sysminor = SYSMINOR(aoemajor, h->minor);
- if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) {
- printk(KERN_INFO
- "aoe: e%ld.%d: minor number too large\n",
+ if (aoemajor == 0xffff) {
+ pr_info("aoe: e%ld.%d: broadcast shelf number invalid\n",
+ aoemajor, (int) h->minor);
+ return;
+ }
+ if (h->minor == 0xff) {
+ pr_info("aoe: e%ld.%d: broadcast slot number invalid\n",
aoemajor, (int) h->minor);
return;
}
- bufcnt = be16_to_cpu(ch->bufcnt);
- if (bufcnt > MAXFRAMES) /* keep it reasonable */
- bufcnt = MAXFRAMES;
+ n = be16_to_cpu(ch->bufcnt);
+ if (n > aoe_maxout) /* keep it reasonable */
+ n = aoe_maxout;
- d = aoedev_set(sysminor, h->src, skb->dev, bufcnt);
+ d = aoedev_by_aoeaddr(aoemajor, h->minor, 1);
if (d == NULL) {
- printk(KERN_INFO "aoe: aoecmd_cfg_rsp: device set failure\n");
+ pr_info("aoe: device allocation failure\n");
return;
}
spin_lock_irqsave(&d->lock, flags);
- if (d->flags & (DEVFL_UP | DEVFL_CLOSEWAIT)) {
- spin_unlock_irqrestore(&d->lock, flags);
+ t = gettgt(d, h->src);
+ if (t) {
+ t->nframes = n;
+ if (n < t->maxout)
+ aoecmd_wreset(t);
+ } else {
+ t = addtgt(d, h->src, n);
+ if (!t)
+ goto bail;
+ }
+ n = skb->dev->mtu;
+ n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr);
+ n /= 512;
+ if (n > ch->scnt)
+ n = ch->scnt;
+ n = n ? n * 512 : DEFAULTBCNT;
+ setifbcnt(t, skb->dev, n);
+
+ /* don't change users' perspective */
+ if (d->nopen == 0) {
+ d->fw_ver = be16_to_cpu(ch->fwver);
+ sl = aoecmd_ata_id(d);
+ }
+bail:
+ spin_unlock_irqrestore(&d->lock, flags);
+ aoedev_put(d);
+ if (sl) {
+ __skb_queue_head_init(&queue);
+ __skb_queue_tail(&queue, sl);
+ aoenet_xmit(&queue);
+ }
+}
+
+void
+aoecmd_wreset(struct aoetgt *t)
+{
+ t->maxout = 1;
+ t->ssthresh = t->nframes / 2;
+ t->next_cwnd = t->nframes;
+}
+
+void
+aoecmd_cleanslate(struct aoedev *d)
+{
+ struct aoetgt **t, **te;
+
+ d->rttavg = RTTAVG_INIT;
+ d->rttdev = RTTDEV_INIT;
+ d->maxbcnt = 0;
+
+ t = d->targets;
+ te = t + d->ntargets;
+ for (; t < te && *t; t++)
+ aoecmd_wreset(*t);
+}
+
+void
+aoe_failbuf(struct aoedev *d, struct buf *buf)
+{
+ if (buf == NULL)
return;
+ buf->iter.bi_size = 0;
+ clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+ if (buf->nframesout == 0)
+ aoe_end_buf(d, buf);
+}
+
+void
+aoe_flush_iocq(void)
+{
+ int i;
+
+ for (i = 0; i < ncpus; i++) {
+ if (kts[i].active)
+ aoe_flush_iocq_by_index(i);
}
+}
- d->fw_ver = be16_to_cpu(ch->fwver);
+void
+aoe_flush_iocq_by_index(int id)
+{
+ struct frame *f;
+ struct aoedev *d;
+ LIST_HEAD(flist);
+ struct list_head *pos;
+ struct sk_buff *skb;
+ ulong flags;
- /* we get here only if the device is new */
- sl = aoecmd_ata_id(d);
+ spin_lock_irqsave(&iocq[id].lock, flags);
+ list_splice_init(&iocq[id].head, &flist);
+ spin_unlock_irqrestore(&iocq[id].lock, flags);
+ while (!list_empty(&flist)) {
+ pos = flist.next;
+ list_del(pos);
+ f = list_entry(pos, struct frame, head);
+ d = f->t->d;
+ skb = f->r_skb;
+ spin_lock_irqsave(&d->lock, flags);
+ if (f->buf) {
+ f->buf->nframesout--;
+ aoe_failbuf(d, f->buf);
+ }
+ aoe_freetframe(f);
+ spin_unlock_irqrestore(&d->lock, flags);
+ dev_kfree_skb(skb);
+ aoedev_put(d);
+ }
+}
- spin_unlock_irqrestore(&d->lock, flags);
+int __init
+aoecmd_init(void)
+{
+ void *p;
+ int i;
+ int ret;
+
+ /* get_zeroed_page returns page with ref count 1 */
+ p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT);
+ if (!p)
+ return -ENOMEM;
+ empty_page = virt_to_page(p);
+
+ ncpus = num_online_cpus();
+
+ iocq = kcalloc(ncpus, sizeof(struct iocq_ktio), GFP_KERNEL);
+ if (!iocq)
+ return -ENOMEM;
+
+ kts = kcalloc(ncpus, sizeof(struct ktstate), GFP_KERNEL);
+ if (!kts) {
+ ret = -ENOMEM;
+ goto kts_fail;
+ }
+
+ ktiowq = kcalloc(ncpus, sizeof(wait_queue_head_t), GFP_KERNEL);
+ if (!ktiowq) {
+ ret = -ENOMEM;
+ goto ktiowq_fail;
+ }
+
+ mutex_init(&ktio_spawn_lock);
+
+ for (i = 0; i < ncpus; i++) {
+ INIT_LIST_HEAD(&iocq[i].head);
+ spin_lock_init(&iocq[i].lock);
+ init_waitqueue_head(&ktiowq[i]);
+ snprintf(kts[i].name, sizeof(kts[i].name), "aoe_ktio%d", i);
+ kts[i].fn = ktio;
+ kts[i].waitq = &ktiowq[i];
+ kts[i].lock = &iocq[i].lock;
+ kts[i].id = i;
+ kts[i].active = 0;
+ }
+ kts[0].active = 1;
+ if (aoe_ktstart(&kts[0])) {
+ ret = -ENOMEM;
+ goto ktstart_fail;
+ }
+ return 0;
- aoenet_xmit(sl);
+ktstart_fail:
+ kfree(ktiowq);
+ktiowq_fail:
+ kfree(kts);
+kts_fail:
+ kfree(iocq);
+
+ return ret;
}
+void
+aoecmd_exit(void)
+{
+ int i;
+
+ for (i = 0; i < ncpus; i++)
+ if (kts[i].active)
+ aoe_ktstop(&kts[i]);
+
+ aoe_flush_iocq();
+
+ /* Free up the iocq and thread speicific configuration
+ * allocated during startup.
+ */
+ kfree(iocq);
+ kfree(kts);
+ kfree(ktiowq);
+
+ free_page((unsigned long) page_address(empty_page));
+ empty_page = NULL;
+}