aboutsummaryrefslogtreecommitdiff
path: root/drivers/block/aoe
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block/aoe')
-rw-r--r--drivers/block/aoe/Makefile2
-rw-r--r--drivers/block/aoe/aoe.h145
-rw-r--r--drivers/block/aoe/aoeblk.c305
-rw-r--r--drivers/block/aoe/aoechr.c42
-rw-r--r--drivers/block/aoe/aoecmd.c1800
-rw-r--r--drivers/block/aoe/aoedev.c453
-rw-r--r--drivers/block/aoe/aoemain.c12
-rw-r--r--drivers/block/aoe/aoenet.c76
8 files changed, 2041 insertions, 794 deletions
diff --git a/drivers/block/aoe/Makefile b/drivers/block/aoe/Makefile
index e76d997183c..06ea82cdf27 100644
--- a/drivers/block/aoe/Makefile
+++ b/drivers/block/aoe/Makefile
@@ -3,4 +3,4 @@
#
obj-$(CONFIG_ATA_OVER_ETH) += aoe.o
-aoe-objs := aoeblk.o aoechr.o aoecmd.o aoedev.o aoemain.o aoenet.o
+aoe-y := aoeblk.o aoechr.o aoecmd.o aoedev.o aoemain.o aoenet.o
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 5e41e6dd657..9220f8e833d 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -1,5 +1,5 @@
-/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
-#define VERSION "47"
+/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
+#define VERSION "85"
#define AOE_MAJOR 152
#define DEVICE_NAME "aoe"
@@ -10,10 +10,7 @@
#define AOE_PARTITIONS (16)
#endif
-#define SYSMINOR(aoemajor, aoeminor) ((aoemajor) * NPERSHELF + (aoeminor))
-#define AOEMAJOR(sysminor) ((sysminor) / NPERSHELF)
-#define AOEMINOR(sysminor) ((sysminor) % NPERSHELF)
-#define WHITESPACE " \t\v\f\n"
+#define WHITESPACE " \t\v\f\n,"
enum {
AOECMD_ATA,
@@ -75,105 +72,136 @@ enum {
DEVFL_UP = 1, /* device is installed in system and ready for AoE->ATA commands */
DEVFL_TKILL = (1<<1), /* flag for timer to know when to kill self */
DEVFL_EXT = (1<<2), /* device accepts lba48 commands */
- DEVFL_CLOSEWAIT = (1<<3), /* device is waiting for all closes to revalidate */
- DEVFL_GDALLOC = (1<<4), /* need to alloc gendisk */
+ DEVFL_GDALLOC = (1<<3), /* need to alloc gendisk */
+ DEVFL_GD_NOW = (1<<4), /* allocating gendisk */
DEVFL_KICKME = (1<<5), /* slow polling network card catch */
DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */
-
- BUFFL_FAIL = 1,
+ DEVFL_FREEING = (1<<7), /* set when device is being cleaned up */
+ DEVFL_FREED = (1<<8), /* device has been cleaned up */
};
enum {
DEFAULTBCNT = 2 * 512, /* 2 sectors */
- NPERSHELF = 16, /* number of slots per shelf address */
- FREETAG = -1,
MIN_BUFS = 16,
- NTARGETS = 8,
+ NTARGETS = 4,
NAOEIFS = 8,
- NSKBPOOLMAX = 128,
+ NSKBPOOLMAX = 256,
+ NFACTIVE = 61,
TIMERTICK = HZ / 10,
- MINTIMER = HZ >> 2,
- MAXTIMER = HZ << 1,
- HELPWAIT = 20,
+ RTTSCALE = 8,
+ RTTDSCALE = 3,
+ RTTAVG_INIT = USEC_PER_SEC / 4 << RTTSCALE,
+ RTTDEV_INIT = RTTAVG_INIT / 4,
+
+ HARD_SCORN_SECS = 10, /* try another remote port after this */
+ MAX_TAINT = 1000, /* cap on aoetgt taint */
};
struct buf {
- struct list_head bufs;
- ulong stime; /* for disk stats */
- ulong flags;
ulong nframesout;
- ulong resid;
- ulong bv_resid;
- ulong bv_off;
- sector_t sector;
struct bio *bio;
- struct bio_vec *bv;
+ struct bvec_iter iter;
+ struct request *rq;
+};
+
+enum frame_flags {
+ FFL_PROBE = 1,
};
struct frame {
- int tag;
+ struct list_head head;
+ u32 tag;
+ struct timeval sent; /* high-res time packet was sent */
+ u32 sent_jiffs; /* low-res jiffies-based sent time */
ulong waited;
+ ulong waited_total;
+ struct aoetgt *t; /* parent target I belong to */
+ struct sk_buff *skb; /* command skb freed on module exit */
+ struct sk_buff *r_skb; /* response skb for async processing */
struct buf *buf;
- char *bufaddr;
- ulong bcnt;
- sector_t lba;
- struct sk_buff *skb;
+ struct bvec_iter iter;
+ char flags;
};
struct aoeif {
struct net_device *nd;
- unsigned char lost;
- unsigned char lostjumbo;
- ushort maxbcnt;
+ ulong lost;
+ int bcnt;
};
struct aoetgt {
unsigned char addr[6];
- ushort nframes;
- struct frame *frames;
+ ushort nframes; /* cap on frames to use */
+ struct aoedev *d; /* parent device I belong to */
+ struct list_head ffree; /* list of free frames */
struct aoeif ifs[NAOEIFS];
struct aoeif *ifp; /* current aoeif in use */
- ushort nout;
- ushort maxout;
- u16 lasttag; /* last tag sent */
- u16 useme;
- ulong lastwadj; /* last window adjustment */
+ ushort nout; /* number of AoE commands outstanding */
+ ushort maxout; /* current value for max outstanding */
+ ushort next_cwnd; /* incr maxout after decrementing to zero */
+ ushort ssthresh; /* slow start threshold */
+ ulong falloc; /* number of allocated frames */
+ int taint; /* how much we want to avoid this aoetgt */
+ int minbcnt;
int wpkts, rpkts;
- int dataref;
+ char nout_probes;
};
struct aoedev {
struct aoedev *next;
ulong sysminor;
ulong aoemajor;
+ u32 rttavg; /* scaled AoE round trip time average */
+ u32 rttdev; /* scaled round trip time mean deviation */
u16 aoeminor;
u16 flags;
u16 nopen; /* (bd_openers isn't available without sleeping) */
- u16 rttavg; /* round trip average of requests/responses */
- u16 mintimer;
u16 fw_ver; /* version of blade's firmware */
+ u16 lasttag; /* last tag sent */
+ u16 useme;
+ ulong ref;
struct work_struct work;/* disk create work struct */
struct gendisk *gd;
- struct request_queue blkq;
- struct hd_geometry geo;
+ struct dentry *debugfs;
+ struct request_queue *blkq;
+ struct hd_geometry geo;
sector_t ssize;
struct timer_list timer;
spinlock_t lock;
- struct sk_buff_head sendq;
struct sk_buff_head skbpool;
mempool_t *bufpool; /* for deadlock-free Buf allocation */
- struct list_head bufq; /* queue of bios to work on */
- struct buf *inprocess; /* the one we're currently working on */
- struct aoetgt *targets[NTARGETS];
+ struct { /* pointers to work in progress */
+ struct buf *buf;
+ struct bio *nxbio;
+ struct request *rq;
+ } ip;
+ ulong maxbcnt;
+ struct list_head factive[NFACTIVE]; /* hash of active frames */
+ struct list_head rexmitq; /* deferred retransmissions */
+ struct aoetgt **targets;
+ ulong ntargets; /* number of allocated aoetgt pointers */
struct aoetgt **tgt; /* target in use when working */
- struct aoetgt **htgt; /* target needing rexmit assistance */
+ ulong kicked;
+ char ident[512];
};
+/* kthread tracking */
+struct ktstate {
+ struct completion rendez;
+ struct task_struct *task;
+ wait_queue_head_t *waitq;
+ int (*fn) (int);
+ char name[12];
+ spinlock_t *lock;
+ int id;
+ int active;
+};
int aoeblk_init(void);
void aoeblk_exit(void);
void aoeblk_gdalloc(void *);
+void aoedisk_rm_debugfs(struct aoedev *d);
void aoedisk_rm_sysfs(struct aoedev *d);
int aoechr_init(void);
@@ -182,22 +210,31 @@ void aoechr_error(char *);
void aoecmd_work(struct aoedev *d);
void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor);
-void aoecmd_ata_rsp(struct sk_buff *);
+struct sk_buff *aoecmd_ata_rsp(struct sk_buff *);
void aoecmd_cfg_rsp(struct sk_buff *);
void aoecmd_sleepwork(struct work_struct *);
+void aoecmd_wreset(struct aoetgt *t);
void aoecmd_cleanslate(struct aoedev *);
+void aoecmd_exit(void);
+int aoecmd_init(void);
struct sk_buff *aoecmd_ata_id(struct aoedev *);
+void aoe_freetframe(struct frame *);
+void aoe_flush_iocq(void);
+void aoe_flush_iocq_by_index(int);
+void aoe_end_request(struct aoedev *, struct request *, int);
+int aoe_ktstart(struct ktstate *k);
+void aoe_ktstop(struct ktstate *k);
int aoedev_init(void);
void aoedev_exit(void);
-struct aoedev *aoedev_by_aoeaddr(int maj, int min);
-struct aoedev *aoedev_by_sysminor_m(ulong sysminor);
+struct aoedev *aoedev_by_aoeaddr(ulong maj, int min, int do_alloc);
void aoedev_downdev(struct aoedev *d);
int aoedev_flush(const char __user *str, size_t size);
+void aoe_failbuf(struct aoedev *, struct buf *);
+void aoedev_put(struct aoedev *);
int aoenet_init(void);
void aoenet_exit(void);
void aoenet_xmit(struct sk_buff_head *);
int is_aoe_netif(struct net_device *ifp);
int set_aoe_iflist(const char __user *str, size_t size);
-
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 2307a271bdc..dd73e1ff175 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -1,19 +1,35 @@
-/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoeblk.c
* block device routines
*/
+#include <linux/kernel.h>
#include <linux/hdreg.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/ioctl.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
#include <linux/genhd.h>
#include <linux/netdevice.h>
+#include <linux/mutex.h>
+#include <linux/export.h>
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <scsi/sg.h>
#include "aoe.h"
+static DEFINE_MUTEX(aoeblk_mutex);
static struct kmem_cache *buf_pool_cache;
+static struct dentry *aoe_debugfs_dir;
+
+/* GPFS needs a larger value than the default. */
+static int aoe_maxsectors;
+module_param(aoe_maxsectors, int, 0644);
+MODULE_PARM_DESC(aoe_maxsectors,
+ "When nonzero, set the maximum number of sectors per I/O request");
static ssize_t aoedisk_show_state(struct device *dev,
struct device_attribute *attr, char *page)
@@ -53,7 +69,7 @@ static ssize_t aoedisk_show_netif(struct device *dev,
nd = nds;
ne = nd + ARRAY_SIZE(nds);
t = d->targets;
- te = t + NTARGETS;
+ te = t + d->ntargets;
for (; t < te && *t; t++) {
ifp = (*t)->ifs;
e = ifp + NAOEIFS;
@@ -85,6 +101,63 @@ static ssize_t aoedisk_show_fwver(struct device *dev,
return snprintf(page, PAGE_SIZE, "0x%04x\n", (unsigned int) d->fw_ver);
}
+static ssize_t aoedisk_show_payload(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct aoedev *d = disk->private_data;
+
+ return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt);
+}
+
+static int aoedisk_debugfs_show(struct seq_file *s, void *ignored)
+{
+ struct aoedev *d;
+ struct aoetgt **t, **te;
+ struct aoeif *ifp, *ife;
+ unsigned long flags;
+ char c;
+
+ d = s->private;
+ seq_printf(s, "rttavg: %d rttdev: %d\n",
+ d->rttavg >> RTTSCALE,
+ d->rttdev >> RTTDSCALE);
+ seq_printf(s, "nskbpool: %d\n", skb_queue_len(&d->skbpool));
+ seq_printf(s, "kicked: %ld\n", d->kicked);
+ seq_printf(s, "maxbcnt: %ld\n", d->maxbcnt);
+ seq_printf(s, "ref: %ld\n", d->ref);
+
+ spin_lock_irqsave(&d->lock, flags);
+ t = d->targets;
+ te = t + d->ntargets;
+ for (; t < te && *t; t++) {
+ c = '\t';
+ seq_printf(s, "falloc: %ld\n", (*t)->falloc);
+ seq_printf(s, "ffree: %p\n",
+ list_empty(&(*t)->ffree) ? NULL : (*t)->ffree.next);
+ seq_printf(s, "%pm:%d:%d:%d\n", (*t)->addr, (*t)->nout,
+ (*t)->maxout, (*t)->nframes);
+ seq_printf(s, "\tssthresh:%d\n", (*t)->ssthresh);
+ seq_printf(s, "\ttaint:%d\n", (*t)->taint);
+ seq_printf(s, "\tr:%d\n", (*t)->rpkts);
+ seq_printf(s, "\tw:%d\n", (*t)->wpkts);
+ ifp = (*t)->ifs;
+ ife = ifp + ARRAY_SIZE((*t)->ifs);
+ for (; ifp->nd && ifp < ife; ifp++) {
+ seq_printf(s, "%c%s", c, ifp->nd->name);
+ c = ',';
+ }
+ seq_puts(s, "\n");
+ }
+ spin_unlock_irqrestore(&d->lock, flags);
+
+ return 0;
+}
+
+static int aoe_debugfs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, aoedisk_debugfs_show, inode->i_private);
+}
static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL);
static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL);
@@ -93,12 +166,14 @@ static struct device_attribute dev_attr_firmware_version = {
.attr = { .name = "firmware-version", .mode = S_IRUGO },
.show = aoedisk_show_fwver,
};
+static DEVICE_ATTR(payload, S_IRUGO, aoedisk_show_payload, NULL);
static struct attribute *aoe_attrs[] = {
&dev_attr_state.attr,
&dev_attr_mac.attr,
&dev_attr_netif.attr,
&dev_attr_firmware_version.attr,
+ &dev_attr_payload.attr,
NULL,
};
@@ -106,6 +181,44 @@ static const struct attribute_group attr_group = {
.attrs = aoe_attrs,
};
+static const struct file_operations aoe_debugfs_fops = {
+ .open = aoe_debugfs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void
+aoedisk_add_debugfs(struct aoedev *d)
+{
+ struct dentry *entry;
+ char *p;
+
+ if (aoe_debugfs_dir == NULL)
+ return;
+ p = strchr(d->gd->disk_name, '/');
+ if (p == NULL)
+ p = d->gd->disk_name;
+ else
+ p++;
+ BUG_ON(*p == '\0');
+ entry = debugfs_create_file(p, 0444, aoe_debugfs_dir, d,
+ &aoe_debugfs_fops);
+ if (IS_ERR_OR_NULL(entry)) {
+ pr_info("aoe: cannot create debugfs file for %s\n",
+ d->gd->disk_name);
+ return;
+ }
+ BUG_ON(d->debugfs);
+ d->debugfs = entry;
+}
+void
+aoedisk_rm_debugfs(struct aoedev *d)
+{
+ debugfs_remove(d->debugfs);
+ d->debugfs = NULL;
+}
+
static int
aoedisk_add_sysfs(struct aoedev *d)
{
@@ -123,17 +236,29 @@ aoeblk_open(struct block_device *bdev, fmode_t mode)
struct aoedev *d = bdev->bd_disk->private_data;
ulong flags;
+ if (!virt_addr_valid(d)) {
+ pr_crit("aoe: invalid device pointer in %s\n",
+ __func__);
+ WARN_ON(1);
+ return -ENODEV;
+ }
+ if (!(d->flags & DEVFL_UP) || d->flags & DEVFL_TKILL)
+ return -ENODEV;
+
+ mutex_lock(&aoeblk_mutex);
spin_lock_irqsave(&d->lock, flags);
- if (d->flags & DEVFL_UP) {
+ if (d->flags & DEVFL_UP && !(d->flags & DEVFL_TKILL)) {
d->nopen++;
spin_unlock_irqrestore(&d->lock, flags);
+ mutex_unlock(&aoeblk_mutex);
return 0;
}
spin_unlock_irqrestore(&d->lock, flags);
+ mutex_unlock(&aoeblk_mutex);
return -ENODEV;
}
-static int
+static void
aoeblk_release(struct gendisk *disk, fmode_t mode)
{
struct aoedev *d = disk->private_data;
@@ -144,78 +269,28 @@ aoeblk_release(struct gendisk *disk, fmode_t mode)
if (--d->nopen == 0) {
spin_unlock_irqrestore(&d->lock, flags);
aoecmd_cfg(d->aoemajor, d->aoeminor);
- return 0;
+ return;
}
spin_unlock_irqrestore(&d->lock, flags);
-
- return 0;
}
-static int
-aoeblk_make_request(struct request_queue *q, struct bio *bio)
+static void
+aoeblk_request(struct request_queue *q)
{
- struct sk_buff_head queue;
struct aoedev *d;
- struct buf *buf;
- ulong flags;
-
- blk_queue_bounce(q, &bio);
-
- if (bio == NULL) {
- printk(KERN_ERR "aoe: bio is NULL\n");
- BUG();
- return 0;
- }
- d = bio->bi_bdev->bd_disk->private_data;
- if (d == NULL) {
- printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n");
- BUG();
- bio_endio(bio, -ENXIO);
- return 0;
- } else if (bio->bi_io_vec == NULL) {
- printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
- BUG();
- bio_endio(bio, -ENXIO);
- return 0;
- }
- buf = mempool_alloc(d->bufpool, GFP_NOIO);
- if (buf == NULL) {
- printk(KERN_INFO "aoe: buf allocation failure\n");
- bio_endio(bio, -ENOMEM);
- return 0;
- }
- memset(buf, 0, sizeof(*buf));
- INIT_LIST_HEAD(&buf->bufs);
- buf->stime = jiffies;
- buf->bio = bio;
- buf->resid = bio->bi_size;
- buf->sector = bio->bi_sector;
- buf->bv = &bio->bi_io_vec[bio->bi_idx];
- buf->bv_resid = buf->bv->bv_len;
- WARN_ON(buf->bv_resid == 0);
- buf->bv_off = buf->bv->bv_offset;
-
- spin_lock_irqsave(&d->lock, flags);
+ struct request *rq;
+ d = q->queuedata;
if ((d->flags & DEVFL_UP) == 0) {
- printk(KERN_INFO "aoe: device %ld.%d is not up\n",
+ pr_info_ratelimited("aoe: device %ld.%d is not up\n",
d->aoemajor, d->aoeminor);
- spin_unlock_irqrestore(&d->lock, flags);
- mempool_free(buf, d->bufpool);
- bio_endio(bio, -ENXIO);
- return 0;
+ while ((rq = blk_peek_request(q))) {
+ blk_start_request(rq);
+ aoe_end_request(d, rq, 1);
+ }
+ return;
}
-
- list_add_tail(&buf->bufs, &d->bufq);
-
aoecmd_work(d);
- __skb_queue_head_init(&queue);
- skb_queue_splice_init(&d->sendq, &queue);
-
- spin_unlock_irqrestore(&d->lock, flags);
- aoenet_xmit(&queue);
-
- return 0;
}
static int
@@ -234,9 +309,38 @@ aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0;
}
-static struct block_device_operations aoe_bdops = {
+static int
+aoeblk_ioctl(struct block_device *bdev, fmode_t mode, uint cmd, ulong arg)
+{
+ struct aoedev *d;
+
+ if (!arg)
+ return -EINVAL;
+
+ d = bdev->bd_disk->private_data;
+ if ((d->flags & DEVFL_UP) == 0) {
+ pr_err("aoe: disk not up\n");
+ return -ENODEV;
+ }
+
+ if (cmd == HDIO_GET_IDENTITY) {
+ if (!copy_to_user((void __user *) arg, &d->ident,
+ sizeof(d->ident)))
+ return 0;
+ return -EFAULT;
+ }
+
+ /* udev calls scsi_id, which uses SG_IO, resulting in noise */
+ if (cmd != SG_IO)
+ pr_info("aoe: unknown ioctl 0x%x\n", cmd);
+
+ return -ENOTTY;
+}
+
+static const struct block_device_operations aoe_bdops = {
.open = aoeblk_open,
.release = aoeblk_release,
+ .ioctl = aoeblk_ioctl,
.getgeo = aoeblk_getgeo,
.owner = THIS_MODULE,
};
@@ -247,37 +351,67 @@ aoeblk_gdalloc(void *vp)
{
struct aoedev *d = vp;
struct gendisk *gd;
+ mempool_t *mp;
+ struct request_queue *q;
+ enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, };
ulong flags;
+ int late = 0;
+
+ spin_lock_irqsave(&d->lock, flags);
+ if (d->flags & DEVFL_GDALLOC
+ && !(d->flags & DEVFL_TKILL)
+ && !(d->flags & DEVFL_GD_NOW))
+ d->flags |= DEVFL_GD_NOW;
+ else
+ late = 1;
+ spin_unlock_irqrestore(&d->lock, flags);
+ if (late)
+ return;
gd = alloc_disk(AOE_PARTITIONS);
if (gd == NULL) {
- printk(KERN_ERR
- "aoe: cannot allocate disk structure for %ld.%d\n",
+ pr_err("aoe: cannot allocate disk structure for %ld.%d\n",
d->aoemajor, d->aoeminor);
goto err;
}
- d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache);
- if (d->bufpool == NULL) {
+ mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab,
+ buf_pool_cache);
+ if (mp == NULL) {
printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n",
d->aoemajor, d->aoeminor);
goto err_disk;
}
-
- blk_queue_make_request(&d->blkq, aoeblk_make_request);
- if (bdi_init(&d->blkq.backing_dev_info))
+ q = blk_init_queue(aoeblk_request, &d->lock);
+ if (q == NULL) {
+ pr_err("aoe: cannot allocate block queue for %ld.%d\n",
+ d->aoemajor, d->aoeminor);
goto err_mempool;
+ }
+
spin_lock_irqsave(&d->lock, flags);
+ WARN_ON(!(d->flags & DEVFL_GD_NOW));
+ WARN_ON(!(d->flags & DEVFL_GDALLOC));
+ WARN_ON(d->flags & DEVFL_TKILL);
+ WARN_ON(d->gd);
+ WARN_ON(d->flags & DEVFL_UP);
+ blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS);
+ q->backing_dev_info.name = "aoe";
+ q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
+ d->bufpool = mp;
+ d->blkq = gd->queue = q;
+ q->queuedata = d;
+ d->gd = gd;
+ if (aoe_maxsectors)
+ blk_queue_max_hw_sectors(q, aoe_maxsectors);
gd->major = AOE_MAJOR;
- gd->first_minor = d->sysminor * AOE_PARTITIONS;
+ gd->first_minor = d->sysminor;
gd->fops = &aoe_bdops;
gd->private_data = d;
set_capacity(gd, d->ssize);
snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d",
d->aoemajor, d->aoeminor);
- gd->queue = &d->blkq;
- d->gd = gd;
d->flags &= ~DEVFL_GDALLOC;
d->flags |= DEVFL_UP;
@@ -285,21 +419,30 @@ aoeblk_gdalloc(void *vp)
add_disk(gd);
aoedisk_add_sysfs(d);
+ aoedisk_add_debugfs(d);
+
+ spin_lock_irqsave(&d->lock, flags);
+ WARN_ON(!(d->flags & DEVFL_GD_NOW));
+ d->flags &= ~DEVFL_GD_NOW;
+ spin_unlock_irqrestore(&d->lock, flags);
return;
err_mempool:
- mempool_destroy(d->bufpool);
+ mempool_destroy(mp);
err_disk:
put_disk(gd);
err:
spin_lock_irqsave(&d->lock, flags);
- d->flags &= ~DEVFL_GDALLOC;
+ d->flags &= ~DEVFL_GD_NOW;
+ schedule_work(&d->work);
spin_unlock_irqrestore(&d->lock, flags);
}
void
aoeblk_exit(void)
{
+ debugfs_remove_recursive(aoe_debugfs_dir);
+ aoe_debugfs_dir = NULL;
kmem_cache_destroy(buf_pool_cache);
}
@@ -311,7 +454,11 @@ aoeblk_init(void)
0, 0, NULL);
if (buf_pool_cache == NULL)
return -ENOMEM;
-
+ aoe_debugfs_dir = debugfs_create_dir("aoe", NULL);
+ if (IS_ERR_OR_NULL(aoe_debugfs_dir)) {
+ pr_info("aoe: cannot create debugfs directory\n");
+ aoe_debugfs_dir = NULL;
+ }
return 0;
}
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 200efc4d2c1..ab41be625a5 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoechr.c
* AoE character device driver
@@ -8,8 +8,10 @@
#include <linux/blkdev.h>
#include <linux/completion.h>
#include <linux/delay.h>
-#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
#include <linux/skbuff.h>
+#include <linux/export.h>
#include "aoe.h"
enum {
@@ -36,6 +38,12 @@ struct ErrMsg {
char *msg;
};
+static DEFINE_MUTEX(aoechr_mutex);
+
+/* A ring buffer of error messages, to be read through
+ * "/dev/etherd/err". When no messages are present,
+ * readers will block waiting for messages to appear.
+ */
static struct ErrMsg emsgs[NMSG];
static int emsgs_head_idx, emsgs_tail_idx;
static struct completion emsgs_comp;
@@ -83,34 +91,34 @@ revalidate(const char __user *str, size_t size)
if (copy_from_user(buf, str, size))
return -EFAULT;
- /* should be e%d.%d format */
n = sscanf(buf, "e%d.%d", &major, &minor);
if (n != 2) {
- printk(KERN_ERR "aoe: invalid device specification\n");
+ pr_err("aoe: invalid device specification %s\n", buf);
return -EINVAL;
}
- d = aoedev_by_aoeaddr(major, minor);
+ d = aoedev_by_aoeaddr(major, minor, 0);
if (!d)
return -EINVAL;
spin_lock_irqsave(&d->lock, flags);
aoecmd_cleanslate(d);
+ aoecmd_cfg(major, minor);
loop:
skb = aoecmd_ata_id(d);
spin_unlock_irqrestore(&d->lock, flags);
/* try again if we are able to sleep a bit,
* otherwise give up this revalidation
*/
- if (!skb && !msleep_interruptible(200)) {
+ if (!skb && !msleep_interruptible(250)) {
spin_lock_irqsave(&d->lock, flags);
goto loop;
}
+ aoedev_put(d);
if (skb) {
struct sk_buff_head queue;
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
}
- aoecmd_cfg(major, minor);
return 0;
}
@@ -131,13 +139,12 @@ bail: spin_unlock_irqrestore(&emsgs_lock, flags);
return;
}
- mp = kmalloc(n, GFP_ATOMIC);
+ mp = kmemdup(msg, n, GFP_ATOMIC);
if (mp == NULL) {
printk(KERN_ERR "aoe: allocation failure, len=%ld\n", n);
goto bail;
}
- memcpy(mp, msg, n);
em->msg = mp;
em->flags |= EMFL_VALID;
em->len = n;
@@ -171,6 +178,7 @@ aoechr_write(struct file *filp, const char __user *buf, size_t cnt, loff_t *offp
break;
case MINOR_FLUSH:
ret = aoedev_flush(buf, cnt);
+ break;
}
if (ret == 0)
ret = cnt;
@@ -182,16 +190,16 @@ aoechr_open(struct inode *inode, struct file *filp)
{
int n, i;
- lock_kernel();
+ mutex_lock(&aoechr_mutex);
n = iminor(inode);
filp->private_data = (void *) (unsigned long) n;
for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
if (chardevs[i].minor == n) {
- unlock_kernel();
+ mutex_unlock(&aoechr_mutex);
return 0;
}
- unlock_kernel();
+ mutex_unlock(&aoechr_mutex);
return -EINVAL;
}
@@ -264,15 +272,21 @@ static const struct file_operations aoe_fops = {
.open = aoechr_open,
.release = aoechr_rel,
.owner = THIS_MODULE,
+ .llseek = noop_llseek,
};
+static char *aoe_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev));
+}
+
int __init
aoechr_init(void)
{
int n, i;
n = register_chrdev(AOE_MAJOR, "aoechr", &aoe_fops);
- if (n < 0) {
+ if (n < 0) {
printk(KERN_ERR "aoe: can't register char device\n");
return n;
}
@@ -283,6 +297,8 @@ aoechr_init(void)
unregister_chrdev(AOE_MAJOR, "aoechr");
return PTR_ERR(aoe_class);
}
+ aoe_class->devnode = aoe_devnode;
+
for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
device_create(aoe_class, NULL,
MKDEV(AOE_MAJOR, chardevs[i].minor), NULL,
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 45c5a33daf4..422b7d84f68 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1,59 +1,113 @@
-/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoecmd.c
* Filesystem request handling methods
*/
+#include <linux/ata.h>
+#include <linux/slab.h>
#include <linux/hdreg.h>
#include <linux/blkdev.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/genhd.h>
#include <linux/moduleparam.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
#include <net/net_namespace.h>
#include <asm/unaligned.h>
+#include <linux/uio.h>
#include "aoe.h"
+#define MAXIOC (8192) /* default meant to avoid most soft lockups */
+
+static void ktcomplete(struct frame *, struct sk_buff *);
+static int count_targets(struct aoedev *d, int *untainted);
+
+static struct buf *nextbuf(struct aoedev *);
+
static int aoe_deadsecs = 60 * 3;
module_param(aoe_deadsecs, int, 0644);
MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
-static int aoe_maxout = 16;
+static int aoe_maxout = 64;
module_param(aoe_maxout, int, 0644);
MODULE_PARM_DESC(aoe_maxout,
"Only aoe_maxout outstanding packets for every MAC on eX.Y.");
+/* The number of online cpus during module initialization gives us a
+ * convenient heuristic cap on the parallelism used for ktio threads
+ * doing I/O completion. It is not important that the cap equal the
+ * actual number of running CPUs at any given time, but because of CPU
+ * hotplug, we take care to use ncpus instead of using
+ * num_online_cpus() after module initialization.
+ */
+static int ncpus;
+
+/* mutex lock used for synchronization while thread spawning */
+static DEFINE_MUTEX(ktio_spawn_lock);
+
+static wait_queue_head_t *ktiowq;
+static struct ktstate *kts;
+
+/* io completion queue */
+struct iocq_ktio {
+ struct list_head head;
+ spinlock_t lock;
+};
+static struct iocq_ktio *iocq;
+
+static struct page *empty_page;
+
static struct sk_buff *
new_skb(ulong len)
{
struct sk_buff *skb;
- skb = alloc_skb(len, GFP_ATOMIC);
+ skb = alloc_skb(len + MAX_HEADER, GFP_ATOMIC);
if (skb) {
+ skb_reserve(skb, MAX_HEADER);
skb_reset_mac_header(skb);
skb_reset_network_header(skb);
skb->protocol = __constant_htons(ETH_P_AOE);
- skb->priority = 0;
- skb->next = skb->prev = NULL;
-
- /* tell the network layer not to perform IP checksums
- * or to get the NIC to do it
- */
- skb->ip_summed = CHECKSUM_NONE;
+ skb_checksum_none_assert(skb);
}
return skb;
}
static struct frame *
-getframe(struct aoetgt *t, int tag)
+getframe_deferred(struct aoedev *d, u32 tag)
{
- struct frame *f, *e;
+ struct list_head *head, *pos, *nx;
+ struct frame *f;
- f = t->frames;
- e = f + t->nframes;
- for (; f<e; f++)
- if (f->tag == tag)
+ head = &d->rexmitq;
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ if (f->tag == tag) {
+ list_del(pos);
return f;
+ }
+ }
+ return NULL;
+}
+
+static struct frame *
+getframe(struct aoedev *d, u32 tag)
+{
+ struct frame *f;
+ struct list_head *head, *pos, *nx;
+ u32 n;
+
+ n = tag % NFACTIVE;
+ head = &d->factive[n];
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ if (f->tag == tag) {
+ list_del(pos);
+ return f;
+ }
+ }
return NULL;
}
@@ -63,18 +117,18 @@ getframe(struct aoetgt *t, int tag)
* This driver reserves tag -1 to mean "unused frame."
*/
static int
-newtag(struct aoetgt *t)
+newtag(struct aoedev *d)
{
register ulong n;
n = jiffies & 0xffff;
- return n |= (++t->lasttag & 0x7fff) << 16;
+ return n |= (++d->lasttag & 0x7fff) << 16;
}
-static int
+static u32
aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
{
- u32 host_tag = newtag(t);
+ u32 host_tag = newtag(d);
memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
memcpy(h->dst, t->addr, sizeof h->dst);
@@ -99,16 +153,18 @@ put_lba(struct aoe_atahdr *ah, sector_t lba)
ah->lba5 = lba >>= 8;
}
-static void
+static struct aoeif *
ifrotate(struct aoetgt *t)
{
- t->ifp++;
- if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL)
- t->ifp = t->ifs;
- if (t->ifp->nd == NULL) {
- printk(KERN_INFO "aoe: no interface to rotate to\n");
- BUG();
- }
+ struct aoeif *ifp;
+
+ ifp = t->ifp;
+ ifp++;
+ if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL)
+ ifp = t->ifs;
+ if (ifp->nd == NULL)
+ return NULL;
+ return t->ifp = ifp;
}
static void
@@ -133,161 +189,221 @@ skb_pool_get(struct aoedev *d)
return NULL;
}
-/* freeframe is where we do our load balancing so it's a little hairy. */
+void
+aoe_freetframe(struct frame *f)
+{
+ struct aoetgt *t;
+
+ t = f->t;
+ f->buf = NULL;
+ memset(&f->iter, 0, sizeof(f->iter));
+ f->r_skb = NULL;
+ f->flags = 0;
+ list_add(&f->head, &t->ffree);
+}
+
static struct frame *
-freeframe(struct aoedev *d)
+newtframe(struct aoedev *d, struct aoetgt *t)
{
- struct frame *f, *e, *rf;
- struct aoetgt **t;
+ struct frame *f;
struct sk_buff *skb;
+ struct list_head *pos;
+
+ if (list_empty(&t->ffree)) {
+ if (t->falloc >= NSKBPOOLMAX*2)
+ return NULL;
+ f = kcalloc(1, sizeof(*f), GFP_ATOMIC);
+ if (f == NULL)
+ return NULL;
+ t->falloc++;
+ f->t = t;
+ } else {
+ pos = t->ffree.next;
+ list_del(pos);
+ f = list_entry(pos, struct frame, head);
+ }
- if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */
+ skb = f->skb;
+ if (skb == NULL) {
+ f->skb = skb = new_skb(ETH_ZLEN);
+ if (!skb) {
+bail: aoe_freetframe(f);
+ return NULL;
+ }
+ }
+
+ if (atomic_read(&skb_shinfo(skb)->dataref) != 1) {
+ skb = skb_pool_get(d);
+ if (skb == NULL)
+ goto bail;
+ skb_pool_put(d, f->skb);
+ f->skb = skb;
+ }
+
+ skb->truesize -= skb->data_len;
+ skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+ skb_trim(skb, 0);
+ return f;
+}
+
+static struct frame *
+newframe(struct aoedev *d)
+{
+ struct frame *f;
+ struct aoetgt *t, **tt;
+ int totout = 0;
+ int use_tainted;
+ int has_untainted;
+
+ if (!d->targets || !d->targets[0]) {
printk(KERN_ERR "aoe: NULL TARGETS!\n");
return NULL;
}
- t = d->tgt;
- t++;
- if (t >= &d->targets[NTARGETS] || !*t)
- t = d->targets;
- for (;;) {
- if ((*t)->nout < (*t)->maxout
- && t != d->htgt
- && (*t)->ifp->nd) {
- rf = NULL;
- f = (*t)->frames;
- e = f + (*t)->nframes;
- for (; f < e; f++) {
- if (f->tag != FREETAG)
- continue;
- skb = f->skb;
- if (!skb
- && !(f->skb = skb = new_skb(ETH_ZLEN)))
- continue;
- if (atomic_read(&skb_shinfo(skb)->dataref)
- != 1) {
- if (!rf)
- rf = f;
- continue;
- }
-gotone: skb_shinfo(skb)->nr_frags = skb->data_len = 0;
- skb_trim(skb, 0);
- d->tgt = t;
- ifrotate(*t);
+ tt = d->tgt; /* last used target */
+ for (use_tainted = 0, has_untainted = 0;;) {
+ tt++;
+ if (tt >= &d->targets[d->ntargets] || !*tt)
+ tt = d->targets;
+ t = *tt;
+ if (!t->taint) {
+ has_untainted = 1;
+ totout += t->nout;
+ }
+ if (t->nout < t->maxout
+ && (use_tainted || !t->taint)
+ && t->ifp->nd) {
+ f = newtframe(d, t);
+ if (f) {
+ ifrotate(t);
+ d->tgt = tt;
return f;
}
- /* Work can be done, but the network layer is
- holding our precious packets. Try to grab
- one from the pool. */
- f = rf;
- if (f == NULL) { /* more paranoia */
- printk(KERN_ERR
- "aoe: freeframe: %s.\n",
- "unexpected null rf");
- d->flags |= DEVFL_KICKME;
- return NULL;
- }
- skb = skb_pool_get(d);
- if (skb) {
- skb_pool_put(d, f->skb);
- f->skb = skb;
- goto gotone;
- }
- (*t)->dataref++;
- if ((*t)->nout == 0)
- d->flags |= DEVFL_KICKME;
}
- if (t == d->tgt) /* we've looped and found nada */
- break;
- t++;
- if (t >= &d->targets[NTARGETS] || !*t)
- t = d->targets;
+ if (tt == d->tgt) { /* we've looped and found nada */
+ if (!use_tainted && !has_untainted)
+ use_tainted = 1;
+ else
+ break;
+ }
+ }
+ if (totout == 0) {
+ d->kicked++;
+ d->flags |= DEVFL_KICKME;
}
return NULL;
}
-static int
-aoecmd_ata_rw(struct aoedev *d)
+static void
+skb_fillup(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter)
{
- struct frame *f;
+ int frag = 0;
+ struct bio_vec bv;
+
+ __bio_for_each_segment(bv, bio, iter, iter)
+ skb_fill_page_desc(skb, frag++, bv.bv_page,
+ bv.bv_offset, bv.bv_len);
+}
+
+static void
+fhash(struct frame *f)
+{
+ struct aoedev *d = f->t->d;
+ u32 n;
+
+ n = f->tag % NFACTIVE;
+ list_add_tail(&f->head, &d->factive[n]);
+}
+
+static void
+ata_rw_frameinit(struct frame *f)
+{
+ struct aoetgt *t;
struct aoe_hdr *h;
struct aoe_atahdr *ah;
- struct buf *buf;
- struct bio_vec *bv;
- struct aoetgt *t;
struct sk_buff *skb;
- ulong bcnt;
char writebit, extbit;
- writebit = 0x10;
- extbit = 0x4;
-
- f = freeframe(d);
- if (f == NULL)
- return 0;
- t = *d->tgt;
- buf = d->inprocess;
- bv = buf->bv;
- bcnt = t->ifp->maxbcnt;
- if (bcnt == 0)
- bcnt = DEFAULTBCNT;
- if (bcnt > buf->bv_resid)
- bcnt = buf->bv_resid;
- /* initialize the headers & frame */
skb = f->skb;
h = (struct aoe_hdr *) skb_mac_header(skb);
- ah = (struct aoe_atahdr *) (h+1);
- skb_put(skb, sizeof *h + sizeof *ah);
+ ah = (struct aoe_atahdr *) (h + 1);
+ skb_put(skb, sizeof(*h) + sizeof(*ah));
memset(h, 0, skb->len);
- f->tag = aoehdr_atainit(d, t, h);
+
+ writebit = 0x10;
+ extbit = 0x4;
+
+ t = f->t;
+ f->tag = aoehdr_atainit(t->d, t, h);
+ fhash(f);
t->nout++;
f->waited = 0;
- f->buf = buf;
- f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
- f->bcnt = bcnt;
- f->lba = buf->sector;
+ f->waited_total = 0;
/* set up ata header */
- ah->scnt = bcnt >> 9;
- put_lba(ah, buf->sector);
- if (d->flags & DEVFL_EXT) {
+ ah->scnt = f->iter.bi_size >> 9;
+ put_lba(ah, f->iter.bi_sector);
+ if (t->d->flags & DEVFL_EXT) {
ah->aflags |= AOEAFL_EXT;
} else {
extbit = 0;
ah->lba3 &= 0x0f;
ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */
}
- if (bio_data_dir(buf->bio) == WRITE) {
- skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
+ if (f->buf && bio_data_dir(f->buf->bio) == WRITE) {
+ skb_fillup(skb, f->buf->bio, f->iter);
ah->aflags |= AOEAFL_WRITE;
- skb->len += bcnt;
- skb->data_len = bcnt;
+ skb->len += f->iter.bi_size;
+ skb->data_len = f->iter.bi_size;
+ skb->truesize += f->iter.bi_size;
t->wpkts++;
} else {
t->rpkts++;
writebit = 0;
}
- ah->cmdstat = WIN_READ | writebit | extbit;
+ ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
+ skb->dev = t->ifp->nd;
+}
+
+static int
+aoecmd_ata_rw(struct aoedev *d)
+{
+ struct frame *f;
+ struct buf *buf;
+ struct sk_buff *skb;
+ struct sk_buff_head queue;
+
+ buf = nextbuf(d);
+ if (buf == NULL)
+ return 0;
+ f = newframe(d);
+ if (f == NULL)
+ return 0;
+
+ /* initialize the headers & frame */
+ f->buf = buf;
+ f->iter = buf->iter;
+ f->iter.bi_size = min_t(unsigned long,
+ d->maxbcnt ?: DEFAULTBCNT,
+ f->iter.bi_size);
+ bio_advance_iter(buf->bio, &buf->iter, f->iter.bi_size);
+
+ if (!buf->iter.bi_size)
+ d->ip.buf = NULL;
/* mark all tracking fields and load out */
buf->nframesout += 1;
- buf->bv_off += bcnt;
- buf->bv_resid -= bcnt;
- buf->resid -= bcnt;
- buf->sector += bcnt >> 9;
- if (buf->resid == 0) {
- d->inprocess = NULL;
- } else if (buf->bv_resid == 0) {
- buf->bv = ++bv;
- buf->bv_resid = bv->bv_len;
- WARN_ON(buf->bv_resid == 0);
- buf->bv_off = bv->bv_offset;
- }
- skb->dev = t->ifp->nd;
- skb = skb_clone(skb, GFP_ATOMIC);
- if (skb)
- __skb_queue_tail(&d->sendq, skb);
+ ata_rw_frameinit(f);
+
+ skb = skb_clone(f->skb, GFP_ATOMIC);
+ if (skb) {
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ __skb_queue_head_init(&queue);
+ __skb_queue_tail(&queue, skb);
+ aoenet_xmit(&queue);
+ }
return 1;
}
@@ -302,8 +418,8 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
struct sk_buff *skb;
struct net_device *ifp;
- read_lock(&dev_base_lock);
- for_each_netdev(&init_net, ifp) {
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, ifp) {
dev_hold(ifp);
if (!is_aoe_netif(ifp))
goto cont;
@@ -330,64 +446,90 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
cont:
dev_put(ifp);
}
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static void
-resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
+resend(struct aoedev *d, struct frame *f)
{
struct sk_buff *skb;
+ struct sk_buff_head queue;
struct aoe_hdr *h;
- struct aoe_atahdr *ah;
+ struct aoetgt *t;
char buf[128];
u32 n;
- ifrotate(t);
- n = newtag(t);
+ t = f->t;
+ n = newtag(d);
skb = f->skb;
+ if (ifrotate(t) == NULL) {
+ /* probably can't happen, but set it up to fail anyway */
+ pr_info("aoe: resend: no interfaces to rotate to.\n");
+ ktcomplete(f, NULL);
+ return;
+ }
h = (struct aoe_hdr *) skb_mac_header(skb);
- ah = (struct aoe_atahdr *) (h+1);
- snprintf(buf, sizeof buf,
- "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
- "retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n,
- h->src, h->dst, t->nout);
- aoechr_error(buf);
+ if (!(f->flags & FFL_PROBE)) {
+ snprintf(buf, sizeof(buf),
+ "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
+ "retransmit", d->aoemajor, d->aoeminor,
+ f->tag, jiffies, n,
+ h->src, h->dst, t->nout);
+ aoechr_error(buf);
+ }
f->tag = n;
+ fhash(f);
h->tag = cpu_to_be32(n);
memcpy(h->dst, t->addr, sizeof h->dst);
memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
- switch (ah->cmdstat) {
- default:
- break;
- case WIN_READ:
- case WIN_READ_EXT:
- case WIN_WRITE:
- case WIN_WRITE_EXT:
- put_lba(ah, f->lba);
-
- n = f->bcnt;
- if (n > DEFAULTBCNT)
- n = DEFAULTBCNT;
- ah->scnt = n >> 9;
- if (ah->aflags & AOEAFL_WRITE) {
- skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
- offset_in_page(f->bufaddr), n);
- skb->len = sizeof *h + sizeof *ah + n;
- skb->data_len = n;
- }
- }
skb->dev = t->ifp->nd;
skb = skb_clone(skb, GFP_ATOMIC);
if (skb == NULL)
return;
- __skb_queue_tail(&d->sendq, skb);
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ __skb_queue_head_init(&queue);
+ __skb_queue_tail(&queue, skb);
+ aoenet_xmit(&queue);
+}
+
+static int
+tsince_hr(struct frame *f)
+{
+ struct timeval now;
+ int n;
+
+ do_gettimeofday(&now);
+ n = now.tv_usec - f->sent.tv_usec;
+ n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
+
+ if (n < 0)
+ n = -n;
+
+ /* For relatively long periods, use jiffies to avoid
+ * discrepancies caused by updates to the system time.
+ *
+ * On system with HZ of 1000, 32-bits is over 49 days
+ * worth of jiffies, or over 71 minutes worth of usecs.
+ *
+ * Jiffies overflow is handled by subtraction of unsigned ints:
+ * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
+ * $3 = 4
+ * (gdb)
+ */
+ if (n > USEC_PER_SEC / 4) {
+ n = ((u32) jiffies) - f->sent_jiffs;
+ n *= USEC_PER_SEC / HZ;
+ }
+
+ return n;
}
static int
-tsince(int tag)
+tsince(u32 tag)
{
int n;
@@ -395,7 +537,7 @@ tsince(int tag)
n -= tag & 0xffff;
if (n < 0)
n += 1<<16;
- return n;
+ return jiffies_to_usecs(n + 1);
}
static struct aoeif *
@@ -411,195 +553,399 @@ getif(struct aoetgt *t, struct net_device *nd)
return NULL;
}
-static struct aoeif *
-addif(struct aoetgt *t, struct net_device *nd)
-{
- struct aoeif *p;
-
- p = getif(t, NULL);
- if (!p)
- return NULL;
- p->nd = nd;
- p->maxbcnt = DEFAULTBCNT;
- p->lost = 0;
- p->lostjumbo = 0;
- return p;
-}
-
static void
ejectif(struct aoetgt *t, struct aoeif *ifp)
{
struct aoeif *e;
+ struct net_device *nd;
ulong n;
+ nd = ifp->nd;
e = t->ifs + NAOEIFS - 1;
n = (e - ifp) * sizeof *ifp;
memmove(ifp, ifp+1, n);
e->nd = NULL;
+ dev_put(nd);
}
-static int
-sthtith(struct aoedev *d)
+static struct frame *
+reassign_frame(struct frame *f)
+{
+ struct frame *nf;
+ struct sk_buff *skb;
+
+ nf = newframe(f->t->d);
+ if (!nf)
+ return NULL;
+ if (nf->t == f->t) {
+ aoe_freetframe(nf);
+ return NULL;
+ }
+
+ skb = nf->skb;
+ nf->skb = f->skb;
+ nf->buf = f->buf;
+ nf->iter = f->iter;
+ nf->waited = 0;
+ nf->waited_total = f->waited_total;
+ nf->sent = f->sent;
+ nf->sent_jiffs = f->sent_jiffs;
+ f->skb = skb;
+
+ return nf;
+}
+
+static void
+probe(struct aoetgt *t)
{
- struct frame *f, *e, *nf;
+ struct aoedev *d;
+ struct frame *f;
struct sk_buff *skb;
- struct aoetgt *ht = *d->htgt;
+ struct sk_buff_head queue;
+ size_t n, m;
+ int frag;
+
+ d = t->d;
+ f = newtframe(d, t);
+ if (!f) {
+ pr_err("%s %pm for e%ld.%d: %s\n",
+ "aoe: cannot probe remote address",
+ t->addr,
+ (long) d->aoemajor, d->aoeminor,
+ "no frame available");
+ return;
+ }
+ f->flags |= FFL_PROBE;
+ ifrotate(t);
+ f->iter.bi_size = t->d->maxbcnt ? t->d->maxbcnt : DEFAULTBCNT;
+ ata_rw_frameinit(f);
+ skb = f->skb;
+ for (frag = 0, n = f->iter.bi_size; n > 0; ++frag, n -= m) {
+ if (n < PAGE_SIZE)
+ m = n;
+ else
+ m = PAGE_SIZE;
+ skb_fill_page_desc(skb, frag, empty_page, 0, m);
+ }
+ skb->len += f->iter.bi_size;
+ skb->data_len = f->iter.bi_size;
+ skb->truesize += f->iter.bi_size;
- f = ht->frames;
- e = f + ht->nframes;
- for (; f < e; f++) {
- if (f->tag == FREETAG)
+ skb = skb_clone(f->skb, GFP_ATOMIC);
+ if (skb) {
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ __skb_queue_head_init(&queue);
+ __skb_queue_tail(&queue, skb);
+ aoenet_xmit(&queue);
+ }
+}
+
+static long
+rto(struct aoedev *d)
+{
+ long t;
+
+ t = 2 * d->rttavg >> RTTSCALE;
+ t += 8 * d->rttdev >> RTTDSCALE;
+ if (t == 0)
+ t = 1;
+
+ return t;
+}
+
+static void
+rexmit_deferred(struct aoedev *d)
+{
+ struct aoetgt *t;
+ struct frame *f;
+ struct frame *nf;
+ struct list_head *pos, *nx, *head;
+ int since;
+ int untainted;
+
+ count_targets(d, &untainted);
+
+ head = &d->rexmitq;
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ t = f->t;
+ if (t->taint) {
+ if (!(f->flags & FFL_PROBE)) {
+ nf = reassign_frame(f);
+ if (nf) {
+ if (t->nout_probes == 0
+ && untainted > 0) {
+ probe(t);
+ t->nout_probes++;
+ }
+ list_replace(&f->head, &nf->head);
+ pos = &nf->head;
+ aoe_freetframe(f);
+ f = nf;
+ t = f->t;
+ }
+ } else if (untainted < 1) {
+ /* don't probe w/o other untainted aoetgts */
+ goto stop_probe;
+ } else if (tsince_hr(f) < t->taint * rto(d)) {
+ /* reprobe slowly when taint is high */
+ continue;
+ }
+ } else if (f->flags & FFL_PROBE) {
+stop_probe: /* don't probe untainted aoetgts */
+ list_del(pos);
+ aoe_freetframe(f);
+ /* leaving d->kicked, because this is routine */
+ f->t->d->flags |= DEVFL_KICKME;
continue;
- nf = freeframe(d);
- if (!nf)
- return 0;
- skb = nf->skb;
- *nf = *f;
- f->skb = skb;
- f->tag = FREETAG;
- nf->waited = 0;
- ht->nout--;
- (*d->tgt)->nout++;
- resend(d, *d->tgt, nf);
+ }
+ if (t->nout >= t->maxout)
+ continue;
+ list_del(pos);
+ t->nout++;
+ if (f->flags & FFL_PROBE)
+ t->nout_probes++;
+ since = tsince_hr(f);
+ f->waited += since;
+ f->waited_total += since;
+ resend(d, f);
}
- /* he's clean, he's useless. take away his interfaces */
- memset(ht->ifs, 0, sizeof ht->ifs);
- d->htgt = NULL;
- return 1;
}
-static inline unsigned char
-ata_scnt(unsigned char *packet) {
- struct aoe_hdr *h;
- struct aoe_atahdr *ah;
+/* An aoetgt accumulates demerits quickly, and successful
+ * probing redeems the aoetgt slowly.
+ */
+static void
+scorn(struct aoetgt *t)
+{
+ int n;
- h = (struct aoe_hdr *) packet;
- ah = (struct aoe_atahdr *) (h+1);
- return ah->scnt;
+ n = t->taint++;
+ t->taint += t->taint * 2;
+ if (n > t->taint)
+ t->taint = n;
+ if (t->taint > MAX_TAINT)
+ t->taint = MAX_TAINT;
+}
+
+static int
+count_targets(struct aoedev *d, int *untainted)
+{
+ int i, good;
+
+ for (i = good = 0; i < d->ntargets && d->targets[i]; ++i)
+ if (d->targets[i]->taint == 0)
+ good++;
+
+ if (untainted)
+ *untainted = good;
+ return i;
}
static void
rexmit_timer(ulong vp)
{
- struct sk_buff_head queue;
struct aoedev *d;
- struct aoetgt *t, **tt, **te;
+ struct aoetgt *t;
struct aoeif *ifp;
- struct frame *f, *e;
+ struct frame *f;
+ struct list_head *head, *pos, *nx;
+ LIST_HEAD(flist);
register long timeout;
ulong flags, n;
+ int i;
+ int utgts; /* number of aoetgt descriptors (not slots) */
+ int since;
d = (struct aoedev *) vp;
- /* timeout is always ~150% of the moving average */
- timeout = d->rttavg;
- timeout += timeout >> 1;
-
spin_lock_irqsave(&d->lock, flags);
+ /* timeout based on observed timings and variations */
+ timeout = rto(d);
+
+ utgts = count_targets(d, NULL);
+
if (d->flags & DEVFL_TKILL) {
spin_unlock_irqrestore(&d->lock, flags);
return;
}
- tt = d->targets;
- te = tt + NTARGETS;
- for (; tt < te && *tt; tt++) {
- t = *tt;
- f = t->frames;
- e = f + t->nframes;
- for (; f < e; f++) {
- if (f->tag == FREETAG
- || tsince(f->tag) < timeout)
- continue;
- n = f->waited += timeout;
- n /= HZ;
- if (n > aoe_deadsecs) {
- /* waited too long. device failure. */
- aoedev_downdev(d);
- break;
- }
- if (n > HELPWAIT /* see if another target can help */
- && (tt != d->targets || d->targets[1]))
- d->htgt = tt;
+ /* collect all frames to rexmit into flist */
+ for (i = 0; i < NFACTIVE; i++) {
+ head = &d->factive[i];
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ if (tsince_hr(f) < timeout)
+ break; /* end of expired frames */
+ /* move to flist for later processing */
+ list_move_tail(pos, &flist);
+ }
+ }
+
+ /* process expired frames */
+ while (!list_empty(&flist)) {
+ pos = flist.next;
+ f = list_entry(pos, struct frame, head);
+ since = tsince_hr(f);
+ n = f->waited_total + since;
+ n /= USEC_PER_SEC;
+ if (aoe_deadsecs
+ && n > aoe_deadsecs
+ && !(f->flags & FFL_PROBE)) {
+ /* Waited too long. Device failure.
+ * Hang all frames on first hash bucket for downdev
+ * to clean up.
+ */
+ list_splice(&flist, &d->factive[0]);
+ aoedev_downdev(d);
+ goto out;
+ }
- if (t->nout == t->maxout) {
- if (t->maxout > 1)
- t->maxout--;
- t->lastwadj = jiffies;
- }
+ t = f->t;
+ n = f->waited + since;
+ n /= USEC_PER_SEC;
+ if (aoe_deadsecs && utgts > 0
+ && (n > aoe_deadsecs / utgts || n > HARD_SCORN_SECS))
+ scorn(t); /* avoid this target */
+ if (t->maxout != 1) {
+ t->ssthresh = t->maxout / 2;
+ t->maxout = 1;
+ }
+
+ if (f->flags & FFL_PROBE) {
+ t->nout_probes--;
+ } else {
ifp = getif(t, f->skb->dev);
if (ifp && ++ifp->lost > (t->nframes << 1)
&& (ifp != t->ifs || t->ifs[1].nd)) {
ejectif(t, ifp);
ifp = NULL;
}
-
- if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
- && ifp && ++ifp->lostjumbo > (t->nframes << 1)
- && ifp->maxbcnt != DEFAULTBCNT) {
- printk(KERN_INFO
- "aoe: e%ld.%d: "
- "too many lost jumbo on "
- "%s:%pm - "
- "falling back to %d frames.\n",
- d->aoemajor, d->aoeminor,
- ifp->nd->name, t->addr,
- DEFAULTBCNT);
- ifp->maxbcnt = 0;
- }
- resend(d, t, f);
}
-
- /* window check */
- if (t->nout == t->maxout
- && t->maxout < t->nframes
- && (jiffies - t->lastwadj)/HZ > 10) {
- t->maxout++;
- t->lastwadj = jiffies;
- }
- }
-
- if (!skb_queue_empty(&d->sendq)) {
- n = d->rttavg <<= 1;
- if (n > MAXTIMER)
- d->rttavg = MAXTIMER;
+ list_move_tail(pos, &d->rexmitq);
+ t->nout--;
}
+ rexmit_deferred(d);
- if (d->flags & DEVFL_KICKME || d->htgt) {
+out:
+ if ((d->flags & DEVFL_KICKME) && d->blkq) {
d->flags &= ~DEVFL_KICKME;
- aoecmd_work(d);
+ d->blkq->request_fn(d->blkq);
}
- __skb_queue_head_init(&queue);
- skb_queue_splice_init(&d->sendq, &queue);
-
d->timer.expires = jiffies + TIMERTICK;
add_timer(&d->timer);
spin_unlock_irqrestore(&d->lock, flags);
+}
- aoenet_xmit(&queue);
+static unsigned long
+rqbiocnt(struct request *r)
+{
+ struct bio *bio;
+ unsigned long n = 0;
+
+ __rq_for_each_bio(bio, r)
+ n++;
+ return n;
+}
+
+/* This can be removed if we are certain that no users of the block
+ * layer will ever use zero-count pages in bios. Otherwise we have to
+ * protect against the put_page sometimes done by the network layer.
+ *
+ * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
+ * discussion.
+ *
+ * We cannot use get_page in the workaround, because it insists on a
+ * positive page count as a precondition. So we use _count directly.
+ */
+static void
+bio_pageinc(struct bio *bio)
+{
+ struct bio_vec bv;
+ struct page *page;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(bv, bio, iter) {
+ /* Non-zero page count for non-head members of
+ * compound pages is no longer allowed by the kernel.
+ */
+ page = compound_head(bv.bv_page);
+ atomic_inc(&page->_count);
+ }
+}
+
+static void
+bio_pagedec(struct bio *bio)
+{
+ struct page *page;
+ struct bio_vec bv;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(bv, bio, iter) {
+ page = compound_head(bv.bv_page);
+ atomic_dec(&page->_count);
+ }
+}
+
+static void
+bufinit(struct buf *buf, struct request *rq, struct bio *bio)
+{
+ memset(buf, 0, sizeof(*buf));
+ buf->rq = rq;
+ buf->bio = bio;
+ buf->iter = bio->bi_iter;
+ bio_pageinc(bio);
+}
+
+static struct buf *
+nextbuf(struct aoedev *d)
+{
+ struct request *rq;
+ struct request_queue *q;
+ struct buf *buf;
+ struct bio *bio;
+
+ q = d->blkq;
+ if (q == NULL)
+ return NULL; /* initializing */
+ if (d->ip.buf)
+ return d->ip.buf;
+ rq = d->ip.rq;
+ if (rq == NULL) {
+ rq = blk_peek_request(q);
+ if (rq == NULL)
+ return NULL;
+ blk_start_request(rq);
+ d->ip.rq = rq;
+ d->ip.nxbio = rq->bio;
+ rq->special = (void *) rqbiocnt(rq);
+ }
+ buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
+ if (buf == NULL) {
+ pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
+ return NULL;
+ }
+ bio = d->ip.nxbio;
+ bufinit(buf, rq, bio);
+ bio = bio->bi_next;
+ d->ip.nxbio = bio;
+ if (bio == NULL)
+ d->ip.rq = NULL;
+ return d->ip.buf = buf;
}
/* enters with d->lock held */
void
aoecmd_work(struct aoedev *d)
{
- struct buf *buf;
-loop:
- if (d->htgt && !sthtith(d))
- return;
- if (d->inprocess == NULL) {
- if (list_empty(&d->bufq))
- return;
- buf = container_of(d->bufq.next, struct buf, bufs);
- list_del(d->bufq.next);
- d->inprocess = buf;
- }
- if (aoecmd_ata_rw(d))
- goto loop;
+ rexmit_deferred(d);
+ while (aoecmd_ata_rw(d))
+ ;
}
/* this function performs work that has been deferred until sleeping is OK
@@ -608,28 +954,36 @@ void
aoecmd_sleepwork(struct work_struct *work)
{
struct aoedev *d = container_of(work, struct aoedev, work);
+ struct block_device *bd;
+ u64 ssize;
if (d->flags & DEVFL_GDALLOC)
aoeblk_gdalloc(d);
if (d->flags & DEVFL_NEWSIZE) {
- struct block_device *bd;
- unsigned long flags;
- u64 ssize;
-
ssize = get_capacity(d->gd);
bd = bdget_disk(d->gd, 0);
-
if (bd) {
mutex_lock(&bd->bd_inode->i_mutex);
i_size_write(bd->bd_inode, (loff_t)ssize<<9);
mutex_unlock(&bd->bd_inode->i_mutex);
bdput(bd);
}
- spin_lock_irqsave(&d->lock, flags);
+ spin_lock_irq(&d->lock);
d->flags |= DEVFL_UP;
d->flags &= ~DEVFL_NEWSIZE;
- spin_unlock_irqrestore(&d->lock, flags);
+ spin_unlock_irq(&d->lock);
+ }
+}
+
+static void
+ata_ident_fixstring(u16 *id, int ns)
+{
+ u16 s;
+
+ while (ns-- > 0) {
+ s = *id;
+ *id++ = s >> 8 | s << 8;
}
}
@@ -668,6 +1022,11 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
}
+ ata_ident_fixstring((u16 *) &id[10<<1], 10); /* serial */
+ ata_ident_fixstring((u16 *) &id[23<<1], 4); /* firmware */
+ ata_ident_fixstring((u16 *) &id[27<<1], 20); /* model */
+ memcpy(d->ident, id, sizeof(d->ident));
+
if (d->ssize != ssize)
printk(KERN_INFO
"aoe: %pm e%ld.%d v%04x has %llu sectors\n",
@@ -687,26 +1046,28 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
}
static void
-calc_rttavg(struct aoedev *d, int rtt)
+calc_rttavg(struct aoedev *d, struct aoetgt *t, int rtt)
{
register long n;
n = rtt;
- if (n < 0) {
- n = -rtt;
- if (n < MINTIMER)
- n = MINTIMER;
- else if (n > MAXTIMER)
- n = MAXTIMER;
- d->mintimer += (n - d->mintimer) >> 1;
- } else if (n < d->mintimer)
- n = d->mintimer;
- else if (n > MAXTIMER)
- n = MAXTIMER;
-
- /* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */
- n -= d->rttavg;
- d->rttavg += n >> 2;
+
+ /* cf. Congestion Avoidance and Control, Jacobson & Karels, 1988 */
+ n -= d->rttavg >> RTTSCALE;
+ d->rttavg += n;
+ if (n < 0)
+ n = -n;
+ n -= d->rttdev >> RTTDSCALE;
+ d->rttdev += n;
+
+ if (!t || t->maxout >= t->nframes)
+ return;
+ if (t->maxout < t->ssthresh)
+ t->maxout += 1;
+ else if (t->nout == t->maxout && t->next_cwnd-- == 0) {
+ t->maxout += 1;
+ t->next_cwnd = t->maxout;
+ }
}
static struct aoetgt *
@@ -715,166 +1076,352 @@ gettgt(struct aoedev *d, char *addr)
struct aoetgt **t, **e;
t = d->targets;
- e = t + NTARGETS;
+ e = t + d->ntargets;
for (; t < e && *t; t++)
if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
return *t;
return NULL;
}
-static inline void
-diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector)
+static void
+bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt)
{
- unsigned long n_sect = bio->bi_size >> 9;
- const int rw = bio_data_dir(bio);
- struct hd_struct *part;
- int cpu;
-
- cpu = part_stat_lock();
- part = disk_map_sector_rcu(disk, sector);
+ int soff = 0;
+ struct bio_vec bv;
- part_stat_inc(cpu, part, ios[rw]);
- part_stat_add(cpu, part, ticks[rw], duration);
- part_stat_add(cpu, part, sectors[rw], n_sect);
- part_stat_add(cpu, part, io_ticks, duration);
+ iter.bi_size = cnt;
- part_stat_unlock();
+ __bio_for_each_segment(bv, bio, iter, iter) {
+ char *p = page_address(bv.bv_page) + bv.bv_offset;
+ skb_copy_bits(skb, soff, p, bv.bv_len);
+ soff += bv.bv_len;
+ }
}
void
-aoecmd_ata_rsp(struct sk_buff *skb)
+aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
+{
+ struct bio *bio;
+ int bok;
+ struct request_queue *q;
+
+ q = d->blkq;
+ if (rq == d->ip.rq)
+ d->ip.rq = NULL;
+ do {
+ bio = rq->bio;
+ bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
+ } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_iter.bi_size));
+
+ /* cf. http://lkml.org/lkml/2006/10/31/28 */
+ if (!fastfail)
+ __blk_run_queue(q);
+}
+
+static void
+aoe_end_buf(struct aoedev *d, struct buf *buf)
+{
+ struct request *rq;
+ unsigned long n;
+
+ if (buf == d->ip.buf)
+ d->ip.buf = NULL;
+ rq = buf->rq;
+ bio_pagedec(buf->bio);
+ mempool_free(buf, d->bufpool);
+ n = (unsigned long) rq->special;
+ rq->special = (void *) --n;
+ if (n == 0)
+ aoe_end_request(d, rq, 0);
+}
+
+static void
+ktiocomplete(struct frame *f)
{
- struct sk_buff_head queue;
- struct aoedev *d;
struct aoe_hdr *hin, *hout;
struct aoe_atahdr *ahin, *ahout;
- struct frame *f;
struct buf *buf;
+ struct sk_buff *skb;
struct aoetgt *t;
struct aoeif *ifp;
- register long n;
- ulong flags;
- char ebuf[128];
- u16 aoemajor;
-
- hin = (struct aoe_hdr *) skb_mac_header(skb);
- aoemajor = get_unaligned_be16(&hin->major);
- d = aoedev_by_aoeaddr(aoemajor, hin->minor);
- if (d == NULL) {
- snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
- "for unknown device %d.%d\n",
- aoemajor, hin->minor);
- aoechr_error(ebuf);
- return;
- }
-
- spin_lock_irqsave(&d->lock, flags);
+ struct aoedev *d;
+ long n;
+ int untainted;
- n = get_unaligned_be32(&hin->tag);
- t = gettgt(d, hin->src);
- if (t == NULL) {
- printk(KERN_INFO "aoe: can't find target e%ld.%d:%pm\n",
- d->aoemajor, d->aoeminor, hin->src);
- spin_unlock_irqrestore(&d->lock, flags);
- return;
- }
- f = getframe(t, n);
- if (f == NULL) {
- calc_rttavg(d, -tsince(n));
- spin_unlock_irqrestore(&d->lock, flags);
- snprintf(ebuf, sizeof ebuf,
- "%15s e%d.%d tag=%08x@%08lx\n",
- "unexpected rsp",
- get_unaligned_be16(&hin->major),
- hin->minor,
- get_unaligned_be32(&hin->tag),
- jiffies);
- aoechr_error(ebuf);
+ if (f == NULL)
return;
- }
- calc_rttavg(d, tsince(f->tag));
+ t = f->t;
+ d = t->d;
+ skb = f->r_skb;
+ buf = f->buf;
+ if (f->flags & FFL_PROBE)
+ goto out;
+ if (!skb) /* just fail the buf. */
+ goto noskb;
- ahin = (struct aoe_atahdr *) (hin+1);
hout = (struct aoe_hdr *) skb_mac_header(f->skb);
ahout = (struct aoe_atahdr *) (hout+1);
- buf = f->buf;
+ hin = (struct aoe_hdr *) skb->data;
+ skb_pull(skb, sizeof(*hin));
+ ahin = (struct aoe_atahdr *) skb->data;
+ skb_pull(skb, sizeof(*ahin));
if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */
- printk(KERN_ERR
- "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
+ pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
ahout->cmdstat, ahin->cmdstat,
d->aoemajor, d->aoeminor);
- if (buf)
- buf->flags |= BUFFL_FAIL;
- } else {
- if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */
- d->htgt = NULL;
- n = ahout->scnt << 9;
- switch (ahout->cmdstat) {
- case WIN_READ:
- case WIN_READ_EXT:
- if (skb->len - sizeof *hin - sizeof *ahin < n) {
- printk(KERN_ERR
- "aoe: %s. skb->len=%d need=%ld\n",
- "runt data size in read", skb->len, n);
- /* fail frame f? just returning will rexmit. */
- spin_unlock_irqrestore(&d->lock, flags);
- return;
- }
- memcpy(f->bufaddr, ahin+1, n);
- case WIN_WRITE:
- case WIN_WRITE_EXT:
- ifp = getif(t, skb->dev);
- if (ifp) {
- ifp->lost = 0;
- if (n > DEFAULTBCNT)
- ifp->lostjumbo = 0;
- }
- if (f->bcnt -= n) {
- f->lba += n >> 9;
- f->bufaddr += n;
- resend(d, t, f);
- goto xmit;
- }
+noskb: if (buf)
+ clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+ goto out;
+ }
+
+ n = ahout->scnt << 9;
+ switch (ahout->cmdstat) {
+ case ATA_CMD_PIO_READ:
+ case ATA_CMD_PIO_READ_EXT:
+ if (skb->len < n) {
+ pr_err("%s e%ld.%d. skb->len=%d need=%ld\n",
+ "aoe: runt data size in read from",
+ (long) d->aoemajor, d->aoeminor,
+ skb->len, n);
+ clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
break;
- case WIN_IDENTIFY:
- if (skb->len - sizeof *hin - sizeof *ahin < 512) {
- printk(KERN_INFO
- "aoe: runt data size in ataid. skb->len=%d\n",
- skb->len);
- spin_unlock_irqrestore(&d->lock, flags);
- return;
- }
- ataid_complete(d, t, (char *) (ahin+1));
+ }
+ if (n > f->iter.bi_size) {
+ pr_err_ratelimited("%s e%ld.%d. bytes=%ld need=%u\n",
+ "aoe: too-large data size in read from",
+ (long) d->aoemajor, d->aoeminor,
+ n, f->iter.bi_size);
+ clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+ break;
+ }
+ bvcpy(skb, f->buf->bio, f->iter, n);
+ case ATA_CMD_PIO_WRITE:
+ case ATA_CMD_PIO_WRITE_EXT:
+ spin_lock_irq(&d->lock);
+ ifp = getif(t, skb->dev);
+ if (ifp)
+ ifp->lost = 0;
+ spin_unlock_irq(&d->lock);
+ break;
+ case ATA_CMD_ID_ATA:
+ if (skb->len < 512) {
+ pr_info("%s e%ld.%d. skb->len=%d need=512\n",
+ "aoe: runt data size in ataid from",
+ (long) d->aoemajor, d->aoeminor,
+ skb->len);
break;
- default:
- printk(KERN_INFO
- "aoe: unrecognized ata command %2.2Xh for %d.%d\n",
- ahout->cmdstat,
- get_unaligned_be16(&hin->major),
- hin->minor);
+ }
+ if (skb_linearize(skb))
+ break;
+ spin_lock_irq(&d->lock);
+ ataid_complete(d, t, skb->data);
+ spin_unlock_irq(&d->lock);
+ break;
+ default:
+ pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n",
+ ahout->cmdstat,
+ be16_to_cpu(get_unaligned(&hin->major)),
+ hin->minor);
+ }
+out:
+ spin_lock_irq(&d->lock);
+ if (t->taint > 0
+ && --t->taint > 0
+ && t->nout_probes == 0) {
+ count_targets(d, &untainted);
+ if (untainted > 0) {
+ probe(t);
+ t->nout_probes++;
}
}
- if (buf && --buf->nframesout == 0 && buf->resid == 0) {
- diskstats(d->gd, buf->bio, jiffies - buf->stime, buf->sector);
- n = (buf->flags & BUFFL_FAIL) ? -EIO : 0;
- bio_endio(buf->bio, n);
- mempool_free(buf, d->bufpool);
+ aoe_freetframe(f);
+
+ if (buf && --buf->nframesout == 0 && buf->iter.bi_size == 0)
+ aoe_end_buf(d, buf);
+
+ spin_unlock_irq(&d->lock);
+ aoedev_put(d);
+ dev_kfree_skb(skb);
+}
+
+/* Enters with iocq.lock held.
+ * Returns true iff responses needing processing remain.
+ */
+static int
+ktio(int id)
+{
+ struct frame *f;
+ struct list_head *pos;
+ int i;
+ int actual_id;
+
+ for (i = 0; ; ++i) {
+ if (i == MAXIOC)
+ return 1;
+ if (list_empty(&iocq[id].head))
+ return 0;
+ pos = iocq[id].head.next;
+ list_del(pos);
+ f = list_entry(pos, struct frame, head);
+ spin_unlock_irq(&iocq[id].lock);
+ ktiocomplete(f);
+
+ /* Figure out if extra threads are required. */
+ actual_id = f->t->d->aoeminor % ncpus;
+
+ if (!kts[actual_id].active) {
+ BUG_ON(id != 0);
+ mutex_lock(&ktio_spawn_lock);
+ if (!kts[actual_id].active
+ && aoe_ktstart(&kts[actual_id]) == 0)
+ kts[actual_id].active = 1;
+ mutex_unlock(&ktio_spawn_lock);
+ }
+ spin_lock_irq(&iocq[id].lock);
}
+}
- f->buf = NULL;
- f->tag = FREETAG;
- t->nout--;
+static int
+kthread(void *vp)
+{
+ struct ktstate *k;
+ DECLARE_WAITQUEUE(wait, current);
+ int more;
+
+ k = vp;
+ current->flags |= PF_NOFREEZE;
+ set_user_nice(current, -10);
+ complete(&k->rendez); /* tell spawner we're running */
+ do {
+ spin_lock_irq(k->lock);
+ more = k->fn(k->id);
+ if (!more) {
+ add_wait_queue(k->waitq, &wait);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ }
+ spin_unlock_irq(k->lock);
+ if (!more) {
+ schedule();
+ remove_wait_queue(k->waitq, &wait);
+ } else
+ cond_resched();
+ } while (!kthread_should_stop());
+ complete(&k->rendez); /* tell spawner we're stopping */
+ return 0;
+}
+
+void
+aoe_ktstop(struct ktstate *k)
+{
+ kthread_stop(k->task);
+ wait_for_completion(&k->rendez);
+}
+
+int
+aoe_ktstart(struct ktstate *k)
+{
+ struct task_struct *task;
+
+ init_completion(&k->rendez);
+ task = kthread_run(kthread, k, "%s", k->name);
+ if (task == NULL || IS_ERR(task))
+ return -ENOMEM;
+ k->task = task;
+ wait_for_completion(&k->rendez); /* allow kthread to start */
+ init_completion(&k->rendez); /* for waiting for exit later */
+ return 0;
+}
+
+/* pass it off to kthreads for processing */
+static void
+ktcomplete(struct frame *f, struct sk_buff *skb)
+{
+ int id;
+ ulong flags;
+
+ f->r_skb = skb;
+ id = f->t->d->aoeminor % ncpus;
+ spin_lock_irqsave(&iocq[id].lock, flags);
+ if (!kts[id].active) {
+ spin_unlock_irqrestore(&iocq[id].lock, flags);
+ /* The thread with id has not been spawned yet,
+ * so delegate the work to the main thread and
+ * try spawning a new thread.
+ */
+ id = 0;
+ spin_lock_irqsave(&iocq[id].lock, flags);
+ }
+ list_add_tail(&f->head, &iocq[id].head);
+ spin_unlock_irqrestore(&iocq[id].lock, flags);
+ wake_up(&ktiowq[id]);
+}
+
+struct sk_buff *
+aoecmd_ata_rsp(struct sk_buff *skb)
+{
+ struct aoedev *d;
+ struct aoe_hdr *h;
+ struct frame *f;
+ u32 n;
+ ulong flags;
+ char ebuf[128];
+ u16 aoemajor;
+
+ h = (struct aoe_hdr *) skb->data;
+ aoemajor = be16_to_cpu(get_unaligned(&h->major));
+ d = aoedev_by_aoeaddr(aoemajor, h->minor, 0);
+ if (d == NULL) {
+ snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
+ "for unknown device %d.%d\n",
+ aoemajor, h->minor);
+ aoechr_error(ebuf);
+ return skb;
+ }
+ spin_lock_irqsave(&d->lock, flags);
+
+ n = be32_to_cpu(get_unaligned(&h->tag));
+ f = getframe(d, n);
+ if (f) {
+ calc_rttavg(d, f->t, tsince_hr(f));
+ f->t->nout--;
+ if (f->flags & FFL_PROBE)
+ f->t->nout_probes--;
+ } else {
+ f = getframe_deferred(d, n);
+ if (f) {
+ calc_rttavg(d, NULL, tsince_hr(f));
+ } else {
+ calc_rttavg(d, NULL, tsince(n));
+ spin_unlock_irqrestore(&d->lock, flags);
+ aoedev_put(d);
+ snprintf(ebuf, sizeof(ebuf),
+ "%15s e%d.%d tag=%08x@%08lx s=%pm d=%pm\n",
+ "unexpected rsp",
+ get_unaligned_be16(&h->major),
+ h->minor,
+ get_unaligned_be32(&h->tag),
+ jiffies,
+ h->src,
+ h->dst);
+ aoechr_error(ebuf);
+ return skb;
+ }
+ }
aoecmd_work(d);
-xmit:
- __skb_queue_head_init(&queue);
- skb_queue_splice_init(&d->sendq, &queue);
spin_unlock_irqrestore(&d->lock, flags);
- aoenet_xmit(&queue);
+
+ ktcomplete(f, skb);
+
+ /*
+ * Note here that we do not perform an aoedev_put, as we are
+ * leaving this reference for the ktio to release.
+ */
+ return NULL;
}
void
@@ -886,7 +1433,7 @@ aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
aoecmd_cfg_pkts(aoemajor, aoeminor, &queue);
aoenet_xmit(&queue);
}
-
+
struct sk_buff *
aoecmd_ata_id(struct aoedev *d)
{
@@ -896,7 +1443,7 @@ aoecmd_ata_id(struct aoedev *d)
struct sk_buff *skb;
struct aoetgt *t;
- f = freeframe(d);
+ f = newframe(d);
if (f == NULL)
return NULL;
@@ -909,56 +1456,132 @@ aoecmd_ata_id(struct aoedev *d)
skb_put(skb, sizeof *h + sizeof *ah);
memset(h, 0, skb->len);
f->tag = aoehdr_atainit(d, t, h);
+ fhash(f);
t->nout++;
f->waited = 0;
+ f->waited_total = 0;
/* set up ata header */
ah->scnt = 1;
- ah->cmdstat = WIN_IDENTIFY;
+ ah->cmdstat = ATA_CMD_ID_ATA;
ah->lba3 = 0xa0;
skb->dev = t->ifp->nd;
- d->rttavg = MAXTIMER;
+ d->rttavg = RTTAVG_INIT;
+ d->rttdev = RTTDEV_INIT;
d->timer.function = rexmit_timer;
- return skb_clone(skb, GFP_ATOMIC);
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (skb) {
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ }
+
+ return skb;
+}
+
+static struct aoetgt **
+grow_targets(struct aoedev *d)
+{
+ ulong oldn, newn;
+ struct aoetgt **tt;
+
+ oldn = d->ntargets;
+ newn = oldn * 2;
+ tt = kcalloc(newn, sizeof(*d->targets), GFP_ATOMIC);
+ if (!tt)
+ return NULL;
+ memmove(tt, d->targets, sizeof(*d->targets) * oldn);
+ d->tgt = tt + (d->tgt - d->targets);
+ kfree(d->targets);
+ d->targets = tt;
+ d->ntargets = newn;
+
+ return &d->targets[oldn];
}
-
+
static struct aoetgt *
addtgt(struct aoedev *d, char *addr, ulong nframes)
{
struct aoetgt *t, **tt, **te;
- struct frame *f, *e;
tt = d->targets;
- te = tt + NTARGETS;
+ te = tt + d->ntargets;
for (; tt < te && *tt; tt++)
;
if (tt == te) {
- printk(KERN_INFO
- "aoe: device addtgt failure; too many targets\n");
- return NULL;
- }
- t = kcalloc(1, sizeof *t, GFP_ATOMIC);
- f = kcalloc(nframes, sizeof *f, GFP_ATOMIC);
- if (!t || !f) {
- kfree(f);
- kfree(t);
- printk(KERN_INFO "aoe: cannot allocate memory to add target\n");
- return NULL;
+ tt = grow_targets(d);
+ if (!tt)
+ goto nomem;
}
-
+ t = kzalloc(sizeof(*t), GFP_ATOMIC);
+ if (!t)
+ goto nomem;
t->nframes = nframes;
- t->frames = f;
- e = f + nframes;
- for (; f < e; f++)
- f->tag = FREETAG;
+ t->d = d;
memcpy(t->addr, addr, sizeof t->addr);
t->ifp = t->ifs;
- t->maxout = t->nframes;
+ aoecmd_wreset(t);
+ t->maxout = t->nframes / 2;
+ INIT_LIST_HEAD(&t->ffree);
return *tt = t;
+
+ nomem:
+ pr_info("aoe: cannot allocate memory to add target\n");
+ return NULL;
+}
+
+static void
+setdbcnt(struct aoedev *d)
+{
+ struct aoetgt **t, **e;
+ int bcnt = 0;
+
+ t = d->targets;
+ e = t + d->ntargets;
+ for (; t < e && *t; t++)
+ if (bcnt == 0 || bcnt > (*t)->minbcnt)
+ bcnt = (*t)->minbcnt;
+ if (bcnt != d->maxbcnt) {
+ d->maxbcnt = bcnt;
+ pr_info("aoe: e%ld.%d: setting %d byte data frames\n",
+ d->aoemajor, d->aoeminor, bcnt);
+ }
+}
+
+static void
+setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt)
+{
+ struct aoedev *d;
+ struct aoeif *p, *e;
+ int minbcnt;
+
+ d = t->d;
+ minbcnt = bcnt;
+ p = t->ifs;
+ e = p + NAOEIFS;
+ for (; p < e; p++) {
+ if (p->nd == NULL)
+ break; /* end of the valid interfaces */
+ if (p->nd == nd) {
+ p->bcnt = bcnt; /* we're updating */
+ nd = NULL;
+ } else if (minbcnt > p->bcnt)
+ minbcnt = p->bcnt; /* find the min interface */
+ }
+ if (nd) {
+ if (p == e) {
+ pr_err("aoe: device setifbcnt failure; too many interfaces.\n");
+ return;
+ }
+ dev_hold(nd);
+ p->nd = nd;
+ p->bcnt = bcnt;
+ }
+ t->minbcnt = minbcnt;
+ setdbcnt(d);
}
void
@@ -968,11 +1591,12 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
struct aoe_hdr *h;
struct aoe_cfghdr *ch;
struct aoetgt *t;
- struct aoeif *ifp;
- ulong flags, sysminor, aoemajor;
+ ulong flags, aoemajor;
struct sk_buff *sl;
+ struct sk_buff_head queue;
u16 n;
+ sl = NULL;
h = (struct aoe_hdr *) skb_mac_header(skb);
ch = (struct aoe_cfghdr *) (h+1);
@@ -986,10 +1610,13 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
"Check shelf dip switches.\n");
return;
}
-
- sysminor = SYSMINOR(aoemajor, h->minor);
- if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) {
- printk(KERN_INFO "aoe: e%ld.%d: minor number too large\n",
+ if (aoemajor == 0xffff) {
+ pr_info("aoe: e%ld.%d: broadcast shelf number invalid\n",
+ aoemajor, (int) h->minor);
+ return;
+ }
+ if (h->minor == 0xff) {
+ pr_info("aoe: e%ld.%d: broadcast slot number invalid\n",
aoemajor, (int) h->minor);
return;
}
@@ -998,63 +1625,41 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
if (n > aoe_maxout) /* keep it reasonable */
n = aoe_maxout;
- d = aoedev_by_sysminor_m(sysminor);
+ d = aoedev_by_aoeaddr(aoemajor, h->minor, 1);
if (d == NULL) {
- printk(KERN_INFO "aoe: device sysminor_m failure\n");
+ pr_info("aoe: device allocation failure\n");
return;
}
spin_lock_irqsave(&d->lock, flags);
t = gettgt(d, h->src);
- if (!t) {
+ if (t) {
+ t->nframes = n;
+ if (n < t->maxout)
+ aoecmd_wreset(t);
+ } else {
t = addtgt(d, h->src, n);
- if (!t) {
- spin_unlock_irqrestore(&d->lock, flags);
- return;
- }
- }
- ifp = getif(t, skb->dev);
- if (!ifp) {
- ifp = addif(t, skb->dev);
- if (!ifp) {
- printk(KERN_INFO
- "aoe: device addif failure; "
- "too many interfaces?\n");
- spin_unlock_irqrestore(&d->lock, flags);
- return;
- }
- }
- if (ifp->maxbcnt) {
- n = ifp->nd->mtu;
- n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr);
- n /= 512;
- if (n > ch->scnt)
- n = ch->scnt;
- n = n ? n * 512 : DEFAULTBCNT;
- if (n != ifp->maxbcnt) {
- printk(KERN_INFO
- "aoe: e%ld.%d: setting %d%s%s:%pm\n",
- d->aoemajor, d->aoeminor, n,
- " byte data frames on ", ifp->nd->name,
- t->addr);
- ifp->maxbcnt = n;
- }
+ if (!t)
+ goto bail;
}
+ n = skb->dev->mtu;
+ n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr);
+ n /= 512;
+ if (n > ch->scnt)
+ n = ch->scnt;
+ n = n ? n * 512 : DEFAULTBCNT;
+ setifbcnt(t, skb->dev, n);
/* don't change users' perspective */
- if (d->nopen) {
- spin_unlock_irqrestore(&d->lock, flags);
- return;
+ if (d->nopen == 0) {
+ d->fw_ver = be16_to_cpu(ch->fwver);
+ sl = aoecmd_ata_id(d);
}
- d->fw_ver = be16_to_cpu(ch->fwver);
-
- sl = aoecmd_ata_id(d);
-
+bail:
spin_unlock_irqrestore(&d->lock, flags);
-
+ aoedev_put(d);
if (sl) {
- struct sk_buff_head queue;
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, sl);
aoenet_xmit(&queue);
@@ -1062,23 +1667,160 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
}
void
+aoecmd_wreset(struct aoetgt *t)
+{
+ t->maxout = 1;
+ t->ssthresh = t->nframes / 2;
+ t->next_cwnd = t->nframes;
+}
+
+void
aoecmd_cleanslate(struct aoedev *d)
{
struct aoetgt **t, **te;
- struct aoeif *p, *e;
- d->mintimer = MINTIMER;
+ d->rttavg = RTTAVG_INIT;
+ d->rttdev = RTTDEV_INIT;
+ d->maxbcnt = 0;
t = d->targets;
- te = t + NTARGETS;
- for (; t < te && *t; t++) {
- (*t)->maxout = (*t)->nframes;
- p = (*t)->ifs;
- e = p + NAOEIFS;
- for (; p < e; p++) {
- p->lostjumbo = 0;
- p->lost = 0;
- p->maxbcnt = DEFAULTBCNT;
+ te = t + d->ntargets;
+ for (; t < te && *t; t++)
+ aoecmd_wreset(*t);
+}
+
+void
+aoe_failbuf(struct aoedev *d, struct buf *buf)
+{
+ if (buf == NULL)
+ return;
+ buf->iter.bi_size = 0;
+ clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+ if (buf->nframesout == 0)
+ aoe_end_buf(d, buf);
+}
+
+void
+aoe_flush_iocq(void)
+{
+ int i;
+
+ for (i = 0; i < ncpus; i++) {
+ if (kts[i].active)
+ aoe_flush_iocq_by_index(i);
+ }
+}
+
+void
+aoe_flush_iocq_by_index(int id)
+{
+ struct frame *f;
+ struct aoedev *d;
+ LIST_HEAD(flist);
+ struct list_head *pos;
+ struct sk_buff *skb;
+ ulong flags;
+
+ spin_lock_irqsave(&iocq[id].lock, flags);
+ list_splice_init(&iocq[id].head, &flist);
+ spin_unlock_irqrestore(&iocq[id].lock, flags);
+ while (!list_empty(&flist)) {
+ pos = flist.next;
+ list_del(pos);
+ f = list_entry(pos, struct frame, head);
+ d = f->t->d;
+ skb = f->r_skb;
+ spin_lock_irqsave(&d->lock, flags);
+ if (f->buf) {
+ f->buf->nframesout--;
+ aoe_failbuf(d, f->buf);
}
+ aoe_freetframe(f);
+ spin_unlock_irqrestore(&d->lock, flags);
+ dev_kfree_skb(skb);
+ aoedev_put(d);
+ }
+}
+
+int __init
+aoecmd_init(void)
+{
+ void *p;
+ int i;
+ int ret;
+
+ /* get_zeroed_page returns page with ref count 1 */
+ p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT);
+ if (!p)
+ return -ENOMEM;
+ empty_page = virt_to_page(p);
+
+ ncpus = num_online_cpus();
+
+ iocq = kcalloc(ncpus, sizeof(struct iocq_ktio), GFP_KERNEL);
+ if (!iocq)
+ return -ENOMEM;
+
+ kts = kcalloc(ncpus, sizeof(struct ktstate), GFP_KERNEL);
+ if (!kts) {
+ ret = -ENOMEM;
+ goto kts_fail;
+ }
+
+ ktiowq = kcalloc(ncpus, sizeof(wait_queue_head_t), GFP_KERNEL);
+ if (!ktiowq) {
+ ret = -ENOMEM;
+ goto ktiowq_fail;
+ }
+
+ mutex_init(&ktio_spawn_lock);
+
+ for (i = 0; i < ncpus; i++) {
+ INIT_LIST_HEAD(&iocq[i].head);
+ spin_lock_init(&iocq[i].lock);
+ init_waitqueue_head(&ktiowq[i]);
+ snprintf(kts[i].name, sizeof(kts[i].name), "aoe_ktio%d", i);
+ kts[i].fn = ktio;
+ kts[i].waitq = &ktiowq[i];
+ kts[i].lock = &iocq[i].lock;
+ kts[i].id = i;
+ kts[i].active = 0;
+ }
+ kts[0].active = 1;
+ if (aoe_ktstart(&kts[0])) {
+ ret = -ENOMEM;
+ goto ktstart_fail;
}
+ return 0;
+
+ktstart_fail:
+ kfree(ktiowq);
+ktiowq_fail:
+ kfree(kts);
+kts_fail:
+ kfree(iocq);
+
+ return ret;
+}
+
+void
+aoecmd_exit(void)
+{
+ int i;
+
+ for (i = 0; i < ncpus; i++)
+ if (kts[i].active)
+ aoe_ktstop(&kts[i]);
+
+ aoe_flush_iocq();
+
+ /* Free up the iocq and thread speicific configuration
+ * allocated during startup.
+ */
+ kfree(iocq);
+ kfree(kts);
+ kfree(ktiowq);
+
+ free_page((unsigned long) page_address(empty_page));
+ empty_page = NULL;
}
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index eeea477d960..e774c50b684 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoedev.c
* AoE device utility functions; maintains device list.
@@ -8,30 +8,141 @@
#include <linux/blkdev.h>
#include <linux/netdevice.h>
#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+#include <linux/kdev_t.h>
+#include <linux/moduleparam.h>
+#include <linux/string.h>
#include "aoe.h"
static void dummy_timer(ulong);
-static void aoedev_freedev(struct aoedev *);
static void freetgt(struct aoedev *d, struct aoetgt *t);
static void skbpoolfree(struct aoedev *d);
+static int aoe_dyndevs = 1;
+module_param(aoe_dyndevs, int, 0644);
+MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices.");
+
static struct aoedev *devlist;
static DEFINE_SPINLOCK(devlist_lock);
-struct aoedev *
-aoedev_by_aoeaddr(int maj, int min)
+/* Because some systems will have one, many, or no
+ * - partitions,
+ * - slots per shelf,
+ * - or shelves,
+ * we need some flexibility in the way the minor numbers
+ * are allocated. So they are dynamic.
+ */
+#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS)
+
+static DEFINE_SPINLOCK(used_minors_lock);
+static DECLARE_BITMAP(used_minors, N_DEVS);
+
+static int
+minor_get_dyn(ulong *sysminor)
{
- struct aoedev *d;
ulong flags;
+ ulong n;
+ int error = 0;
+
+ spin_lock_irqsave(&used_minors_lock, flags);
+ n = find_first_zero_bit(used_minors, N_DEVS);
+ if (n < N_DEVS)
+ set_bit(n, used_minors);
+ else
+ error = -1;
+ spin_unlock_irqrestore(&used_minors_lock, flags);
+
+ *sysminor = n * AOE_PARTITIONS;
+ return error;
+}
- spin_lock_irqsave(&devlist_lock, flags);
+static int
+minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin)
+{
+ ulong flags;
+ ulong n;
+ int error = 0;
+ enum {
+ /* for backwards compatibility when !aoe_dyndevs,
+ * a static number of supported slots per shelf */
+ NPERSHELF = 16,
+ };
+
+ if (aoemin >= NPERSHELF) {
+ pr_err("aoe: %s %d slots per shelf\n",
+ "static minor device numbers support only",
+ NPERSHELF);
+ error = -1;
+ goto out;
+ }
- for (d=devlist; d; d=d->next)
- if (d->aoemajor == maj && d->aoeminor == min)
- break;
+ n = aoemaj * NPERSHELF + aoemin;
+ if (n >= N_DEVS) {
+ pr_err("aoe: %s with e%ld.%d\n",
+ "cannot use static minor device numbers",
+ aoemaj, aoemin);
+ error = -1;
+ goto out;
+ }
+
+ spin_lock_irqsave(&used_minors_lock, flags);
+ if (test_bit(n, used_minors)) {
+ pr_err("aoe: %s %lu\n",
+ "existing device already has static minor number",
+ n);
+ error = -1;
+ } else
+ set_bit(n, used_minors);
+ spin_unlock_irqrestore(&used_minors_lock, flags);
+ *sysminor = n * AOE_PARTITIONS;
+out:
+ return error;
+}
+
+static int
+minor_get(ulong *sysminor, ulong aoemaj, int aoemin)
+{
+ if (aoe_dyndevs)
+ return minor_get_dyn(sysminor);
+ else
+ return minor_get_static(sysminor, aoemaj, aoemin);
+}
+
+static void
+minor_free(ulong minor)
+{
+ ulong flags;
+
+ minor /= AOE_PARTITIONS;
+ BUG_ON(minor >= N_DEVS);
+
+ spin_lock_irqsave(&used_minors_lock, flags);
+ BUG_ON(!test_bit(minor, used_minors));
+ clear_bit(minor, used_minors);
+ spin_unlock_irqrestore(&used_minors_lock, flags);
+}
+
+/*
+ * Users who grab a pointer to the device with aoedev_by_aoeaddr
+ * automatically get a reference count and must be responsible
+ * for performing a aoedev_put. With the addition of async
+ * kthread processing I'm no longer confident that we can
+ * guarantee consistency in the face of device flushes.
+ *
+ * For the time being, we only bother to add extra references for
+ * frames sitting on the iocq. When the kthreads finish processing
+ * these frames, they will aoedev_put the device.
+ */
+
+void
+aoedev_put(struct aoedev *d)
+{
+ ulong flags;
+ spin_lock_irqsave(&devlist_lock, flags);
+ d->ref--;
spin_unlock_irqrestore(&devlist_lock, flags);
- return d;
}
static void
@@ -46,127 +157,247 @@ dummy_timer(ulong vp)
add_timer(&d->timer);
}
+static void
+aoe_failip(struct aoedev *d)
+{
+ struct request *rq;
+ struct bio *bio;
+ unsigned long n;
+
+ aoe_failbuf(d, d->ip.buf);
+
+ rq = d->ip.rq;
+ if (rq == NULL)
+ return;
+ while ((bio = d->ip.nxbio)) {
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ d->ip.nxbio = bio->bi_next;
+ n = (unsigned long) rq->special;
+ rq->special = (void *) --n;
+ }
+ if ((unsigned long) rq->special == 0)
+ aoe_end_request(d, rq, 0);
+}
+
+static void
+downdev_frame(struct list_head *pos)
+{
+ struct frame *f;
+
+ f = list_entry(pos, struct frame, head);
+ list_del(pos);
+ if (f->buf) {
+ f->buf->nframesout--;
+ aoe_failbuf(f->t->d, f->buf);
+ }
+ aoe_freetframe(f);
+}
+
void
aoedev_downdev(struct aoedev *d)
{
- struct aoetgt **t, **te;
- struct frame *f, *e;
- struct buf *buf;
- struct bio *bio;
+ struct aoetgt *t, **tt, **te;
+ struct list_head *head, *pos, *nx;
+ struct request *rq;
+ int i;
- t = d->targets;
- te = t + NTARGETS;
- for (; t < te && *t; t++) {
- f = (*t)->frames;
- e = f + (*t)->nframes;
- for (; f < e; f->tag = FREETAG, f->buf = NULL, f++) {
- if (f->tag == FREETAG || f->buf == NULL)
- continue;
- buf = f->buf;
- bio = buf->bio;
- if (--buf->nframesout == 0
- && buf != d->inprocess) {
- mempool_free(buf, d->bufpool);
- bio_endio(bio, -EIO);
- }
- }
- (*t)->maxout = (*t)->nframes;
- (*t)->nout = 0;
+ d->flags &= ~DEVFL_UP;
+
+ /* clean out active and to-be-retransmitted buffers */
+ for (i = 0; i < NFACTIVE; i++) {
+ head = &d->factive[i];
+ list_for_each_safe(pos, nx, head)
+ downdev_frame(pos);
}
- buf = d->inprocess;
- if (buf) {
- bio = buf->bio;
- mempool_free(buf, d->bufpool);
- bio_endio(bio, -EIO);
+ head = &d->rexmitq;
+ list_for_each_safe(pos, nx, head)
+ downdev_frame(pos);
+
+ /* reset window dressings */
+ tt = d->targets;
+ te = tt + d->ntargets;
+ for (; tt < te && (t = *tt); tt++) {
+ aoecmd_wreset(t);
+ t->nout = 0;
}
- d->inprocess = NULL;
- d->htgt = NULL;
-
- while (!list_empty(&d->bufq)) {
- buf = container_of(d->bufq.next, struct buf, bufs);
- list_del(d->bufq.next);
- bio = buf->bio;
- mempool_free(buf, d->bufpool);
- bio_endio(bio, -EIO);
+
+ /* clean out the in-process request (if any) */
+ aoe_failip(d);
+
+ /* fast fail all pending I/O */
+ if (d->blkq) {
+ while ((rq = blk_peek_request(d->blkq))) {
+ blk_start_request(rq);
+ aoe_end_request(d, rq, 1);
+ }
}
if (d->gd)
set_capacity(d->gd, 0);
+}
- d->flags &= ~DEVFL_UP;
+/* return whether the user asked for this particular
+ * device to be flushed
+ */
+static int
+user_req(char *s, size_t slen, struct aoedev *d)
+{
+ const char *p;
+ size_t lim;
+
+ if (!d->gd)
+ return 0;
+ p = kbasename(d->gd->disk_name);
+ lim = sizeof(d->gd->disk_name);
+ lim -= p - d->gd->disk_name;
+ if (slen < lim)
+ lim = slen;
+
+ return !strncmp(s, p, lim);
}
static void
-aoedev_freedev(struct aoedev *d)
+freedev(struct aoedev *d)
{
struct aoetgt **t, **e;
+ int freeing = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&d->lock, flags);
+ if (d->flags & DEVFL_TKILL
+ && !(d->flags & DEVFL_FREEING)) {
+ d->flags |= DEVFL_FREEING;
+ freeing = 1;
+ }
+ spin_unlock_irqrestore(&d->lock, flags);
+ if (!freeing)
+ return;
+ del_timer_sync(&d->timer);
if (d->gd) {
+ aoedisk_rm_debugfs(d);
aoedisk_rm_sysfs(d);
del_gendisk(d->gd);
put_disk(d->gd);
+ blk_cleanup_queue(d->blkq);
}
t = d->targets;
- e = t + NTARGETS;
+ e = t + d->ntargets;
for (; t < e && *t; t++)
freetgt(d, *t);
if (d->bufpool)
mempool_destroy(d->bufpool);
skbpoolfree(d);
- kfree(d);
+ minor_free(d->sysminor);
+
+ spin_lock_irqsave(&d->lock, flags);
+ d->flags |= DEVFL_FREED;
+ spin_unlock_irqrestore(&d->lock, flags);
}
-int
-aoedev_flush(const char __user *str, size_t cnt)
+enum flush_parms {
+ NOT_EXITING = 0,
+ EXITING = 1,
+};
+
+static int
+flush(const char __user *str, size_t cnt, int exiting)
{
ulong flags;
struct aoedev *d, **dd;
- struct aoedev *rmd = NULL;
char buf[16];
int all = 0;
+ int specified = 0; /* flush a specific device */
+ unsigned int skipflags;
+
+ skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL;
- if (cnt >= 3) {
+ if (!exiting && cnt >= 3) {
if (cnt > sizeof buf)
cnt = sizeof buf;
if (copy_from_user(buf, str, cnt))
return -EFAULT;
all = !strncmp(buf, "all", 3);
+ if (!all)
+ specified = 1;
}
flush_scheduled_work();
+ /* pass one: without sleeping, do aoedev_downdev */
spin_lock_irqsave(&devlist_lock, flags);
- dd = &devlist;
- while ((d = *dd)) {
+ for (d = devlist; d; d = d->next) {
spin_lock(&d->lock);
- if ((!all && (d->flags & DEVFL_UP))
- || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
- || d->nopen) {
- spin_unlock(&d->lock);
- dd = &d->next;
- continue;
- }
- *dd = d->next;
+ if (exiting) {
+ /* unconditionally take each device down */
+ } else if (specified) {
+ if (!user_req(buf, cnt, d))
+ goto cont;
+ } else if ((!all && (d->flags & DEVFL_UP))
+ || d->flags & skipflags
+ || d->nopen
+ || d->ref)
+ goto cont;
+
aoedev_downdev(d);
d->flags |= DEVFL_TKILL;
+cont:
spin_unlock(&d->lock);
- d->next = rmd;
- rmd = d;
}
spin_unlock_irqrestore(&devlist_lock, flags);
- while ((d = rmd)) {
- rmd = d->next;
- del_timer_sync(&d->timer);
- aoedev_freedev(d); /* must be able to sleep */
+
+ /* pass two: call freedev, which might sleep,
+ * for aoedevs marked with DEVFL_TKILL
+ */
+restart:
+ spin_lock_irqsave(&devlist_lock, flags);
+ for (d = devlist; d; d = d->next) {
+ spin_lock(&d->lock);
+ if (d->flags & DEVFL_TKILL
+ && !(d->flags & DEVFL_FREEING)) {
+ spin_unlock(&d->lock);
+ spin_unlock_irqrestore(&devlist_lock, flags);
+ freedev(d);
+ goto restart;
+ }
+ spin_unlock(&d->lock);
}
+
+ /* pass three: remove aoedevs marked with DEVFL_FREED */
+ for (dd = &devlist, d = *dd; d; d = *dd) {
+ struct aoedev *doomed = NULL;
+
+ spin_lock(&d->lock);
+ if (d->flags & DEVFL_FREED) {
+ *dd = d->next;
+ doomed = d;
+ } else {
+ dd = &d->next;
+ }
+ spin_unlock(&d->lock);
+ if (doomed)
+ kfree(doomed->targets);
+ kfree(doomed);
+ }
+ spin_unlock_irqrestore(&devlist_lock, flags);
+
return 0;
}
-/* I'm not really sure that this is a realistic problem, but if the
-network driver goes gonzo let's just leak memory after complaining. */
+int
+aoedev_flush(const char __user *str, size_t cnt)
+{
+ return flush(str, cnt, NOT_EXITING);
+}
+
+/* This has been confirmed to occur once with Tms=3*1000 due to the
+ * driver changing link and not processing its transmit ring. The
+ * problem is hard enough to solve by returning an error that I'm
+ * still punting on "solving" this.
+ */
static void
skbfree(struct sk_buff *skb)
{
- enum { Sms = 100, Tms = 3*1000};
+ enum { Sms = 250, Tms = 30 * 1000};
int i = Tms / Sms;
if (skb == NULL)
@@ -180,6 +411,7 @@ skbfree(struct sk_buff *skb)
"cannot free skb -- memory leaked.");
return;
}
+ skb->truesize -= skb->data_len;
skb_shinfo(skb)->nr_frags = skb->data_len = 0;
skb_trim(skb, 0);
dev_kfree_skb(skb);
@@ -196,26 +428,43 @@ skbpoolfree(struct aoedev *d)
__skb_queue_head_init(&d->skbpool);
}
-/* find it or malloc it */
+/* find it or allocate it */
struct aoedev *
-aoedev_by_sysminor_m(ulong sysminor)
+aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
{
struct aoedev *d;
+ int i;
ulong flags;
+ ulong sysminor = 0;
spin_lock_irqsave(&devlist_lock, flags);
for (d=devlist; d; d=d->next)
- if (d->sysminor == sysminor)
+ if (d->aoemajor == maj && d->aoeminor == min) {
+ spin_lock(&d->lock);
+ if (d->flags & DEVFL_TKILL) {
+ spin_unlock(&d->lock);
+ d = NULL;
+ goto out;
+ }
+ d->ref++;
+ spin_unlock(&d->lock);
break;
- if (d)
+ }
+ if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0)
goto out;
d = kcalloc(1, sizeof *d, GFP_ATOMIC);
if (!d)
goto out;
+ d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC);
+ if (!d->targets) {
+ kfree(d);
+ d = NULL;
+ goto out;
+ }
+ d->ntargets = NTARGETS;
INIT_WORK(&d->work, aoecmd_sleepwork);
spin_lock_init(&d->lock);
- skb_queue_head_init(&d->sendq);
skb_queue_head_init(&d->skbpool);
init_timer(&d->timer);
d->timer.data = (ulong) d;
@@ -224,11 +473,15 @@ aoedev_by_sysminor_m(ulong sysminor)
add_timer(&d->timer);
d->bufpool = NULL; /* defer to aoeblk_gdalloc */
d->tgt = d->targets;
- INIT_LIST_HEAD(&d->bufq);
+ d->ref = 1;
+ for (i = 0; i < NFACTIVE; i++)
+ INIT_LIST_HEAD(&d->factive[i]);
+ INIT_LIST_HEAD(&d->rexmitq);
d->sysminor = sysminor;
- d->aoemajor = AOEMAJOR(sysminor);
- d->aoeminor = AOEMINOR(sysminor);
- d->mintimer = MINTIMER;
+ d->aoemajor = maj;
+ d->aoeminor = min;
+ d->rttavg = RTTAVG_INIT;
+ d->rttdev = RTTDEV_INIT;
d->next = devlist;
devlist = d;
out:
@@ -239,35 +492,31 @@ aoedev_by_sysminor_m(ulong sysminor)
static void
freetgt(struct aoedev *d, struct aoetgt *t)
{
- struct frame *f, *e;
+ struct frame *f;
+ struct list_head *pos, *nx, *head;
+ struct aoeif *ifp;
+
+ for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) {
+ if (!ifp->nd)
+ break;
+ dev_put(ifp->nd);
+ }
- f = t->frames;
- e = f + t->nframes;
- for (; f < e; f++)
+ head = &t->ffree;
+ list_for_each_safe(pos, nx, head) {
+ list_del(pos);
+ f = list_entry(pos, struct frame, head);
skbfree(f->skb);
- kfree(t->frames);
+ kfree(f);
+ }
kfree(t);
}
void
aoedev_exit(void)
{
- struct aoedev *d;
- ulong flags;
-
flush_scheduled_work();
-
- while ((d = devlist)) {
- devlist = d->next;
-
- spin_lock_irqsave(&d->lock, flags);
- aoedev_downdev(d);
- d->flags |= DEVFL_TKILL;
- spin_unlock_irqrestore(&d->lock, flags);
-
- del_timer_sync(&d->timer);
- aoedev_freedev(d);
- }
+ flush(NULL, 0, EXITING);
}
int __init
diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
index 7f83ad90e76..4b987c2fefb 100644
--- a/drivers/block/aoe/aoemain.c
+++ b/drivers/block/aoe/aoemain.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoemain.c
* Module initialization routines, discover timer
@@ -61,6 +61,7 @@ aoe_exit(void)
aoenet_exit();
unregister_blkdev(AOE_MAJOR, DEVICE_NAME);
+ aoecmd_exit();
aoechr_exit();
aoedev_exit();
aoeblk_exit(); /* free cache after de-allocating bufs */
@@ -83,17 +84,20 @@ aoe_init(void)
ret = aoenet_init();
if (ret)
goto net_fail;
+ ret = aoecmd_init();
+ if (ret)
+ goto cmd_fail;
ret = register_blkdev(AOE_MAJOR, DEVICE_NAME);
if (ret < 0) {
printk(KERN_ERR "aoe: can't register major\n");
goto blkreg_fail;
}
-
printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION);
discover_timer(TINIT);
return 0;
-
blkreg_fail:
+ aoecmd_exit();
+ cmd_fail:
aoenet_exit();
net_fail:
aoeblk_exit();
@@ -101,7 +105,7 @@ aoe_init(void)
aoechr_exit();
chr_fail:
aoedev_exit();
-
+
printk(KERN_INFO "aoe: initialisation failure.\n");
return ret;
}
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index c6099ba9a4b..63773a90581 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -1,9 +1,10 @@
-/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoenet.c
* Ethernet portion of AoE driver
*/
+#include <linux/gfp.h>
#include <linux/hdreg.h>
#include <linux/blkdev.h>
#include <linux/netdevice.h>
@@ -30,7 +31,10 @@ enum {
static char aoe_iflist[IFLISTSZ];
module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600);
-MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=\"dev1 [dev2 ...]\"");
+MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=dev1[,dev2...]");
+
+static wait_queue_head_t txwq;
+static struct ktstate kts;
#ifndef MODULE
static int __init aoe_iflist_setup(char *str)
@@ -43,6 +47,28 @@ static int __init aoe_iflist_setup(char *str)
__setup("aoe_iflist=", aoe_iflist_setup);
#endif
+static spinlock_t txlock;
+static struct sk_buff_head skbtxq;
+
+/* enters with txlock held */
+static int
+tx(int id) __must_hold(&txlock)
+{
+ struct sk_buff *skb;
+ struct net_device *ifp;
+
+ while ((skb = skb_dequeue(&skbtxq))) {
+ spin_unlock_irq(&txlock);
+ ifp = skb->dev;
+ if (dev_queue_xmit(skb) == NET_XMIT_DROP && net_ratelimit())
+ pr_warn("aoe: packet could not be sent on %s. %s\n",
+ ifp ? ifp->name : "netif",
+ "consider increasing tx_queue_len");
+ spin_lock_irq(&txlock);
+ }
+ return 0;
+}
+
int
is_aoe_netif(struct net_device *ifp)
{
@@ -87,21 +113,27 @@ void
aoenet_xmit(struct sk_buff_head *queue)
{
struct sk_buff *skb, *tmp;
+ ulong flags;
skb_queue_walk_safe(queue, skb, tmp) {
__skb_unlink(skb, queue);
- dev_queue_xmit(skb);
+ spin_lock_irqsave(&txlock, flags);
+ skb_queue_tail(&skbtxq, skb);
+ spin_unlock_irqrestore(&txlock, flags);
+ wake_up(&txwq);
}
}
-/*
- * (1) len doesn't include the header by default. I want this.
+/*
+ * (1) len doesn't include the header by default. I want this.
*/
static int
aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
{
struct aoe_hdr *h;
+ struct aoe_atahdr *ah;
u32 n;
+ int sn;
if (dev_net(ifp) != &init_net)
goto exit;
@@ -109,13 +141,16 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
skb = skb_share_check(skb, GFP_ATOMIC);
if (skb == NULL)
return 0;
- if (skb_linearize(skb))
- goto exit;
if (!is_aoe_netif(ifp))
goto exit;
skb_push(skb, ETH_HLEN); /* (1) */
-
- h = (struct aoe_hdr *) skb_mac_header(skb);
+ sn = sizeof(*h) + sizeof(*ah);
+ if (skb->len >= sn) {
+ sn -= skb_headlen(skb);
+ if (sn > 0 && !__pskb_pull_tail(skb, sn))
+ goto exit;
+ }
+ h = (struct aoe_hdr *) skb->data;
n = get_unaligned_be32(&h->tag);
if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31))
goto exit;
@@ -136,7 +171,8 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
switch (h->cmd) {
case AOECMD_ATA:
- aoecmd_ata_rsp(skb);
+ /* ata_rsp may keep skb for later processing or give it back */
+ skb = aoecmd_ata_rsp(skb);
break;
case AOECMD_CFG:
aoecmd_cfg_rsp(skb);
@@ -144,14 +180,18 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
default:
if (h->cmd >= AOECMD_VEND_MIN)
break; /* don't complain about vendor commands */
- printk(KERN_INFO "aoe: unknown cmd %d\n", h->cmd);
+ pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd);
+ break;
}
+
+ if (!skb)
+ return 0;
exit:
dev_kfree_skb(skb);
return 0;
}
-static struct packet_type aoe_pt = {
+static struct packet_type aoe_pt __read_mostly = {
.type = __constant_htons(ETH_P_AOE),
.func = aoenet_rcv,
};
@@ -159,6 +199,16 @@ static struct packet_type aoe_pt = {
int __init
aoenet_init(void)
{
+ skb_queue_head_init(&skbtxq);
+ init_waitqueue_head(&txwq);
+ spin_lock_init(&txlock);
+ kts.lock = &txlock;
+ kts.fn = tx;
+ kts.waitq = &txwq;
+ kts.id = 0;
+ snprintf(kts.name, sizeof(kts.name), "aoe_tx%d", kts.id);
+ if (aoe_ktstart(&kts))
+ return -EAGAIN;
dev_add_pack(&aoe_pt);
return 0;
}
@@ -166,6 +216,8 @@ aoenet_init(void)
void
aoenet_exit(void)
{
+ aoe_ktstop(&kts);
+ skb_queue_purge(&skbtxq);
dev_remove_pack(&aoe_pt);
}