diff options
author | Jens Axboe <axboe@kernel.dk> | 2013-07-02 08:31:48 +0200 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2013-07-02 08:31:48 +0200 |
commit | 5f0e5afa0de4522abb3ea7d1369039b94e740ec5 (patch) | |
tree | 6a5be3db9ecfed8ef2150c6146f6d1e0d658ac8b /drivers/block | |
parent | d752b2696072ed52fd5afab08b601e2220a3b87e (diff) | |
parent | 9e895ace5d82df8929b16f58e9f515f6d54ab82d (diff) |
Merge tag 'v3.10-rc7' into for-3.11/drivers
Linux 3.10-rc7
Pull this in early to avoid doing it with the bcache merge,
since there are a number of changes to bcache between my old
base (3.10-rc1) and the new pull request.
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/brd.c | 4 | ||||
-rw-r--r-- | drivers/block/cciss.c | 32 | ||||
-rw-r--r-- | drivers/block/mtip32xx/mtip32xx.c | 8 | ||||
-rw-r--r-- | drivers/block/nvme-core.c | 62 | ||||
-rw-r--r-- | drivers/block/nvme-scsi.c | 3 | ||||
-rw-r--r-- | drivers/block/pktcdvd.c | 3 | ||||
-rw-r--r-- | drivers/block/rbd.c | 974 | ||||
-rw-r--r-- | drivers/block/xsysace.c | 3 |
8 files changed, 647 insertions, 442 deletions
diff --git a/drivers/block/brd.c b/drivers/block/brd.c index f1a29f8e9d3..9bf4371755f 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -117,13 +117,13 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) spin_lock(&brd->brd_lock); idx = sector >> PAGE_SECTORS_SHIFT; + page->index = idx; if (radix_tree_insert(&brd->brd_pages, idx, page)) { __free_page(page); page = radix_tree_lookup(&brd->brd_pages, idx); BUG_ON(!page); BUG_ON(page->index != idx); - } else - page->index = idx; + } spin_unlock(&brd->brd_lock); radix_tree_preload_end(); diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 6374dc10352..62b6c2cc80b 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -168,8 +168,6 @@ static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id); static int cciss_open(struct block_device *bdev, fmode_t mode); static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode); static void cciss_release(struct gendisk *disk, fmode_t mode); -static int do_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg); static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo); @@ -235,7 +233,7 @@ static const struct block_device_operations cciss_fops = { .owner = THIS_MODULE, .open = cciss_unlocked_open, .release = cciss_release, - .ioctl = do_ioctl, + .ioctl = cciss_ioctl, .getgeo = cciss_getgeo, #ifdef CONFIG_COMPAT .compat_ioctl = cciss_compat_ioctl, @@ -1143,16 +1141,6 @@ static void cciss_release(struct gendisk *disk, fmode_t mode) mutex_unlock(&cciss_mutex); } -static int do_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) -{ - int ret; - mutex_lock(&cciss_mutex); - ret = cciss_ioctl(bdev, mode, cmd, arg); - mutex_unlock(&cciss_mutex); - return ret; -} - #ifdef CONFIG_COMPAT static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode, @@ -1179,7 +1167,7 @@ static int cciss_compat_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_REGNEWD: case CCISS_RESCANDISK: case CCISS_GETLUNINFO: - return do_ioctl(bdev, mode, cmd, arg); + return cciss_ioctl(bdev, mode, cmd, arg); case CCISS_PASSTHRU32: return cciss_ioctl32_passthru(bdev, mode, cmd, arg); @@ -1219,7 +1207,7 @@ static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode, if (err) return -EFAULT; - err = do_ioctl(bdev, mode, CCISS_PASSTHRU, (unsigned long)p); + err = cciss_ioctl(bdev, mode, CCISS_PASSTHRU, (unsigned long)p); if (err) return err; err |= @@ -1261,7 +1249,7 @@ static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode, if (err) return -EFAULT; - err = do_ioctl(bdev, mode, CCISS_BIG_PASSTHRU, (unsigned long)p); + err = cciss_ioctl(bdev, mode, CCISS_BIG_PASSTHRU, (unsigned long)p); if (err) return err; err |= @@ -1311,11 +1299,14 @@ static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp) static int cciss_getintinfo(ctlr_info_t *h, void __user *argp) { cciss_coalint_struct intinfo; + unsigned long flags; if (!argp) return -EINVAL; + spin_lock_irqsave(&h->lock, flags); intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay); intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount); + spin_unlock_irqrestore(&h->lock, flags); if (copy_to_user (argp, &intinfo, sizeof(cciss_coalint_struct))) return -EFAULT; @@ -1356,12 +1347,15 @@ static int cciss_setintinfo(ctlr_info_t *h, void __user *argp) static int cciss_getnodename(ctlr_info_t *h, void __user *argp) { NodeName_type NodeName; + unsigned long flags; int i; if (!argp) return -EINVAL; + spin_lock_irqsave(&h->lock, flags); for (i = 0; i < 16; i++) NodeName[i] = readb(&h->cfgtable->ServerName[i]); + spin_unlock_irqrestore(&h->lock, flags); if (copy_to_user(argp, NodeName, sizeof(NodeName_type))) return -EFAULT; return 0; @@ -1398,10 +1392,13 @@ static int cciss_setnodename(ctlr_info_t *h, void __user *argp) static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp) { Heartbeat_type heartbeat; + unsigned long flags; if (!argp) return -EINVAL; + spin_lock_irqsave(&h->lock, flags); heartbeat = readl(&h->cfgtable->HeartBeat); + spin_unlock_irqrestore(&h->lock, flags); if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type))) return -EFAULT; return 0; @@ -1410,10 +1407,13 @@ static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp) static int cciss_getbustypes(ctlr_info_t *h, void __user *argp) { BusTypes_type BusTypes; + unsigned long flags; if (!argp) return -EINVAL; + spin_lock_irqsave(&h->lock, flags); BusTypes = readl(&h->cfgtable->BusTypes); + spin_unlock_irqrestore(&h->lock, flags); if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type))) return -EFAULT; return 0; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 847107ef0cc..20dd52a2f92 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3002,7 +3002,8 @@ static int mtip_hw_debugfs_init(struct driver_data *dd) static void mtip_hw_debugfs_exit(struct driver_data *dd) { - debugfs_remove_recursive(dd->dfs_node); + if (dd->dfs_node) + debugfs_remove_recursive(dd->dfs_node); } @@ -3863,7 +3864,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) struct driver_data *dd = queue->queuedata; struct scatterlist *sg; struct bio_vec *bvec; - int nents = 0; + int i, nents = 0; int tag = 0, unaligned = 0; if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { @@ -3921,11 +3922,12 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) } /* Create the scatter list for this bio. */ - bio_for_each_segment(bvec, bio, nents) { + bio_for_each_segment(bvec, bio, i) { sg_set_page(&sg[nents], bvec->bv_page, bvec->bv_len, bvec->bv_offset); + nents++; } /* Issue the read/write. */ diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 8efdfaa44a5..ce79a590b45 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -629,7 +629,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, struct nvme_command *cmnd; struct nvme_iod *iod; enum dma_data_direction dma_dir; - int cmdid, length, result = -ENOMEM; + int cmdid, length, result; u16 control; u32 dsmgmt; int psegs = bio_phys_segments(ns->queue, bio); @@ -640,6 +640,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, return result; } + result = -ENOMEM; iod = nvme_alloc_iod(psegs, bio->bi_size, GFP_ATOMIC); if (!iod) goto nomem; @@ -977,6 +978,8 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) if (timeout && !time_after(now, info[cmdid].timeout)) continue; + if (info[cmdid].ctx == CMD_CTX_CANCELLED) + continue; dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); ctx = cancel_cmdid(nvmeq, cmdid, &fn); fn(nvmeq->dev, ctx, &cqe); @@ -1206,7 +1209,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, if (addr & 3) return ERR_PTR(-EINVAL); - if (!length) + if (!length || length > INT_MAX - PAGE_SIZE) return ERR_PTR(-EINVAL); offset = offset_in_page(addr); @@ -1227,7 +1230,8 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, sg_init_table(sg, count); for (i = 0; i < count; i++) { sg_set_page(&sg[i], pages[i], - min_t(int, length, PAGE_SIZE - offset), offset); + min_t(unsigned, length, PAGE_SIZE - offset), + offset); length -= (PAGE_SIZE - offset); offset = 0; } @@ -1435,7 +1439,7 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, nvme_free_iod(dev, iod); } - if (!status && copy_to_user(&ucmd->result, &cmd.result, + if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result, sizeof(cmd.result))) status = -EFAULT; @@ -1633,7 +1637,8 @@ static int set_queue_count(struct nvme_dev *dev, int count) static int nvme_setup_io_queues(struct nvme_dev *dev) { - int result, cpu, i, nr_io_queues, db_bar_size, q_depth; + struct pci_dev *pdev = dev->pci_dev; + int result, cpu, i, nr_io_queues, db_bar_size, q_depth, q_count; nr_io_queues = num_online_cpus(); result = set_queue_count(dev, nr_io_queues); @@ -1642,14 +1647,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (result < nr_io_queues) nr_io_queues = result; + q_count = nr_io_queues; /* Deregister the admin queue's interrupt */ free_irq(dev->entry[0].vector, dev->queues[0]); db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); if (db_bar_size > 8192) { iounmap(dev->bar); - dev->bar = ioremap(pci_resource_start(dev->pci_dev, 0), - db_bar_size); + dev->bar = ioremap(pci_resource_start(pdev, 0), db_bar_size); dev->dbs = ((void __iomem *)dev->bar) + 4096; dev->queues[0]->q_db = dev->dbs; } @@ -1657,19 +1662,36 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) for (i = 0; i < nr_io_queues; i++) dev->entry[i].entry = i; for (;;) { - result = pci_enable_msix(dev->pci_dev, dev->entry, - nr_io_queues); + result = pci_enable_msix(pdev, dev->entry, nr_io_queues); if (result == 0) { break; } else if (result > 0) { nr_io_queues = result; continue; } else { - nr_io_queues = 1; + nr_io_queues = 0; break; } } + if (nr_io_queues == 0) { + nr_io_queues = q_count; + for (;;) { + result = pci_enable_msi_block(pdev, nr_io_queues); + if (result == 0) { + for (i = 0; i < nr_io_queues; i++) + dev->entry[i].vector = i + pdev->irq; + break; + } else if (result > 0) { + nr_io_queues = result; + continue; + } else { + nr_io_queues = 1; + break; + } + } + } + result = queue_request_irq(dev, dev->queues[0], "nvme admin"); /* XXX: handle failure here */ @@ -1850,7 +1872,10 @@ static void nvme_free_dev(struct kref *kref) { struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); nvme_dev_remove(dev); - pci_disable_msix(dev->pci_dev); + if (dev->pci_dev->msi_enabled) + pci_disable_msi(dev->pci_dev); + else if (dev->pci_dev->msix_enabled) + pci_disable_msix(dev->pci_dev); iounmap(dev->bar); nvme_release_instance(dev); nvme_release_prp_pools(dev); @@ -1923,8 +1948,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&dev->namespaces); dev->pci_dev = pdev; pci_set_drvdata(pdev, dev); - dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)); - dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); + + if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64))) + dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); + else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32))) + dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32)); + else + goto disable; + result = nvme_set_instance(dev); if (result) goto disable; @@ -1977,7 +2008,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) unmap: iounmap(dev->bar); disable_msix: - pci_disable_msix(pdev); + if (dev->pci_dev->msi_enabled) + pci_disable_msi(dev->pci_dev); + else if (dev->pci_dev->msix_enabled) + pci_disable_msix(dev->pci_dev); nvme_release_instance(dev); nvme_release_prp_pools(dev); disable: diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index fed54b03989..102de2f52b5 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -44,7 +44,6 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/types.h> -#include <linux/version.h> #include <scsi/sg.h> #include <scsi/scsi.h> @@ -1654,7 +1653,7 @@ static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list, } } -static u16 nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, +static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *mode_page, u8 page_code) { int res = SNTI_TRANSLATION_SUCCESS; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 3c08983e600..f5d0ea11d9f 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -83,7 +83,8 @@ #define MAX_SPEED 0xffff -#define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1)) +#define ZONE(sector, pd) (((sector) + (pd)->offset) & \ + ~(sector_t)((pd)->settings.size - 1)) static DEFINE_MUTEX(pktcdvd_mutex); static struct pktcdvd_device *pkt_devs[MAX_WRITERS]; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ca63104136e..49394e3f31b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -55,6 +55,39 @@ #define SECTOR_SHIFT 9 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) +/* + * Increment the given counter and return its updated value. + * If the counter is already 0 it will not be incremented. + * If the counter is already at its maximum value returns + * -EINVAL without updating it. + */ +static int atomic_inc_return_safe(atomic_t *v) +{ + unsigned int counter; + + counter = (unsigned int)__atomic_add_unless(v, 1, 0); + if (counter <= (unsigned int)INT_MAX) + return (int)counter; + + atomic_dec(v); + + return -EINVAL; +} + +/* Decrement the counter. Return the resulting value, or -EINVAL */ +static int atomic_dec_return_safe(atomic_t *v) +{ + int counter; + + counter = atomic_dec_return(v); + if (counter >= 0) + return counter; + + atomic_inc(v); + + return -EINVAL; +} + #define RBD_DRV_NAME "rbd" #define RBD_DRV_NAME_LONG "rbd (rados block device)" @@ -100,21 +133,20 @@ * block device image metadata (in-memory version) */ struct rbd_image_header { - /* These four fields never change for a given rbd image */ + /* These six fields never change for a given rbd image */ char *object_prefix; - u64 features; __u8 obj_order; __u8 crypt_type; __u8 comp_type; + u64 stripe_unit; + u64 stripe_count; + u64 features; /* Might be changeable someday? */ /* The remaining fields need to be updated occasionally */ u64 image_size; struct ceph_snap_context *snapc; - char *snap_names; - u64 *snap_sizes; - - u64 stripe_unit; - u64 stripe_count; + char *snap_names; /* format 1 only */ + u64 *snap_sizes; /* format 1 only */ }; /* @@ -225,6 +257,7 @@ struct rbd_obj_request { }; }; struct page **copyup_pages; + u32 copyup_page_count; struct ceph_osd_request *osd_req; @@ -257,6 +290,7 @@ struct rbd_img_request { struct rbd_obj_request *obj_request; /* obj req initiator */ }; struct page **copyup_pages; + u32 copyup_page_count; spinlock_t completion_lock;/* protects next_completion */ u32 next_completion; rbd_img_callback_t callback; @@ -311,6 +345,7 @@ struct rbd_device { struct rbd_spec *parent_spec; u64 parent_overlap; + atomic_t parent_ref; struct rbd_device *parent; /* protects updating the header */ @@ -359,7 +394,8 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); -static int rbd_dev_image_probe(struct rbd_device *rbd_dev); +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); +static void rbd_spec_put(struct rbd_spec *spec); static struct bus_attribute rbd_bus_attrs[] = { __ATTR(add, S_IWUSR, NULL, rbd_add), @@ -426,7 +462,8 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request); static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); static int rbd_dev_refresh(struct rbd_device *rbd_dev); -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev); +static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); +static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev); static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u64 snap_id); static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, @@ -482,8 +519,8 @@ static const struct block_device_operations rbd_bd_ops = { }; /* - * Initialize an rbd client instance. - * We own *ceph_opts. + * Initialize an rbd client instance. Success or not, this function + * consumes ceph_opts. */ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) { @@ -638,7 +675,8 @@ static int parse_rbd_opts_token(char *c, void *private) /* * Get a ceph client with specific addr and configuration, if one does - * not exist create it. + * not exist create it. Either way, ceph_opts is consumed by this + * function. */ static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) { @@ -726,88 +764,123 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) } /* - * Create a new header structure, translate header format from the on-disk - * header. + * Fill an rbd image header with information from the given format 1 + * on-disk header. */ -static int rbd_header_from_disk(struct rbd_image_header *header, +static int rbd_header_from_disk(struct rbd_device *rbd_dev, struct rbd_image_header_ondisk *ondisk) { + struct rbd_image_header *header = &rbd_dev->header; + bool first_time = header->object_prefix == NULL; + struct ceph_snap_context *snapc; + char *object_prefix = NULL; + char *snap_names = NULL; + u64 *snap_sizes = NULL; u32 snap_count; - size_t len; size_t size; + int ret = -ENOMEM; u32 i; - memset(header, 0, sizeof (*header)); + /* Allocate this now to avoid having to handle failure below */ - snap_count = le32_to_cpu(ondisk->snap_count); + if (first_time) { + size_t len; - len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix)); - header->object_prefix = kmalloc(len + 1, GFP_KERNEL); - if (!header->object_prefix) - return -ENOMEM; - memcpy(header->object_prefix, ondisk->object_prefix, len); - header->object_prefix[len] = '\0'; + len = strnlen(ondisk->object_prefix, + sizeof (ondisk->object_prefix)); + object_prefix = kmalloc(len + 1, GFP_KERNEL); + if (!object_prefix) + return -ENOMEM; + memcpy(object_prefix, ondisk->object_prefix, len); + object_prefix[len] = '\0'; + } + + /* Allocate the snapshot context and fill it in */ + snap_count = le32_to_cpu(ondisk->snap_count); + snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); + if (!snapc) + goto out_err; + snapc->seq = le64_to_cpu(ondisk->snap_seq); if (snap_count) { + struct rbd_image_snap_ondisk *snaps; u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); - /* Save a copy of the snapshot names */ + /* We'll keep a copy of the snapshot names... */ - if (snap_names_len > (u64) SIZE_MAX) - return -EIO; - header->snap_names = kmalloc(snap_names_len, GFP_KERNEL); - if (!header->snap_names) + if (snap_names_len > (u64)SIZE_MAX) + goto out_2big; + snap_names = kmalloc(snap_names_len, GFP_KERNEL); + if (!snap_names) + goto out_err; + + /* ...as well as the array of their sizes. */ + + size = snap_count * sizeof (*header->snap_sizes); + snap_sizes = kmalloc(size, GFP_KERNEL); + if (!snap_sizes) goto out_err; + /* - * Note that rbd_dev_v1_header_read() guarantees - * the ondisk buffer we're working with has + * Copy the names, and fill in each snapshot's id + * and size. + * + * Note that rbd_dev_v1_header_info() guarantees the + * ondisk buffer we're working with has * snap_names_len bytes beyond the end of the * snapshot id array, this memcpy() is safe. */ - memcpy(header->snap_names, &ondisk->snaps[snap_count], - snap_names_len); + memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); + snaps = ondisk->snaps; + for (i = 0; i < snap_count; i++) { + snapc->snaps[i] = le64_to_cpu(snaps[i].id); + snap_sizes[i] = le64_to_cpu(snaps[i].image_size); + } + } - /* Record each snapshot's size */ + /* We won't fail any more, fill in the header */ - size = snap_count * sizeof (*header->snap_sizes); - header->snap_sizes = kmalloc(size, GFP_KERNEL); - if (!header->snap_sizes) - goto out_err; - for (i = 0; i < snap_count; i++) - header->snap_sizes[i] = - le64_to_cpu(ondisk->snaps[i].image_size); + down_write(&rbd_dev->header_rwsem); + if (first_time) { + header->object_prefix = object_prefix; + header->obj_order = ondisk->options.order; + header->crypt_type = ondisk->options.crypt_type; + header->comp_type = ondisk->options.comp_type; + /* The rest aren't used for format 1 images */ + header->stripe_unit = 0; + header->stripe_count = 0; + header->features = 0; } else { - header->snap_names = NULL; - header->snap_sizes = NULL; + ceph_put_snap_context(header->snapc); + kfree(header->snap_names); + kfree(header->snap_sizes); } - header->features = 0; /* No features support in v1 images */ - header->obj_order = ondisk->options.order; - header->crypt_type = ondisk->options.crypt_type; - header->comp_type = ondisk->options.comp_type; - - /* Allocate and fill in the snapshot context */ + /* The remaining fields always get updated (when we refresh) */ header->image_size = le64_to_cpu(ondisk->image_size); + header->snapc = snapc; + header->snap_names = snap_names; + header->snap_sizes = snap_sizes; - header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); - if (!header->snapc) - goto out_err; - header->snapc->seq = le64_to_cpu(ondisk->snap_seq); - for (i = 0; i < snap_count; i++) - header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); + /* Make sure mapping size is consistent with header info */ - return 0; + if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time) + if (rbd_dev->mapping.size != header->image_size) + rbd_dev->mapping.size = header->image_size; + up_write(&rbd_dev->header_rwsem); + + return 0; +out_2big: + ret = -EIO; out_err: - kfree(header->snap_sizes); - header->snap_sizes = NULL; - kfree(header->snap_names); - header->snap_names = NULL; - kfree(header->object_prefix); - header->object_prefix = NULL; + kfree(snap_sizes); + kfree(snap_names); + ceph_put_snap_context(snapc); + kfree(object_prefix); - return -ENOMEM; + return ret; } static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) @@ -934,20 +1007,11 @@ static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) { - const char *snap_name = rbd_dev->spec->snap_name; - u64 snap_id; + u64 snap_id = rbd_dev->spec->snap_id; u64 size = 0; u64 features = 0; int ret; - if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) { - snap_id = rbd_snap_id_by_name(rbd_dev, snap_name); - if (snap_id == CEPH_NOSNAP) - return -ENOENT; - } else { - snap_id = CEPH_NOSNAP; - } - ret = rbd_snap_size(rbd_dev, snap_id, &size); if (ret) return ret; @@ -958,11 +1022,6 @@ static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) rbd_dev->mapping.size = size; rbd_dev->mapping.features = features; - /* If we are mapping a snapshot it must be marked read-only */ - - if (snap_id != CEPH_NOSNAP) - rbd_dev->mapping.read_only = true; - return 0; } @@ -970,14 +1029,6 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) { rbd_dev->mapping.size = 0; rbd_dev->mapping.features = 0; - rbd_dev->mapping.read_only = true; -} - -static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev) -{ - rbd_dev->mapping.size = 0; - rbd_dev->mapping.features = 0; - rbd_dev->mapping.read_only = true; } static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) @@ -985,12 +1036,16 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) char *name; u64 segment; int ret; + char *name_format; name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); if (!name) return NULL; segment = offset >> rbd_dev->header.obj_order; - ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx", + name_format = "%s.%012llx"; + if (rbd_dev->image_format == 2) + name_format = "%s.%016llx"; + ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format, rbd_dev->header.object_prefix, segment); if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { pr_err("error formatting segment name for #%llu (%d)\n", @@ -1342,20 +1397,18 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request) kref_put(&obj_request->kref, rbd_obj_request_destroy); } -static void rbd_img_request_get(struct rbd_img_request *img_request) -{ - dout("%s: img %p (was %d)\n", __func__, img_request, - atomic_read(&img_request->kref.refcount)); - kref_get(&img_request->kref); -} - +static bool img_request_child_test(struct rbd_img_request *img_request); +static void rbd_parent_request_destroy(struct kref *kref); static void rbd_img_request_destroy(struct kref *kref); static void rbd_img_request_put(struct rbd_img_request *img_request) { rbd_assert(img_request != NULL); dout("%s: img %p (was %d)\n", __func__, img_request, atomic_read(&img_request->kref.refcount)); - kref_put(&img_request->kref, rbd_img_request_destroy); + if (img_request_child_test(img_request)) + kref_put(&img_request->kref, rbd_parent_request_destroy); + else + kref_put(&img_request->kref, rbd_img_request_destroy); } static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, @@ -1472,6 +1525,12 @@ static void img_request_child_set(struct rbd_img_request *img_request) smp_mb(); } +static void img_request_child_clear(struct rbd_img_request *img_request) +{ + clear_bit(IMG_REQ_CHILD, &img_request->flags); + smp_mb(); +} + static bool img_request_child_test(struct rbd_img_request *img_request) { smp_mb(); @@ -1484,6 +1543,12 @@ static void img_request_layered_set(struct rbd_img_request *img_request) smp_mb(); } +static void img_request_layered_clear(struct rbd_img_request *img_request) +{ + clear_bit(IMG_REQ_LAYERED, &img_request->flags); + smp_mb(); +} + static bool img_request_layered_test(struct rbd_img_request *img_request) { smp_mb(); @@ -1827,6 +1892,74 @@ static void rbd_obj_request_destroy(struct kref *kref) kmem_cache_free(rbd_obj_request_cache, obj_request); } +/* It's OK to call this for a device with no parent */ + +static void rbd_spec_put(struct rbd_spec *spec); +static void rbd_dev_unparent(struct rbd_device *rbd_dev) +{ + rbd_dev_remove_parent(rbd_dev); + rbd_spec_put(rbd_dev->parent_spec); + rbd_dev->parent_spec = NULL; + rbd_dev->parent_overlap = 0; +} + +/* + * Parent image reference counting is used to determine when an + * image's parent fields can be safely torn down--after there are no + * more in-flight requests to the parent image. When the last + * reference is dropped, cleaning them up is safe. + */ +static void rbd_dev_parent_put(struct rbd_device *rbd_dev) +{ + int counter; + + if (!rbd_dev->parent_spec) + return; + + counter = atomic_dec_return_safe(&rbd_dev->parent_ref); + if (counter > 0) + return; + + /* Last reference; clean up parent data structures */ + + if (!counter) + rbd_dev_unparent(rbd_dev); + else + rbd_warn(rbd_dev, "parent reference underflow\n"); +} + +/* + * If an image has a non-zero parent overlap, get a reference to its + * parent. + * + * We must get the reference before checking for the overlap to + * coordinate properly with zeroing the parent overlap in + * rbd_dev_v2_parent_info() when an image gets flattened. We + * drop it again if there is no overlap. + * + * Returns true if the rbd device has a parent with a non-zero + * overlap and a reference for it was successfully taken, or + * false otherwise. + */ +static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) +{ + int counter; + + if (!rbd_dev->parent_spec) + return false; + + counter = atomic_inc_return_safe(&rbd_dev->parent_ref); + if (counter > 0 && rbd_dev->parent_overlap) + return true; + + /* Image was flattened, but parent is not yet torn down */ + + if (counter < 0) + rbd_warn(rbd_dev, "parent reference overflow\n"); + + return false; +} + /* * Caller is responsible for filling in the list of object requests * that comprises the image request, and the Linux request pointer @@ -1835,8 +1968,7 @@ static void rbd_obj_request_destroy(struct kref *kref) static struct rbd_img_request *rbd_img_request_create( struct rbd_device *rbd_dev, u64 offset, u64 length, - bool write_request, - bool child_request) + bool write_request) { struct rbd_img_request *img_request; @@ -1861,9 +1993,7 @@ static struct rbd_img_request *rbd_img_request_create( } else { img_request->snap_id = rbd_dev->spec->snap_id; } - if (child_request) - img_request_child_set(img_request); - if (rbd_dev->parent_spec) + if (rbd_dev_parent_get(rbd_dev)) img_request_layered_set(img_request); spin_lock_init(&img_request->completion_lock); img_request->next_completion = 0; @@ -1873,9 +2003,6 @@ static struct rbd_img_request *rbd_img_request_create( INIT_LIST_HEAD(&img_request->obj_requests); kref_init(&img_request->kref); - rbd_img_request_get(img_request); /* Avoid a warning */ - rbd_img_request_put(img_request); /* TEMPORARY */ - dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, write_request ? "write" : "read", offset, length, img_request); @@ -1897,15 +2024,54 @@ static void rbd_img_request_destroy(struct kref *kref) rbd_img_obj_request_del(img_request, obj_request); rbd_assert(img_request->obj_request_count == 0); + if (img_request_layered_test(img_request)) { + img_request_layered_clear(img_request); + rbd_dev_parent_put(img_request->rbd_dev); + } + if (img_request_write_test(img_request)) ceph_put_snap_context(img_request->snapc); - if (img_request_child_test(img_request)) - rbd_obj_request_put(img_request->obj_request); - kmem_cache_free(rbd_img_request_cache, img_request); } +static struct rbd_img_request *rbd_parent_request_create( + struct rbd_obj_request *obj_request, + u64 img_offset, u64 length) +{ + struct rbd_img_request *parent_request; + struct rbd_device *rbd_dev; + + rbd_assert(obj_request->img_request); + rbd_dev = obj_request->img_request->rbd_dev; + + parent_request = rbd_img_request_create(rbd_dev->parent, + img_offset, length, false); + if (!parent_request) + return NULL; + + img_request_child_set(parent_request); + rbd_obj_request_get(obj_request); + parent_request->obj_request = obj_request; + + return parent_request; +} + +static void rbd_parent_request_destroy(struct kref *kref) +{ + struct rbd_img_request *parent_request; + struct rbd_obj_request *orig_request; + + parent_request = container_of(kref, struct rbd_img_request, kref); + orig_request = parent_request->obj_request; + + parent_request->obj_request = NULL; + rbd_obj_request_put(orig_request); + img_request_child_clear(parent_request); + + rbd_img_request_destroy(kref); +} + static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; @@ -2114,7 +2280,7 @@ rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; struct rbd_device *rbd_dev; - u64 length; + struct page **pages; u32 page_count; rbd_assert(obj_request->type == OBJ_REQUEST_BIO); @@ -2124,12 +2290,14 @@ rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) rbd_dev = img_request->rbd_dev; rbd_assert(rbd_dev); - length = (u64)1 << rbd_dev->header.obj_order; - page_count = (u32)calc_pages_for(0, length); - rbd_assert(obj_request->copyup_pages); |