diff options
Diffstat (limited to 'drivers/block/xen-blkfront.c')
| -rw-r--r-- | drivers/block/xen-blkfront.c | 832 |
1 files changed, 695 insertions, 137 deletions
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index e4fb3374dcd..5deb235bd18 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -44,6 +44,7 @@ #include <linux/mutex.h> #include <linux/scatterlist.h> #include <linux/bitmap.h> +#include <linux/list.h> #include <xen/xen.h> #include <xen/xenbus.h> @@ -64,15 +65,39 @@ enum blkif_state { BLKIF_STATE_SUSPENDED, }; +struct grant { + grant_ref_t gref; + unsigned long pfn; + struct list_head node; +}; + struct blk_shadow { struct blkif_request req; struct request *request; - unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct grant **grants_used; + struct grant **indirect_grants; + struct scatterlist *sg; +}; + +struct split_bio { + struct bio *bio; + atomic_t pending; + int err; }; static DEFINE_MUTEX(blkfront_mutex); static const struct block_device_operations xlvbd_block_fops; +/* + * Maximum number of segments in indirect requests, the actual value used by + * the frontend driver is the minimum of this value and the value provided + * by the backend driver. + */ + +static unsigned int xen_blkif_max_segments = 32; +module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); +MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); + #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) /* @@ -91,12 +116,14 @@ struct blkfront_info enum blkif_state connected; int ring_ref; struct blkif_front_ring ring; - struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; unsigned int evtchn, irq; struct request_queue *rq; struct work_struct work; struct gnttab_free_callback callback; struct blk_shadow shadow[BLK_RING_SIZE]; + struct list_head grants; + struct list_head indirect_pages; + unsigned int persistent_gnts_c; unsigned long shadow_free; unsigned int feature_flush; unsigned int flush_op; @@ -104,6 +131,8 @@ struct blkfront_info unsigned int feature_secdiscard:1; unsigned int discard_granularity; unsigned int discard_alignment; + unsigned int feature_persistent:1; + unsigned int max_indirect_segments; int is_ready; }; @@ -132,6 +161,13 @@ static DEFINE_SPINLOCK(minor_lock); #define DEV_NAME "xvd" /* name in /dev */ +#define SEGS_PER_INDIRECT_FRAME \ + (PAGE_SIZE/sizeof(struct blkif_request_segment)) +#define INDIRECT_GREFS(_segs) \ + ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) + +static int blkfront_setup_indirect(struct blkfront_info *info); + static int get_id_from_freelist(struct blkfront_info *info) { unsigned long free = info->shadow_free; @@ -154,6 +190,77 @@ static int add_id_to_freelist(struct blkfront_info *info, return 0; } +static int fill_grant_buffer(struct blkfront_info *info, int num) +{ + struct page *granted_page; + struct grant *gnt_list_entry, *n; + int i = 0; + + while(i < num) { + gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); + if (!gnt_list_entry) + goto out_of_memory; + + if (info->feature_persistent) { + granted_page = alloc_page(GFP_NOIO); + if (!granted_page) { + kfree(gnt_list_entry); + goto out_of_memory; + } + gnt_list_entry->pfn = page_to_pfn(granted_page); + } + + gnt_list_entry->gref = GRANT_INVALID_REF; + list_add(&gnt_list_entry->node, &info->grants); + i++; + } + + return 0; + +out_of_memory: + list_for_each_entry_safe(gnt_list_entry, n, + &info->grants, node) { + list_del(&gnt_list_entry->node); + if (info->feature_persistent) + __free_page(pfn_to_page(gnt_list_entry->pfn)); + kfree(gnt_list_entry); + i--; + } + BUG_ON(i != 0); + return -ENOMEM; +} + +static struct grant *get_grant(grant_ref_t *gref_head, + unsigned long pfn, + struct blkfront_info *info) +{ + struct grant *gnt_list_entry; + unsigned long buffer_mfn; + + BUG_ON(list_empty(&info->grants)); + gnt_list_entry = list_first_entry(&info->grants, struct grant, + node); + list_del(&gnt_list_entry->node); + + if (gnt_list_entry->gref != GRANT_INVALID_REF) { + info->persistent_gnts_c--; + return gnt_list_entry; + } + + /* Assign a gref to this page */ + gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); + BUG_ON(gnt_list_entry->gref == -ENOSPC); + if (!info->feature_persistent) { + BUG_ON(!pfn); + gnt_list_entry->pfn = pfn; + } + buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); + gnttab_grant_foreign_access_ref(gnt_list_entry->gref, + info->xbdev->otherend_id, + buffer_mfn, 0); + return gnt_list_entry; +} + static const char *op_name(int op) { static const char *const names[] = { @@ -282,85 +389,175 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, static int blkif_queue_request(struct request *req) { struct blkfront_info *info = req->rq_disk->private_data; - unsigned long buffer_mfn; struct blkif_request *ring_req; unsigned long id; unsigned int fsect, lsect; - int i, ref; + int i, ref, n; + struct blkif_request_segment *segments = NULL; + + /* + * Used to store if we are able to queue the request by just using + * existing persistent grants, or if we have to get new grants, + * as there are not sufficiently many free. + */ + bool new_persistent_gnts; grant_ref_t gref_head; + struct grant *gnt_list_entry = NULL; struct scatterlist *sg; + int nseg, max_grefs; if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) return 1; - if (gnttab_alloc_grant_references( - BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { - gnttab_request_free_callback( - &info->callback, - blkif_restart_queue_callback, - info, - BLKIF_MAX_SEGMENTS_PER_REQUEST); - return 1; - } + max_grefs = req->nr_phys_segments; + if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST) + /* + * If we are using indirect segments we need to account + * for the indirect grefs used in the request. + */ + max_grefs += INDIRECT_GREFS(req->nr_phys_segments); + + /* Check if we have enough grants to allocate a requests */ + if (info->persistent_gnts_c < max_grefs) { + new_persistent_gnts = 1; + if (gnttab_alloc_grant_references( + max_grefs - info->persistent_gnts_c, + &gref_head) < 0) { + gnttab_request_free_callback( + &info->callback, + blkif_restart_queue_callback, + info, + max_grefs); + return 1; + } + } else + new_persistent_gnts = 0; /* Fill out a communications ring structure. */ ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); id = get_id_from_freelist(info); info->shadow[id].request = req; - ring_req->u.rw.id = id; - ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); - ring_req->u.rw.handle = info->handle; - - ring_req->operation = rq_data_dir(req) ? - BLKIF_OP_WRITE : BLKIF_OP_READ; - - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { - /* - * Ideally we can do an unordered flush-to-disk. In case the - * backend onlysupports barriers, use that. A barrier request - * a superset of FUA, so we can implement it the same - * way. (It's also a FLUSH+FUA, since it is - * guaranteed ordered WRT previous writes.) - */ - ring_req->operation = info->flush_op; - } - if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { - /* id, sector_number and handle are set above. */ ring_req->operation = BLKIF_OP_DISCARD; ring_req->u.discard.nr_sectors = blk_rq_sectors(req); + ring_req->u.discard.id = id; + ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; else ring_req->u.discard.flag = 0; } else { - ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req, - info->sg); - BUG_ON(ring_req->u.rw.nr_segments > - BLKIF_MAX_SEGMENTS_PER_REQUEST); - - for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) { - buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg))); + BUG_ON(info->max_indirect_segments == 0 && + req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); + BUG_ON(info->max_indirect_segments && + req->nr_phys_segments > info->max_indirect_segments); + nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + ring_req->u.rw.id = id; + if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + /* + * The indirect operation can only be a BLKIF_OP_READ or + * BLKIF_OP_WRITE + */ + BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); + ring_req->operation = BLKIF_OP_INDIRECT; + ring_req->u.indirect.indirect_op = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.indirect.handle = info->handle; + ring_req->u.indirect.nr_segments = nseg; + } else { + ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.rw.handle = info->handle; + ring_req->operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { + /* + * Ideally we can do an unordered flush-to-disk. In case the + * backend onlysupports barriers, use that. A barrier request + * a superset of FUA, so we can implement it the same + * way. (It's also a FLUSH+FUA, since it is + * guaranteed ordered WRT previous writes.) + */ + ring_req->operation = info->flush_op; + } + ring_req->u.rw.nr_segments = nseg; + } + for_each_sg(info->shadow[id].sg, sg, nseg, i) { fsect = sg->offset >> 9; lsect = fsect + (sg->length >> 9) - 1; - /* install a grant reference. */ - ref = gnttab_claim_grant_reference(&gref_head); - BUG_ON(ref == -ENOSPC); - - gnttab_grant_foreign_access_ref( - ref, - info->xbdev->otherend_id, - buffer_mfn, - rq_data_dir(req)); - - info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); - ring_req->u.rw.seg[i] = + + if ((ring_req->operation == BLKIF_OP_INDIRECT) && + (i % SEGS_PER_INDIRECT_FRAME == 0)) { + unsigned long uninitialized_var(pfn); + + if (segments) + kunmap_atomic(segments); + + n = i / SEGS_PER_INDIRECT_FRAME; + if (!info->feature_persistent) { + struct page *indirect_page; + + /* Fetch a pre-allocated page to use for indirect grefs */ + BUG_ON(list_empty(&info->indirect_pages)); + indirect_page = list_first_entry(&info->indirect_pages, + struct page, lru); + list_del(&indirect_page->lru); + pfn = page_to_pfn(indirect_page); + } + gnt_list_entry = get_grant(&gref_head, pfn, info); + info->shadow[id].indirect_grants[n] = gnt_list_entry; + segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); + ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; + } + + gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info); + ref = gnt_list_entry->gref; + + info->shadow[id].grants_used[i] = gnt_list_entry; + + if (rq_data_dir(req) && info->feature_persistent) { + char *bvec_data; + void *shared_data; + + BUG_ON(sg->offset + sg->length > PAGE_SIZE); + + shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); + bvec_data = kmap_atomic(sg_page(sg)); + + /* + * this does not wipe data stored outside the + * range sg->offset..sg->offset+sg->length. + * Therefore, blkback *could* see data from + * previous requests. This is OK as long as + * persistent grants are shared with just one + * domain. It may need refactoring if this + * changes + */ + memcpy(shared_data + sg->offset, + bvec_data + sg->offset, + sg->length); + + kunmap_atomic(bvec_data); + kunmap_atomic(shared_data); + } + if (ring_req->operation != BLKIF_OP_INDIRECT) { + ring_req->u.rw.seg[i] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } else { + n = i % SEGS_PER_INDIRECT_FRAME; + segments[n] = (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } } + if (segments) + kunmap_atomic(segments); } info->ring.req_prod_pvt++; @@ -368,7 +565,8 @@ static int blkif_queue_request(struct request *req) /* Keep a private copy so we can reissue requests when recovering. */ info->shadow[id].req = *ring_req; - gnttab_free_grant_references(gref_head); + if (new_persistent_gnts) + gnttab_free_grant_references(gref_head); return 0; } @@ -414,10 +612,10 @@ static void do_blkif_request(struct request_queue *rq) } pr_debug("do_blk_req %p: cmd %p, sec %lx, " - "(%u/%u) buffer:%p [%s]\n", + "(%u/%u) [%s]\n", req, req->cmd, (unsigned long)blk_rq_pos(req), blk_rq_cur_sectors(req), blk_rq_sectors(req), - req->buffer, rq_data_dir(req) ? "write" : "read"); + rq_data_dir(req) ? "write" : "read"); if (blkif_queue_request(req)) { blk_requeue_request(rq, req); @@ -434,7 +632,9 @@ wait: flush_requests(info); } -static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) +static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, + unsigned int physical_sector_size, + unsigned int segments) { struct request_queue *rq; struct blkfront_info *info = gd->private_data; @@ -456,14 +656,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) /* Hard sector size and max sectors impersonate the equiv. hardware. */ blk_queue_logical_block_size(rq, sector_size); - blk_queue_max_hw_sectors(rq, 512); + blk_queue_physical_block_size(rq, physical_sector_size); + blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512); /* Each segment in a request is up to an aligned page in size. */ blk_queue_segment_boundary(rq, PAGE_SIZE - 1); blk_queue_max_segment_size(rq, PAGE_SIZE); /* Ensure a merged request will fit in a single I/O ring slot. */ - blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_segments(rq, segments); /* Make sure buffer addresses are sector-aligned. */ blk_queue_dma_alignment(rq, 511); @@ -480,12 +681,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) static void xlvbd_flush(struct blkfront_info *info) { blk_queue_flush(info->rq, info->feature_flush); - printk(KERN_INFO "blkfront: %s: %s: %s\n", + printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n", info->gd->disk_name, info->flush_op == BLKIF_OP_WRITE_BARRIER ? "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? "flush diskcache" : "barrier or flush"), - info->feature_flush ? "enabled" : "disabled"); + info->feature_flush ? "enabled;" : "disabled;", + "persistent grants:", + info->feature_persistent ? "enabled;" : "disabled;", + "indirect descriptors:", + info->max_indirect_segments ? "enabled;" : "disabled;"); } static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) @@ -558,7 +763,8 @@ static char *encode_disk_name(char *ptr, unsigned int n) static int xlvbd_alloc_gendisk(blkif_sector_t capacity, struct blkfront_info *info, - u16 vdisk_info, u16 sector_size) + u16 vdisk_info, u16 sector_size, + unsigned int physical_sector_size) { struct gendisk *gd; int nr_minors = 1; @@ -625,7 +831,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, gd->driverfs_dev = &(info->xbdev->dev); set_capacity(gd, capacity); - if (xlvbd_init_blk_queue(gd, sector_size)) { + if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size, + info->max_indirect_segments ? : + BLKIF_MAX_SEGMENTS_PER_REQUEST)) { del_gendisk(gd); goto release; } @@ -670,7 +878,7 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) spin_unlock_irqrestore(&info->io_lock, flags); /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work_sync(&info->work); + flush_work(&info->work); del_gendisk(info->gd); @@ -707,6 +915,10 @@ static void blkif_restart_queue(struct work_struct *work) static void blkif_free(struct blkfront_info *info, int suspend) { + struct grant *persistent_gnt; + struct grant *n; + int i, j, segs; + /* Prevent new requests being issued until we fix things up. */ spin_lock_irq(&info->io_lock); info->connected = suspend ? @@ -714,12 +926,86 @@ static void blkif_free(struct blkfront_info *info, int suspend) /* No more blkif_request(). */ if (info->rq) blk_stop_queue(info->rq); + + /* Remove all persistent grants */ + if (!list_empty(&info->grants)) { + list_for_each_entry_safe(persistent_gnt, n, + &info->grants, node) { + list_del(&persistent_gnt->node); + if (persistent_gnt->gref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(persistent_gnt->gref, + 0, 0UL); + info->persistent_gnts_c--; + } + if (info->feature_persistent) + __free_page(pfn_to_page(persistent_gnt->pfn)); + kfree(persistent_gnt); + } + } + BUG_ON(info->persistent_gnts_c != 0); + + /* + * Remove indirect pages, this only happens when using indirect + * descriptors but not persistent grants + */ + if (!list_empty(&info->indirect_pages)) { + struct page *indirect_page, *n; + + BUG_ON(info->feature_persistent); + list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_del(&indirect_page->lru); + __free_page(indirect_page); + } + } + + for (i = 0; i < BLK_RING_SIZE; i++) { + /* + * Clear persistent grants present in requests already + * on the shared ring + */ + if (!info->shadow[i].request) + goto free_shadow; + + segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? + info->shadow[i].req.u.indirect.nr_segments : + info->shadow[i].req.u.rw.nr_segments; + for (j = 0; j < segs; j++) { + persistent_gnt = info->shadow[i].grants_used[j]; + gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); + if (info->feature_persistent) + __free_page(pfn_to_page(persistent_gnt->pfn)); + kfree(persistent_gnt); + } + + if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) + /* + * If this is not an indirect operation don't try to + * free indirect segments + */ + goto free_shadow; + + for (j = 0; j < INDIRECT_GREFS(segs); j++) { + persistent_gnt = info->shadow[i].indirect_grants[j]; + gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); + __free_page(pfn_to_page(persistent_gnt->pfn)); + kfree(persistent_gnt); + } + +free_shadow: + kfree(info->shadow[i].grants_used); + info->shadow[i].grants_used = NULL; + kfree(info->shadow[i].indirect_grants); + info->shadow[i].indirect_grants = NULL; + kfree(info->shadow[i].sg); + info->shadow[i].sg = NULL; + } + /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); spin_unlock_irq(&info->io_lock); /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work_sync(&info->work); + flush_work(&info->work); /* Free resources associated with old device channel. */ if (info->ring_ref != GRANT_INVALID_REF) { @@ -734,13 +1020,86 @@ static void blkif_free(struct blkfront_info *info, int suspend) } -static void blkif_completion(struct blk_shadow *s) +static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, + struct blkif_response *bret) { - int i; - /* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place - * flag. */ - for (i = 0; i < s->req.u.rw.nr_segments; i++) - gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL); + int i = 0; + struct scatterlist *sg; + char *bvec_data; + void *shared_data; + int nseg; + + nseg = s->req.operation == BLKIF_OP_INDIRECT ? + s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; + + if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { + /* + * Copy the data received from the backend into the bvec. + * Since bv_offset can be different than 0, and bv_len different + * than PAGE_SIZE, we have to keep track of the current offset, + * to be sure we are copying the data from the right shared page. + */ + for_each_sg(s->sg, sg, nseg, i) { + BUG_ON(sg->offset + sg->length > PAGE_SIZE); + shared_data = kmap_atomic( + pfn_to_page(s->grants_used[i]->pfn)); + bvec_data = kmap_atomic(sg_page(sg)); + memcpy(bvec_data + sg->offset, + shared_data + sg->offset, + sg->length); + kunmap_atomic(bvec_data); + kunmap_atomic(shared_data); + } + } + /* Add the persistent grant into the list of free grants */ + for (i = 0; i < nseg; i++) { + if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { + /* + * If the grant is still mapped by the backend (the + * backend has chosen to make this grant persistent) + * we add it at the head of the list, so it will be + * reused first. + */ + if (!info->feature_persistent) + pr_alert_ratelimited("backed has not unmapped grant: %u\n", + s->grants_used[i]->gref); + list_add(&s->grants_used[i]->node, &info->grants); + info->persistent_gnts_c++; + } else { + /* + * If the grant is not mapped by the backend we end the + * foreign access and add it to the tail of the list, + * so it will not be picked again unless we run out of + * persistent grants. + */ + gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); + s->grants_used[i]->gref = GRANT_INVALID_REF; + list_add_tail(&s->grants_used[i]->node, &info->grants); + } + } + if (s->req.operation == BLKIF_OP_INDIRECT) { + for (i = 0; i < INDIRECT_GREFS(nseg); i++) { + if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { + if (!info->feature_persistent) + pr_alert_ratelimited("backed has not unmapped grant: %u\n", + s->indirect_grants[i]->gref); + list_add(&s->indirect_grants[i]->node, &info->grants); + info->persistent_gnts_c++; + } else { + struct page *indirect_page; + + gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL); + /* + * Add the used indirect page back to the list of + * available pages for indirect grefs. + */ + indirect_page = pfn_to_page(s->indirect_grants[i]->pfn); + list_add(&indirect_page->lru, &info->indirect_pages); + s->indirect_grants[i]->gref = GRANT_INVALID_REF; + list_add_tail(&s->indirect_grants[i]->node, &info->grants); + } + } + } } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -783,7 +1142,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) req = info->shadow[id].request; if (bret->operation != BLKIF_OP_DISCARD) - blkif_completion(&info->shadow[id]); + blkif_completion(&info->shadow[id], info, bret); if (add_id_to_freelist(info, id)) { WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", @@ -874,8 +1233,6 @@ static int setup_blkring(struct xenbus_device *dev, SHARED_RING_INIT(sring); FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); - sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); - err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); if (err < 0) { free_page((unsigned long)sring); @@ -888,9 +1245,8 @@ static int setup_blkring(struct xenbus_device *dev, if (err) goto fail; - err = bind_evtchn_to_irqhandler(info->evtchn, - blkif_interrupt, - IRQF_SAMPLE_RANDOM, "blkif", info); + err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, + "blkif", info); if (err <= 0) { xenbus_dev_fatal(dev, err, "bind_evtchn_to_irqhandler failed"); @@ -943,6 +1299,11 @@ again: message = "writing protocol"; goto abort_transaction; } + err = xenbus_printf(xbt, dev->nodename, + "feature-persistent", "%u", 1); + if (err) + dev_warn(&dev->dev, + "writing persistent grants feature to xenbus"); err = xenbus_transaction_end(xbt, 0); if (err) { @@ -995,7 +1356,7 @@ static int blkfront_probe(struct xenbus_device *dev, char *type; int len; /* no unplug has been done: do not hook devices != xen vbds */ - if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) { + if (xen_has_pv_and_legacy_disk_devices()) { int major; if (!VDEV_IS_EXTENDED(vdevice)) @@ -1030,6 +1391,9 @@ static int blkfront_probe(struct xenbus_device *dev, spin_lock_init(&info->io_lock); info->xbdev = dev; info->vdevice = vdevice; + INIT_LIST_HEAD(&info->grants); + INIT_LIST_HEAD(&info->indirect_pages); + info->persistent_gnts_c = 0; info->connected = BLKIF_STATE_DISCONNECTED; INIT_WORK(&info->work, blkif_restart_queue); @@ -1051,20 +1415,39 @@ static int blkfront_probe(struct xenbus_device *dev, return 0; } +static void split_bio_end(struct bio *bio, int error) +{ + struct split_bio *split_bio = bio->bi_private; + + if (error) + split_bio->err = error; + + if (atomic_dec_and_test(&split_bio->pending)) { + split_bio->bio->bi_phys_segments = 0; + bio_endio(split_bio->bio, split_bio->err); + kfree(split_bio); + } + bio_put(bio); +} static int blkif_recover(struct blkfront_info *info) { int i; - struct blkif_request *req; + struct request *req, *n; struct blk_shadow *copy; - int j; + int rc; + struct bio *bio, *cloned_bio; + struct bio_list bio_list, merge_bio; + unsigned int segs, offset; + int pending, size; + struct split_bio *split_bio; + struct list_head requests; /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmalloc(sizeof(info->shadow), + copy = kmemdup(info->shadow, sizeof(info->shadow), GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); if (!copy) return -ENOMEM; - memcpy(copy, info->shadow, sizeof(info->shadow)); /* Stage 2: Set up free list. */ memset(&info->shadow, 0, sizeof(info->shadow)); @@ -1073,36 +1456,64 @@ static int blkif_recover(struct blkfront_info *info) info->shadow_free = info->ring.req_prod_pvt; info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; - /* Stage 3: Find pending requests and requeue them. */ + rc = blkfront_setup_indirect(info); + if (rc) { + kfree(copy); + return rc; + } + + segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; + blk_queue_max_segments(info->rq, segs); + bio_list_init(&bio_list); + INIT_LIST_HEAD(&requests); for (i = 0; i < BLK_RING_SIZE; i++) { /* Not in use? */ if (!copy[i].request) continue; - /* Grab a request slot and copy shadow state into it. */ - req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - *req = copy[i].req; - - /* We get a new request id, and must reset the shadow state. */ - req->u.rw.id = get_id_from_freelist(info); - memcpy(&info->shadow[req->u.rw.id], ©[i], sizeof(copy[i])); - - if (req->operation != BLKIF_OP_DISCARD) { - /* Rewrite any grant references invalidated by susp/resume. */ - for (j = 0; j < req->u.rw.nr_segments; j++) - gnttab_grant_foreign_access_ref( - req->u.rw.seg[j].gref, - info->xbdev->otherend_id, - pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]), - rq_data_dir(info->shadow[req->u.rw.id].request)); + /* + * Get the bios in the request so we can re-queue them. + */ + if (copy[i].request->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + /* + * Flush operations don't contain bios, so + * we need to requeue the whole request + */ + list_add(©[i].request->queuelist, &requests); + continue; } - info->shadow[req->u.rw.id].req = *req; - - info->ring.req_prod_pvt++; + merge_bio.head = copy[i].request->bio; + merge_bio.tail = copy[i].request->biotail; + bio_list_merge(&bio_list, &merge_bio); + copy[i].request->bio = NULL; + blk_put_request(copy[i].request); } kfree(copy); + /* + * Empty the queue, this is important because we might have + * requests in the queue with more segments than what we + * can handle now. + */ + spin_lock_irq(&info->io_lock); + while ((req = blk_fetch_request(info->rq)) != NULL) { + if (req->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + list_add(&req->queuelist, &requests); + continue; + } + merge_bio.head = req->bio; + merge_bio.tail = req->biotail; + bio_list_merge(&bio_list, &merge_bio); + req->bio = NULL; + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) + pr_alert("diskcache flush request found!\n"); + __blk_put_request(info->rq, req); + } + spin_unlock_irq(&info->io_lock); + xenbus_switch_state(info->xbdev, XenbusStateConnected); spin_lock_irq(&info->io_lock); @@ -1110,14 +1521,50 @@ static int blkif_recover(struct blkfront_info *info) /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; - /* Send off requeued requests */ - flush_requests(info); - /* Kick any other new requests queued since we resumed */ kick_pending_request_queues(info); + list_for_each_entry_safe(req, n, &requests, queuelist) { + /* Requeue pending requests (flush or discard) */ + list_del_init(&req->queuelist); + BUG_ON(req->nr_phys_segments > segs); + blk_requeue_request(info->rq, req); + } spin_unlock_irq(&info->io_lock); + while ((bio = bio_list_pop(&bio_list)) != NULL) { + /* Traverse the list of pending bios and re-queue them */ + if (bio_segments(bio) > segs) { + /* + * This bio has more segments than what we can + * handle, we have to split it. + */ + pending = (bio_segments(bio) + segs - 1) / segs; + split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO); + BUG_ON(split_bio == NULL); + atomic_set(&split_bio->pending, pending); + split_bio->bio = bio; + for (i = 0; i < pending; i++) { + offset = (i * segs * PAGE_SIZE) >> 9; + size = min((unsigned int)(segs * PAGE_SIZE) >> 9, + (unsigned int)bio_sectors(bio) - offset); + cloned_bio = bio_clone(bio, GFP_NOIO); + BUG_ON(cloned_bio == NULL); + bio_trim(cloned_bio, offset, size); + cloned_bio->bi_private = split_bio; + cloned_bio->bi_end_io = split_bio_end; + submit_bio(cloned_bio->bi_rw, cloned_bio); + } + /* + * Now we have to wait for all those smaller bios to + * end, so we can also end the "parent" bio. + */ + continue; + } + /* We don't need to split this bio */ + submit_bio(bio->bi_rw, bio); + } + return 0; } @@ -1137,8 +1584,12 @@ static int blkfront_resume(struct xenbus_device *dev) blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); err = talk_to_blkback(dev, info); - if (info->connected == BLKIF_STATE_SUSPENDED && !err) - err = blkif_recover(info); + + /* + * We have to wait for the backend to switch to + * connected state, since we want to read which + * features it supports. + */ return err; } @@ -1184,36 +1635,102 @@ blkfront_closing(struct blkfront_info *info) static void blkfront_setup_discard(struct blkfront_info *info) { int err; - char *type; unsigned int discard_granularity; unsigned int discard_alignment; unsigned int discard_secure; - type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL); - if (IS_ERR(type)) - return; + info->feature_discard = 1; + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "discard-granularity", "%u", &discard_granularity, + "discard-alignment", "%u", &discard_alignment, + NULL); + if (!err) { + info->discard_granularity = discard_granularity; + info->discard_alignment = discard_alignment; + } + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "discard-secure", "%d", &discard_secure, + NULL); + if (!err) + info->feature_secdiscard = !!discard_secure; +} - info->feature_secdiscard = 0; - if (strncmp(type, "phy", 3) == 0) { - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "discard-granularity", "%u", &discard_granularity, - "discard-alignment", "%u", &discard_alignment, - NULL); - if (!err) { - info->feature_discard = 1; - info->discard_granularity = discard_granularity; - info->discard_alignment = discard_alignment; - } - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "discard-secure", "%d", &discard_secure, +static int blkfront_setup_indirect(struct blkfront_info *info) +{ + unsigned int indirect_segments, segs; + int err, i; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-max-indirect-segments", "%u", &indirect_segments, NULL); - if (!err) - info->feature_secdiscard = discard_secure; + if (err) { + info->max_indirect_segments = 0; + segs = BLKIF_MAX_SEGMENTS_PER_REQUEST; + } else { + info->max_indirect_segments = min(indirect_segments, + xen_blkif_max_segments); + segs = info->max_indirect_segments; + } + + err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE); + if (err) + goto out_of_memory; + + if (!info->feature_persistent && info->max_indirect_segments) { + /* + * We are using indirect descriptors but not persistent + * grants, we need to allocate a set of pages that can be + * used for mapping indirect grefs + */ + int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE; + + BUG_ON(!list_empty(&info->indirect_pages)); + for (i = 0; i < num; i++) { + struct page *indirect_page = alloc_page(GFP_NOIO); + if (!indirect_page) + goto out_of_memory; + list_add(&indirect_page->lru, &info->indirect_pages); + } + } + + for (i = 0; i < BLK_RING_SIZE; i++) { + info->shadow[i].grants_used = kzalloc( + sizeof(info->shadow[i].grants_used[0]) * segs, + GFP_NOIO); + info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO); + if (info->max_indirect_segments) + info->shadow[i].indirect_grants = kzalloc( + sizeof(info->shadow[i].indirect_grants[0]) * + INDIRECT_GREFS(segs), + GFP_NOIO); + if ((info->shadow[i].grants_used == NULL) || + (info->shadow[i].sg == NULL) || + (info->max_indirect_segments && + (info->shadow[i].indirect_grants == NULL))) + goto out_of_memory; + sg_init_table(info->shadow[i].sg, segs); + } - } else if (strncmp(type, "file", 4) == 0) - info->feature_discard = 1; - kfree(type); + return 0; + +out_of_memory: + for (i = 0; i < BLK_RING_SIZE; i++) { + kfree(info->shadow[i].grants_used); + info->shadow[i].grants_used = NULL; + kfree(info->shadow[i].sg); + info->shadow[i].sg = NULL; + kfree(info->shadow[i].indirect_grants); + info->shadow[i].indirect_grants = NULL; + } + if (!list_empty(&info->indirect_pages)) { + struct page *indirect_page, *n; + list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_del(&indirect_page->lru); + __free_page(indirect_page); + } + } + return -ENOMEM; } /* @@ -1224,9 +1741,10 @@ static void blkfront_connect(struct blkfront_info *info) { unsigned long long sectors; unsigned long sector_size; + unsigned int physical_sector_size; unsigned int binfo; int err; - int barrier, flush, discard; + int barrier, flush, discard, persistent; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -1243,8 +1761,15 @@ static void blkfront_connect(struct blkfront_info *info) set_capacity(info->gd, sectors); revalidate_disk(info->gd); - /* fall through */ + return; case BLKIF_STATE_SUSPENDED: + /* + * If we are recovering from suspension, we need to wait + * for the backend to announce it's features before + * reconnecting, at least we need to know if the backend + * supports indirect descriptors, and how many. + */ + blkif_recover(info); return; default: @@ -1266,6 +1791,16 @@ static void blkfront_connect(struct blkfront_info *info) return; } + /* + * physcial-sector-size is a newer field, so old backends may not + * provide this. Assume physical sector size to be the same as + * sector_size in that case. + */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "physical-sector-size", "%u", &physical_sector_size); + if (err != 1) + physical_sector_size = sector_size; + info->feature_flush = 0; info->flush_op = 0; @@ -1304,7 +1839,23 @@ static void blkfront_connect(struct blkfront_info *info) if (!err && discard) blkfront_setup_discard(info); - err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-persistent", "%u", &persistent, + NULL); + if (err) + info->feature_persistent = 0; + else + info->feature_persistent = persistent; + + err = blkfront_setup_indirect(info); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", + info->xbdev->otherend); + return; + } + + err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, + physical_sector_size); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", info->xbdev->otherend); @@ -1341,13 +1892,16 @@ static void blkback_changed(struct xenbus_device *dev, case XenbusStateReconfiguring: case XenbusStateReconfigured: case XenbusStateUnknown: - case XenbusStateClosed: break; case XenbusStateConnected: blkfront_connect(info); break; + case XenbusStateClosed: + if (dev->state == XenbusStateClosed) + break; + /* Missed the backend's Closing state -- fallthrough */ case XenbusStateClosing: blkfront_closing(info); break; @@ -1438,7 +1992,7 @@ out: return err; } -static int blkif_release(struct gendisk *disk, fmode_t mode) +static void blkif_release(struct gendisk *disk, fmode_t mode) { struct blkfront_info *info = disk->private_data; struct block_device *bdev; @@ -1448,6 +2002,10 @@ static int blkif_release(struct gendisk *disk, fmode_t mode) bdev = bdget_disk(disk, 0); + if (!bdev) { + WARN(1, "Block device %s yanked out from us!\n", disk->disk_name); + goto out_mutex; + } if (bdev->bd_openers) goto out; @@ -1478,8 +2036,8 @@ static int blkif_release(struct gendisk *disk, fmode_t mode) out: bdput(bdev); +out_mutex: mutex_unlock(&blkfront_mutex); - return 0; } static const struct block_device_operations xlvbd_block_fops = @@ -1512,7 +2070,7 @@ static int __init xlblk_init(void) if (!xen_domain()) return -ENODEV; - if (xen_hvm_domain() && !xen_platform_pci_unplug) + if (!xen_has_pv_disk_devices()) return -ENODEV; if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { |
