diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-27 20:02:07 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-27 20:02:07 -0700 |
commit | 8d49a77568d1105ff3e64aec484dac059f54824e (patch) | |
tree | 633ee954a3cea97bf136dec933388a2e419e5dac | |
parent | 93567c43eb2a4771b9c590435928f9b3a428e568 (diff) | |
parent | 1ddd5049545e0aa1a0ed19bca4d9c9c3ce1ac8a2 (diff) |
Merge branch 'for-2.6.39/drivers' of git://git.kernel.dk/linux-2.6-block
* 'for-2.6.39/drivers' of git://git.kernel.dk/linux-2.6-block: (122 commits)
cciss: fix lost command issue
drbd: need include for bitops functions declarations
Revert "cciss: Add missing allocation in scsi_cmd_stack_setup and corresponding deallocation"
cciss: fix missed command status value CMD_UNABORTABLE
cciss: remove unnecessary casts
cciss: Mask off error bits of c->busaddr in cmd_special_free when calling pci_free_consistent
cciss: Inform controller we are using 32-bit tags.
cciss: hoist tag masking out of loop
cciss: Add missing allocation in scsi_cmd_stack_setup and corresponding deallocation
cciss: export resettable host attribute
drbd: drop code present under #ifdef which is relevant to 2.6.28 and below
drbd: Fixed handling of read errors on a 'VerifyS' node
drbd: Fixed handling of read errors on a 'VerifyT' node
drbd: Implemented real timeout checking for request processing time
drbd: Remove unused function atodb_endio()
drbd: improve log message if received sector offset exceeds local capacity
drbd: kill dead code
drbd: don't BUG_ON, if bio_add_page of a single page to an empty bio fails
drbd: Removed left over, now wrong comments
drbd: serialize admin requests for new verify run with pending bitmap io
...
-rw-r--r-- | Documentation/ABI/testing/sysfs-bus-pci-devices-cciss | 12 | ||||
-rw-r--r-- | drivers/block/cciss.c | 86 | ||||
-rw-r--r-- | drivers/block/cciss.h | 1 | ||||
-rw-r--r-- | drivers/block/cciss_cmd.h | 1 | ||||
-rw-r--r-- | drivers/block/cciss_scsi.c | 13 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 335 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 752 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 270 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 673 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 183 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_proc.c | 114 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 608 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 169 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.h | 36 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_strings.c | 6 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 360 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_wrappers.h | 2 | ||||
-rw-r--r-- | include/linux/drbd.h | 23 | ||||
-rw-r--r-- | include/linux/drbd_limits.h | 12 | ||||
-rw-r--r-- | include/linux/drbd_nl.h | 13 | ||||
-rw-r--r-- | include/linux/drbd_tag_magic.h | 1 |
21 files changed, 2267 insertions, 1403 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss index 4f29e5f1ebf..f5bb0a3bb8c 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss +++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss @@ -59,3 +59,15 @@ Kernel Version: 2.6.31 Contact: iss_storagedev@hp.com Description: Displays the usage count (number of opens) of logical drive Y of controller X. + +Where: /sys/bus/pci/devices/<dev>/ccissX/resettable +Date: February 2011 +Kernel Version: 2.6.38 +Contact: iss_storagedev@hp.com +Description: Value of 1 indicates the controller can honor the reset_devices + kernel parameter. Value of 0 indicates reset_devices cannot be + honored. This is to allow, for example, kexec tools to be able + to warn the user if they designate an unresettable device as + a dump device, as kdump requires resetting the device in order + to work reliably. + diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 35658f445fc..9bf13988f1a 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -193,7 +193,7 @@ static int __devinit cciss_find_cfg_addrs(struct pci_dev *pdev, u64 *cfg_offset); static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev, unsigned long *memory_bar); - +static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag); /* performant mode helper functions */ static void calc_bucket_map(int *bucket, int num_buckets, int nsgs, @@ -231,7 +231,7 @@ static const struct block_device_operations cciss_fops = { */ static void set_performant_mode(ctlr_info_t *h, CommandList_struct *c) { - if (likely(h->transMethod == CFGTBL_Trans_Performant)) + if (likely(h->transMethod & CFGTBL_Trans_Performant)) c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1); } @@ -556,6 +556,44 @@ static void __devinit cciss_procinit(ctlr_info_t *h) #define to_hba(n) container_of(n, struct ctlr_info, dev) #define to_drv(n) container_of(n, drive_info_struct, dev) +/* List of controllers which cannot be reset on kexec with reset_devices */ +static u32 unresettable_controller[] = { + 0x324a103C, /* Smart Array P712m */ + 0x324b103C, /* SmartArray P711m */ + 0x3223103C, /* Smart Array P800 */ + 0x3234103C, /* Smart Array P400 */ + 0x3235103C, /* Smart Array P400i */ + 0x3211103C, /* Smart Array E200i */ + 0x3212103C, /* Smart Array E200 */ + 0x3213103C, /* Smart Array E200i */ + 0x3214103C, /* Smart Array E200i */ + 0x3215103C, /* Smart Array E200i */ + 0x3237103C, /* Smart Array E500 */ + 0x323D103C, /* Smart Array P700m */ + 0x409C0E11, /* Smart Array 6400 */ + 0x409D0E11, /* Smart Array 6400 EM */ +}; + +static int ctlr_is_resettable(struct ctlr_info *h) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(unresettable_controller); i++) + if (unresettable_controller[i] == h->board_id) + return 0; + return 1; +} + +static ssize_t host_show_resettable(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ctlr_info *h = to_hba(dev); + + return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h)); +} +static DEVICE_ATTR(resettable, S_IRUGO, host_show_resettable, NULL); + static ssize_t host_store_rescan(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -741,6 +779,7 @@ static DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL); static struct attribute *cciss_host_attrs[] = { &dev_attr_rescan.attr, + &dev_attr_resettable.attr, NULL }; @@ -973,8 +1012,8 @@ static void cmd_special_free(ctlr_info_t *h, CommandList_struct *c) temp64.val32.upper = c->ErrDesc.Addr.upper; pci_free_consistent(h->pdev, sizeof(ErrorInfo_struct), c->err_info, (dma_addr_t) temp64.val); - pci_free_consistent(h->pdev, sizeof(CommandList_struct), - c, (dma_addr_t) c->busaddr); + pci_free_consistent(h->pdev, sizeof(CommandList_struct), c, + (dma_addr_t) cciss_tag_discard_error_bits(h, (u32) c->busaddr)); } static inline ctlr_info_t *get_host(struct gendisk *disk) @@ -1490,8 +1529,7 @@ static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp) return -EINVAL; if (!capable(CAP_SYS_RAWIO)) return -EPERM; - ioc = (BIG_IOCTL_Command_struct *) - kmalloc(sizeof(*ioc), GFP_KERNEL); + ioc = kmalloc(sizeof(*ioc), GFP_KERNEL); if (!ioc) { status = -ENOMEM; goto cleanup1; @@ -2653,6 +2691,10 @@ static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c) c->Request.CDB[0]); return_status = IO_NEEDS_RETRY; break; + case CMD_UNABORTABLE: + dev_warn(&h->pdev->dev, "cmd unabortable\n"); + return_status = IO_ERROR; + break; default: dev_warn(&h->pdev->dev, "cmd 0x%02x returned " "unknown status %x\n", c->Request.CDB[0], @@ -3103,6 +3145,13 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd, (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ? DID_PASSTHROUGH : DID_ERROR); break; + case CMD_UNABORTABLE: + dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd); + rq->errors = make_status_bytes(SAM_STAT_GOOD, + cmd->err_info->CommandStatus, DRIVER_OK, + cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC ? + DID_PASSTHROUGH : DID_ERROR); + break; default: dev_warn(&h->pdev->dev, "cmd %p returned " "unknown status %x\n", cmd, @@ -3136,10 +3185,13 @@ static inline u32 cciss_tag_to_index(u32 tag) return tag >> DIRECT_LOOKUP_SHIFT; } -static inline u32 cciss_tag_discard_error_bits(u32 tag) +static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag) { -#define CCISS_ERROR_BITS 0x03 - return tag & ~CCISS_ERROR_BITS; +#define CCISS_PERF_ERROR_BITS ((1 << DIRECT_LOOKUP_SHIFT) - 1) +#define CCISS_SIMPLE_ERROR_BITS 0x03 + if (likely(h->transMethod & CFGTBL_Trans_Performant)) + return tag & ~CCISS_PERF_ERROR_BITS; + return tag & ~CCISS_SIMPLE_ERROR_BITS; } static inline void cciss_mark_tag_indexed(u32 *tag) @@ -3359,7 +3411,7 @@ static inline u32 next_command(ctlr_info_t *h) { u32 a; - if (unlikely(h->transMethod != CFGTBL_Trans_Performant)) + if (unlikely(!(h->transMethod & CFGTBL_Trans_Performant))) return h->access.command_completed(h); if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) { @@ -3394,14 +3446,12 @@ static inline u32 process_indexed_cmd(ctlr_info_t *h, u32 raw_tag) /* process completion of a non-indexed command */ static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag) { - u32 tag; CommandList_struct *c = NULL; __u32 busaddr_masked, tag_masked; - tag = cciss_tag_discard_error_bits(raw_tag); + tag_masked = cciss_tag_discard_error_bits(h, raw_tag); list_for_each_entry(c, &h->cmpQ, list) { - busaddr_masked = cciss_tag_discard_error_bits(c->busaddr); - tag_masked = cciss_tag_discard_error_bits(tag); + busaddr_masked = cciss_tag_discard_error_bits(h, c->busaddr); if (busaddr_masked == tag_masked) { finish_cmd(h, c, raw_tag); return next_command(h); @@ -3753,7 +3803,8 @@ static void __devinit cciss_wait_for_mode_change_ack(ctlr_info_t *h) } } -static __devinit void cciss_enter_performant_mode(ctlr_info_t *h) +static __devinit void cciss_enter_performant_mode(ctlr_info_t *h, + u32 use_short_tags) { /* This is a bit complicated. There are 8 registers on * the controller which we write to to tell it 8 different @@ -3808,7 +3859,7 @@ static __devinit void cciss_enter_performant_mode(ctlr_info_t *h) writel(0, &h->transtable->RepQCtrAddrHigh32); writel(h->reply_pool_dhandle, &h->transtable->RepQAddr0Low32); writel(0, &h->transtable->RepQAddr0High32); - writel(CFGTBL_Trans_Performant, + writel(CFGTBL_Trans_Performant | use_short_tags, &(h->cfgtable->HostWrite.TransportRequest)); writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); @@ -3855,7 +3906,8 @@ static void __devinit cciss_put_controller_into_performant_mode(ctlr_info_t *h) if ((h->reply_pool == NULL) || (h->blockFetchTable == NULL)) goto clean_up; - cciss_enter_performant_mode(h); + cciss_enter_performant_mode(h, + trans_support & CFGTBL_Trans_use_short_tags); /* Change the access methods to the performant access methods */ h->access = SA5_performant_access; diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h index 579f7491849..554bbd907d1 100644 --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h @@ -222,6 +222,7 @@ static void SA5_submit_command( ctlr_info_t *h, CommandList_struct *c) h->ctlr, c->busaddr); #endif /* CCISS_DEBUG */ writel(c->busaddr, h->vaddr + SA5_REQUEST_PORT_OFFSET); + readl(h->vaddr + SA5_REQUEST_PORT_OFFSET); h->commands_outstanding++; if ( h->commands_outstanding > h->max_outstanding) h->max_outstanding = h->commands_outstanding; diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h index 35463d2f0ee..cd441bef031 100644 --- a/drivers/block/cciss_cmd.h +++ b/drivers/block/cciss_cmd.h @@ -56,6 +56,7 @@ #define CFGTBL_Trans_Simple 0x00000002l #define CFGTBL_Trans_Performant 0x00000004l +#define CFGTBL_Trans_use_short_tags 0x20000000l #define CFGTBL_BusType_Ultra2 0x00000001l #define CFGTBL_BusType_Ultra3 0x00000002l diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index 727d0225b7d..df793803f5a 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -824,13 +824,18 @@ static void complete_scsi_command(CommandList_struct *c, int timeout, break; case CMD_UNSOLICITED_ABORT: cmd->result = DID_ABORT << 16; - dev_warn(&h->pdev->dev, "%p aborted do to an " + dev_warn(&h->pdev->dev, "%p aborted due to an " "unsolicited abort\n", c); break; case CMD_TIMEOUT: cmd->result = DID_TIME_OUT << 16; dev_warn(&h->pdev->dev, "%p timedout\n", c); break; + case CMD_UNABORTABLE: + cmd->result = DID_ERROR << 16; + dev_warn(&h->pdev->dev, "c %p command " + "unabortable\n", c); + break; default: cmd->result = DID_ERROR << 16; dev_warn(&h->pdev->dev, @@ -1007,11 +1012,15 @@ cciss_scsi_interpret_error(ctlr_info_t *h, CommandList_struct *c) break; case CMD_UNSOLICITED_ABORT: dev_warn(&h->pdev->dev, - "%p aborted do to an unsolicited abort\n", c); + "%p aborted due to an unsolicited abort\n", c); break; case CMD_TIMEOUT: dev_warn(&h->pdev->dev, "%p timedout\n", c); break; + case CMD_UNABORTABLE: + dev_warn(&h->pdev->dev, + "%p unabortable\n", c); + break; default: dev_warn(&h->pdev->dev, "%p returned unknown status %x\n", diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index aca302492ff..2a1642bc451 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -92,7 +92,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; - if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) + if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_endio(bio, -EIO); else submit_bio(rw, bio); @@ -176,13 +176,17 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) struct lc_element *al_ext; struct lc_element *tmp; unsigned long al_flags = 0; + int wake; spin_lock_irq(&mdev->al_lock); tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); if (unlikely(tmp != NULL)) { struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { + wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); spin_unlock_irq(&mdev->al_lock); + if (wake) + wake_up(&mdev->al_wait); return NULL; } } @@ -258,6 +262,33 @@ void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) spin_unlock_irqrestore(&mdev->al_lock, flags); } +#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) +/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT + * are still coupled, or assume too much about their relation. + * Code below will not work if this is violated. + * Will be cleaned up with some followup patch. + */ +# error FIXME +#endif + +static unsigned int al_extent_to_bm_page(unsigned int al_enr) +{ + return al_enr >> + /* bit to page */ + ((PAGE_SHIFT + 3) - + /* al extent number to bit */ + (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); +} + +static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) +{ + return rs_enr >> + /* bit to page */ + ((PAGE_SHIFT + 3) - + /* al extent number to bit */ + (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); +} + int w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) { @@ -285,7 +316,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) * For now, we must not write the transaction, * if we cannot write out the bitmap of the evicted extent. */ if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) - drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); + drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted)); /* The bitmap write may have failed, causing a state change. */ if (mdev->state.disk < D_INCONSISTENT) { @@ -334,7 +365,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) + mdev->ldev->md.al_offset + mdev->al_tr_pos; if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) - drbd_chk_io_error(mdev, 1, TRUE); + drbd_chk_io_error(mdev, 1, true); if (++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) @@ -511,225 +542,6 @@ cancel: return 1; } -static void atodb_endio(struct bio *bio, int error) -{ - struct drbd_atodb_wait *wc = bio->bi_private; - struct drbd_conf *mdev = wc->mdev; - struct page *page; - int uptodate = bio_flagged(bio, BIO_UPTODATE); - - /* strange behavior of some lower level drivers... - * fail the request by clearing the uptodate flag, - * but do not return any error?! */ - if (!error && !uptodate) - error = -EIO; - - drbd_chk_io_error(mdev, error, TRUE); - if (error && wc->error == 0) - wc->error = error; - - if (atomic_dec_and_test(&wc->count)) - complete(&wc->io_done); - - page = bio->bi_io_vec[0].bv_page; - put_page(page); - bio_put(bio); - mdev->bm_writ_cnt++; - put_ldev(mdev); -} - -/* sector to word */ -#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) - -/* activity log to on disk bitmap -- prepare bio unless that sector - * is already covered by previously prepared bios */ -static int atodb_prepare_unless_covered(struct drbd_conf *mdev, - struct bio **bios, - unsigned int enr, - struct drbd_atodb_wait *wc) __must_hold(local) -{ - struct bio *bio; - struct page *page; - sector_t on_disk_sector; - unsigned int page_offset = PAGE_SIZE; - int offset; - int i = 0; - int err = -ENOMEM; - - /* We always write aligned, full 4k blocks, - * so we can ignore the logical_block_size (for now) */ - enr &= ~7U; - on_disk_sector = enr + mdev->ldev->md.md_offset - + mdev->ldev->md.bm_offset; - - D_ASSERT(!(on_disk_sector & 7U)); - - /* Check if that enr is already covered by an already created bio. - * Caution, bios[] is not NULL terminated, - * but only initialized to all NULL. - * For completely scattered activity log, - * the last invocation iterates over all bios, - * and finds the last NULL entry. - */ - while ((bio = bios[i])) { - if (bio->bi_sector == on_disk_sector) - return 0; - i++; - } - /* bios[i] == NULL, the next not yet used slot */ - - /* GFP_KERNEL, we are not in the write-out path */ - bio = bio_alloc(GFP_KERNEL, 1); - if (bio == NULL) - return -ENOMEM; - - if (i > 0) { - const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec; - page_offset = prev_bv->bv_offset + prev_bv->bv_len; - page = prev_bv->bv_page; - } - if (page_offset == PAGE_SIZE) { - page = alloc_page(__GFP_HIGHMEM); - if (page == NULL) - goto out_bio_put; - page_offset = 0; - } else { - get_page(page); - } - - offset = S2W(enr); - drbd_bm_get_lel(mdev, offset, - min_t(size_t, S2W(8), drbd_bm_words(mdev) - offset), - kmap(page) + page_offset); - kunmap(page); - - bio->bi_private = wc; - bio->bi_end_io = atodb_endio; - bio->bi_bdev = mdev->ldev->md_bdev; - bio->bi_sector = on_disk_sector; - - if (bio_add_page(bio, page, 4096, page_offset) != 4096) - goto out_put_page; - - atomic_inc(&wc->count); - /* we already know that we may do this... - * get_ldev_if_state(mdev,D_ATTACHING); - * just get the extra reference, so that the local_cnt reflects - * the number of pending IO requests DRBD at its backing device. - */ - atomic_inc(&mdev->local_cnt); - - bios[i] = bio; - - return 0; - -out_put_page: - err = -EINVAL; - put_page(page); -out_bio_put: - bio_put(bio); - return err; -} - -/** - * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents - * @mdev: DRBD device. - * - * Called when we detach (unconfigure) local storage, - * or when we go from R_PRIMARY to R_SECONDARY role. - */ -void drbd_al_to_on_disk_bm(struct drbd_conf *mdev) -{ - int i, nr_elements; - unsigned int enr; - struct bio **bios; - struct drbd_atodb_wait wc; - - ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING)) - return; /* sorry, I don't have any act_log etc... */ - - wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); - - nr_elements = mdev->act_log->nr_elements; - - /* GFP_KERNEL, we are not in anyone's write-out path */ - bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL); - if (!bios) - goto submit_one_by_one; - - atomic_set(&wc.count, 0); - init_completion(&wc.io_done); - wc.mdev = mdev; - wc.error = 0; - - for (i = 0; i < nr_elements; i++) { - enr = lc_element_by_index(mdev->act_log, i)->lc_number; - if (enr == LC_FREE) - continue; - /* next statement also does atomic_inc wc.count and local_cnt */ - if (atodb_prepare_unless_covered(mdev, bios, - enr/AL_EXT_PER_BM_SECT, - &wc)) - goto free_bios_submit_one_by_one; - } - - /* unnecessary optimization? */ - lc_unlock(mdev->act_log); - wake_up(&mdev->al_wait); - - /* all prepared, submit them */ - for (i = 0; i < nr_elements; i++) { - if (bios[i] == NULL) - break; - if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) { - bios[i]->bi_rw = WRITE; - bio_endio(bios[i], -EIO); - } else { - submit_bio(WRITE, bios[i]); - } - } - - /* always (try to) flush bitmap to stable storage */ - drbd_md_flush(mdev); - - /* In case we did not submit a single IO do not wait for - * them to complete. ( Because we would wait forever here. ) - * - * In case we had IOs and they are already complete, there - * is not point in waiting anyways. - * Therefore this if () ... */ - if (atomic_read(&wc.count)) - wait_for_completion(&wc.io_done); - - put_ldev(mdev); - - kfree(bios); - return; - - free_bios_submit_one_by_one: - /* free everything by calling the endio callback directly. */ - for (i = 0; i < nr_elements && bios[i]; i++) - bio_endio(bios[i], 0); - - kfree(bios); - - submit_one_by_one: - dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n"); - - for (i = 0; i < mdev->act_log->nr_elements; i++) { - enr = lc_element_by_index(mdev->act_log, i)->lc_number; - if (enr == LC_FREE) - continue; - /* Really slow: if we have al-extents 16..19 active, - * sector 4 will be written four times! Synchronous! */ - drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT); - } - - lc_unlock(mdev->act_log); - wake_up(&mdev->al_wait); - put_ldev(mdev); -} - /** * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents * @mdev: DRBD device. @@ -809,7 +621,7 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused return 1; } - drbd_bm_write_sect(mdev, udw->enr); + drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr)); put_ldev(mdev); kfree(udw); @@ -889,7 +701,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, dev_warn(DEV, "Kicking resync_lru element enr=%u " "out with rs_failed=%d\n", ext->lce.lc_number, ext->rs_failed); - set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); } ext->rs_left = rs_left; ext->rs_failed = success ? 0 : count; @@ -908,7 +719,6 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, drbd_queue_work_front(&mdev->data.work, &udw->w); } else { dev_warn(DEV, "Could not kmalloc an udw\n"); - set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); } } } else { @@ -919,6 +729,22 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, } } +void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go) +{ + unsigned long now = jiffies; + unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark]; + int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS; + if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { + if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go && + mdev->state.conn != C_PAUSED_SYNC_T && + mdev->state.conn != C_PAUSED_SYNC_S) { + mdev->rs_mark_time[next] = now; + mdev->rs_mark_left[next] = still_to_go; + mdev->rs_last_mark = next; + } + } +} + /* clear the bit corresponding to the piece of storage in question: * size byte of data starting from sector. Only clear a bits of the affected * one ore more _aligned_ BM_BLOCK_SIZE blocks. @@ -936,7 +762,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, int wake_up = 0; unsigned long flags; - if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; @@ -969,21 +795,9 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, */ count = drbd_bm_clear_bits(mdev, sbnr, ebnr); if (count && get_ldev(mdev)) { - unsigned long now = jiffies; - unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark]; - int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS; - if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { - unsigned long tw = drbd_bm_total_weight(mdev); - if (mdev->rs_mark_left[mdev->rs_last_mark] != tw && - mdev->state.conn != C_PAUSED_SYNC_T && - mdev->state.conn != C_PAUSED_SYNC_S) { - mdev->rs_mark_time[next] = now; - mdev->rs_mark_left[next] = tw; - mdev->rs_last_mark = next; - } - } + drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev)); spin_lock_irqsave(&mdev->al_lock, flags); - drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); + drbd_try_clear_on_disk_bm(mdev, sector, count, true); spin_unlock_irqrestore(&mdev->al_lock, flags); /* just wake_up unconditional now, various lc_chaged(), @@ -998,27 +812,27 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, /* * this is intended to set one request worth of data out of sync. * affects at least 1 bit, - * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. + * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits. * * called by tl_clear and drbd_send_dblock (==drbd_make_request). * so this can be _any_ process. */ -void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, +int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, const char *file, const unsigned int line) { unsigned long sbnr, ebnr, lbnr, flags; sector_t esector, nr_sectors; - unsigned int enr, count; + unsigned int enr, count = 0; struct lc_element *e; - if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "sector: %llus, size: %d\n", (unsigned long long)sector, size); - return; + return 0; } if (!get_ldev(mdev)) - return; /* no disk, no metadata, no bitmap to set bits in */ + return 0; /* no disk, no metadata, no bitmap to set bits in */ nr_sectors = drbd_get_capacity(mdev->this_bdev); esector = sector + (size >> 9) - 1; @@ -1048,6 +862,8 @@ void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, out: put_ldev(mdev); + + return count; } static @@ -1128,7 +944,10 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) unsigned int enr = BM_SECT_TO_EXT(sector); struct bm_extent *bm_ext; int i, sig; + int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait. + 200 times -> 20 seconds. */ +retry: sig = wait_event_interruptible(mdev->al_wait, (bm_ext = _bme_get(mdev, enr))); if (sig) @@ -1139,16 +958,25 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { sig = wait_event_interruptible(mdev->al_wait, - !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i)); - if (sig) { + !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) || + test_bit(BME_PRIORITY, &bm_ext->flags)); + + if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) { spin_lock_irq(&mdev->al_lock); if (lc_put(mdev->resync, &bm_ext->lce) == 0) { - clear_bit(BME_NO_WRITES, &bm_ext->flags); + bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ mdev->resync_locked--; wake_up(&mdev->al_wait); } spin_unlock_irq(&mdev->al_lock); - return -EINTR; + if (sig) + return -EINTR; + if (schedule_timeout_interruptible(HZ/10)) + return -EINTR; + if (sa && --sa == 0) + dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec." + "Resync stalled?\n"); + goto retry; } } set_bit(BME_LOCKED, &bm_ext->flags); @@ -1291,8 +1119,7 @@ void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector) } if (lc_put(mdev->resync, &bm_ext->lce) == 0) { - clear_bit(BME_LOCKED, &bm_ext->flags); - clear_bit(BME_NO_WRITES, &bm_ext->flags); + bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */ mdev->resync_locked--; wake_up(&mdev->al_wait); } @@ -1383,7 +1210,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) sector_t esector, nr_sectors; int wake_up = 0; - if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); return; @@ -1420,7 +1247,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) mdev->rs_failed += count; if (get_ldev(mdev)) { - drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE); + drbd_try_clear_on_disk_bm(mdev, sector, count, false); put_ldev(mdev); } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 0645ca829a9..f0ae63d2df6 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -28,18 +28,58 @@ #include <linux/drbd.h> #include <linux/slab.h> #include <asm/kmap_types.h> + +#include <asm-generic/bitops/le.h> + #include "drbd_int.h" + /* OPAQUE outside this file! * interface defined in drbd_int.h * convention: * function name drbd_bm_... => used elsewhere, "public". * function name bm_... => internal to implementation, "private". + */ + + +/* + * LIMITATIONS: + * We want to support >= peta byte of backend storage, while for now still using + * a granularity of one bit per 4KiB of storage. + * 1 << 50 bytes backend storage (1 PiB) + * 1 << (50 - 12) bits needed + * 38 --> we need u64 to index and count bits + * 1 << (38 - 3) bitmap bytes needed + * 35 --> we still need u64 to index and count bytes + * (that's 32 GiB of bitmap for 1 PiB storage) + * 1 << (35 - 2) 32bit longs needed + * 33 --> we'd even need u64 to index and count 32bit long words. + * 1 << (35 - 3) 64bit longs needed + * 32 --> we could get away with a 32bit unsigned int to index and count + * 64bit long words, but I rather stay with unsigned long for now. + * We probably should neither count nor point to bytes or long words + * directly, but either by bitnumber, or by page index and offset. + * 1 << (35 - 12) + * 22 --> we need that much 4KiB pages of bitmap. + * 1 << (22 + 3) --> on a 64bit arch, + * we need 32 MiB to store the array of page pointers. + * + * Because I'm lazy, and because the resulting patch was too large, too ugly + * and still incomplete, on 32bit we still "only" support 16 TiB (minus some), + * (1 << 32) bits * 4k storage. + * - * Note that since find_first_bit returns int, at the current granularity of - * the bitmap (4KB per byte), this implementation "only" supports up to - * 1<<(32+12) == 16 TB... + * bitmap storage and IO: + * Bitmap is stored little endian on disk, and is kept little endian in + * core memory. Currently we still hold the full bitmap in core as long + * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage + * seems excessive. + * + * We plan to reduce the amount of in-core bitmap pages by pageing them in + * and out against their on-disk location as necessary, but need to make + * sure we don't cause too much meta data IO, and must not deadlock in + * tight memory situations. This needs some |