aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-06-04 15:37:44 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2010-06-04 15:37:44 -0700
commitd2dd328b7f7bc6cebe167648289337755944ad2a (patch)
tree5d664a2db1ac209f7537452ddc02597972f7aa37
parentc1518f12bab97a6d409a25aaccb02dc8895800f3 (diff)
parent1abec4fdbb142e3ccb6ce99832fae42129134a96 (diff)
Merge branch 'for-linus' of git://git.kernel.dk/linux-2.6-block
* 'for-linus' of git://git.kernel.dk/linux-2.6-block: (27 commits) block: make blk_init_free_list and elevator_init idempotent block: avoid unconditionally freeing previously allocated request_queue pipe: change /proc/sys/fs/pipe-max-pages to byte sized interface pipe: change the privilege required for growing a pipe beyond system max pipe: adjust minimum pipe size to 1 page block: disable preemption before using sched_clock() cciss: call BUG() earlier Preparing 8.3.8rc2 drbd: Reduce verbosity drbd: use drbd specific ratelimit instead of global printk_ratelimit drbd: fix hang on local read errors while disconnected drbd: Removed the now empty w_io_error() function drbd: removed duplicated #includes drbd: improve usage of MSG_MORE drbd: need to set socket bufsize early to take effect drbd: improve network latency, TCP_QUICKACK drbd: Revert "drbd: Create new current UUID as late as possible" brd: support discard Revert "writeback: fix WB_SYNC_NONE writeback from umount" Revert "writeback: ensure that WB_SYNC_NONE writeback with sb pinned is sync" ...
-rw-r--r--block/blk-core.c20
-rw-r--r--block/cfq-iosched.c101
-rw-r--r--block/elevator.c8
-rw-r--r--drivers/block/brd.c53
-rw-r--r--drivers/block/cciss_scsi.c2
-rw-r--r--drivers/block/drbd/drbd_int.h14
-rw-r--r--drivers/block/drbd/drbd_main.c68
-rw-r--r--drivers/block/drbd/drbd_receiver.c45
-rw-r--r--drivers/block/drbd/drbd_req.c54
-rw-r--r--drivers/block/drbd/drbd_req.h1
-rw-r--r--drivers/block/drbd/drbd_worker.c24
-rw-r--r--fs/fs-writeback.c64
-rw-r--r--fs/pipe.c77
-rw-r--r--fs/splice.c2
-rw-r--r--fs/sync.c2
-rw-r--r--include/linux/backing-dev.h2
-rw-r--r--include/linux/blkdev.h9
-rw-r--r--include/linux/drbd.h2
-rw-r--r--include/linux/iocontext.h1
-rw-r--r--include/linux/pipe_fs_i.h4
-rw-r--r--include/linux/writeback.h10
-rw-r--r--kernel/sysctl.c8
-rw-r--r--mm/page-writeback.c4
23 files changed, 311 insertions, 264 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 3bc5579d6f5..f84cce42fc5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -467,6 +467,9 @@ static int blk_init_free_list(struct request_queue *q)
{
struct request_list *rl = &q->rq;
+ if (unlikely(rl->rq_pool))
+ return 0;
+
rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
rl->elvpriv = 0;
@@ -570,9 +573,17 @@ EXPORT_SYMBOL(blk_init_queue);
struct request_queue *
blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
- struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+ struct request_queue *uninit_q, *q;
+
+ uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+ if (!uninit_q)
+ return NULL;
+
+ q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
+ if (!q)
+ blk_cleanup_queue(uninit_q);
- return blk_init_allocated_queue_node(q, rfn, lock, node_id);
+ return q;
}
EXPORT_SYMBOL(blk_init_queue_node);
@@ -592,10 +603,8 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
return NULL;
q->node = node_id;
- if (blk_init_free_list(q)) {
- kmem_cache_free(blk_requestq_cachep, q);
+ if (blk_init_free_list(q))
return NULL;
- }
q->request_fn = rfn;
q->prep_rq_fn = NULL;
@@ -618,7 +627,6 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
return q;
}
- blk_put_queue(q);
return NULL;
}
EXPORT_SYMBOL(blk_init_allocated_queue_node);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ed897b5ef31..5ff4f4850e7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -64,6 +64,9 @@ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
static struct completion *ioc_gone;
static DEFINE_SPINLOCK(ioc_gone_lock);
+static DEFINE_SPINLOCK(cic_index_lock);
+static DEFINE_IDA(cic_index_ida);
+
#define CFQ_PRIO_LISTS IOPRIO_BE_NR
#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
@@ -271,6 +274,7 @@ struct cfq_data {
unsigned int cfq_latency;
unsigned int cfq_group_isolation;
+ unsigned int cic_index;
struct list_head cic_list;
/*
@@ -430,6 +434,24 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic,
cic->cfqq[is_sync] = cfqq;
}
+#define CIC_DEAD_KEY 1ul
+#define CIC_DEAD_INDEX_SHIFT 1
+
+static inline void *cfqd_dead_key(struct cfq_data *cfqd)
+{
+ return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
+}
+
+static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
+{
+ struct cfq_data *cfqd = cic->key;
+
+ if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
+ return NULL;
+
+ return cfqd;
+}
+
/*
* We regard a request as SYNC, if it's either a read or has the SYNC bit
* set (in which case it could also be direct WRITE).
@@ -2510,11 +2532,12 @@ static void cfq_cic_free(struct cfq_io_context *cic)
static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
{
unsigned long flags;
+ unsigned long dead_key = (unsigned long) cic->key;
- BUG_ON(!cic->dead_key);
+ BUG_ON(!(dead_key & CIC_DEAD_KEY));
spin_lock_irqsave(&ioc->lock, flags);
- radix_tree_delete(&ioc->radix_root, cic->dead_key);
+ radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
hlist_del_rcu(&cic->cic_list);
spin_unlock_irqrestore(&ioc->lock, flags);
@@ -2537,15 +2560,10 @@ static void cfq_free_io_context(struct io_context *ioc)
__call_for_each_cic(ioc, cic_free_func);
}
-static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+static void cfq_put_cooperator(struct cfq_queue *cfqq)
{
struct cfq_queue *__cfqq, *next;
- if (unlikely(cfqq == cfqd->active_queue)) {
- __cfq_slice_expired(cfqd, cfqq, 0);
- cfq_schedule_dispatch(cfqd);
- }
-
/*
* If this queue was scheduled to merge with another queue, be
* sure to drop the reference taken on that queue (and others in
@@ -2561,6 +2579,16 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
cfq_put_queue(__cfqq);
__cfqq = next;
}
+}
+
+static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+ if (unlikely(cfqq == cfqd->active_queue)) {
+ __cfq_slice_expired(cfqd, cfqq, 0);
+ cfq_schedule_dispatch(cfqd);
+ }
+
+ cfq_put_cooperator(cfqq);
cfq_put_queue(cfqq);
}
@@ -2573,11 +2601,10 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
list_del_init(&cic->queue_list);
/*
- * Make sure key == NULL is seen for dead queues
+ * Make sure dead mark is seen for dead queues
*/
smp_wmb();
- cic->dead_key = (unsigned long) cic->key;
- cic->key = NULL;
+ cic->key = cfqd_dead_key(cfqd);
if (ioc->ioc_data == cic)
rcu_assign_pointer(ioc->ioc_data, NULL);
@@ -2596,7 +2623,7 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
static void cfq_exit_single_io_context(struct io_context *ioc,
struct cfq_io_context *cic)
{
- struct cfq_data *cfqd = cic->key;
+ struct cfq_data *cfqd = cic_to_cfqd(cic);
if (cfqd) {
struct request_queue *q = cfqd->queue;
@@ -2609,7 +2636,7 @@ static void cfq_exit_single_io_context(struct io_context *ioc,
* race between exiting task and queue
*/
smp_read_barrier_depends();
- if (cic->key)
+ if (cic->key == cfqd)
__cfq_exit_single_io_context(cfqd, cic);
spin_unlock_irqrestore(q->queue_lock, flags);
@@ -2689,7 +2716,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
{
- struct cfq_data *cfqd = cic->key;
+ struct cfq_data *cfqd = cic_to_cfqd(cic);
struct cfq_queue *cfqq;
unsigned long flags;
@@ -2746,7 +2773,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
{
struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
- struct cfq_data *cfqd = cic->key;
+ struct cfq_data *cfqd = cic_to_cfqd(cic);
unsigned long flags;
struct request_queue *q;
@@ -2883,12 +2910,13 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
unsigned long flags;
WARN_ON(!list_empty(&cic->queue_list));
+ BUG_ON(cic->key != cfqd_dead_key(cfqd));
spin_lock_irqsave(&ioc->lock, flags);
BUG_ON(ioc->ioc_data == cic);
- radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
+ radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
hlist_del_rcu(&cic->cic_list);
spin_unlock_irqrestore(&ioc->lock, flags);
@@ -2900,7 +2928,6 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
{
struct cfq_io_context *cic;
unsigned long flags;
- void *k;
if (unlikely(!ioc))
return NULL;
@@ -2917,13 +2944,11 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
}
do {
- cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
+ cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
rcu_read_unlock();
if (!cic)
break;
- /* ->key must be copied to avoid race with cfq_exit_queue() */
- k = cic->key;
- if (unlikely(!k)) {
+ if (unlikely(cic->key != cfqd)) {
cfq_drop_dead_cic(cfqd, ioc, cic);
rcu_read_lock();
continue;
@@ -2956,7 +2981,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
spin_lock_irqsave(&ioc->lock, flags);
ret = radix_tree_insert(&ioc->radix_root,
- (unsigned long) cfqd, cic);
+ cfqd->cic_index, cic);
if (!ret)
hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
spin_unlock_irqrestore(&ioc->lock, flags);
@@ -3516,6 +3541,9 @@ split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
}
cic_set_cfqq(cic, NULL, 1);
+
+ cfq_put_cooperator(cfqq);
+
cfq_put_queue(cfqq);
return NULL;
}
@@ -3708,10 +3736,32 @@ static void cfq_exit_queue(struct elevator_queue *e)
cfq_shutdown_timer_wq(cfqd);
+ spin_lock(&cic_index_lock);
+ ida_remove(&cic_index_ida, cfqd->cic_index);
+ spin_unlock(&cic_index_lock);
+
/* Wait for cfqg->blkg->key accessors to exit their grace periods. */
call_rcu(&cfqd->rcu, cfq_cfqd_free);
}
+static int cfq_alloc_cic_index(void)
+{
+ int index, error;
+
+ do {
+ if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
+ return -ENOMEM;
+
+ spin_lock(&cic_index_lock);
+ error = ida_get_new(&cic_index_ida, &index);
+ spin_unlock(&cic_index_lock);
+ if (error && error != -EAGAIN)
+ return error;
+ } while (error);
+
+ return index;
+}
+
static void *cfq_init_queue(struct request_queue *q)
{
struct cfq_data *cfqd;
@@ -3719,10 +3769,16 @@ static void *cfq_init_queue(struct request_queue *q)
struct cfq_group *cfqg;
struct cfq_rb_root *st;
+ i = cfq_alloc_cic_index();
+ if (i < 0)
+ return NULL;
+
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
if (!cfqd)
return NULL;
+ cfqd->cic_index = i;
+
/* Init root service tree */
cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -3984,6 +4040,7 @@ static void __exit cfq_exit(void)
*/
if (elv_ioc_count_read(cfq_ioc_count))
wait_for_completion(&all_gone);
+ ida_destroy(&cic_index_ida);
cfq_slab_kill();
}
diff --git a/block/elevator.c b/block/elevator.c
index 6df2b5056b5..923a9139106 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -242,9 +242,11 @@ int elevator_init(struct request_queue *q, char *name)
{
struct elevator_type *e = NULL;
struct elevator_queue *eq;
- int ret = 0;
void *data;
+ if (unlikely(q->elevator))
+ return 0;
+
INIT_LIST_HEAD(&q->queue_head);
q->last_merge = NULL;
q->end_sector = 0;
@@ -284,7 +286,7 @@ int elevator_init(struct request_queue *q, char *name)
}
elevator_attach(q, eq, data);
- return ret;
+ return 0;
}
EXPORT_SYMBOL(elevator_init);
@@ -1097,7 +1099,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
struct elevator_type *__e;
int len = 0;
- if (!q->elevator)
+ if (!q->elevator || !blk_queue_stackable(q))
return sprintf(name, "none\n");
elv = e->elevator_type;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 6081e81d573..f1bf79d9bc0 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -133,6 +133,28 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
return page;
}
+static void brd_free_page(struct brd_device *brd, sector_t sector)
+{
+ struct page *page;
+ pgoff_t idx;
+
+ spin_lock(&brd->brd_lock);
+ idx = sector >> PAGE_SECTORS_SHIFT;
+ page = radix_tree_delete(&brd->brd_pages, idx);
+ spin_unlock(&brd->brd_lock);
+ if (page)
+ __free_page(page);
+}
+
+static void brd_zero_page(struct brd_device *brd, sector_t sector)
+{
+ struct page *page;
+
+ page = brd_lookup_page(brd, sector);
+ if (page)
+ clear_highpage(page);
+}
+
/*
* Free all backing store pages and radix tree. This must only be called when
* there are no other users of the device.
@@ -189,6 +211,24 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
return 0;
}
+static void discard_from_brd(struct brd_device *brd,
+ sector_t sector, size_t n)
+{
+ while (n >= PAGE_SIZE) {
+ /*
+ * Don't want to actually discard pages here because
+ * re-allocating the pages can result in writeback
+ * deadlocks under heavy load.
+ */
+ if (0)
+ brd_free_page(brd, sector);
+ else
+ brd_zero_page(brd, sector);
+ sector += PAGE_SIZE >> SECTOR_SHIFT;
+ n -= PAGE_SIZE;
+ }
+}
+
/*
* Copy n bytes from src to the brd starting at sector. Does not sleep.
*/
@@ -300,6 +340,12 @@ static int brd_make_request(struct request_queue *q, struct bio *bio)
get_capacity(bdev->bd_disk))
goto out;
+ if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) {
+ err = 0;
+ discard_from_brd(brd, sector, bio->bi_size);
+ goto out;
+ }
+
rw = bio_rw(bio);
if (rw == READA)
rw = READ;
@@ -320,7 +366,7 @@ out:
}
#ifdef CONFIG_BLK_DEV_XIP
-static int brd_direct_access (struct block_device *bdev, sector_t sector,
+static int brd_direct_access(struct block_device *bdev, sector_t sector,
void **kaddr, unsigned long *pfn)
{
struct brd_device *brd = bdev->bd_disk->private_data;
@@ -437,6 +483,11 @@ static struct brd_device *brd_alloc(int i)
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
+ brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
+ brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
+ brd->brd_queue->limits.discard_zeroes_data = 1;
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
+
disk = brd->brd_disk = alloc_disk(1 << part_shift);
if (!disk)
goto out_free_queue;
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index e1d0e2cfec7..3381505c8a6 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -188,11 +188,11 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *cmd)
sa = h->scsi_ctlr;
stk = &sa->cmd_stack;
+ stk->top++;
if (stk->top >= CMD_STACK_SIZE) {
printk("cciss: scsi_cmd_free called too many times.\n");
BUG();
}
- stk->top++;
stk->elem[stk->top] = (struct cciss_scsi_cmd_stack_elem_t *) cmd;
}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index e9654c8d5b6..485ed8c7d62 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -943,8 +943,7 @@ struct drbd_conf {
struct drbd_work resync_work,
unplug_work,
md_sync_work,
- delay_probe_work,
- uuid_work;
+ delay_probe_work;
struct timer_list resync_timer;
struct timer_list md_sync_timer;
struct timer_list delay_probe_timer;
@@ -1069,7 +1068,6 @@ struct drbd_conf {
struct timeval dps_time; /* delay-probes-start-time */
unsigned int dp_volume_last; /* send_cnt of last delay probe */
int c_sync_rate; /* current resync rate after delay_probe magic */
- atomic_t new_c_uuid;
};
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1476,7 +1474,6 @@ extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
-extern int w_io_error(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
@@ -1542,7 +1539,7 @@ static inline void drbd_tcp_nodelay(struct socket *sock)
static inline void drbd_tcp_quickack(struct socket *sock)
{
- int __user val = 1;
+ int __user val = 2;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
(char __user *)&val, sizeof(val));
}
@@ -1728,7 +1725,7 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
switch (mdev->ldev->dc.on_io_error) {
case EP_PASS_ON:
if (!forcedetach) {
- if (printk_ratelimit())
+ if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Local IO failed in %s."
"Passing error on...\n", where);
break;
@@ -2219,8 +2216,6 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
return 0;
if (test_bit(BITMAP_IO, &mdev->flags))
return 0;
- if (atomic_read(&mdev->new_c_uuid))
- return 0;
return 1;
}
@@ -2241,9 +2236,6 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
* to avoid races with the reconnect code,
* we need to atomic_inc within the spinlock. */
- if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1))
- drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work);
-
spin_lock_irq(&mdev->req_lock);
while (!__inc_ap_bio_cond(mdev)) {
prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index be2d2da9cdb..6b077f93acc 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1215,18 +1215,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
ns.pdsk == D_OUTDATED)) {
if (get_ldev(mdev)) {
if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
- mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE &&
- !atomic_read(&mdev->new_c_uuid))
- atomic_set(&mdev->new_c_uuid, 2);
+ mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+ drbd_uuid_new_current(mdev);
+ drbd_send_uuids(mdev);
+ }
put_ldev(mdev);
}
}
if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
- /* Diskless peer becomes primary or got connected do diskless, primary peer. */
- if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 &&
- !atomic_read(&mdev->new_c_uuid))
- atomic_set(&mdev->new_c_uuid, 2);
+ if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
+ drbd_uuid_new_current(mdev);
/* D_DISKLESS Peer becomes secondary */
if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1350,24 +1349,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
drbd_md_sync(mdev);
}
-static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
- if (get_ldev(mdev)) {
- if (mdev->ldev->md.uuid[UI_BITMAP] == 0) {
- drbd_uuid_new_current(mdev);
- if (get_net_conf(mdev)) {
- drbd_send_uuids(mdev);
- put_net_conf(mdev);
- }
- drbd_md_sync(mdev);
- }
- put_ldev(mdev);
- }
- atomic_dec(&mdev->new_c_uuid);
- wake_up(&mdev->misc_wait);
-
- return 1;
-}
static int drbd_thread_setup(void *arg)
{
@@ -2291,9 +2272,9 @@ static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *
* with page_count == 0 or PageSlab.
*/
static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
- int offset, size_t size)
+ int offset, size_t size, unsigned msg_flags)
{
- int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
+ int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
kunmap(page);
if (sent == size)
mdev->send_cnt += size>>9;
@@ -2301,7 +2282,7 @@ static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
}
static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
- int offset, size_t size)
+ int offset, size_t size, unsigned msg_flags)
{
mm_segment_t oldfs = get_fs();
int sent, ok;
@@ -2314,14 +2295,15 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
* __page_cache_release a page that would actually still be referenced
* by someone, leading to some obscure delayed Oops somewhere else. */
if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
- return _drbd_no_send_page(mdev, page, offset, size);
+ return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
+ msg_flags |= MSG_NOSIGNAL;
drbd_update_congested(mdev);
set_fs(KERNEL_DS);
do {
sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
offset, len,
- MSG_NOSIGNAL);
+ msg_flags);
if (sent == -EAGAIN) {
if (we_should_drop_the_connection(mdev,
mdev->data.socket))
@@ -2350,9 +2332,11 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ /* hint all but last page with MSG_MORE */
__bio_for_each_segment(bvec, bio, i, 0) {
if (!_drbd_no_send_page(mdev, bvec->bv_page,
- bvec->bv_offset, bvec->bv_len))
+ bvec->bv_offset, bvec->bv_len,
+ i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
return 0;
}
return 1;
@@ -2362,12 +2346,13 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
{
struct bio_vec *bvec;
int i;
+ /* hint all but last page with MSG_MORE */
__bio_for_each_segment(bvec, bio, i, 0) {
if (!_drbd_send_page(mdev, bvec->bv_page,
- bvec->bv_offset, bvec->bv_len))
+ bvec->bv_offset, bvec->bv_len,
+ i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
return 0;
}
-
return 1;
}
@@ -2375,9 +2360,11 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
{
struct page *page = e->pages;
unsigned len = e->size;
+ /* hint all but last page with MSG_MORE */
page_chain_for_each(page) {
unsigned l = min_t(unsigned, len, PAGE_SIZE);
- if (!_drbd_send_page(mdev, page, 0, l))
+ if (!_drbd_send_page(mdev, page, 0, l,
+ page_chain_next(page) ? MSG_MORE : 0))
return 0;
len -= l;
}
@@ -2457,11 +2444,11 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
p.dp_flags = cpu_to_be32(dp_flags);
set_bit(UNPLUG_REMOTE, &mdev->flags);
ok = (sizeof(p) ==
- drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
+ drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
if (ok && dgs) {
dgb = mdev->int_dig_out;
drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
- ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
+ ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
}
if (ok) {
if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
@@ -2510,11 +2497,11 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
return 0;
ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
- sizeof(p), MSG_MORE);
+ sizeof(p), dgs ? MSG_MORE : 0);
if (ok && dgs) {
dgb = mdev->int_dig_out;
drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
- ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
+ ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
}
if (ok)
ok = _drbd_send_zc_ee(mdev, e);
@@ -2708,7 +2695,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
atomic_set(&mdev->net_cnt, 0);
atomic_set(&mdev->packet_seq, 0);
atomic_set(&mdev->pp_in_use, 0);
- atomic_set(&mdev->new_c_uuid, 0);
mutex_init(&mdev->md_io_mutex);
mutex_init(&mdev->data.mutex);
@@ -2739,14 +2725,12 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
INIT_LIST_HEAD(&mdev->delay_probes);
INIT_LIST_HEAD(&mdev->delay_probe_work.list);
- INIT_LIST_HEAD(&mdev->uuid_work.list);
mdev->resync_work.cb = w_resync_inactive;
mdev->unplug_work.cb = w_send_write_hint;
mdev->md_sync_work.cb = w_md_sync;
mdev->bm_io_work.w.cb = w_bitmap_io;
mdev->delay_probe_work.cb = w_delay_probes;
- mdev->uuid_work.cb = w_new_current_uuid;
init_timer(&mdev->resync_timer);
init_timer(&mdev->md_sync_timer);
init_timer(&mdev->delay_probe_timer);
@@ -3799,7 +3783,7 @@ _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
if (ret) {
fault_count++;
- if (printk_ratelimit())
+ if (__ratelimit(&drbd_ratelimit_state))
dev_warn(DEV, "***Simulating %s failure\n",
_drbd_fault_str(type));
}
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index bc9ab7fb2cc..dff48701b84 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -42,7 +42,6 @@
#include <linux/unistd.h>
#include <linux/vmalloc.h>
#include <linux/random.h>
-#include <linux/mm.h>
#include <linux/string.h>
#include <linux/scatterlist.h>
#include "drbd_int.h"
@@ -571,6 +570,25 @@ static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
return rv;
}
+/* quoting tcp(7):
+ * On individual connections, the socket buffer size must be set prior to the
+ * listen(2) or connect(2) calls in order to have it take effect.
+ * This is our wrapper to do so.
+ */
+static void drbd_setbufsize(struct socket *sock, unsigned int snd,
+ unsigned int rcv)
+{
+ /* open coded SO_SNDBUF, SO_RCVBUF */
+ if (snd) {
+ sock->sk->sk_sndbuf = snd;
+ sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ }
+ if (rcv) {
+ sock->sk->sk_rcvbuf = rcv;
+ sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+}
+
static struct socket *drbd_try_connect(struct drbd_conf *mdev)
{
const char *what;
@@ -592,6 +610,8 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev)
sock->sk->sk_rcvtimeo =
sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ;
+ drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
+ mdev->net_conf->rcvbuf_size);
/* explicitly bind to the configured IP as source IP
* for the outgoing connections.
@@ -670,6 +690,8 @@ static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
s_listen->sk->sk_rcvtimeo = timeo;
s_listen->sk->sk_sndtimeo = timeo;
+ drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
+ mdev->net_conf->rcvbuf_size);
what = "bind before listen";
err = s_listen->ops->bind(s_listen,
@@ -856,16 +878,6 @@ retry:
sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
- if (mdev->net_conf->sndbuf_size) {
- sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size;
- sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- }
-
- if (mdev->net_conf->rcvbuf_size) {
- sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size;
- sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- }
-
/* NOT YET ...
* sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
* sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
@@ -1154,17 +1166,6 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
unsigned n_bios = 0;
unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
- if (atomic_read(&mdev->new_c_uuid)) {
- if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) {
- drbd_uuid_new_current(mdev);
- drbd_md_sync(mdev);
-
- atomic_dec(&mdev->new_c_uuid);
- wake_up(&mdev->misc_wait);
- }
- wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid));
- }
-
/* In most cases, we will only need one bio. But in case the lower
* level restrictions happen to be different at this offset on this
* side than those of the sending peer, we may need to submit the
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 3397f11d0ba..654f1ef5cbb 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -102,32 +102,7 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
}
}
- /* if it was a local io error, we want to notify our
- * peer about that, and see if we need to
- * detach the disk and stuff.
- * to avoid allocating some special work
- * struct, reuse the request. */
-
- /* THINK
- * why do we do this not when we detect the error,
- * but delay it until it is "done", i.e. possibly
- * until the next barrier ack? */
-
- if (rw == WRITE &&
- ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
- if (!(req->w.list.next == LIST_POISON1 ||
- list_empty(&req->w.list))) {
- /* DEBUG ASSERT only; if this triggers, we
- * probably corrupt the worker list here */
- dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next);
- dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev);
- }
- req->w.cb = w_io_error;
- drbd_queue_work(&mdev->data.work, &req->w);
- /* drbd_req_free() is done in w_io_error */
- } else {
- drbd_req_free(req);
- }
+ drbd_req_free(req);
}
static void queue_barrier(struct drbd_conf *mdev)
@@ -453,9 +428,6 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
- dev_alert(DEV, "Local WRITE failed sec=