From 6746aff74da293b5fd24e5c68b870b721e86cd5f Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Sep 2009 11:50:14 +0200 Subject: HWPOISON: shmem: call set_page_dirty() with locked page The dirtying of page and set_page_dirty() can be moved into the page lock. - In shmem_write_end(), the page was dirtied while the page lock was held, but it's being marked dirty just after dropping the page lock. - In shmem_symlink(), both dirtying and marking can be moved into page lock. It's valuable for the hwpoison code to know whether one bad page can be dropped without losing data. It mainly judges by testing the PG_dirty bit after taking the page lock. So it becomes important that the dirtying of page and the marking of dirtiness are both done inside the page lock. Which is a common practice, but sadly not a rule. The noticeable exceptions are - mapped pages - pages with buffer_heads The above pages could go dirty at any time. Fortunately the hwpoison will unmap the page and release the buffer_heads beforehand anyway. Many other types of pages (eg. metadata pages) can also be dirtied at will by their owners, the hwpoison code cannot do meaningful things to them anyway. Only the dirtiness of pagecache pages owned by regular files are interested. v2: AK: Add comment about set_page_dirty rules (suggested by Peter Zijlstra) Acked-by: Hugh Dickins Reviewed-by: WANG Cong Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/page-writeback.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dd73d29c15a..bba82c414ba 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1149,6 +1149,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) EXPORT_SYMBOL(redirty_page_for_writepage); /* + * Dirty a page. + * + * For pages with a mapping this should be done under the page lock + * for the benefit of asynchronous memory errors who prefer a consistent + * dirty state. This rule can be broken in some special cases, + * but should be better not to. + * * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ -- cgit v1.2.3-18-g5258 From adea02a1bea71a508da32c04d715485a1fe62029 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 21 Sep 2009 17:01:42 -0700 Subject: mm: count only reclaimable lru pages global_lru_pages() / zone_lru_pages() can be used in two ways: - to estimate max reclaimable pages in determine_dirtyable_memory() - to calculate the slab scan ratio When swap is full or not present, the anon lru lists are not reclaimable and also won't be scanned. So the anon pages shall not be counted in both usage scenarios. Also rename to _reclaimable_pages: now they are counting the possibly reclaimable lru pages. It can greatly (and correctly) increase the slab scan rate under high memory pressure (when most file pages have been reclaimed and swap is full/absent), thus reduce false OOM kills. Acked-by: Peter Zijlstra Reviewed-by: Rik van Riel Reviewed-by: Christoph Lameter Reviewed-by: Minchan Kim Cc: KOSAKI Motohiro Signed-off-by: Wu Fengguang Acked-by: Johannes Weiner Reviewed-by: Minchan Kim Reviewed-by: Jesse Barnes Cc: David Howells Cc: "Li, Ming Chun" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d1ba4644105..5f378dd5880 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -380,7 +380,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); + x += zone_page_state(z, NR_FREE_PAGES) + + zone_reclaimable_pages(z); } /* * Make sure that the number of highmem pages is never larger @@ -404,7 +405,7 @@ unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); -- cgit v1.2.3-18-g5258 From 8d65af789f3e2cf4cfbdbf71a0f7a61ebcd41d38 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 23 Sep 2009 15:57:19 -0700 Subject: sysctl: remove "struct file *" argument of ->proc_handler It's unused. It isn't needed -- read or write flag is already passed and sysctl shouldn't care about the rest. It _was_ used in two places at arch/frv for some reason. Signed-off-by: Alexey Dobriyan Cc: David Howells Cc: "Eric W. Biederman" Cc: Al Viro Cc: Ralf Baechle Cc: Martin Schwidefsky Cc: Ingo Molnar Cc: "David S. Miller" Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5f378dd5880..be197f71b09 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -155,37 +155,37 @@ static void update_completion_period(void) } int dirty_background_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_ratio = 0; return ret; } int dirty_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { update_completion_period(); vm_dirty_bytes = 0; @@ -195,13 +195,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write, int dirty_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { update_completion_period(); vm_dirty_ratio = 0; @@ -686,9 +686,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); return 0; } -- cgit v1.2.3-18-g5258 From 3a2e9a5a2afc1a2d2c548b8987f133235cebe933 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 23 Sep 2009 21:56:00 +0800 Subject: writeback: balance_dirty_pages() shall write more than dirtied pages Some filesystem may choose to write much more than ratelimit_pages before calling balance_dirty_pages_ratelimited_nr(). So it is safer to determine number to write based on real number of dirtied pages. Otherwise it is possible that loop { btrfs_file_write(): dirty 1024 pages balance_dirty_pages(): write up to 48 pages (= ratelimit_pages * 1.5) } in which the writeback rate cannot keep up with dirty rate, and the dirty pages go all the way beyond dirty_thresh. The increased write_chunk may make the dirtier more bumpy. So filesystems shall be take care not to dirty too much at a time (eg. > 4MB) without checking the ratelimit. Signed-off-by: Wu Fengguang Acked-by: Peter Zijlstra Signed-off-by: Jens Axboe --- mm/page-writeback.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5f378dd5880..cbd4cba468b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -44,12 +44,15 @@ static long ratelimit_pages = 32; /* * When balance_dirty_pages decides that the caller needs to perform some * non-background writeback, this is how many pages it will attempt to write. - * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably + * It should be somewhat larger than dirtied pages to ensure that reasonably * large amounts of I/O are submitted. */ -static inline long sync_writeback_pages(void) +static inline long sync_writeback_pages(unsigned long dirtied) { - return ratelimit_pages + ratelimit_pages / 2; + if (dirtied < ratelimit_pages) + dirtied = ratelimit_pages; + + return dirtied + dirtied / 2; } /* The following parameters are exported via /proc/sys/vm */ @@ -477,7 +480,8 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, * If we're over `background_thresh' then pdflush is woken to perform some * writeout. */ -static void balance_dirty_pages(struct address_space *mapping) +static void balance_dirty_pages(struct address_space *mapping, + unsigned long write_chunk) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; @@ -485,7 +489,6 @@ static void balance_dirty_pages(struct address_space *mapping) unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long pages_written = 0; - unsigned long write_chunk = sync_writeback_pages(); unsigned long pause = 1; struct backing_dev_info *bdi = mapping->backing_dev_info; @@ -640,9 +643,10 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { + ratelimit = sync_writeback_pages(*p); *p = 0; preempt_enable(); - balance_dirty_pages(mapping); + balance_dirty_pages(mapping, ratelimit); return; } preempt_enable(); -- cgit v1.2.3-18-g5258 From d3ddec7635b6fb37cb49e3553bdeea59642be653 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 23 Sep 2009 20:33:40 +0800 Subject: writeback: stop background writeback when below background threshold Treat bdi_start_writeback(0) as a special request to do background write, and stop such work when we are below the background dirty threshold. Also simplify the (nr_pages <= 0) checks. Since we already pass in nr_pages=LONG_MAX for WB_SYNC_ALL and background writes, we don't need to worry about it being decreased to zero. Reported-by: Richard Kennedy CC: Jan Kara Acked-by: Peter Zijlstra Signed-off-by: Wu Fengguang Signed-off-by: Jens Axboe --- mm/page-writeback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index cbd4cba468b..3c78fc31620 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -593,10 +593,10 @@ static void balance_dirty_pages(struct address_space *mapping, * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS)) + (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS)) > background_thresh))) - bdi_start_writeback(bdi, nr_writeback); + bdi_start_writeback(bdi, 0); } void set_page_dirty_balance(struct page *page, int page_mkwrite) -- cgit v1.2.3-18-g5258 From 5b0830cb9085f4b69f9d57d7f3aaff322ffbec26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 23 Sep 2009 19:37:09 +0200 Subject: writeback: get rid to incorrect references to pdflush in comments Signed-off-by: Jens Axboe --- mm/page-writeback.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3c78fc31620..8bef063125b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -58,7 +58,7 @@ static inline long sync_writeback_pages(unsigned long dirtied) /* The following parameters are exported via /proc/sys/vm */ /* - * Start background writeback (via pdflush) at this percentage + * Start background writeback (via writeback threads) at this percentage */ int dirty_background_ratio = 10; @@ -477,8 +477,8 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to perform writeback if the system is over `vm_dirty_ratio'. - * If we're over `background_thresh' then pdflush is woken to perform some - * writeout. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. */ static void balance_dirty_pages(struct address_space *mapping, unsigned long write_chunk) @@ -582,7 +582,7 @@ static void balance_dirty_pages(struct address_space *mapping, bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) - return; /* pdflush is already working this queue */ + return; /* * In laptop mode, we wait until hitting the higher threshold before -- cgit v1.2.3-18-g5258 From a72bfd4dea053bb8e2233902c3f1893ef5485802 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 26 Sep 2009 00:07:46 +0200 Subject: writeback: pass in super_block to bdi_start_writeback() Sometimes we only want to write pages from a specific super_block, so allow that to be passed in. This fixes a problem with commit 56a131dcf7ed36c3c6e36bea448b674ea85ed5bb causing writeback on all super_blocks on a bdi, where we only really want to sync a specific sb from writeback_inodes_sb(). Signed-off-by: Jens Axboe --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 69b5fbabc8b..a3b14090b1f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -596,7 +596,7 @@ static void balance_dirty_pages(struct address_space *mapping, (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS)) > background_thresh))) - bdi_start_writeback(bdi, 0); + bdi_start_writeback(bdi, NULL, 0); } void set_page_dirty_balance(struct page *page, int page_mkwrite) -- cgit v1.2.3-18-g5258