diff options
Diffstat (limited to 'mm/page-writeback.c')
| -rw-r--r-- | mm/page-writeback.c | 940 |
1 files changed, 675 insertions, 265 deletions
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ea3f83b2119..e0c943014eb 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -32,16 +32,27 @@ #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> -#include <linux/buffer_head.h> +#include <linux/buffer_head.h> /* __set_page_dirty_buffers */ #include <linux/pagevec.h> +#include <linux/timer.h> +#include <linux/sched/rt.h> +#include <linux/mm_inline.h> #include <trace/events/writeback.h> +#include "internal.h" + /* * Sleep at most 200ms at a time in balance_dirty_pages(). */ #define MAX_PAUSE max(HZ/5, 1) /* + * Try to keep balance_dirty_pages() call intervals higher than this many pages + * by raising pause time to max_pause when falls below it. + */ +#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) + +/* * Estimate write bandwidth at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) @@ -89,6 +100,8 @@ unsigned long vm_dirty_bytes; */ unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ +EXPORT_SYMBOL_GPL(dirty_writeback_interval); + /* * The longest time for which data is allowed to remain dirty */ @@ -127,34 +140,196 @@ unsigned long global_dirty_limit; * measured in page writeback completions. * */ -static struct prop_descriptor vm_completions; +static struct fprop_global writeout_completions; + +static void writeout_period(unsigned long t); +/* Timer for aging of writeout_completions */ +static struct timer_list writeout_period_timer = + TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); +static unsigned long writeout_period_time = 0; + +/* + * Length of period for aging writeout fractions of bdis. This is an + * arbitrarily chosen number. The longer the period, the slower fractions will + * reflect changes in current writeout rate. + */ +#define VM_COMPLETIONS_PERIOD_LEN (3*HZ) + +/* + * In a memory zone, there is a certain amount of pages we consider + * available for the page cache, which is essentially the number of + * free and reclaimable pages, minus some zone reserves to protect + * lowmem and the ability to uphold the zone's watermarks without + * requiring writeback. + * + * This number of dirtyable pages is the base value of which the + * user-configurable dirty ratio is the effictive number of pages that + * are allowed to be actually dirtied. Per individual zone, or + * globally by using the sum of dirtyable pages over all zones. + * + * Because the user is allowed to specify the dirty limit globally as + * absolute number of bytes, calculating the per-zone dirty limit can + * require translating the configured limit into a percentage of + * global dirtyable memory first. + */ + +/** + * zone_dirtyable_memory - number of dirtyable pages in a zone + * @zone: the zone + * + * Returns the zone's number of pages potentially available for dirty + * page cache. This is the base value for the per-zone dirty limits. + */ +static unsigned long zone_dirtyable_memory(struct zone *zone) +{ + unsigned long nr_pages; + + nr_pages = zone_page_state(zone, NR_FREE_PAGES); + nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + + nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); + nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); + + return nr_pages; +} + +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + + for_each_node_state(node, N_HIGH_MEMORY) { + struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + + x += zone_dirtyable_memory(z); + } + /* + * Unreclaimable memory (kernel memory or anonymous memory + * without swap) can bring down the dirtyable pages below + * the zone's dirty balance reserve and the above calculation + * will underflow. However we still want to add in nodes + * which are below threshold (negative values) to get a more + * accurate calculation but make sure that the total never + * underflows. + */ + if ((long)x < 0) + x = 0; + + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +/** + * global_dirtyable_memory - number of globally dirtyable pages + * + * Returns the global number of pages potentially available for dirty + * page cache. This is the base value for the global dirty limits. + */ +static unsigned long global_dirtyable_memory(void) +{ + unsigned long x; + + x = global_page_state(NR_FREE_PAGES); + x -= min(x, dirty_balance_reserve); + + x += global_page_state(NR_INACTIVE_FILE); + x += global_page_state(NR_ACTIVE_FILE); + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + + return x + 1; /* Ensure that we never return 0 */ +} /* - * couple the period to the dirty_ratio: + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * + * Calculate the dirty thresholds based on sysctl parameters + * - vm.dirty_background_ratio or vm.dirty_background_bytes + * - vm.dirty_ratio or vm.dirty_bytes + * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and + * real-time tasks. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +{ + unsigned long background; + unsigned long dirty; + unsigned long uninitialized_var(available_memory); + struct task_struct *tsk; + + if (!vm_dirty_bytes || !dirty_background_bytes) + available_memory = global_dirtyable_memory(); + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else + dirty = (vm_dirty_ratio * available_memory) / 100; + + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; + + if (background >= dirty) + background = dirty / 2; + tsk = current; + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + background += background / 4; + dirty += dirty / 4; + } + *pbackground = background; + *pdirty = dirty; + trace_global_dirty_state(background, dirty); +} + +/** + * zone_dirty_limit - maximum number of dirty pages allowed in a zone + * @zone: the zone * - * period/2 ~ roundup_pow_of_two(dirty limit) + * Returns the maximum number of dirty pages allowed in a zone, based + * on the zone's dirtyable memory. */ -static int calc_period_shift(void) +static unsigned long zone_dirty_limit(struct zone *zone) { - unsigned long dirty_total; + unsigned long zone_memory = zone_dirtyable_memory(zone); + struct task_struct *tsk = current; + unsigned long dirty; if (vm_dirty_bytes) - dirty_total = vm_dirty_bytes / PAGE_SIZE; + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * + zone_memory / global_dirtyable_memory(); else - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / - 100; - return 2 + ilog2(dirty_total - 1); + dirty = vm_dirty_ratio * zone_memory / 100; + + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + dirty += dirty / 4; + + return dirty; } -/* - * update the period when the dirty threshold changes. +/** + * zone_dirty_ok - tells whether a zone is within its dirty limits + * @zone: the zone to check + * + * Returns %true when the dirty pages in @zone are within the zone's + * dirty limit, %false if the limit is exceeded. */ -static void update_completion_period(void) +bool zone_dirty_ok(struct zone *zone) { - int shift = calc_period_shift(); - prop_change_shift(&vm_completions, shift); + unsigned long limit = zone_dirty_limit(zone); - writeback_set_ratelimit(); + return zone_page_state(zone, NR_FILE_DIRTY) + + zone_page_state(zone, NR_UNSTABLE_NFS) + + zone_page_state(zone, NR_WRITEBACK) <= limit; } int dirty_background_ratio_handler(struct ctl_table *table, int write, @@ -190,13 +365,12 @@ int dirty_ratio_handler(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { - update_completion_period(); + writeback_set_ratelimit(); vm_dirty_bytes = 0; } return ret; } - int dirty_bytes_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -206,12 +380,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write, ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { - update_completion_period(); + writeback_set_ratelimit(); vm_dirty_ratio = 0; } return ret; } +static unsigned long wp_next_time(unsigned long cur_time) +{ + cur_time += VM_COMPLETIONS_PERIOD_LEN; + /* 0 has a special meaning... */ + if (!cur_time) + return 1; + return cur_time; +} + /* * Increment the BDI's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). @@ -219,8 +402,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write, static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) { __inc_bdi_stat(bdi, BDI_WRITTEN); - __prop_inc_percpu_max(&vm_completions, &bdi->completions, - bdi->max_prop_frac); + __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, + bdi->max_prop_frac); + /* First event after period switching was turned off? */ + if (!unlikely(writeout_period_time)) { + /* + * We can race with other __bdi_writeout_inc calls here but + * it does not cause any harm since the resulting time when + * timer will fire and what is in writeout_period_time will be + * roughly the same. + */ + writeout_period_time = wp_next_time(jiffies); + mod_timer(&writeout_period_timer, writeout_period_time); + } } void bdi_writeout_inc(struct backing_dev_info *bdi) @@ -239,11 +433,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc); static void bdi_writeout_fraction(struct backing_dev_info *bdi, long *numerator, long *denominator) { - prop_fraction_percpu(&vm_completions, &bdi->completions, + fprop_fraction_percpu(&writeout_completions, &bdi->completions, numerator, denominator); } /* + * On idle system, we can be called long after we scheduled because we use + * deferred timers so count with missed periods. + */ +static void writeout_period(unsigned long t) +{ + int miss_periods = (jiffies - writeout_period_time) / + VM_COMPLETIONS_PERIOD_LEN; + + if (fprop_new_period(&writeout_completions, miss_periods + 1)) { + writeout_period_time = wp_next_time(writeout_period_time + + miss_periods * VM_COMPLETIONS_PERIOD_LEN); + mod_timer(&writeout_period_timer, writeout_period_time); + } else { + /* + * Aging has zeroed all fractions. Stop wasting CPU on period + * updates. + */ + writeout_period_time = 0; + } +} + +/* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not * exceed 100%. @@ -283,7 +499,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) ret = -EINVAL; } else { bdi->max_ratio = max_ratio; - bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; + bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; } spin_unlock_bh(&bdi_lock); @@ -291,67 +507,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); -/* - * Work out the current dirty-memory clamping and background writeout - * thresholds. - * - * The main aim here is to lower them aggressively if there is a lot of mapped - * memory around. To avoid stressing page reclaim with lots of unreclaimable - * pages. It is better to clamp down on writers than to start swapping, and - * performing lots of scanning. - * - * We only allow 1/2 of the currently-unmapped memory to be dirtied. - * - * We don't permit the clamping level to fall below 5% - that is getting rather - * excessive. - * - * We make sure that the background writeout level is below the adjusted - * clamping level. - */ - -static unsigned long highmem_dirtyable_memory(unsigned long total) -{ -#ifdef CONFIG_HIGHMEM - int node; - unsigned long x = 0; - - for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = - &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - - x += zone_page_state(z, NR_FREE_PAGES) + - zone_reclaimable_pages(z); - } - /* - * Make sure that the number of highmem pages is never larger - * than the number of the total dirtyable memory. This can only - * occur in very strange VM situations but we want to make sure - * that this does not occur. - */ - return min(x, total); -#else - return 0; -#endif -} - -/** - * determine_dirtyable_memory - amount of memory that may be used - * - * Returns the numebr of pages that can currently be freed and used - * by the kernel for direct mappings. - */ -unsigned long determine_dirtyable_memory(void) -{ - unsigned long x; - - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); - - if (!vm_highmem_is_dirtyable) - x -= highmem_dirtyable_memory(x); - - return x + 1; /* Ensure that we never return 0 */ -} - static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { @@ -363,47 +518,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh) return max(thresh, global_dirty_limit); } -/* - * global_dirty_limits - background-writeback and dirty-throttling thresholds - * - * Calculate the dirty thresholds based on sysctl parameters - * - vm.dirty_background_ratio or vm.dirty_background_bytes - * - vm.dirty_ratio or vm.dirty_bytes - * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. - */ -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) -{ - unsigned long background; - unsigned long dirty; - unsigned long uninitialized_var(available_memory); - struct task_struct *tsk; - - if (!vm_dirty_bytes || !dirty_background_bytes) - available_memory = determine_dirtyable_memory(); - - if (vm_dirty_bytes) - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); - else - dirty = (vm_dirty_ratio * available_memory) / 100; - - if (dirty_background_bytes) - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); - else - background = (dirty_background_ratio * available_memory) / 100; - - if (background >= dirty) - background = dirty / 2; - tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - background += background / 4; - dirty += dirty / 4; - } - *pbackground = background; - *pdirty = dirty; - trace_global_dirty_state(background, dirty); -} - /** * bdi_dirty_limit - @bdi's share of dirty throttling threshold * @bdi: the backing_dev_info to query @@ -448,6 +562,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) } /* + * setpoint - dirty 3 + * f(dirty) := 1.0 + (----------------) + * limit - setpoint + * + * it's a 3rd order polynomial that subjects to + * + * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast + * (2) f(setpoint) = 1.0 => the balance point + * (3) f(limit) = 0 => the hard limit + * (4) df/dx <= 0 => negative feedback control + * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) + * => fast response on large errors; small oscillation near setpoint + */ +static long long pos_ratio_polynom(unsigned long setpoint, + unsigned long dirty, + unsigned long limit) +{ + long long pos_ratio; + long x; + + x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, + limit - setpoint + 1); + pos_ratio = x; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio += 1 << RATELIMIT_CALC_SHIFT; + + return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); +} + +/* * Dirty position control. * * (o) global/bdi setpoints @@ -545,26 +690,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, /* * global setpoint * - * setpoint - dirty 3 - * f(dirty) := 1.0 + (----------------) - * limit - setpoint + * See comment for pos_ratio_polynom(). + */ + setpoint = (freerun + limit) / 2; + pos_ratio = pos_ratio_polynom(setpoint, dirty, limit); + + /* + * The strictlimit feature is a tool preventing mistrusted filesystems + * from growing a large number of dirty pages before throttling. For + * such filesystems balance_dirty_pages always checks bdi counters + * against bdi limits. Even if global "nr_dirty" is under "freerun". + * This is especially important for fuse which sets bdi->max_ratio to + * 1% by default. Without strictlimit feature, fuse writeback may + * consume arbitrary amount of RAM because it is accounted in + * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * - * it's a 3rd order polynomial that subjects to + * Here, in bdi_position_ratio(), we calculate pos_ratio based on + * two values: bdi_dirty and bdi_thresh. Let's consider an example: + * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global + * limits are set by default to 10% and 20% (background and throttle). + * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. + * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is + * about ~6K pages (as the average of background and throttle bdi + * limits). The 3rd order polynomial will provide positive feedback if + * bdi_dirty is under bdi_setpoint and vice versa. * - * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast - * (2) f(setpoint) = 1.0 => the balance point - * (3) f(limit) = 0 => the hard limit - * (4) df/dx <= 0 => negative feedback control - * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) - * => fast response on large errors; small oscillation near setpoint + * Note, that we cannot use global counters in these calculations + * because we want to throttle process writing to a strictlimit BDI + * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB + * in the example above). */ - setpoint = (freerun + limit) / 2; - x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT, - limit - setpoint + 1); - pos_ratio = x; - pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; - pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; - pos_ratio += 1 << RATELIMIT_CALC_SHIFT; + if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + long long bdi_pos_ratio; + unsigned long bdi_bg_thresh; + + if (bdi_dirty < 8) + return min_t(long long, pos_ratio * 2, + 2 << RATELIMIT_CALC_SHIFT); + + if (bdi_dirty >= bdi_thresh) + return 0; + + bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh); + bdi_setpoint = dirty_freerun_ceiling(bdi_thresh, + bdi_bg_thresh); + + if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh) + return 0; + + bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty, + bdi_thresh); + + /* + * Typically, for strictlimit case, bdi_setpoint << setpoint + * and pos_ratio >> bdi_pos_ratio. In the other words global + * state ("dirty") is not limiting factor and we have to + * make decision based on bdi counters. But there is an + * important case when global pos_ratio should get precedence: + * global limits are exceeded (e.g. due to activities on other + * BDIs) while given strictlimit BDI is below limit. + * + * "pos_ratio * bdi_pos_ratio" would work for the case above, + * but it would look too non-natural for the case of all + * activity in the system coming from a single strictlimit BDI + * with bdi->max_ratio == 100%. + * + * Note that min() below somewhat changes the dynamics of the + * control system. Normally, pos_ratio value can be well over 3 + * (when globally we are at freerun and bdi is well below bdi + * setpoint). Now the maximum pos_ratio in the same situation + * is 2. We might want to tweak this if we observe the control + * system is too slow to adapt. + */ + return min(pos_ratio, bdi_pos_ratio); + } /* * We have computed basic pos_ratio above based on global situation. If @@ -625,7 +824,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, x_intercept = bdi_setpoint + span; if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty), + pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), x_intercept - bdi_setpoint + 1); } else pos_ratio /= 4; @@ -816,6 +1015,11 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, */ balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, dirty_rate | 1); + /* + * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw + */ + if (unlikely(balanced_dirty_ratelimit > write_bw)) + balanced_dirty_ratelimit = write_bw; /* * We could safely do this and return immediately: @@ -823,7 +1027,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * bdi->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated - * code makes use of task_ratelimit to filter out sigular points and + * code makes use of task_ratelimit to filter out singular points and * limit the step size. * * The below code essentially only uses the relative value of @@ -846,12 +1050,33 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * feel and care are stable dirty rate and small position error. * * |task_ratelimit - dirty_ratelimit| is used to limit the step size - * and filter out the sigular points of balanced_dirty_ratelimit. Which + * and filter out the singular points of balanced_dirty_ratelimit. Which * keeps jumping around randomly and can even leap far away at times * due to the small 200ms estimation period of dirty_rate (we want to * keep that period small to reduce time lags). */ step = 0; + + /* + * For strictlimit case, calculations above were based on bdi counters + * and limits (starting from pos_ratio = bdi_position_ratio() and up to + * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). + * Hence, to calculate "step" properly, we have to use bdi_dirty as + * "dirty" and bdi_setpoint as "setpoint". + * + * We rampup dirty_ratelimit forcibly if bdi_dirty is low because + * it's possible that bdi_thresh is close to zero due to inactivity + * of backing device (see the implementation of bdi_dirty_limit()). + */ + if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + dirty = bdi_dirty; + if (bdi_dirty < 8) + setpoint = bdi_dirty + 1; + else + setpoint = (bdi_thresh + + bdi_dirty_limit(bdi, bg_thresh)) / 2; + } + if (dirty < setpoint) { x = min(bdi->balanced_dirty_ratelimit, min(balanced_dirty_ratelimit, task_ratelimit)); @@ -946,7 +1171,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi, } /* - * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() + * After a task dirtied this many pages, balance_dirty_pages_ratelimited() * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive @@ -966,36 +1191,144 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi, unsigned long bdi_dirty) { unsigned long bw = bdi->avg_write_bandwidth; - unsigned long hi = ilog2(bw); - unsigned long lo = ilog2(bdi->dirty_ratelimit); unsigned long t; - /* target for 20ms max pause on 1-dd case */ - t = HZ / 50; + /* + * Limit pause time for small memory systems. If sleeping for too long + * time, a small pool of dirty/writeback pages may go empty and disk go + * idle. + * + * 8 serves as the safety ratio. + */ + t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); + t++; + + return min_t(unsigned long, t, MAX_PAUSE); +} + +static long bdi_min_pause(struct backing_dev_info *bdi, + long max_pause, + unsigned long task_ratelimit, + unsigned long dirty_ratelimit, + int *nr_dirtied_pause) +{ + long hi = ilog2(bdi->avg_write_bandwidth); + long lo = ilog2(bdi->dirty_ratelimit); + long t; /* target pause */ + long pause; /* estimated next pause */ + int pages; /* target nr_dirtied_pause */ + + /* target for 10ms pause on 1-dd case */ + t = max(1, HZ / 100); /* * Scale up pause time for concurrent dirtiers in order to reduce CPU * overheads. * - * (N * 20ms) on 2^N concurrent tasks. + * (N * 10ms) on 2^N concurrent tasks. */ if (hi > lo) - t += (hi - lo) * (20 * HZ) / 1024; + t += (hi - lo) * (10 * HZ) / 1024; /* - * Limit pause time for small memory systems. If sleeping for too long - * time, a small pool of dirty/writeback pages may go empty and disk go - * idle. + * This is a bit convoluted. We try to base the next nr_dirtied_pause + * on the much more stable dirty_ratelimit. However the next pause time + * will be computed based on task_ratelimit and the two rate limits may + * depart considerably at some time. Especially if task_ratelimit goes + * below dirty_ratelimit/2 and the target pause is max_pause, the next + * pause time will be max_pause*2 _trimmed down_ to max_pause. As a + * result task_ratelimit won't be executed faithfully, which could + * eventually bring down dirty_ratelimit. * - * 8 serves as the safety ratio. + * We apply two rules to fix it up: + * 1) try to estimate the next pause time and if necessary, use a lower + * nr_dirtied_pause so as not to exceed max_pause. When this happens, + * nr_dirtied_pause will be "dancing" with task_ratelimit. + * 2) limit the target pause time to max_pause/2, so that the normal + * small fluctuations of task_ratelimit won't trigger rule (1) and + * nr_dirtied_pause will remain as stable as dirty_ratelimit. + */ + t = min(t, 1 + max_pause / 2); + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); + + /* + * Tiny nr_dirtied_pause is found to hurt I/O performance in the test + * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. + * When the 16 consecutive reads are often interrupted by some dirty + * throttling pause during the async writes, cfq will go into idles + * (deadline is fine). So push nr_dirtied_pause as high as possible + * until reaches DIRTY_POLL_THRESH=32 pages. + */ + if (pages < DIRTY_POLL_THRESH) { + t = max_pause; + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); + if (pages > DIRTY_POLL_THRESH) { + pages = DIRTY_POLL_THRESH; + t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; + } + } + + pause = HZ * pages / (task_ratelimit + 1); + if (pause > max_pause) { + t = max_pause; + pages = task_ratelimit * t / roundup_pow_of_two(HZ); + } + + *nr_dirtied_pause = pages; + /* + * The minimal pause time will normally be half the target pause time. + */ + return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; +} + +static inline void bdi_dirty_limits(struct backing_dev_info *bdi, + unsigned long dirty_thresh, + unsigned long background_thresh, + unsigned long *bdi_dirty, + unsigned long *bdi_thresh, + unsigned long *bdi_bg_thresh) +{ + unsigned long bdi_reclaimable; + + /* + * bdi_thresh is not treated as some limiting factor as + * dirty_thresh, due to reasons + * - in JBOD setup, bdi_thresh can fluctuate a lot + * - in a system with HDD and USB key, the USB key may somehow + * go into state (bdi_dirty >> bdi_thresh) either because + * bdi_dirty starts high, or because bdi_thresh drops low. + * In this case we don't want to hard throttle the USB key + * dirtiers for 100 seconds until bdi_dirty drops under + * bdi_thresh. Instead the auxiliary bdi control line in + * bdi_position_ratio() will let the dirtier task progress + * at some rate <= (write_bw / 2) for bringing down bdi_dirty. */ - t = min(t, bdi_dirty * HZ / (8 * bw + 1)); + *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + + if (bdi_bg_thresh) + *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh * + background_thresh, + dirty_thresh) : 0; /* - * The pause time will be settled within range (max_pause/4, max_pause). - * Apply a minimal value of 4 to get a non-zero max_pause/4. + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. */ - return clamp_val(t, 4, MAX_PAUSE); + if (*bdi_thresh < 2 * bdi_stat_error(bdi)) { + bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + *bdi_dirty = bdi_reclaimable + + bdi_stat_sum(bdi, BDI_WRITEBACK); + } else { + bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + *bdi_dirty = bdi_reclaimable + + bdi_stat(bdi, BDI_WRITEBACK); + } } /* @@ -1009,23 +1342,30 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long pages_dirtied) { unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ - unsigned long bdi_reclaimable; unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ - unsigned long bdi_dirty; - unsigned long freerun; unsigned long background_thresh; unsigned long dirty_thresh; - unsigned long bdi_thresh; - long pause = 0; - long uninitialized_var(max_pause); + long period; + long pause; + long max_pause; + long min_pause; + int nr_dirtied_pause; bool dirty_exceeded = false; unsigned long task_ratelimit; - unsigned long uninitialized_var(dirty_ratelimit); + unsigned long dirty_ratelimit; unsigned long pos_ratio; struct backing_dev_info *bdi = mapping->backing_dev_info; + bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; for (;;) { + unsigned long now = jiffies; + unsigned long uninitialized_var(bdi_thresh); + unsigned long thresh; + unsigned long uninitialized_var(bdi_dirty); + unsigned long dirty; + unsigned long bg_thresh; + /* * Unstable writes are a feature of certain networked * filesystems (i.e. NFS) in which data may have been @@ -1038,56 +1378,44 @@ static void balance_dirty_pages(struct address_space *mapping, global_dirty_limits(&background_thresh, &dirty_thresh); + if (unlikely(strictlimit)) { + bdi_dirty_limits(bdi, dirty_thresh, background_thresh, + &bdi_dirty, &bdi_thresh, &bg_thresh); + + dirty = bdi_dirty; + thresh = bdi_thresh; + } else { + dirty = nr_dirty; + thresh = dirty_thresh; + bg_thresh = background_thresh; + } + /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts - * when the bdi limits are ramping up. + * when the bdi limits are ramping up in case of !strictlimit. + * + * In strictlimit case make decision based on the bdi counters + * and limits. Small writeouts when the bdi limits are ramping + * up are the price we consciously pay for strictlimit-ing. */ - freerun = dirty_freerun_ceiling(dirty_thresh, - background_thresh); - if (nr_dirty <= freerun) + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; + current->nr_dirtied_pause = + dirty_poll_interval(dirty, thresh); break; + } if (unlikely(!writeback_in_progress(bdi))) bdi_start_background_writeback(bdi); - /* - * bdi_thresh is not treated as some limiting factor as - * dirty_thresh, due to reasons - * - in JBOD setup, bdi_thresh can fluctuate a lot - * - in a system with HDD and USB key, the USB key may somehow - * go into state (bdi_dirty >> bdi_thresh) either because - * bdi_dirty starts high, or because bdi_thresh drops low. - * In this case we don't want to hard throttle the USB key - * dirtiers for 100 seconds until bdi_dirty drops under - * bdi_thresh. Instead the auxiliary bdi control line in - * bdi_position_ratio() will let the dirtier task progress - * at some rate <= (write_bw / 2) for bringing down bdi_dirty. - */ - bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + if (!strictlimit) + bdi_dirty_limits(bdi, dirty_thresh, background_thresh, + &bdi_dirty, &bdi_thresh, NULL); - /* - * In order to avoid the stacked BDI deadlock we need - * to ensure we accurately count the 'dirty' pages when - * the threshold is low. - * - * Otherwise it would be possible to get thresh+n pages - * reported dirty, even though there are thresh-m pages - * actually dirty; with m+n sitting in the percpu - * deltas. - */ - if (bdi_thresh < 2 * bdi_stat_error(bdi)) { - bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - bdi_dirty = bdi_reclaimable + - bdi_stat_sum(bdi, BDI_WRITEBACK); - } else { - bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_dirty = bdi_reclaimable + - bdi_stat(bdi, BDI_WRITEBACK); - } - - dirty_exceeded = (bdi_dirty > bdi_thresh) || - (nr_dirty > dirty_thresh); + dirty_exceeded = (bdi_dirty > bdi_thresh) && + ((nr_dirty > dirty_thresh) || strictlimit); if (dirty_exceeded && !bdi->dirty_exceeded) bdi->dirty_exceeded = 1; @@ -1095,20 +1423,34 @@ static void balance_dirty_pages(struct address_space *mapping, nr_dirty, bdi_thresh, bdi_dirty, start_time); - max_pause = bdi_max_pause(bdi, bdi_dirty); - dirty_ratelimit = bdi->dirty_ratelimit; pos_ratio = bdi_position_ratio(bdi, dirty_thresh, background_thresh, nr_dirty, bdi_thresh, bdi_dirty); task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> RATELIMIT_CALC_SHIFT; + max_pause = bdi_max_pause(bdi, bdi_dirty); + min_pause = bdi_min_pause(bdi, max_pause, + task_ratelimit, dirty_ratelimit, + &nr_dirtied_pause); + if (unlikely(task_ratelimit == 0)) { + period = max_pause; pause = max_pause; goto pause; } - pause = HZ * pages_dirtied / task_ratelimit; - if (unlikely(pause <= 0)) { + period = HZ * pages_dirtied / task_ratelimit; + pause = period; + if (current->dirty_paused_when) + pause -= now - current->dirty_paused_when; + /* + * For less than 1s think time (ext3/4 may block the dirtier + * for up to 800ms from time to time on 1-HDD; so does xfs, + * however at much less frequency), try to compensate it in + * future periods by updating the virtual time; otherwise just + * do a reset, as it may be a light dirtier. + */ + if (pause < min_pause) { trace_balance_dirty_pages(bdi, dirty_thresh, background_thresh, @@ -1118,12 +1460,24 @@ static void balance_dirty_pages(struct address_space *mapping, dirty_ratelimit, task_ratelimit, pages_dirtied, - pause, + period, + min(pause, 0L), start_time); - pause = 1; /* avoid resetting nr_dirtied_pause below */ + if (pause < -HZ) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; + } else if (period) { + current->dirty_paused_when += period; + current->nr_dirtied = 0; + } else if (current->nr_dirtied_pause <= pages_dirtied) + current->nr_dirtied_pause += pages_dirtied; break; } - pause = min(pause, max_pause); + if (unlikely(pause > max_pause)) { + /* for occasional dropped task_ratelimit */ + now += min(pause - max_pause, max_pause); + pause = max_pause; + } pause: trace_balance_dirty_pages(bdi, @@ -1135,11 +1489,16 @@ pause: dirty_ratelimit, task_ratelimit, pages_dirtied, + period, pause, start_time); __set_current_state(TASK_KILLABLE); io_schedule_timeout(pause); + current->dirty_paused_when = now + pause; + current->nr_dirtied = 0; + current->nr_dirtied_pause = nr_dirtied_pause; + /* * This is typically equal to (nr_dirty < dirty_thresh) and can * also keep "1000+ dd on a slow USB stick" under control. @@ -1167,23 +1526,6 @@ pause: if (!dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; - current->nr_dirtied = 0; - if (pause == 0) { /* in freerun area */ - current->nr_dirtied_pause = - dirty_poll_interval(nr_dirty, dirty_thresh); - } else if (pause <= max_pause / 4 && - pages_dirtied >= current->nr_dirtied_pause) { - current->nr_dirtied_pause = clamp_val( - dirty_ratelimit * (max_pause / 2) / HZ, - pages_dirtied + pages_dirtied / 8, - pages_dirtied * 4); - } else if (pause >= max_pause) { - current->nr_dirtied_pause = 1 | clamp_val( - dirty_ratelimit * (max_pause / 2) / HZ, - pages_dirtied / 4, - pages_dirtied - pages_dirtied / 8); - } - if (writeback_in_progress(bdi)) return; @@ -1202,9 +1544,9 @@ pause: bdi_start_background_writeback(bdi); } -void set_page_dirty_balance(struct page *page, int page_mkwrite) +void set_page_dirty_balance(struct page *page) { - if (set_page_dirty(page) || page_mkwrite) { + if (set_page_dirty(page)) { struct address_space *mapping = page_mapping(page); if (mapping) @@ -1214,10 +1556,25 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) static DEFINE_PER_CPU(int, bdp_ratelimits); +/* + * Normal tasks are throttled by + * loop { + * dirty tsk->nr_dirtied_pause pages; + * take a snap in balance_dirty_pages(); + * } + * However there is a worst case. If every task exit immediately when dirtied + * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be + * called to throttle the page dirties. The solution is to save the not yet + * throttled page dirties in dirty_throttle_leaks on task exit and charge them + * randomly into the running tasks. This works well for the above worst case, + * as the new task will pick up and accumulate the old task's leaked dirty + * count and eventually get throttled. + */ +DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; + /** - * balance_dirty_pages_ratelimited_nr - balance dirty memory state + * balance_dirty_pages_ratelimited - balance dirty memory state * @mapping: address_space which was dirtied - * @nr_pages_dirtied: number of pages which the caller has just dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's @@ -1228,8 +1585,7 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, - unsigned long nr_pages_dirtied) +void balance_dirty_pages_ratelimited(struct address_space *mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; int ratelimit; @@ -1242,8 +1598,6 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, if (bdi->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); - current->nr_dirtied += nr_pages_dirtied; - preempt_disable(); /* * This prevents one CPU to accumulate too many dirtied pages without @@ -1251,22 +1605,31 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. */ - p = &__get_cpu_var(bdp_ratelimits); + p = this_cpu_ptr(&bdp_ratelimits); if (unlikely(current->nr_dirtied >= ratelimit)) *p = 0; - else { - *p += nr_pages_dirtied; - if (unlikely(*p >= ratelimit_pages)) { - *p = 0; - ratelimit = 0; - } + else if (unlikely(*p >= ratelimit_pages)) { + *p = 0; + ratelimit = 0; + } + /* + * Pick up the dirtied pages by the exited tasks. This avoids lots of + * short-lived tasks (eg. gcc invocations in a kernel build) escaping + * the dirty throttling and livelock other long-run dirtiers. + */ + p = this_cpu_ptr(&dirty_throttle_leaks); + if (*p > 0 && current->nr_dirtied < ratelimit) { + unsigned long nr_pages_dirtied; + nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); + *p -= nr_pages_dirtied; + current->nr_dirtied += nr_pages_dirtied; } preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) balance_dirty_pages(mapping, current->nr_dirtied); } -EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); +EXPORT_SYMBOL(balance_dirty_pages_ratelimited); void throttle_vm_writeout(gfp_t gfp_mask) { @@ -1275,6 +1638,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) for ( ; ; ) { global_dirty_limits(&background_thresh, &dirty_thresh); + dirty_thresh = hard_dirty_limit(dirty_thresh); /* * Boost the allowable dirty threshold a bit for page @@ -1300,11 +1664,10 @@ void throttle_vm_writeout(gfp_t gfp_mask) /* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ -int dirty_writeback_centisecs_handler(ctl_table *table, int write, +int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); - bdi_arm_supers_timer(); return 0; } @@ -1368,19 +1731,28 @@ void writeback_set_ratelimit(void) unsigned long background_thresh; unsigned long dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); + global_dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; } -static int __cpuinit -ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) +static int +ratelimit_handler(struct notifier_block *self, unsigned long action, + void *hcpu) { - writeback_set_ratelimit(); - return NOTIFY_DONE; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DEAD: + writeback_set_ratelimit(); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } } -static struct notifier_block __cpuinitdata ratelimit_nb = { +static struct notifier_block ratelimit_nb = { .notifier_call = ratelimit_handler, .next = NULL, }; @@ -1405,13 +1777,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { */ void __init page_writeback_init(void) { - int shift; - writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - shift = calc_period_shift(); - prop_descriptor_init(&vm_completions, shift); + fprop_global_init(&writeout_completions); } /** @@ -1735,23 +2104,33 @@ int __set_page_dirty_no_writeback(struct page *page) */ void account_page_dirtied(struct page *page, struct address_space *mapping) { + trace_writeback_dirty_page(page, mapping); + if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); task_io_account_write(PAGE_CACHE_SIZE); + current->nr_dirtied++; + this_cpu_inc(bdp_ratelimits); } } EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for set_page_writeback family. + * + * The caller must hold mem_cgroup_begin/end_update_page_stat() lock + * while calling this function. + * See test_set_page_writeback for example. + * * NOTE: Unlike account_page_dirtied this does not rely on being atomic * wrt interrupts. */ void account_page_writeback(struct page *page) { + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } EXPORT_SYMBOL(account_page_writeback); @@ -1776,11 +2155,12 @@ int __set_page_dirty_nobuffers(struct page *page) if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); struct address_space *mapping2; + unsigned long flags; if (!mapping) return 1; - spin_lock_irq(&mapping->tree_lock); + spin_lock_irqsave(&mapping->tree_lock, flags); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); @@ -1789,7 +2169,7 @@ int __set_page_dirty_nobuffers(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1968,7 +2348,10 @@ int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); int ret; + bool locked; + unsigned long memcg_flags; + mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -1989,17 +2372,22 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } + mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); return ret; } -int test_set_page_writeback(struct page *page) +int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); int ret; + bool locked; + unsigned long memcg_flags; + mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2017,19 +2405,21 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_TOWRITE); + if (!keep_write) + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_TOWRITE); spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); } if (!ret) account_page_writeback(page); + mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); return ret; } -EXPORT_SYMBOL(test_set_page_writeback); +EXPORT_SYMBOL(__test_set_page_writeback); /* * Return true if any of the pages in the mapping are marked with the @@ -2040,3 +2430,23 @@ int mapping_tagged(struct address_space *mapping, int tag) return radix_tree_tagged(&mapping->page_tree, tag); } EXPORT_SYMBOL(mapping_tagged); + +/** + * wait_for_stable_page() - wait for writeback to finish, if necessary. + * @page: The page to wait on. + * + * This function determines if the given page is related to a backing device + * that requires page contents to be held stable during writeback. If so, then + * it will wait for any pending writeback to complete. + */ +void wait_for_stable_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + struct backing_dev_info *bdi = mapping->backing_dev_info; + + if (!bdi_cap_stable_pages_required(bdi)) + return; + + wait_on_page_writeback(page); +} +EXPORT_SYMBOL_GPL(wait_for_stable_page); |
