aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-08-31 09:45:21 +0200
committerIngo Molnar <mingo@elte.hu>2010-08-31 09:45:46 +0200
commitdaab7fc734a53fdeaf844b7c03053118ad1769da (patch)
tree575deb3cdcc6dda562acaed6f7c29bc81ae01cf2 /mm
parent774ea0bcb27f57b6fd521b3b6c43237782fed4b9 (diff)
parent2bfc96a127bc1cc94d26bfaa40159966064f9c8c (diff)
Merge commit 'v2.6.36-rc3' into x86/memblock
Conflicts: arch/x86/kernel/trampoline.c mm/memblock.c Merge reason: Resolve the conflicts, update to latest upstream. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c449
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/highmem.c7
-rw-r--r--mm/hugetlb.c110
-rw-r--r--mm/hwpoison-inject.c15
-rw-r--r--mm/init-mm.c6
-rw-r--r--mm/kmemleak.c100
-rw-r--r--mm/ksm.c71
-rw-r--r--mm/memcontrol.c462
-rw-r--r--mm/memory-failure.c153
-rw-r--r--mm/memory.c58
-rw-r--r--mm/mempolicy.c82
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mlock.c19
-rw-r--r--mm/mmap.c74
-rw-r--r--mm/nommu.c12
-rw-r--r--mm/oom_kill.c687
-rw-r--r--mm/page-writeback.c282
-rw-r--r--mm/page_alloc.c33
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/percpu.c85
-rw-r--r--mm/rmap.c203
-rw-r--r--mm/shmem.c139
-rw-r--r--mm/slab.c9
-rw-r--r--mm/slob.c14
-rw-r--r--mm/slub.c87
-rw-r--r--mm/swapfile.c100
-rw-r--r--mm/truncate.c38
-rw-r--r--mm/util.c11
-rw-r--r--mm/vmalloc.c13
-rw-r--r--mm/vmscan.c548
-rw-r--r--mm/vmstat.c8
32 files changed, 2369 insertions, 1520 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 123bcef13e5..eaa4a5bbe06 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/device.h>
+#include <trace/events/writeback.h>
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
@@ -49,8 +50,6 @@ static struct timer_list sync_supers_timer;
static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long);
-static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
-
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -65,31 +64,25 @@ static void bdi_debug_init(void)
static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
struct backing_dev_info *bdi = m->private;
- struct bdi_writeback *wb;
+ struct bdi_writeback *wb = &bdi->wb;
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
struct inode *inode;
- /*
- * inode lock is enough here, the bdi->wb_list is protected by
- * RCU on the reader side
- */
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
spin_lock(&inode_lock);
- list_for_each_entry(wb, &bdi->wb_list, list) {
- nr_wb++;
- list_for_each_entry(inode, &wb->b_dirty, i_list)
- nr_dirty++;
- list_for_each_entry(inode, &wb->b_io, i_list)
- nr_io++;
- list_for_each_entry(inode, &wb->b_more_io, i_list)
- nr_more_io++;
- }
+ list_for_each_entry(inode, &wb->b_dirty, i_list)
+ nr_dirty++;
+ list_for_each_entry(inode, &wb->b_io, i_list)
+ nr_io++;
+ list_for_each_entry(inode, &wb->b_more_io, i_list)
+ nr_more_io++;
spin_unlock(&inode_lock);
- get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
@@ -98,19 +91,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"BdiDirtyThresh: %8lu kB\n"
"DirtyThresh: %8lu kB\n"
"BackgroundThresh: %8lu kB\n"
- "WritebackThreads: %8lu\n"
"b_dirty: %8lu\n"
"b_io: %8lu\n"
"b_more_io: %8lu\n"
"bdi_list: %8u\n"
- "state: %8lx\n"
- "wb_list: %8u\n",
+ "state: %8lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
K(bdi_thresh), K(dirty_thresh),
- K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
- !list_empty(&bdi->bdi_list), bdi->state,
- !list_empty(&bdi->wb_list));
+ K(background_thresh), nr_dirty, nr_io, nr_more_io,
+ !list_empty(&bdi->bdi_list), bdi->state);
#undef K
return 0;
@@ -247,7 +237,6 @@ static int __init default_bdi_init(void)
sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
BUG_ON(IS_ERR(sync_supers_tsk));
- init_timer(&sync_supers_timer);
setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
bdi_arm_supers_timer();
@@ -259,77 +248,6 @@ static int __init default_bdi_init(void)
}
subsys_initcall(default_bdi_init);
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
-{
- memset(wb, 0, sizeof(*wb));
-
- wb->bdi = bdi;
- wb->last_old_flush = jiffies;
- INIT_LIST_HEAD(&wb->b_dirty);
- INIT_LIST_HEAD(&wb->b_io);
- INIT_LIST_HEAD(&wb->b_more_io);
-}
-
-static void bdi_task_init(struct backing_dev_info *bdi,
- struct bdi_writeback *wb)
-{
- struct task_struct *tsk = current;
-
- spin_lock(&bdi->wb_lock);
- list_add_tail_rcu(&wb->list, &bdi->wb_list);
- spin_unlock(&bdi->wb_lock);
-
- tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
- set_freezable();
-
- /*
- * Our parent may run at a different priority, just set us to normal
- */
- set_user_nice(tsk, 0);
-}
-
-static int bdi_start_fn(void *ptr)
-{
- struct bdi_writeback *wb = ptr;
- struct backing_dev_info *bdi = wb->bdi;
- int ret;
-
- /*
- * Add us to the active bdi_list
- */
- spin_lock_bh(&bdi_lock);
- list_add_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
-
- bdi_task_init(bdi, wb);
-
- /*
- * Clear pending bit and wakeup anybody waiting to tear us down
- */
- clear_bit(BDI_pending, &bdi->state);
- smp_mb__after_clear_bit();
- wake_up_bit(&bdi->state, BDI_pending);
-
- ret = bdi_writeback_task(wb);
-
- /*
- * Remove us from the list
- */
- spin_lock(&bdi->wb_lock);
- list_del_rcu(&wb->list);
- spin_unlock(&bdi->wb_lock);
-
- /*
- * Flush any work that raced with us exiting. No new work
- * will be added, since this bdi isn't discoverable anymore.
- */
- if (!list_empty(&bdi->work_list))
- wb_do_writeback(wb, 1);
-
- wb->task = NULL;
- return ret;
-}
-
int bdi_has_dirty_io(struct backing_dev_info *bdi)
{
return wb_has_dirty_io(&bdi->wb);
@@ -348,10 +266,10 @@ static void bdi_flush_io(struct backing_dev_info *bdi)
}
/*
- * kupdated() used to do this. We cannot do it from the bdi_forker_task()
+ * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
* or we risk deadlocking on ->s_umount. The longer term solution would be
* to implement sync_supers_bdi() or similar and simply do it from the
- * bdi writeback tasks individually.
+ * bdi writeback thread individually.
*/
static int bdi_sync_supers(void *unused)
{
@@ -387,144 +305,198 @@ static void sync_supers_timer_fn(unsigned long unused)
bdi_arm_supers_timer();
}
-static int bdi_forker_task(void *ptr)
+static void wakeup_timer_fn(unsigned long data)
+{
+ struct backing_dev_info *bdi = (struct backing_dev_info *)data;
+
+ spin_lock_bh(&bdi->wb_lock);
+ if (bdi->wb.task) {
+ trace_writeback_wake_thread(bdi);
+ wake_up_process(bdi->wb.task);
+ } else {
+ /*
+ * When bdi tasks are inactive for long time, they are killed.
+ * In this case we have to wake-up the forker thread which
+ * should create and run the bdi thread.
+ */
+ trace_writeback_wake_forker_thread(bdi);
+ wake_up_process(default_backing_dev_info.wb.task);
+ }
+ spin_unlock_bh(&bdi->wb_lock);
+}
+
+/*
+ * This function is used when the first inode for this bdi is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ */
+void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+{
+ unsigned long timeout;
+
+ timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+ mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
+}
+
+/*
+ * Calculate the longest interval (jiffies) bdi threads are allowed to be
+ * inactive.
+ */
+static unsigned long bdi_longest_inactive(void)
+{
+ unsigned long interval;
+
+ interval = msecs_to_jiffies(dirty_writeback_interval * 10);
+ return max(5UL * 60 * HZ, interval);
+}
+
+static int bdi_forker_thread(void *ptr)
{
struct bdi_writeback *me = ptr;
- bdi_task_init(me->bdi, me);
+ current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ set_freezable();
+
+ /*
+ * Our parent may run at a different priority, just set us to normal
+ */
+ set_user_nice(current, 0);
for (;;) {
- struct backing_dev_info *bdi, *tmp;
- struct bdi_writeback *wb;
+ struct task_struct *task = NULL;
+ struct backing_dev_info *bdi;
+ enum {
+ NO_ACTION, /* Nothing to do */
+ FORK_THREAD, /* Fork bdi thread */
+ KILL_THREAD, /* Kill inactive bdi thread */
+ } action = NO_ACTION;
/*
* Temporary measure, we want to make sure we don't see
* dirty data on the default backing_dev_info
*/
- if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
+ if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
+ del_timer(&me->wakeup_timer);
wb_do_writeback(me, 0);
+ }
spin_lock_bh(&bdi_lock);
+ set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Check if any existing bdi's have dirty data without
- * a thread registered. If so, set that up.
- */
- list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
- if (bdi->wb.task)
- continue;
- if (list_empty(&bdi->work_list) &&
- !bdi_has_dirty_io(bdi))
+ list_for_each_entry(bdi, &bdi_list, bdi_list) {
+ bool have_dirty_io;
+
+ if (!bdi_cap_writeback_dirty(bdi) ||
+ bdi_cap_flush_forker(bdi))
continue;
- bdi_add_default_flusher_task(bdi);
- }
+ WARN(!test_bit(BDI_registered, &bdi->state),
+ "bdi %p/%s is not registered!\n", bdi, bdi->name);
- set_current_state(TASK_INTERRUPTIBLE);
+ have_dirty_io = !list_empty(&bdi->work_list) ||
+ wb_has_dirty_io(&bdi->wb);
- if (list_empty(&bdi_pending_list)) {
- unsigned long wait;
+ /*
+ * If the bdi has work to do, but the thread does not
+ * exist - create it.
+ */
+ if (!bdi->wb.task && have_dirty_io) {
+ /*
+ * Set the pending bit - if someone will try to
+ * unregister this bdi - it'll wait on this bit.
+ */
+ set_bit(BDI_pending, &bdi->state);
+ action = FORK_THREAD;
+ break;
+ }
+
+ spin_lock(&bdi->wb_lock);
+
+ /*
+ * If there is no work to do and the bdi thread was
+ * inactive long enough - kill it. The wb_lock is taken
+ * to make sure no-one adds more work to this bdi and
+ * wakes the bdi thread up.
+ */
+ if (bdi->wb.task && !have_dirty_io &&
+ time_after(jiffies, bdi->wb.last_active +
+ bdi_longest_inactive())) {
+ task = bdi->wb.task;
+ bdi->wb.task = NULL;
+ spin_unlock(&bdi->wb_lock);
+ set_bit(BDI_pending, &bdi->state);
+ action = KILL_THREAD;
+ break;
+ }
+ spin_unlock(&bdi->wb_lock);
+ }
+ spin_unlock_bh(&bdi_lock);
- spin_unlock_bh(&bdi_lock);
- wait = msecs_to_jiffies(dirty_writeback_interval * 10);
- if (wait)
- schedule_timeout(wait);
+ /* Keep working if default bdi still has things to do */
+ if (!list_empty(&me->bdi->work_list))
+ __set_current_state(TASK_RUNNING);
+
+ switch (action) {
+ case FORK_THREAD:
+ __set_current_state(TASK_RUNNING);
+ task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s",
+ dev_name(bdi->dev));
+ if (IS_ERR(task)) {
+ /*
+ * If thread creation fails, force writeout of
+ * the bdi from the thread.
+ */
+ bdi_flush_io(bdi);
+ } else {
+ /*
+ * The spinlock makes sure we do not lose
+ * wake-ups when racing with 'bdi_queue_work()'.
+ */
+ spin_lock_bh(&bdi->wb_lock);
+ bdi->wb.task = task;
+ spin_unlock_bh(&bdi->wb_lock);
+ }
+ break;
+
+ case KILL_THREAD:
+ __set_current_state(TASK_RUNNING);
+ kthread_stop(task);
+ break;
+
+ case NO_ACTION:
+ if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
+ /*
+ * There are no dirty data. The only thing we
+ * should now care about is checking for
+ * inactive bdi threads and killing them. Thus,
+ * let's sleep for longer time, save energy and
+ * be friendly for battery-driven devices.
+ */
+ schedule_timeout(bdi_longest_inactive());
else
- schedule();
+ schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
try_to_freeze();
+ /* Back to the main loop */
continue;
}
- __set_current_state(TASK_RUNNING);
-
- /*
- * This is our real job - check for pending entries in
- * bdi_pending_list, and create the tasks that got added
- */
- bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
- bdi_list);
- list_del_init(&bdi->bdi_list);
- spin_unlock_bh(&bdi_lock);
-
- wb = &bdi->wb;
- wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
- dev_name(bdi->dev));
/*
- * If task creation fails, then readd the bdi to
- * the pending list and force writeout of the bdi
- * from this forker thread. That will free some memory
- * and we can try again.
+ * Clear pending bit and wakeup anybody waiting to tear us down.
*/
- if (IS_ERR(wb->task)) {
- wb->task = NULL;
-
- /*
- * Add this 'bdi' to the back, so we get
- * a chance to flush other bdi's to free
- * memory.
- */
- spin_lock_bh(&bdi_lock);
- list_add_tail(&bdi->bdi_list, &bdi_pending_list);
- spin_unlock_bh(&bdi_lock);
-
- bdi_flush_io(bdi);
- }
+ clear_bit(BDI_pending, &bdi->state);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&bdi->state, BDI_pending);
}
return 0;
}
-static void bdi_add_to_pending(struct rcu_head *head)
-{
- struct backing_dev_info *bdi;
-
- bdi = container_of(head, struct backing_dev_info, rcu_head);
- INIT_LIST_HEAD(&bdi->bdi_list);
-
- spin_lock(&bdi_lock);
- list_add_tail(&bdi->bdi_list, &bdi_pending_list);
- spin_unlock(&bdi_lock);
-
- /*
- * We are now on the pending list, wake up bdi_forker_task()
- * to finish the job and add us back to the active bdi_list
- */
- wake_up_process(default_backing_dev_info.wb.task);
-}
-
-/*
- * Add the default flusher task that gets created for any bdi
- * that has dirty data pending writeout
- */
-void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
-{
- if (!bdi_cap_writeback_dirty(bdi))
- return;
-
- if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
- printk(KERN_ERR "bdi %p/%s is not registered!\n",
- bdi, bdi->name);
- return;
- }
-
- /*
- * Check with the helper whether to proceed adding a task. Will only
- * abort if we two or more simultanous calls to
- * bdi_add_default_flusher_task() occured, further additions will block
- * waiting for previous additions to finish.
- */
- if (!test_and_set_bit(BDI_pending, &bdi->state)) {
- list_del_rcu(&bdi->bdi_list);
-
- /*
- * We must wait for the current RCU period to end before
- * moving to the pending list. So schedule that operation
- * from an RCU callback.
- */
- call_rcu(&bdi->rcu_head, bdi_add_to_pending);
- }
-}
-
/*
* Remove bdi from bdi_list, and ensure that it is no longer visible
*/
@@ -541,23 +513,16 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
va_list args;
- int ret = 0;
struct device *dev;
if (bdi->dev) /* The driver needs to use separate queues per device */
- goto exit;
+ return 0;
va_start(args, fmt);
dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
va_end(args);
- if (IS_ERR(dev)) {
- ret = PTR_ERR(dev);
- goto exit;
- }
-
- spin_lock_bh(&bdi_lock);
- list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
bdi->dev = dev;
@@ -569,21 +534,21 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
if (bdi_cap_flush_forker(bdi)) {
struct bdi_writeback *wb = &bdi->wb;
- wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
+ wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
dev_name(dev));
- if (IS_ERR(wb->task)) {
- wb->task = NULL;
- ret = -ENOMEM;
-
- bdi_remove_from_list(bdi);
- goto exit;
- }
+ if (IS_ERR(wb->task))
+ return PTR_ERR(wb->task);
}
bdi_debug_register(bdi, dev_name(dev));
set_bit(BDI_registered, &bdi->state);
-exit:
- return ret;
+
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
+
+ trace_writeback_bdi_register(bdi);
+ return 0;
}
EXPORT_SYMBOL(bdi_register);
@@ -598,31 +563,29 @@ EXPORT_SYMBOL(bdi_register_dev);
*/
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
{
- struct bdi_writeback *wb;
-
if (!bdi_cap_writeback_dirty(bdi))
return;
/*
- * If setup is pending, wait for that to complete first
+ * Make sure nobody finds us on the bdi_list anymore
*/
- wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
- TASK_UNINTERRUPTIBLE);
+ bdi_remove_from_list(bdi);
/*
- * Make sure nobody finds us on the bdi_list anymore
+ * If setup is pending, wait for that to complete first
*/
- bdi_remove_from_list(bdi);
+ wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+ TASK_UNINTERRUPTIBLE);
/*
- * Finally, kill the kernel threads. We don't need to be RCU
+ * Finally, kill the kernel thread. We don't need to be RCU
* safe anymore, since the bdi is gone from visibility. Force
* unfreeze of the thread before calling kthread_stop(), otherwise
* it would never exet if it is currently stuck in the refrigerator.
*/
- list_for_each_entry(wb, &bdi->wb_list, list) {
- thaw_process(wb->task);
- kthread_stop(wb->task);
+ if (bdi->wb.task) {
+ thaw_process(bdi->wb.task);
+ kthread_stop(bdi->wb.task);
}
}
@@ -644,7 +607,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
+ del_timer_sync(&bdi->wb.wakeup_timer);
if (!bdi_cap_flush_forker(bdi))
bdi_wb_shutdown(bdi);
@@ -655,6 +620,18 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_unregister);
+static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+ memset(wb, 0, sizeof(*wb));
+
+ wb->bdi = bdi;
+ wb->last_old_flush = jiffies;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+ setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+}
+
int bdi_init(struct backing_dev_info *bdi)
{
int i, err;
@@ -665,9 +642,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->max_ratio = 100;
bdi->max_prop_frac = PROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
- INIT_RCU_HEAD(&bdi->rcu_head);
INIT_LIST_HEAD(&bdi->bdi_list);
- INIT_LIST_HEAD(&bdi->wb_list);
INIT_LIST_HEAD(&bdi->work_list);
bdi_wb_init(&bdi->wb, bdi);
diff --git a/mm/filemap.c b/mm/filemap.c
index 20e5642e9f9..3d4df44e422 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2238,14 +2238,12 @@ static ssize_t generic_perform_write(struct file *file,
do {
struct page *page;
- pgoff_t index; /* Pagecache index for current page */
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
void *fsdata;
offset = (pos & (PAGE_CACHE_SIZE - 1));
- index = pos >> PAGE_CACHE_SHIFT;
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_count(i));
diff --git a/mm/highmem.c b/mm/highmem.c
index 66baa20f78f..7a0aa1be499 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,6 +26,7 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
+#include <linux/kgdb.h>
#include <asm/tlbflush.h>
/*
@@ -470,6 +471,12 @@ void debug_kmap_atomic(enum km_type type)
warn_count--;
}
}
+#ifdef CONFIG_KGDB_KDB
+ if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) {
+ WARN_ON(1);
+ warn_count--;
+ }
+#endif /* CONFIG_KGDB_KDB */
}
#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d42b009db..cc5be788a39 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,9 @@
#include <linux/bootmem.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
(vma->vm_pgoff >> huge_page_order(h));
}
+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+ unsigned long address)
+{
+ return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
+
/*
* Return the size of the pages allocated when backing a VMA. In the majority
* cases this will be same size as used by the page table entries.
@@ -552,6 +561,7 @@ static void free_huge_page(struct page *page)
set_page_private(page, 0);
page->mapping = NULL;
BUG_ON(page_count(page));
+ BUG_ON(page_mapcount(page));
INIT_LIST_HEAD(&page->lru);
spin_lock(&hugetlb_lock);
@@ -605,6 +615,8 @@ int PageHuge(struct page *page)
return dtor == free_huge_page;
}
+EXPORT_SYMBOL_GPL(PageHuge);
+
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -2129,6 +2141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
+ page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
spin_unlock(&src->page_table_lock);
@@ -2140,6 +2153,19 @@ nomem:
return -ENOMEM;
}
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+ return 1;
+ } else
+ return 0;
+}
+
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
@@ -2198,6 +2224,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
if (huge_pte_none(pte))
continue;
+ /*
+ * HWPoisoned hugepage is already unmapped and dropped reference
+ */
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+ continue;
+
page = pte_page(pte);
if (pte_dirty(pte))
set_page_dirty(page);
@@ -2207,6 +2239,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
flush_tlb_range(vma, start, end);
mmu_notifier_invalidate_range_end(mm, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
+ page_remove_rmap(page);
list_del(&page->lru);
put_page(page);
}
@@ -2272,6 +2305,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
return 1;
}
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ */
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte,
struct page *pagecache_page)
@@ -2286,8 +2322,13 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
- avoidcopy = (page_count(old_page) == 1);
+ avoidcopy = (page_mapcount(old_page) == 1);
if (avoidcopy) {
+ if (!trylock_page(old_page)) {
+ if (PageAnon(old_page))
+ page_move_anon_rmap(old_page, vma, address);
+ } else
+ unlock_page(old_page);
set_huge_ptep_writable(vma, address, ptep);
return 0;
}
@@ -2338,6 +2379,13 @@ retry_avoidcopy:
return -PTR_ERR(new_page);
}
+ /*
+ * When the original hugepage is shared one, it does not have
+ * anon_vma prepared.
+ */
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+
copy_huge_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page);
@@ -2349,11 +2397,19 @@ retry_avoidcopy:
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW */
+ mmu_notifier_invalidate_range_start(mm,
+ address & huge_page_mask(h),
+ (address & huge_page_mask(h)) + huge_page_size(h));
huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
+ page_remove_rmap(old_page);
+ hugepage_add_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
+ mmu_notifier_invalidate_range_end(mm,
+ address & huge_page_mask(h),
+ (address & huge_page_mask(h)) + huge_page_size(h));
}
page_cache_release(new_page);
page_cache_release(old_page);
@@ -2452,10 +2508,29 @@ retry:
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
+ page_dup_rmap(page);
} else {
lock_page(page);
- page->mapping = HUGETLB_POISON;
+ if (unlikely(anon_vma_prepare(vma))) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+ hugepage_add_new_anon_rmap(page, vma, address);
}
+ } else {
+ page_dup_rmap(page);
+ }
+
+ /*
+ * Since memory error handler replaces pte into hwpoison swap entry
+ * at the time of error handling, a process which reserved but not have
+ * the mapping to the error hugepage does not have hwpoison swap entry.
+ * So we need to block accesses from such a process by checking
+ * PG_hwpoison bit here.
+ */
+ if (unlikely(PageHWPoison(page))) {
+ ret = VM_FAULT_HWPOISON;
+ goto backout_unlocked;
}
/*
@@ -2507,10 +2582,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *ptep;
pte_t entry;
int ret;
+ struct page *page = NULL;
struct page *pagecache_page = NULL;
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma);
+ ptep = huge_pte_offset(mm, address);
+ if (ptep) {
+ entry = huge_ptep_get(ptep);
+ if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ return VM_FAULT_HWPOISON;
+ }
+
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
if (!ptep)
return VM_FAULT_OOM;
@@ -2548,6 +2631,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vma, address);
}
+ if (!pagecache_page) {
+ page = pte_page(entry);
+ lock_page(page);
+ }
+
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
if (unlikely(!pte_same(entry, h