diff options
author | Ingo Molnar <mingo@elte.hu> | 2010-08-31 09:45:21 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-08-31 09:45:46 +0200 |
commit | daab7fc734a53fdeaf844b7c03053118ad1769da (patch) | |
tree | 575deb3cdcc6dda562acaed6f7c29bc81ae01cf2 /mm | |
parent | 774ea0bcb27f57b6fd521b3b6c43237782fed4b9 (diff) | |
parent | 2bfc96a127bc1cc94d26bfaa40159966064f9c8c (diff) |
Merge commit 'v2.6.36-rc3' into x86/memblock
Conflicts:
arch/x86/kernel/trampoline.c
mm/memblock.c
Merge reason: Resolve the conflicts, update to latest upstream.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 449 | ||||
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/highmem.c | 7 | ||||
-rw-r--r-- | mm/hugetlb.c | 110 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 15 | ||||
-rw-r--r-- | mm/init-mm.c | 6 | ||||
-rw-r--r-- | mm/kmemleak.c | 100 | ||||
-rw-r--r-- | mm/ksm.c | 71 | ||||
-rw-r--r-- | mm/memcontrol.c | 462 | ||||
-rw-r--r-- | mm/memory-failure.c | 153 | ||||
-rw-r--r-- | mm/memory.c | 58 | ||||
-rw-r--r-- | mm/mempolicy.c | 82 | ||||
-rw-r--r-- | mm/migrate.c | 10 | ||||
-rw-r--r-- | mm/mlock.c | 19 | ||||
-rw-r--r-- | mm/mmap.c | 74 | ||||
-rw-r--r-- | mm/nommu.c | 12 | ||||
-rw-r--r-- | mm/oom_kill.c | 687 | ||||
-rw-r--r-- | mm/page-writeback.c | 282 | ||||
-rw-r--r-- | mm/page_alloc.c | 33 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 85 | ||||
-rw-r--r-- | mm/rmap.c | 203 | ||||
-rw-r--r-- | mm/shmem.c | 139 | ||||
-rw-r--r-- | mm/slab.c | 9 | ||||
-rw-r--r-- | mm/slob.c | 14 | ||||
-rw-r--r-- | mm/slub.c | 87 | ||||
-rw-r--r-- | mm/swapfile.c | 100 | ||||
-rw-r--r-- | mm/truncate.c | 38 | ||||
-rw-r--r-- | mm/util.c | 11 | ||||
-rw-r--r-- | mm/vmalloc.c | 13 | ||||
-rw-r--r-- | mm/vmscan.c | 548 | ||||
-rw-r--r-- | mm/vmstat.c | 8 |
32 files changed, 2369 insertions, 1520 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 123bcef13e5..eaa4a5bbe06 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/writeback.h> #include <linux/device.h> +#include <trace/events/writeback.h> static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); @@ -49,8 +50,6 @@ static struct timer_list sync_supers_timer; static int bdi_sync_supers(void *); static void sync_supers_timer_fn(unsigned long); -static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); - #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -65,31 +64,25 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; - struct bdi_writeback *wb; + struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; struct inode *inode; - /* - * inode lock is enough here, the bdi->wb_list is protected by - * RCU on the reader side - */ nr_wb = nr_dirty = nr_io = nr_more_io = 0; spin_lock(&inode_lock); - list_for_each_entry(wb, &bdi->wb_list, list) { - nr_wb++; - list_for_each_entry(inode, &wb->b_dirty, i_list) - nr_dirty++; - list_for_each_entry(inode, &wb->b_io, i_list) - nr_io++; - list_for_each_entry(inode, &wb->b_more_io, i_list) - nr_more_io++; - } + list_for_each_entry(inode, &wb->b_dirty, i_list) + nr_dirty++; + list_for_each_entry(inode, &wb->b_io, i_list) + nr_io++; + list_for_each_entry(inode, &wb->b_more_io, i_list) + nr_more_io++; spin_unlock(&inode_lock); - get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); + global_dirty_limits(&background_thresh, &dirty_thresh); + bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, @@ -98,19 +91,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" "BackgroundThresh: %8lu kB\n" - "WritebackThreads: %8lu\n" "b_dirty: %8lu\n" "b_io: %8lu\n" "b_more_io: %8lu\n" "bdi_list: %8u\n" - "state: %8lx\n" - "wb_list: %8u\n", + "state: %8lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), K(bdi_thresh), K(dirty_thresh), - K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, - !list_empty(&bdi->bdi_list), bdi->state, - !list_empty(&bdi->wb_list)); + K(background_thresh), nr_dirty, nr_io, nr_more_io, + !list_empty(&bdi->bdi_list), bdi->state); #undef K return 0; @@ -247,7 +237,6 @@ static int __init default_bdi_init(void) sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); BUG_ON(IS_ERR(sync_supers_tsk)); - init_timer(&sync_supers_timer); setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); bdi_arm_supers_timer(); @@ -259,77 +248,6 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); -static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) -{ - memset(wb, 0, sizeof(*wb)); - - wb->bdi = bdi; - wb->last_old_flush = jiffies; - INIT_LIST_HEAD(&wb->b_dirty); - INIT_LIST_HEAD(&wb->b_io); - INIT_LIST_HEAD(&wb->b_more_io); -} - -static void bdi_task_init(struct backing_dev_info *bdi, - struct bdi_writeback *wb) -{ - struct task_struct *tsk = current; - - spin_lock(&bdi->wb_lock); - list_add_tail_rcu(&wb->list, &bdi->wb_list); - spin_unlock(&bdi->wb_lock); - - tsk->flags |= PF_FLUSHER | PF_SWAPWRITE; - set_freezable(); - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(tsk, 0); -} - -static int bdi_start_fn(void *ptr) -{ - struct bdi_writeback *wb = ptr; - struct backing_dev_info *bdi = wb->bdi; - int ret; - - /* - * Add us to the active bdi_list - */ - spin_lock_bh(&bdi_lock); - list_add_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); - - bdi_task_init(bdi, wb); - - /* - * Clear pending bit and wakeup anybody waiting to tear us down - */ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); - - ret = bdi_writeback_task(wb); - - /* - * Remove us from the list - */ - spin_lock(&bdi->wb_lock); - list_del_rcu(&wb->list); - spin_unlock(&bdi->wb_lock); - - /* - * Flush any work that raced with us exiting. No new work - * will be added, since this bdi isn't discoverable anymore. - */ - if (!list_empty(&bdi->work_list)) - wb_do_writeback(wb, 1); - - wb->task = NULL; - return ret; -} - int bdi_has_dirty_io(struct backing_dev_info *bdi) { return wb_has_dirty_io(&bdi->wb); @@ -348,10 +266,10 @@ static void bdi_flush_io(struct backing_dev_info *bdi) } /* - * kupdated() used to do this. We cannot do it from the bdi_forker_task() + * kupdated() used to do this. We cannot do it from the bdi_forker_thread() * or we risk deadlocking on ->s_umount. The longer term solution would be * to implement sync_supers_bdi() or similar and simply do it from the - * bdi writeback tasks individually. + * bdi writeback thread individually. */ static int bdi_sync_supers(void *unused) { @@ -387,144 +305,198 @@ static void sync_supers_timer_fn(unsigned long unused) bdi_arm_supers_timer(); } -static int bdi_forker_task(void *ptr) +static void wakeup_timer_fn(unsigned long data) +{ + struct backing_dev_info *bdi = (struct backing_dev_info *)data; + + spin_lock_bh(&bdi->wb_lock); + if (bdi->wb.task) { + trace_writeback_wake_thread(bdi); + wake_up_process(bdi->wb.task); + } else { + /* + * When bdi tasks are inactive for long time, they are killed. + * In this case we have to wake-up the forker thread which + * should create and run the bdi thread. + */ + trace_writeback_wake_forker_thread(bdi); + wake_up_process(default_backing_dev_info.wb.task); + } + spin_unlock_bh(&bdi->wb_lock); +} + +/* + * This function is used when the first inode for this bdi is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. Since the write-out would + * starts only 'dirty_writeback_interval' centisecs from now anyway, we just + * set up a timer which wakes the bdi thread up later. + * + * Note, we wouldn't bother setting up the timer, but this function is on the + * fast-path (used by '__mark_inode_dirty()'), so we save few context switches + * by delaying the wake-up. + */ +void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) +{ + unsigned long timeout; + + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); +} + +/* + * Calculate the longest interval (jiffies) bdi threads are allowed to be + * inactive. + */ +static unsigned long bdi_longest_inactive(void) +{ + unsigned long interval; + + interval = msecs_to_jiffies(dirty_writeback_interval * 10); + return max(5UL * 60 * HZ, interval); +} + +static int bdi_forker_thread(void *ptr) { struct bdi_writeback *me = ptr; - bdi_task_init(me->bdi, me); + current->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(current, 0); for (;;) { - struct backing_dev_info *bdi, *tmp; - struct bdi_writeback *wb; + struct task_struct *task = NULL; + struct backing_dev_info *bdi; + enum { + NO_ACTION, /* Nothing to do */ + FORK_THREAD, /* Fork bdi thread */ + KILL_THREAD, /* Kill inactive bdi thread */ + } action = NO_ACTION; /* * Temporary measure, we want to make sure we don't see * dirty data on the default backing_dev_info */ - if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) + if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { + del_timer(&me->wakeup_timer); wb_do_writeback(me, 0); + } spin_lock_bh(&bdi_lock); + set_current_state(TASK_INTERRUPTIBLE); - /* - * Check if any existing bdi's have dirty data without - * a thread registered. If so, set that up. - */ - list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { - if (bdi->wb.task) - continue; - if (list_empty(&bdi->work_list) && - !bdi_has_dirty_io(bdi)) + list_for_each_entry(bdi, &bdi_list, bdi_list) { + bool have_dirty_io; + + if (!bdi_cap_writeback_dirty(bdi) || + bdi_cap_flush_forker(bdi)) continue; - bdi_add_default_flusher_task(bdi); - } + WARN(!test_bit(BDI_registered, &bdi->state), + "bdi %p/%s is not registered!\n", bdi, bdi->name); - set_current_state(TASK_INTERRUPTIBLE); + have_dirty_io = !list_empty(&bdi->work_list) || + wb_has_dirty_io(&bdi->wb); - if (list_empty(&bdi_pending_list)) { - unsigned long wait; + /* + * If the bdi has work to do, but the thread does not + * exist - create it. + */ + if (!bdi->wb.task && have_dirty_io) { + /* + * Set the pending bit - if someone will try to + * unregister this bdi - it'll wait on this bit. + */ + set_bit(BDI_pending, &bdi->state); + action = FORK_THREAD; + break; + } + + spin_lock(&bdi->wb_lock); + + /* + * If there is no work to do and the bdi thread was + * inactive long enough - kill it. The wb_lock is taken + * to make sure no-one adds more work to this bdi and + * wakes the bdi thread up. + */ + if (bdi->wb.task && !have_dirty_io && + time_after(jiffies, bdi->wb.last_active + + bdi_longest_inactive())) { + task = bdi->wb.task; + bdi->wb.task = NULL; + spin_unlock(&bdi->wb_lock); + set_bit(BDI_pending, &bdi->state); + action = KILL_THREAD; + break; + } + spin_unlock(&bdi->wb_lock); + } + spin_unlock_bh(&bdi_lock); - spin_unlock_bh(&bdi_lock); - wait = msecs_to_jiffies(dirty_writeback_interval * 10); - if (wait) - schedule_timeout(wait); + /* Keep working if default bdi still has things to do */ + if (!list_empty(&me->bdi->work_list)) + __set_current_state(TASK_RUNNING); + + switch (action) { + case FORK_THREAD: + __set_current_state(TASK_RUNNING); + task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s", + dev_name(bdi->dev)); + if (IS_ERR(task)) { + /* + * If thread creation fails, force writeout of + * the bdi from the thread. + */ + bdi_flush_io(bdi); + } else { + /* + * The spinlock makes sure we do not lose + * wake-ups when racing with 'bdi_queue_work()'. + */ + spin_lock_bh(&bdi->wb_lock); + bdi->wb.task = task; + spin_unlock_bh(&bdi->wb_lock); + } + break; + + case KILL_THREAD: + __set_current_state(TASK_RUNNING); + kthread_stop(task); + break; + + case NO_ACTION: + if (!wb_has_dirty_io(me) || !dirty_writeback_interval) + /* + * There are no dirty data. The only thing we + * should now care about is checking for + * inactive bdi threads and killing them. Thus, + * let's sleep for longer time, save energy and + * be friendly for battery-driven devices. + */ + schedule_timeout(bdi_longest_inactive()); else - schedule(); + schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); try_to_freeze(); + /* Back to the main loop */ continue; } - __set_current_state(TASK_RUNNING); - - /* - * This is our real job - check for pending entries in - * bdi_pending_list, and create the tasks that got added - */ - bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, - bdi_list); - list_del_init(&bdi->bdi_list); - spin_unlock_bh(&bdi_lock); - - wb = &bdi->wb; - wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", - dev_name(bdi->dev)); /* - * If task creation fails, then readd the bdi to - * the pending list and force writeout of the bdi - * from this forker thread. That will free some memory - * and we can try again. + * Clear pending bit and wakeup anybody waiting to tear us down. */ - if (IS_ERR(wb->task)) { - wb->task = NULL; - - /* - * Add this 'bdi' to the back, so we get - * a chance to flush other bdi's to free - * memory. - */ - spin_lock_bh(&bdi_lock); - list_add_tail(&bdi->bdi_list, &bdi_pending_list); - spin_unlock_bh(&bdi_lock); - - bdi_flush_io(bdi); - } + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); } return 0; } -static void bdi_add_to_pending(struct rcu_head *head) -{ - struct backing_dev_info *bdi; - - bdi = container_of(head, struct backing_dev_info, rcu_head); - INIT_LIST_HEAD(&bdi->bdi_list); - - spin_lock(&bdi_lock); - list_add_tail(&bdi->bdi_list, &bdi_pending_list); - spin_unlock(&bdi_lock); - - /* - * We are now on the pending list, wake up bdi_forker_task() - * to finish the job and add us back to the active bdi_list - */ - wake_up_process(default_backing_dev_info.wb.task); -} - -/* - * Add the default flusher task that gets created for any bdi - * that has dirty data pending writeout - */ -void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) -{ - if (!bdi_cap_writeback_dirty(bdi)) - return; - - if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) { - printk(KERN_ERR "bdi %p/%s is not registered!\n", - bdi, bdi->name); - return; - } - - /* - * Check with the helper whether to proceed adding a task. Will only - * abort if we two or more simultanous calls to - * bdi_add_default_flusher_task() occured, further additions will block - * waiting for previous additions to finish. - */ - if (!test_and_set_bit(BDI_pending, &bdi->state)) { - list_del_rcu(&bdi->bdi_list); - - /* - * We must wait for the current RCU period to end before - * moving to the pending list. So schedule that operation - * from an RCU callback. - */ - call_rcu(&bdi->rcu_head, bdi_add_to_pending); - } -} - /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ @@ -541,23 +513,16 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { va_list args; - int ret = 0; struct device *dev; if (bdi->dev) /* The driver needs to use separate queues per device */ - goto exit; + return 0; va_start(args, fmt); dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); va_end(args); - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - goto exit; - } - - spin_lock_bh(&bdi_lock); - list_add_tail_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); + if (IS_ERR(dev)) + return PTR_ERR(dev); bdi->dev = dev; @@ -569,21 +534,21 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, if (bdi_cap_flush_forker(bdi)) { struct bdi_writeback *wb = &bdi->wb; - wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", + wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", dev_name(dev)); - if (IS_ERR(wb->task)) { - wb->task = NULL; - ret = -ENOMEM; - - bdi_remove_from_list(bdi); - goto exit; - } + if (IS_ERR(wb->task)) + return PTR_ERR(wb->task); } bdi_debug_register(bdi, dev_name(dev)); set_bit(BDI_registered, &bdi->state); -exit: - return ret; + + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + + trace_writeback_bdi_register(bdi); + return 0; } EXPORT_SYMBOL(bdi_register); @@ -598,31 +563,29 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - struct bdi_writeback *wb; - if (!bdi_cap_writeback_dirty(bdi)) return; /* - * If setup is pending, wait for that to complete first + * Make sure nobody finds us on the bdi_list anymore */ - wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, - TASK_UNINTERRUPTIBLE); + bdi_remove_from_list(bdi); /* - * Make sure nobody finds us on the bdi_list anymore + * If setup is pending, wait for that to complete first */ - bdi_remove_from_list(bdi); + wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); /* - * Finally, kill the kernel threads. We don't need to be RCU + * Finally, kill the kernel thread. We don't need to be RCU * safe anymore, since the bdi is gone from visibility. Force * unfreeze of the thread before calling kthread_stop(), otherwise * it would never exet if it is currently stuck in the refrigerator. */ - list_for_each_entry(wb, &bdi->wb_list, list) { - thaw_process(wb->task); - kthread_stop(wb->task); + if (bdi->wb.task) { + thaw_process(bdi->wb.task); + kthread_stop(bdi->wb.task); } } @@ -644,7 +607,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); + del_timer_sync(&bdi->wb.wakeup_timer); if (!bdi_cap_flush_forker(bdi)) bdi_wb_shutdown(bdi); @@ -655,6 +620,18 @@ void bdi_unregister(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_unregister); +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); + setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); +} + int bdi_init(struct backing_dev_info *bdi) { int i, err; @@ -665,9 +642,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; spin_lock_init(&bdi->wb_lock); - INIT_RCU_HEAD(&bdi->rcu_head); INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->wb_list); INIT_LIST_HEAD(&bdi->work_list); bdi_wb_init(&bdi->wb, bdi); diff --git a/mm/filemap.c b/mm/filemap.c index 20e5642e9f9..3d4df44e422 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2238,14 +2238,12 @@ static ssize_t generic_perform_write(struct file *file, do { struct page *page; - pgoff_t index; /* Pagecache index for current page */ unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ void *fsdata; offset = (pos & (PAGE_CACHE_SIZE - 1)); - index = pos >> PAGE_CACHE_SHIFT; bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_count(i)); diff --git a/mm/highmem.c b/mm/highmem.c index 66baa20f78f..7a0aa1be499 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -26,6 +26,7 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> +#include <linux/kgdb.h> #include <asm/tlbflush.h> /* @@ -470,6 +471,12 @@ void debug_kmap_atomic(enum km_type type) warn_count--; } } +#ifdef CONFIG_KGDB_KDB + if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) { + WARN_ON(1); + warn_count--; + } +#endif /* CONFIG_KGDB_KDB */ } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 54d42b009db..cc5be788a39 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,9 @@ #include <linux/bootmem.h> #include <linux/sysfs.h> #include <linux/slab.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } +pgoff_t linear_hugepage_index(struct vm_area_struct *vma, + unsigned long address) +{ + return vma_hugecache_offset(hstate_vma(vma), vma, address); +} + /* * Return the size of the pages allocated when backing a VMA. In the majority * cases this will be same size as used by the page table entries. @@ -552,6 +561,7 @@ static void free_huge_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); + BUG_ON(page_mapcount(page)); INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); @@ -605,6 +615,8 @@ int PageHuge(struct page *page) return dtor == free_huge_page; } +EXPORT_SYMBOL_GPL(PageHuge); + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -2129,6 +2141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); + page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); @@ -2140,6 +2153,19 @@ nomem: return -ENOMEM; } +static int is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { + return 1; + } else + return 0; +} + void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { @@ -2198,6 +2224,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, if (huge_pte_none(pte)) continue; + /* + * HWPoisoned hugepage is already unmapped and dropped reference + */ + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) + continue; + page = pte_page(pte); if (pte_dirty(pte)) set_page_dirty(page); @@ -2207,6 +2239,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { + page_remove_rmap(page); list_del(&page->lru); put_page(page); } @@ -2272,6 +2305,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, return 1; } +/* + * Hugetlb_cow() should be called with page lock of the original hugepage held. + */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, struct page *pagecache_page) @@ -2286,8 +2322,13 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ - avoidcopy = (page_count(old_page) == 1); + avoidcopy = (page_mapcount(old_page) == 1); if (avoidcopy) { + if (!trylock_page(old_page)) { + if (PageAnon(old_page)) + page_move_anon_rmap(old_page, vma, address); + } else + unlock_page(old_page); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -2338,6 +2379,13 @@ retry_avoidcopy: return -PTR_ERR(new_page); } + /* + * When the original hugepage is shared one, it does not have + * anon_vma prepared. + */ + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + copy_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); @@ -2349,11 +2397,19 @@ retry_avoidcopy: ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ + mmu_notifier_invalidate_range_start(mm, + address & huge_page_mask(h), + (address & huge_page_mask(h)) + huge_page_size(h)); huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); + page_remove_rmap(old_page); + hugepage_add_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; + mmu_notifier_invalidate_range_end(mm, + address & huge_page_mask(h), + (address & huge_page_mask(h)) + huge_page_size(h)); } page_cache_release(new_page); page_cache_release(old_page); @@ -2452,10 +2508,29 @@ retry: spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); + page_dup_rmap(page); } else { lock_page(page); - page->mapping = HUGETLB_POISON; + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + hugepage_add_new_anon_rmap(page, vma, address); } + } else { + page_dup_rmap(page); + } + + /* + * Since memory error handler replaces pte into hwpoison swap entry + * at the time of error handling, a process which reserved but not have + * the mapping to the error hugepage does not have hwpoison swap entry. + * So we need to block accesses from such a process by checking + * PG_hwpoison bit here. + */ + if (unlikely(PageHWPoison(page))) { + ret = VM_FAULT_HWPOISON; + goto backout_unlocked; } /* @@ -2507,10 +2582,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *page = NULL; struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + ptep = huge_pte_offset(mm, address); + if (ptep) { + entry = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON; + } + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; @@ -2548,6 +2631,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } + if (!pagecache_page) { + page = pte_page(entry); + lock_page(page); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, h |