aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2011-04-26 10:22:15 +0200
committerJiri Kosina <jkosina@suse.cz>2011-04-26 10:22:59 +0200
commit07f9479a40cc778bc1462ada11f95b01360ae4ff (patch)
tree0676cf38df3844004bb3ebfd99dfa67a4a8998f5 /mm
parent9d5e6bdb3013acfb311ab407eeca0b6a6a3dedbf (diff)
parentcd2e49e90f1cae7726c9a2c54488d881d7f1cd1c (diff)
Merge branch 'master' into for-next
Fast-forwarded to current state of Linus' tree as there are patches to be applied for files that didn't exist on the old branch.
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug25
-rw-r--r--mm/backing-dev.c18
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/compaction.c65
-rw-r--r--mm/filemap.c211
-rw-r--r--mm/huge_memory.c69
-rw-r--r--mm/hugetlb.c16
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kmemleak.c6
-rw-r--r--mm/ksm.c25
-rw-r--r--mm/memblock.c241
-rw-r--r--mm/memcontrol.c669
-rw-r--r--mm/memory-failure.c16
-rw-r--r--mm/memory.c106
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/mempolicy.c3
-rw-r--r--mm/migrate.c58
-rw-r--r--mm/mlock.c17
-rw-r--r--mm/mmap.c15
-rw-r--r--mm/mremap.c11
-rw-r--r--mm/nobootmem.c10
-rw-r--r--mm/nommu.c58
-rw-r--r--mm/oom_kill.c89
-rw-r--r--mm/page-writeback.c25
-rw-r--r--mm/page_alloc.c95
-rw-r--r--mm/page_cgroup.c140
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pagewalk.c24
-rw-r--r--mm/percpu.c13
-rw-r--r--mm/readahead.c18
-rw-r--r--mm/rmap.c85
-rw-r--r--mm/shmem.c11
-rw-r--r--mm/slab.c61
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c376
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c189
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c411
-rw-r--r--mm/truncate.c22
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c158
-rw-r--r--mm/vmscan.c66
-rw-r--r--mm/vmstat.c27
45 files changed, 2123 insertions, 1359 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index af7cfb43d2f..8b1a477162d 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,27 +1,24 @@
config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
- depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
- depends on !HIBERNATION || !PPC && !SPARC
+ depends on DEBUG_KERNEL
+ depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
depends on !KMEMCHECK
+ select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
of memory corruption.
+ For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
+ fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages(). Additionally,
+ this option cannot be enabled in combination with hibernation as
+ that would result in incorrect warnings of memory corruption after
+ a resume because free pages are not saved to the suspend image.
+
config WANT_PAGE_DEBUG_FLAGS
bool
config PAGE_POISONING
- bool "Debug page memory allocations"
- depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
- depends on !HIBERNATION
- select DEBUG_PAGEALLOC
+ bool
select WANT_PAGE_DEBUG_FLAGS
- ---help---
- Fill the pages with poison patterns after free_pages() and verify
- the patterns before alloc_pages(). This results in a large slowdown,
- but helps to find certain types of memory corruption.
-
- This option cannot be enabled in combination with hibernation as
- that would result in incorrect warnings of memory corruption after
- a resume because free pages are not saved to the suspend image.
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 027100d3022..befc87531e4 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,17 +14,11 @@
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-}
-EXPORT_SYMBOL(default_unplug_io_fn);
-
struct backing_dev_info default_backing_dev_info = {
.name = "default",
.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
.state = 0,
.capabilities = BDI_CAP_MAP_COPY,
- .unplug_io_fn = default_unplug_io_fn,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
@@ -73,14 +67,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
struct inode *inode;
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@ -604,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (sb->s_bdi == bdi)
- sb->s_bdi = NULL;
+ sb->s_bdi = &default_backing_dev_info;
}
spin_unlock(&sb_lock);
}
@@ -682,11 +676,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
if (bdi_has_dirty_io(bdi)) {
struct bdi_writeback *dst = &default_backing_dev_info.wb;
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
list_splice(&bdi->wb.b_io, &dst->b_io);
list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
}
bdi_unregister(bdi);
@@ -793,7 +787,7 @@ EXPORT_SYMBOL(congestion_wait);
* jiffies for either a BDI to exit congestion of the given @sync queue
* or a write to complete.
*
- * In the absense of zone congestion, cond_resched() is called to yield
+ * In the absence of zone congestion, cond_resched() is called to yield
* the processor if necessary but otherwise does not sleep.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 07aeb89e396..01d5a4b3dd0 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -34,14 +34,6 @@ unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
-#ifdef CONFIG_CRASH_DUMP
-/*
- * If we have booted due to a crash, max_pfn will be a very low value. We need
- * to know the amount of memory that the previous kernel used.
- */
-unsigned long saved_max_pfn;
-#endif
-
bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
diff --git a/mm/compaction.c b/mm/compaction.c
index 8be430b812d..021a2960ef9 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -42,8 +42,6 @@ struct compact_control {
unsigned int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
-
- int compact_mode;
};
static unsigned long release_freepages(struct list_head *freelist)
@@ -155,7 +153,6 @@ static void isolate_freepages(struct zone *zone,
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- spin_lock_irqsave(&zone->lock, flags);
for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
pfn -= pageblock_nr_pages) {
unsigned long isolated;
@@ -178,9 +175,19 @@ static void isolate_freepages(struct zone *zone,
if (!suitable_migration_target(page))
continue;
- /* Found a block suitable for isolating free pages from */
- isolated = isolate_freepages_block(zone, pfn, freelist);
- nr_freepages += isolated;
+ /*
+ * Found a block suitable for isolating free pages from. Now
+ * we disabled interrupts, double check things are ok and
+ * isolate the pages. This is to minimise the time IRQs
+ * are disabled
+ */
+ isolated = 0;
+ spin_lock_irqsave(&zone->lock, flags);
+ if (suitable_migration_target(page)) {
+ isolated = isolate_freepages_block(zone, pfn, freelist);
+ nr_freepages += isolated;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
/*
* Record the highest PFN we isolated pages from. When next
@@ -190,7 +197,6 @@ static void isolate_freepages(struct zone *zone,
if (isolated)
high_pfn = max(high_pfn, pfn);
}
- spin_unlock_irqrestore(&zone->lock, flags);
/* split_free_page does not map the pages */
list_for_each_entry(page, freelist, lru) {
@@ -271,9 +277,27 @@ static unsigned long isolate_migratepages(struct zone *zone,
}
/* Time to isolate some pages for migration */
+ cond_resched();
spin_lock_irq(&zone->lru_lock);
for (; low_pfn < end_pfn; low_pfn++) {
struct page *page;
+ bool locked = true;
+
+ /* give a chance to irqs before checking need_resched() */
+ if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+ spin_unlock_irq(&zone->lru_lock);
+ locked = false;
+ }
+ if (need_resched() || spin_is_contended(&zone->lru_lock)) {
+ if (locked)
+ spin_unlock_irq(&zone->lru_lock);
+ cond_resched();
+ spin_lock_irq(&zone->lru_lock);
+ if (fatal_signal_pending(current))
+ break;
+ } else if (!locked)
+ spin_lock_irq(&zone->lru_lock);
+
if (!pfn_valid_within(low_pfn))
continue;
nr_scanned++;
@@ -397,10 +421,7 @@ static int compact_finished(struct zone *zone,
return COMPACT_COMPLETE;
/* Compaction run is not finished if the watermark is not met */
- if (cc->compact_mode != COMPACT_MODE_KSWAPD)
- watermark = low_wmark_pages(zone);
- else
- watermark = high_wmark_pages(zone);
+ watermark = low_wmark_pages(zone);
watermark += (1 << cc->order);
if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
@@ -413,15 +434,6 @@ static int compact_finished(struct zone *zone,
if (cc->order == -1)
return COMPACT_CONTINUE;
- /*
- * Generating only one page of the right order is not enough
- * for kswapd, we must continue until we're above the high
- * watermark as a pool for high order GFP_ATOMIC allocations
- * too.
- */
- if (cc->compact_mode == COMPACT_MODE_KSWAPD)
- return COMPACT_CONTINUE;
-
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
/* Job done if page is free of the right migratetype */
@@ -508,12 +520,13 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
unsigned long nr_migrate, nr_remaining;
+ int err;
if (!isolate_migratepages(zone, cc))
continue;
nr_migrate = cc->nr_migratepages;
- migrate_pages(&cc->migratepages, compaction_alloc,
+ err = migrate_pages(&cc->migratepages, compaction_alloc,
(unsigned long)cc, false,
cc->sync);
update_nr_listpages(cc);
@@ -527,7 +540,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_remaining);
/* Release LRU pages not migrated */
- if (!list_empty(&cc->migratepages)) {
+ if (err) {
putback_lru_pages(&cc->migratepages);
cc->nr_migratepages = 0;
}
@@ -543,8 +556,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
- bool sync,
- int compact_mode)
+ bool sync)
{
struct compact_control cc = {
.nr_freepages = 0,
@@ -553,7 +565,6 @@ unsigned long compact_zone_order(struct zone *zone,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
.sync = sync,
- .compact_mode = compact_mode,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -599,8 +610,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
nodemask) {
int status;
- status = compact_zone_order(zone, order, gfp_mask, sync,
- COMPACT_MODE_DIRECT_RECLAIM);
+ status = compact_zone_order(zone, order, gfp_mask, sync);
rc = max(status, rc);
/* If a normal allocation would succeed, stop compacting */
@@ -631,7 +641,6 @@ static int compact_node(int nid)
.nr_freepages = 0,
.nr_migratepages = 0,
.order = -1,
- .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
};
zone = &pgdat->node_zones[zoneid];
diff --git a/mm/filemap.c b/mm/filemap.c
index 83a45d35468..c641edf553a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -80,8 +80,8 @@
* ->i_mutex
* ->i_alloc_sem (various)
*
- * ->inode_lock
- * ->sb_lock (fs/fs-writeback.c)
+ * inode_wb_list_lock
+ * sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
* ->i_mmap_lock
@@ -98,8 +98,10 @@
* ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
- * ->inode_lock (page_remove_rmap->set_page_dirty)
- * ->inode_lock (zap_pte_range->set_page_dirty)
+ * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
+ * ->inode->i_lock (page_remove_rmap->set_page_dirty)
+ * inode_wb_list_lock (zap_pte_range->set_page_dirty)
+ * ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
* (code doesn't rely on that order, so you could switch it around)
@@ -108,11 +110,11 @@
*/
/*
- * Remove a page from the page cache and free it. Caller has to make
+ * Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold the mapping's tree_lock.
*/
-void __remove_from_page_cache(struct page *page)
+void __delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -137,7 +139,15 @@ void __remove_from_page_cache(struct page *page)
}
}
-void remove_from_page_cache(struct page *page)
+/**
+ * delete_from_page_cache - delete page from page cache
+ * @page: the page which the kernel is trying to remove from page cache
+ *
+ * This must be called only on pages that have been verified to be in the page
+ * cache and locked. It will never put the page into the free list, the caller
+ * has a reference on the page.
+ */
+void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
void (*freepage)(struct page *);
@@ -146,54 +156,25 @@ void remove_from_page_cache(struct page *page)
freepage = mapping->a_ops->freepage;
spin_lock_irq(&mapping->tree_lock);
- __remove_from_page_cache(page);
+ __delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
if (freepage)
freepage(page);
+ page_cache_release(page);
}
-EXPORT_SYMBOL(remove_from_page_cache);
+EXPORT_SYMBOL(delete_from_page_cache);
-static int sync_page(void *word)
+static int sleep_on_page(void *word)
{
- struct address_space *mapping;
- struct page *page;
-
- page = container_of((unsigned long *)word, struct page, flags);
-
- /*
- * page_mapping() is being called without PG_locked held.
- * Some knowledge of the state and use of the page is used to
- * reduce the requirements down to a memory barrier.
- * The danger here is of a stale page_mapping() return value
- * indicating a struct address_space different from the one it's
- * associated with when it is associated with one.
- * After smp_mb(), it's either the correct page_mapping() for
- * the page, or an old page_mapping() and the page's own
- * page_mapping() has gone NULL.
- * The ->sync_page() address_space operation must tolerate
- * page_mapping() going NULL. By an amazing coincidence,
- * this comes about because none of the users of the page
- * in the ->sync_page() methods make essential use of the
- * page_mapping(), merely passing the page down to the backing
- * device's unplug functions when it's non-NULL, which in turn
- * ignore it for all cases but swap, where only page_private(page) is
- * of interest. When page_mapping() does go NULL, the entire
- * call stack gracefully ignores the page and returns.
- * -- wli
- */
- smp_mb();
- mapping = page_mapping(page);
- if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
- mapping->a_ops->sync_page(page);
io_schedule();
return 0;
}
-static int sync_page_killable(void *word)
+static int sleep_on_page_killable(void *word)
{
- sync_page(word);
+ sleep_on_page(word);
return fatal_signal_pending(current) ? -EINTR : 0;
}
@@ -387,6 +368,76 @@ int filemap_write_and_wait_range(struct address_space *mapping,
EXPORT_SYMBOL(filemap_write_and_wait_range);
/**
+ * replace_page_cache_page - replace a pagecache page with a new one
+ * @old: page to be replaced
+ * @new: page to replace with
+ * @gfp_mask: allocation mode
+ *
+ * This function replaces a page in the pagecache with a new one. On
+ * success it acquires the pagecache reference for the new page and
+ * drops it for the old page. Both the old and new pages must be
+ * locked. This function does not add the new page to the LRU, the
+ * caller must do that.
+ *
+ * The remove + add is atomic. The only way this function can fail is
+ * memory allocation failure.
+ */
+int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+{
+ int error;
+ struct mem_cgroup *memcg = NULL;
+
+ VM_BUG_ON(!PageLocked(old));
+ VM_BUG_ON(!PageLocked(new));
+ VM_BUG_ON(new->mapping);
+
+ /*
+ * This is not page migration, but prepare_migration and
+ * end_migration does enough work for charge replacement.
+ *
+ * In the longer term we probably want a specialized function
+ * for moving the charge from old to new in a more efficient
+ * manner.
+ */
+ error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
+ if (error)
+ return error;
+
+ error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (!error) {
+ struct address_space *mapping = old->mapping;
+ void (*freepage)(struct page *);
+
+ pgoff_t offset = old->index;
+ freepage = mapping->a_ops->freepage;
+
+ page_cache_get(new);
+ new->mapping = mapping;
+ new->index = offset;
+
+ spin_lock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(old);
+ error = radix_tree_insert(&mapping->page_tree, offset, new);
+ BUG_ON(error);
+ mapping->nrpages++;
+ __inc_zone_page_state(new, NR_FILE_PAGES);
+ if (PageSwapBacked(new))
+ __inc_zone_page_state(new, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ if (freepage)
+ freepage(old);
+ page_cache_release(old);
+ mem_cgroup_end_migration(memcg, old, new, true);
+ } else {
+ mem_cgroup_end_migration(memcg, old, new, false);
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(replace_page_cache_page);
+
+/**
* add_to_page_cache_locked - add a locked page to the pagecache
* @page: page to add
* @mapping: the page's address_space
@@ -479,12 +530,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
EXPORT_SYMBOL(__page_cache_alloc);
#endif
-static int __sleep_on_page_lock(void *word)
-{
- io_schedule();
- return 0;
-}
-
/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
@@ -512,7 +557,7 @@ void wait_on_page_bit(struct page *page, int bit_nr)
DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
if (test_bit(bit_nr, &page->flags))
- __wait_on_bit(page_waitqueue(page), &wait, sync_page,
+ __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_on_page_bit);
@@ -576,17 +621,12 @@ EXPORT_SYMBOL(end_page_writeback);
/**
* __lock_page - get a lock on the page, assuming we need to sleep to get it
* @page: the page to lock
- *
- * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
- * random driver's requestfn sets TASK_RUNNING, we could busywait. However
- * chances are that on the second loop, the block layer's plug list is empty,
- * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
*/
void __lock_page(struct page *page)
{
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+ __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
@@ -596,24 +636,10 @@ int __lock_page_killable(struct page *page)
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
return __wait_on_bit_lock(page_waitqueue(page), &wait,
- sync_page_killable, TASK_KILLABLE);
+ sleep_on_page_killable, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
-/**
- * __lock_page_nosync - get a lock on the page, without calling sync_page()
- * @page: the page to lock
- *
- * Variant of lock_page that does not require the caller to hold a reference
- * on the page's mapping.
- */
-void __lock_page_nosync(struct page *page)
-{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
- TASK_UNINTERRUPTIBLE);
-}
-
int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags)
{
@@ -621,8 +647,10 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
__lock_page(page);
return 1;
} else {
- up_read(&mm->mmap_sem);
- wait_on_page_locked(page);
+ if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ up_read(&mm->mmap_sem);
+ wait_on_page_locked(page);
+ }
return 0;
}
}
@@ -782,9 +810,13 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
+
+ /*
+ * This can only trigger when the entry at index 0 moves out
+ * of or back to the root: none yet gotten, safe to restart.
+ */
if (radix_tree_deref_retry(page)) {
- if (ret)
- start = pages[ret-1]->index;
+ WARN_ON(start | i);
goto restart;
}
@@ -800,6 +832,13 @@ repeat:
pages[ret] = page;
ret++;
}
+
+ /*
+ * If all entries were removed before we could secure them,
+ * try again, because callers stop trying once 0 is returned.
+ */
+ if (unlikely(!ret && nr_found))
+ goto restart;
rcu_read_unlock();
return ret;
}
@@ -834,6 +873,11 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
+
+ /*
+ * This can only trigger when the entry at index 0 moves out
+ * of or back to the root: none yet gotten, safe to restart.
+ */
if (radix_tree_deref_retry(page))
goto restart;
@@ -894,6 +938,11 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
+
+ /*
+ * This can only trigger when the entry at index 0 moves out
+ * of or back to the root: none yet gotten, safe to restart.
+ */
if (radix_tree_deref_retry(page))
goto restart;
@@ -909,6 +958,13 @@ repeat:
pages[ret] = page;
ret++;
}
+
+ /*
+ * If all entries were removed before we could secure them,
+ * try again, because callers stop trying once 0 is returned.
+ */
+ if (unlikely(!ret && nr_found))
+ goto restart;
rcu_read_unlock();
if (ret)
@@ -1298,12 +1354,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long seg = 0;
size_t count;
loff_t *ppos = &iocb->ki_pos;
+ struct blk_plug plug;
count = 0;
retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
if (retval)
return retval;
+ blk_start_plug(&plug);
+
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (filp->f_flags & O_DIRECT) {
loff_t size;
@@ -1376,6 +1435,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
break;
}
out:
+ blk_finish_plug(&plug);
return retval;
}
EXPORT_SYMBOL(generic_file_aio_read);
@@ -2487,11 +2547,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ struct blk_plug plug;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
mutex_lock(&inode->i_mutex);
+ blk_start_plug(&plug);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
@@ -2502,6 +2564,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err < 0 && ret > 0)
ret = err;
}
+ blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 113e35c4750..470dcda10ad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -244,24 +244,28 @@ static ssize_t single_flag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf,
enum transparent_hugepage_flag flag)
{
- if (test_bit(flag, &transparent_hugepage_flags))
- return sprintf(buf, "[yes] no\n");
- else
- return sprintf(buf, "yes [no]\n");
+ return sprintf(buf, "%d\n",
+ !!test_bit(flag, &transparent_hugepage_flags));
}
+
static ssize_t single_flag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count,
enum transparent_hugepage_flag flag)
{
- if (!memcmp("yes", buf,
- min(sizeof("yes")-1, count))) {
+ unsigned long value;
+ int ret;
+
+ ret = kstrtoul(buf, 10, &value);
+ if (ret < 0)
+ return ret;
+ if (value > 1)
+ return -EINVAL;
+
+ if (value)
set_bit(flag, &transparent_hugepage_flags);
- } else if (!memcmp("no", buf,
- min(sizeof("no")-1, count))) {
+ else
clear_bit(flag, &transparent_hugepage_flags);
- } else
- return -EINVAL;
return count;
}
@@ -643,23 +647,24 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,