aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorTony Luck <tony.luck@intel.com>2005-10-31 10:51:57 -0800
committerTony Luck <tony.luck@intel.com>2005-10-31 10:51:57 -0800
commitc7fb577e2a6cb04732541f2dc402bd46747f7558 (patch)
treedf3b1a1922ed13bfbcc45d08650c38beeb1a7bd1 /mm
parent9cec58dc138d6fcad9f447a19c8ff69f6540e667 (diff)
parent581c1b14394aee60aff46ea67d05483261ed6527 (diff)
manual update from upstream:
Applied Al's change 06a544971fad0992fe8b92c5647538d573089dd4 to new location of swiotlb.c Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig21
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem.c1
-rw-r--r--mm/filemap.c24
-rw-r--r--mm/filemap_xip.c22
-rw-r--r--mm/fremap.c86
-rw-r--r--mm/highmem.c14
-rw-r--r--mm/hugetlb.c207
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memory.c993
-rw-r--r--mm/memory_hotplug.c138
-rw-r--r--mm/mempolicy.c463
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/mmap.c128
-rw-r--r--mm/mprotect.c19
-rw-r--r--mm/mremap.c193
-rw-r--r--mm/msync.c78
-rw-r--r--mm/nommu.c18
-rw-r--r--mm/page_alloc.c242
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/pdflush.c13
-rw-r--r--mm/rmap.c146
-rw-r--r--mm/shmem.c32
-rw-r--r--mm/slab.c13
-rw-r--r--mm/sparse.c99
-rw-r--r--mm/swap.c7
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c41
-rw-r--r--mm/thrash.c2
-rw-r--r--mm/tiny-shmem.c5
-rw-r--r--mm/truncate.c11
-rw-r--r--mm/vmalloc.c77
-rw-r--r--mm/vmscan.c14
33 files changed, 1748 insertions, 1382 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 391ffc54d13..1a4473fcb2c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -111,3 +111,24 @@ config SPARSEMEM_STATIC
config SPARSEMEM_EXTREME
def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+# eventually, we can have this option just 'select SPARSEMEM'
+config MEMORY_HOTPLUG
+ bool "Allow for memory hot-add"
+ depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
+
+comment "Memory hotplug is currently incompatible with Software Suspend"
+ depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
+#
+config SPLIT_PTLOCK_CPUS
+ int
+ default "4096" if ARM && !CPU_CACHE_VIPT
+ default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
+ default "4"
diff --git a/mm/Makefile b/mm/Makefile
index 4cd69e3ce42..2fa6d2ca9f2 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SHMEM) += shmem.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
-
+obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a58699b6579..e8c567177dc 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
if (j + 16 < BITS_PER_LONG)
prefetchw(page + j + 16);
__ClearPageReserved(page + j);
+ set_page_count(page + j, 0);
}
__free_pages(page, order);
i += BITS_PER_LONG;
diff --git a/mm/filemap.c b/mm/filemap.c
index b5346576e58..5d6e4c2000d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
*
* ->mmap_sem
* ->i_mmap_lock
- * ->page_table_lock (various places, mainly in mmap.c)
+ * ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
*
* ->mmap_sem
@@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
* ->anon_vma.lock (vma_adjust)
*
* ->anon_vma.lock
- * ->page_table_lock (anon_vma_prepare and various)
+ * ->page_table_lock or pte_lock (anon_vma_prepare and various)
*
- * ->page_table_lock
+ * ->page_table_lock or pte_lock
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
@@ -152,7 +152,7 @@ static int sync_page(void *word)
* in the ->sync_page() methods make essential use of the
* page_mapping(), merely passing the page down to the backing
* device's unplug functions when it's non-NULL, which in turn
- * ignore it for all cases but swap, where only page->private is
+ * ignore it for all cases but swap, where only page_private(page) is
* of interest. When page_mapping() does go NULL, the entire
* call stack gracefully ignores the page and returns.
* -- wli
@@ -377,7 +377,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
* This function does not add the page to the LRU. The caller must do that.
*/
int add_to_page_cache(struct page *page, struct address_space *mapping,
- pgoff_t offset, int gfp_mask)
+ pgoff_t offset, gfp_t gfp_mask)
{
int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
@@ -401,7 +401,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
EXPORT_SYMBOL(add_to_page_cache);
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
- pgoff_t offset, int gfp_mask)
+ pgoff_t offset, gfp_t gfp_mask)
{
int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
if (ret == 0)
@@ -591,7 +591,7 @@ EXPORT_SYMBOL(find_lock_page);
* memory exhaustion.
*/
struct page *find_or_create_page(struct address_space *mapping,
- unsigned long index, unsigned int gfp_mask)
+ unsigned long index, gfp_t gfp_mask)
{
struct page *page, *cached_page = NULL;
int err;
@@ -683,7 +683,7 @@ struct page *
grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
{
struct page *page = find_get_page(mapping, index);
- unsigned int gfp_mask;
+ gfp_t gfp_mask;
if (page) {
if (!TestSetPageLocked(page))
@@ -1030,8 +1030,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
desc.error = 0;
do_generic_file_read(filp,ppos,&desc,file_read_actor);
retval += desc.written;
- if (!retval) {
- retval = desc.error;
+ if (desc.error) {
+ retval = retval ?: desc.error;
break;
}
}
@@ -1520,7 +1520,7 @@ repeat:
page_cache_release(page);
return err;
}
- } else {
+ } else if (vma->vm_flags & VM_NONLINEAR) {
/* No page was found just because we can't read it in now (being
* here implies nonblock != 0), but the page may exist, so set
* the PTE to fault it in later. */
@@ -1537,6 +1537,7 @@ repeat:
return 0;
}
+EXPORT_SYMBOL(filemap_populate);
struct vm_operations_struct generic_file_vm_ops = {
.nopage = filemap_nopage,
@@ -1555,7 +1556,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
vma->vm_ops = &generic_file_vm_ops;
return 0;
}
-EXPORT_SYMBOL(filemap_populate);
/*
* This is for filesystems which do not implement ->writepage.
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 8c199f53773..9cf687e4a29 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,6 +174,8 @@ __xip_unmap (struct address_space * mapping,
unsigned long address;
pte_t *pte;
pte_t pteval;
+ spinlock_t *ptl;
+ struct page *page;
spin_lock(&mapping->i_mmap_lock);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -181,19 +183,17 @@ __xip_unmap (struct address_space * mapping,
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- /*
- * We need the page_table_lock to protect us from page faults,
- * munmap, fork, etc...
- */
- pte = page_check_address(ZERO_PAGE(address), mm,
- address);
- if (!IS_ERR(pte)) {
+ page = ZERO_PAGE(address);
+ pte = page_check_address(page, mm, address, &ptl);
+ if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush(vma, address, pte);
+ page_remove_rmap(page);
+ dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
- pte_unmap(pte);
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
+ page_cache_release(page);
}
}
spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +228,7 @@ xip_file_nopage(struct vm_area_struct * area,
page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
if (!IS_ERR(page)) {
- return page;
+ goto out;
}
if (PTR_ERR(page) != -ENODATA)
return NULL;
@@ -249,6 +249,8 @@ xip_file_nopage(struct vm_area_struct * area,
page = ZERO_PAGE(address);
}
+out:
+ page_cache_get(page);
return page;
}
diff --git a/mm/fremap.c b/mm/fremap.c
index ab23a0673c3..d862be3bc3e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -20,33 +20,32 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
+ struct page *page = NULL;
- if (pte_none(pte))
- return;
if (pte_present(pte)) {
unsigned long pfn = pte_pfn(pte);
-
flush_cache_page(vma, addr, pfn);
pte = ptep_clear_flush(vma, addr, ptep);
- if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
- if (!PageReserved(page)) {
- if (pte_dirty(pte))
- set_page_dirty(page);
- page_remove_rmap(page);
- page_cache_release(page);
- dec_mm_counter(mm, rss);
- }
+ if (unlikely(!pfn_valid(pfn))) {
+ print_bad_pte(vma, pte, addr);
+ goto out;
}
+ page = pfn_to_page(pfn);
+ if (pte_dirty(pte))
+ set_page_dirty(page);
+ page_remove_rmap(page);
+ page_cache_release(page);
} else {
if (!pte_file(pte))
free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear(mm, addr, ptep);
}
+out:
+ return !!page;
}
/*
@@ -64,21 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
pud_t *pud;
pgd_t *pgd;
pte_t pte_val;
+ spinlock_t *ptl;
+
+ BUG_ON(vma->vm_flags & VM_RESERVED);
pgd = pgd_offset(mm, addr);
- spin_lock(&mm->page_table_lock);
-
pud = pud_alloc(mm, pgd, addr);
if (!pud)
- goto err_unlock;
-
+ goto out;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
- goto err_unlock;
-
- pte = pte_alloc_map(mm, pmd, addr);
+ goto out;
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
- goto err_unlock;
+ goto out;
/*
* This page may have been truncated. Tell the
@@ -88,29 +86,27 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
inode = vma->vm_file->f_mapping->host;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (!page->mapping || page->index >= size)
- goto err_unlock;
+ goto unlock;
err = -ENOMEM;
if (page_mapcount(page) > INT_MAX/2)
- goto err_unlock;
+ goto unlock;
- zap_pte(mm, vma, addr, pte);
+ if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
+ inc_mm_counter(mm, file_rss);
- inc_mm_counter(mm,rss);
flush_icache_page(vma, page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
page_add_file_rmap(page);
pte_val = *pte;
- pte_unmap(pte);
update_mmu_cache(vma, addr, pte_val);
-
err = 0;
-err_unlock:
- spin_unlock(&mm->page_table_lock);
+unlock:
+ pte_unmap_unlock(pte, ptl);
+out:
return err;
}
EXPORT_SYMBOL(install_page);
-
/*
* Install a file pte to a given virtual memory address, release any
* previously existing mapping.
@@ -124,37 +120,35 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
pud_t *pud;
pgd_t *pgd;
pte_t pte_val;
+ spinlock_t *ptl;
+
+ BUG_ON(vma->vm_flags & VM_RESERVED);
pgd = pgd_offset(mm, addr);
- spin_lock(&mm->page_table_lock);
-
pud = pud_alloc(mm, pgd, addr);
if (!pud)
- goto err_unlock;
-
+ goto out;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
- goto err_unlock;
-
- pte = pte_alloc_map(mm, pmd, addr);
+ goto out;
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
- goto err_unlock;
+ goto out;
- zap_pte(mm, vma, addr, pte);
+ if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
+ update_hiwater_rss(mm);
+ dec_mm_counter(mm, file_rss);
+ }
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
pte_val = *pte;
- pte_unmap(pte);
update_mmu_cache(vma, addr, pte_val);
- spin_unlock(&mm->page_table_lock);
- return 0;
-
-err_unlock:
- spin_unlock(&mm->page_table_lock);
+ pte_unmap_unlock(pte, ptl);
+ err = 0;
+out:
return err;
}
-
/***
* sys_remap_file_pages - remap arbitrary pages of a shared backing store
* file within an existing vma.
diff --git a/mm/highmem.c b/mm/highmem.c
index 90e1861e2da..ce2e7e8bbfa 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -30,11 +30,9 @@
static mempool_t *page_pool, *isa_page_pool;
-static void *page_pool_alloc(gfp_t gfp_mask, void *data)
+static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data)
{
- unsigned int gfp = gfp_mask | (unsigned int) (long) data;
-
- return alloc_page(gfp);
+ return alloc_page(gfp_mask | GFP_DMA);
}
static void page_pool_free(void *page, void *data)
@@ -51,6 +49,12 @@ static void page_pool_free(void *page, void *data)
* n means that there are (n-1) current users of it.
*/
#ifdef CONFIG_HIGHMEM
+
+static void *page_pool_alloc(gfp_t gfp_mask, void *data)
+{
+ return alloc_page(gfp_mask);
+}
+
static int pkmap_count[LAST_PKMAP];
static unsigned int last_pkmap_nr;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
@@ -267,7 +271,7 @@ int init_emergency_isa_pool(void)
if (isa_page_pool)
return 0;
- isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA);
+ isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL);
if (!isa_page_pool)
BUG();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 61d38067803..c9b43360fd3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -277,19 +277,23 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
unsigned long addr;
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+ src_pte = huge_pte_offset(src, addr);
+ if (!src_pte)
+ continue;
dst_pte = huge_pte_alloc(dst, addr);
if (!dst_pte)
goto nomem;
+ spin_lock(&dst->page_table_lock);
spin_lock(&src->page_table_lock);
- src_pte = huge_pte_offset(src, addr);
- if (src_pte && !pte_none(*src_pte)) {
+ if (!pte_none(*src_pte)) {
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
- add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+ add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
spin_unlock(&src->page_table_lock);
+ spin_unlock(&dst->page_table_lock);
}
return 0;
@@ -310,12 +314,14 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
BUG_ON(start & ~HPAGE_MASK);
BUG_ON(end & ~HPAGE_MASK);
+ spin_lock(&mm->page_table_lock);
+
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
- if (! ptep)
- /* This can happen on truncate, or if an
- * mmap() is aborted due to an error before
- * the prefault */
+ if (!ptep)
continue;
pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -324,96 +330,99 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
page = pte_page(pte);
put_page(page);
- add_mm_counter(mm, rss, - (HPAGE_SIZE / PAGE_SIZE));
+ add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
}
- flush_tlb_range(vma, start, end);
-}
-
-void zap_hugepage_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long length)
-{
- struct mm_struct *mm = vma->vm_mm;
- spin_lock(&mm->page_table_lock);
- unmap_hugepage_range(vma, start, start + length);
spin_unlock(&mm->page_table_lock);
+ flush_tlb_range(vma, start, end);
}
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+static struct page *find_lock_huge_page(struct address_space *mapping,
+ unsigned long idx)
{
- struct mm_struct *mm = current->mm;
- unsigned long addr;
- int ret = 0;
-
- WARN_ON(!is_vm_hugetlb_page(vma));
- BUG_ON(vma->vm_start & ~HPAGE_MASK);
- BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
- hugetlb_prefault_arch_hook(mm);
-
- spin_lock(&mm->page_table_lock);
- for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
- unsigned long idx;
- pte_t *pte = huge_pte_alloc(mm, addr);
- struct page *page;
-
- if (!pte) {
- ret = -ENOMEM;
- goto out;
- }
+ struct page *page;
+ int err;
+ struct inode *inode = mapping->host;
+ unsigned long size;
+
+retry:
+ page = find_lock_page(mapping, idx);
+ if (page)
+ goto out;
+
+ /* Check to make sure the mapping hasn't been truncated */
+ size = i_size_read(inode) >> HPAGE_SHIFT;
+ if (idx >= size)
+ goto out;
+
+ if (hugetlb_get_quota(mapping))
+ goto out;
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ goto out;
+ }
- idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
- + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
- page = find_get_page(mapping, idx);
- if (!page) {
- /* charge the fs quota first */
- if (hugetlb_get_quota(mapping)) {
- ret = -ENOMEM;
- goto out;
- }
- page = alloc_huge_page();
- if (!page) {
- hugetlb_put_quota(mapping);
- ret = -ENOMEM;
- goto out;
- }
- ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
- if (! ret) {
- unlock_page(page);
- } else {
- hugetlb_put_quota(mapping);
- free_huge_page(page);
- goto out;
- }
- }
- add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ hugetlb_put_quota(mapping);
+ if (err == -EEXIST)
+ goto retry;
+ page = NULL;
}
out:
- spin_unlock(&mm->page_table_lock);
- return ret;
+ return page;
}
-/*
- * On ia64 at least, it is possible to receive a hugetlb fault from a
- * stale zero entry left in the TLB from earlier hardware prefetching.
- * Low-level arch code should already have flushed the stale entry as
- * part of its fault handling, but we do need to accept this minor fault
- * and return successfully. Whereas the "normal" case is that this is
- * an access to a hugetlb page which has been truncated off since mmap.
- */
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access)
{
int ret = VM_FAULT_SIGBUS;
+ unsigned long idx;
+ unsigned long size;
pte_t *pte;
+ struct page *page;
+ struct address_space *mapping;
+
+ pte = huge_pte_alloc(mm, address);
+ if (!pte)
+ goto out;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+ /*
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
+ */
+ page = find_lock_huge_page(mapping, idx);
+ if (!page)
+ goto out;
spin_lock(&mm->page_table_lock);
- pte = huge_pte_offset(mm, address);
- if (pte && !pte_none(*pte))
- ret = VM_FAULT_MINOR;
+ size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+ if (idx >= size)
+ goto backout;
+
+ ret = VM_FAULT_MINOR;
+ if (!pte_none(*pte))
+ goto backout;
+
+ add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
spin_unlock(&mm->page_table_lock);
+ unlock_page(page);
+out:
return ret;
+
+backout:
+ spin_unlock(&mm->page_table_lock);
+ hugetlb_put_quota(mapping);
+ unlock_page(page);
+ put_page(page);
+ goto out;
}
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -423,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long vpfn, vaddr = *position;
int remainder = *length;
- BUG_ON(!is_vm_hugetlb_page(vma));
-
vpfn = vaddr/PAGE_SIZE;
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
+ pte_t *pte;
+ struct page *page;
- if (pages) {
- pte_t *pte;
- struct page *page;
-
- /* Some archs (sparc64, sh*) have multiple
- * pte_ts to each hugepage. We have to make
- * sure we get the first, for the page
- * indexing below to work. */
- pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
-
- /* the hugetlb file might have been truncated */
- if (!pte || pte_none(*pte)) {
- remainder = 0;
- if (!i)
- i = -EFAULT;
- break;
- }
+ /*
+ * Some archs (sparc64, sh*) have multiple pte_ts to
+ * each hugepage. We have to make * sure we get the
+ * first, for the page indexing below to work.
+ */
+ pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
- page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+ if (!pte || pte_none(*pte)) {
+ int ret;
- WARN_ON(!PageCompound(page));
+ spin_unlock(&mm->page_table_lock);
+ ret = hugetlb_fault(mm, vma, vaddr, 0);
+ spin_lock(&mm->page_table_lock);
+ if (ret == VM_FAULT_MINOR)
+ continue;
+
+ remainder = 0;
+ if (!i)
+ i = -EFAULT;
+ break;
+ }
+ if (pages) {
+ page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
get_page(page);
pages[i] = page;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 20e075d1c64..17aaf3e1644 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
unsigned long start, unsigned long end)
{
*prev = vma;
- if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+ if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
return -EINVAL;
if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index 1db40e935e5..0f60baf6f69 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
{
struct page *page = pmd_page(*pmd);
pmd_clear(pmd);
+ pte_lock_deinit(page);
pte_free_tlb(tlb, page);
dec_page_state(nr_page_table_pages);
tlb->mm->nr_ptes--;
@@ -249,7 +250,7 @@ void free_pgd_range(struct mmu_gather **tlb,
free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);
- if (!tlb_is_full_mm(*tlb))
+ if (!(*tlb)->fullmm)
flush_tlb_pgtables((*tlb)->mm, start, end);
}
@@ -260,6 +261,12 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
struct vm_area_struct *next = vma->vm_next;
unsigned long addr = vma->vm_start;
+ /*
+ * Hide vma from rmap and vmtruncate before freeing pgtables
+ */
+ anon_vma_unlink(vma);
+ unlink_file_vma(vma);
+
if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
@@ -272,6 +279,8 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
HPAGE_SIZE)) {
vma = next;
next = vma->vm_next;
+ anon_vma_unlink(vma);
+ unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
@@ -280,72 +289,78 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
}
}
-pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
- unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
- if (!pmd_present(*pmd)) {
- struct page *new;
-
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
- return NULL;
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pmd_present(*pmd)) {
- pte_free(new);
- goto out;
- }
+ struct page *new = pte_alloc_one(mm, address);
+ if (!new)
+ return -ENOMEM;
+
+ pte_lock_init(new);
+ spin_lock(&mm->page_table_lock);
+ if (pmd_present(*pmd)) { /* Another has populated it */
+ pte_lock_deinit(new);
+ pte_free(new);
+ } else {
mm->nr_ptes++;
inc_page_state(nr_page_table_pages);
pmd_populate(mm, pmd, new);
}
-out:
- return pte_offset_map(pmd, address);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
}
-pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
{
- if (!pmd_present(*pmd)) {
- pte_t *new;
+ pte_t *new = pte_alloc_one_kernel(&init_mm, address);
+ if (!new)
+ return -ENOMEM;
- spin_unlock(&mm->page_table_lock);
- new = pte_alloc_one_kernel(mm, address);
- spin_lock(&mm->page_table_lock);
- if (!new)
- return NULL;
+ spin_lock(&init_mm.page_table_lock);
+ if (pmd_present(*pmd)) /* Another has populated it */
+ pte_free_kernel(new);
+ else
+ pmd_populate_kernel(&init_mm, pmd, new);
+ spin_unlock(&init_mm.page_table_lock);
+ return 0;
+}
- /*
- * Because we dropped the lock, we should re-check the
- * entry, as somebody else could have populated it..
- */
- if (pmd_present(*pmd)) {
- pte_free_kernel(new);
- goto out;
- }
- pmd_populate_kernel(mm, pmd, new);
- }
-out:
- return pte_offset_kernel(pmd, address);
+static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+{
+ if (file_rss)
+ add_mm_counter(mm, file_rss, file_rss);
+ if (anon_rss)
+ add_mm_counter(mm, anon_rss, anon_rss);
+}
+
+/*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+{
+ printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+ "vm_flags = %lx, vaddr = %lx\n",
+ (long long)pte_val(pte),
+ (vma->vm_mm == current->mm ? current->comm : "???"),
+ vma->vm_flags, vaddr);
+ dump_stack();
}
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
* covered by this vma.
- *
- * dst->page_table_lock is held on entry and exit,
- * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
*/