From d32311fed70d12f14e585feb4653571b1e2b0e6d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 17 Sep 2005 14:41:40 +1000
Subject: [PATCH] Introduce sg_set_buf

sg_init_one is a nice tool for the block layer.  However, users
of struct scatterlist in other subsystems don't usually need the
DMA attributes.  For them it's a waste of time and space to
initialise the whole struct scatterlist structure.

Therefore this patch adds a new function sg_set_buf to initialise
a scatterlist without zeroing the DMA attributes.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/scatterlist.h | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 7f717e95ae3..66ff545552f 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -1,14 +1,23 @@
 #ifndef _LINUX_SCATTERLIST_H
 #define _LINUX_SCATTERLIST_H
 
-static inline void sg_init_one(struct scatterlist *sg,
-			       u8 *buf, unsigned int buflen)
-{
-	memset(sg, 0, sizeof(*sg));
+#include <asm/scatterlist.h>
+#include <linux/mm.h>
+#include <linux/string.h>
 
+static inline void sg_set_buf(struct scatterlist *sg, void *buf,
+			      unsigned int buflen)
+{
 	sg->page = virt_to_page(buf);
 	sg->offset = offset_in_page(buf);
 	sg->length = buflen;
 }
 
+static inline void sg_init_one(struct scatterlist *sg, void *buf,
+			       unsigned int buflen)
+{
+	memset(sg, 0, sizeof(*sg));
+	sg_set_buf(sg, buf, buflen);
+}
+
 #endif /* _LINUX_SCATTERLIST_H */
-- 
cgit v1.2.3-18-g5258


From 930fc45a49ddebe7555cc5c837d82b9c27e65ff4 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sat, 29 Oct 2005 18:15:41 -0700
Subject: [PATCH] vmalloc_node

This patch adds

vmalloc_node(size, node)	-> Allocate necessary memory on the specified node

and

get_vm_area_node(size, flags, node)

and the other functions that it depends on.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/vmalloc.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3701a0673d2..1d5577b2b75 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -32,10 +32,14 @@ struct vm_struct {
  *	Highlevel APIs for driver use
  */
 extern void *vmalloc(unsigned long size);
+extern void *vmalloc_node(unsigned long size, int node);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
-extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot);
+extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
+				pgprot_t prot);
+extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask,
+				pgprot_t prot, int node);
 extern void vfree(void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
@@ -48,6 +52,8 @@ extern void vunmap(void *addr);
 extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
 extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 					unsigned long start, unsigned long end);
+extern struct vm_struct *get_vm_area_node(unsigned long size,
+					unsigned long flags, int node);
 extern struct vm_struct *remove_vm_area(void *addr);
 extern struct vm_struct *__remove_vm_area(void *addr);
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
-- 
cgit v1.2.3-18-g5258


From 63f324cf0792ed69089b79d6921ba3aaea97af50 Mon Sep 17 00:00:00 2001
From: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Date: Sat, 29 Oct 2005 18:15:43 -0700
Subject: [PATCH] fix alpha breakage

barrier.h uses barrier() in non-SMP case.  And doesn't include compiler.h.

Cc: Al Viro <viro@ftp.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/barrier.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/asm-alpha/barrier.h b/include/asm-alpha/barrier.h
index 229c83fe77c..681ff581afa 100644
--- a/include/asm-alpha/barrier.h
+++ b/include/asm-alpha/barrier.h
@@ -1,6 +1,8 @@
 #ifndef __BARRIER_H
 #define __BARRIER_H
 
+#include <asm/compiler.h>
+
 #define mb() \
 __asm__ __volatile__("mb": : :"memory")
 
-- 
cgit v1.2.3-18-g5258


From eb92f4ef320b738e41ad43476a5d05c8a20d5cc7 Mon Sep 17 00:00:00 2001
From: Rik Van Riel <riel@redhat.com>
Date: Sat, 29 Oct 2005 18:15:44 -0700
Subject: [PATCH] add sem_is_read/write_locked()

Add sem_is_read/write_locked functions to the read/write semaphores, along the
same lines of the *_is_locked spinlock functions.  The swap token tuning patch
uses sem_is_read_locked; sem_is_write_locked is added for completeness.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/rwsem.h      | 5 +++++
 include/asm-i386/rwsem.h       | 5 +++++
 include/asm-ia64/rwsem.h       | 5 +++++
 include/asm-ppc/rwsem.h        | 5 +++++
 include/asm-ppc64/rwsem.h      | 5 +++++
 include/asm-s390/rwsem.h       | 5 +++++
 include/asm-sh/rwsem.h         | 5 +++++
 include/asm-sparc64/rwsem.h    | 5 +++++
 include/asm-x86_64/rwsem.h     | 5 +++++
 include/linux/rwsem-spinlock.h | 5 +++++
 10 files changed, 50 insertions(+)

(limited to 'include')

diff --git a/include/asm-alpha/rwsem.h b/include/asm-alpha/rwsem.h
index 8e058a67c9a..fafdd4f7010 100644
--- a/include/asm-alpha/rwsem.h
+++ b/include/asm-alpha/rwsem.h
@@ -262,5 +262,10 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem)
 #endif
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _ALPHA_RWSEM_H */
diff --git a/include/asm-i386/rwsem.h b/include/asm-i386/rwsem.h
index 7625a675852..be4ab859238 100644
--- a/include/asm-i386/rwsem.h
+++ b/include/asm-i386/rwsem.h
@@ -284,5 +284,10 @@ LOCK_PREFIX	"xadd %0,(%2)"
 	return tmp+delta;
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _I386_RWSEM_H */
diff --git a/include/asm-ia64/rwsem.h b/include/asm-ia64/rwsem.h
index e18b5ab0cb7..1327c91ea39 100644
--- a/include/asm-ia64/rwsem.h
+++ b/include/asm-ia64/rwsem.h
@@ -186,4 +186,9 @@ __downgrade_write (struct rw_semaphore *sem)
 #define rwsem_atomic_add(delta, sem)	atomic64_add(delta, (atomic64_t *)(&(sem)->count))
 #define rwsem_atomic_update(delta, sem)	atomic64_add_return(delta, (atomic64_t *)(&(sem)->count))
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* _ASM_IA64_RWSEM_H */
diff --git a/include/asm-ppc/rwsem.h b/include/asm-ppc/rwsem.h
index 3e738f483c1..3501ea72f88 100644
--- a/include/asm-ppc/rwsem.h
+++ b/include/asm-ppc/rwsem.h
@@ -168,5 +168,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _PPC_RWSEM_XADD_H */
diff --git a/include/asm-ppc64/rwsem.h b/include/asm-ppc64/rwsem.h
index bd5c2f09357..7a647fae376 100644
--- a/include/asm-ppc64/rwsem.h
+++ b/include/asm-ppc64/rwsem.h
@@ -163,5 +163,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _PPC_RWSEM_XADD_H */
diff --git a/include/asm-s390/rwsem.h b/include/asm-s390/rwsem.h
index 8c0cebbfc03..0422a085dd5 100644
--- a/include/asm-s390/rwsem.h
+++ b/include/asm-s390/rwsem.h
@@ -351,5 +351,10 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 	return new;
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _S390_RWSEM_H */
diff --git a/include/asm-sh/rwsem.h b/include/asm-sh/rwsem.h
index 1be4337f525..0262d3d1e5e 100644
--- a/include/asm-sh/rwsem.h
+++ b/include/asm-sh/rwsem.h
@@ -166,5 +166,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_SH_RWSEM_H */
diff --git a/include/asm-sparc64/rwsem.h b/include/asm-sparc64/rwsem.h
index 4568ee4022d..cef5e827042 100644
--- a/include/asm-sparc64/rwsem.h
+++ b/include/asm-sparc64/rwsem.h
@@ -56,6 +56,11 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
 	atomic_add(delta, (atomic_t *)(&sem->count));
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _SPARC64_RWSEM_H */
diff --git a/include/asm-x86_64/rwsem.h b/include/asm-x86_64/rwsem.h
index c002175b6e8..46077e9c191 100644
--- a/include/asm-x86_64/rwsem.h
+++ b/include/asm-x86_64/rwsem.h
@@ -274,5 +274,10 @@ LOCK_PREFIX	"xaddl %0,(%2)"
 	return tmp+delta;
 }
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->count != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _X8664_RWSEM_H */
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index b52a2af25f1..f30f805080a 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -61,5 +61,10 @@ extern void FASTCALL(__up_read(struct rw_semaphore *sem));
 extern void FASTCALL(__up_write(struct rw_semaphore *sem));
 extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem));
 
+static inline int rwsem_is_locked(struct rw_semaphore *sem)
+{
+	return (sem->activity != 0);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_SPINLOCK_H */
-- 
cgit v1.2.3-18-g5258


From dfcd3c0dc426bb75770c34b40e14f2da8845ea62 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 29 Oct 2005 18:15:48 -0700
Subject: [PATCH] Convert mempolicies to nodemask_t

The NUMA policy code predated nodemask_t so it used open coded bitmaps.
Convert everything to nodemask_t.  Big patch, but shouldn't have any actual
behaviour changes (except I removed one unnecessary check against
node_online_map and one unnecessary BUG_ON)

Signed-off-by: "Andi Kleen" <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 58385ee1c0a..38e60a09939 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -27,10 +27,10 @@
 
 #include <linux/config.h>
 #include <linux/mmzone.h>
-#include <linux/bitmap.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
+#include <linux/nodemask.h>
 
 struct vm_area_struct;
 
@@ -63,7 +63,7 @@ struct mempolicy {
 	union {
 		struct zonelist  *zonelist;	/* bind */
 		short 		 preferred_node; /* preferred */
-		DECLARE_BITMAP(nodes, MAX_NUMNODES); /* interleave */
+		nodemask_t	 nodes;		/* interleave */
 		/* undefined for default */
 	} v;
 };
-- 
cgit v1.2.3-18-g5258


From ab50b8ed818016cfecd747d6d4bb9139986bc029 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:56 -0700
Subject: [PATCH] mm: vm_stat_account unshackled

The original vm_stat_account has fallen into disuse, with only one user, and
only one user of vm_stat_unaccount.  It's easier to keep track if we convert
them all to __vm_stat_account, then free it from its __shackles.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e1649578fb0..376a466743b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -928,26 +928,14 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long,
 		unsigned long, unsigned long, pgprot_t);
 
 #ifdef CONFIG_PROC_FS
-void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
+void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
 #else
-static inline void __vm_stat_account(struct mm_struct *mm,
+static inline void vm_stat_account(struct mm_struct *mm,
 			unsigned long flags, struct file *file, long pages)
 {
 }
 #endif /* CONFIG_PROC_FS */
 
-static inline void vm_stat_account(struct vm_area_struct *vma)
-{
-	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-							vma_pages(vma));
-}
-
-static inline void vm_stat_unaccount(struct vm_area_struct *vma)
-{
-	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-							-vma_pages(vma));
-}
-
 /* update per process rss and vm hiwater data */
 extern void update_mem_hiwater(struct task_struct *tsk);
 
-- 
cgit v1.2.3-18-g5258


From a8fb5618dab7e45c8990f3155628d772a9ed45f9 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:57 -0700
Subject: [PATCH] mm: unlink_file_vma, remove_vma

Divide remove_vm_struct into two parts: first anon_vma_unlink plus
unlink_file_vma, to unlink the vma from the list and tree by which rmap or
vmtruncate might find it; then remove_vma to close, fput and free.

The intention here is to do the anon_vma_unlink and unlink_file_vma earlier,
in free_pgtables before freeing any page tables: so we can be sure that any
page tables traversed by rmap and vmtruncate are stable (and other, ordinary
cases are stabilized by holding mmap_sem).

This will be crucial to traversing pgd,pud,pmd without page_table_lock.  But
testing the split-out patch showed that lifting the page_table_lock is
symbiotically necessary to make this change - the lock ordering is wrong to
move those unlinks into free_pgtables while it's under ptlock.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 376a466743b..0c64484d8ae 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -834,6 +834,7 @@ extern int split_vma(struct mm_struct *,
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
+extern void unlink_file_vma(struct vm_area_struct *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff);
 extern void exit_mmap(struct mm_struct *);
-- 
cgit v1.2.3-18-g5258


From 65500d234e74fc4e8f18e1a429bc24e51e75de4a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:59 -0700
Subject: [PATCH] mm: page fault handlers tidyup

Impose a little more consistency on the page fault handlers do_wp_page,
do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their
arguments in the same order, called the same names?

break_cow is all very well, but what it did was inlined elsewhere: easier to
compare if it's brought back into do_wp_page.

do_file_page's fallback to do_no_page dates from a time when we were testing
pte_file by using it wherever possible: currently it's peculiar to nonlinear
vmas, so just check that.  BUG_ON if not?  Better not, it's probably page
table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's
use that for do_wp_page's invalid pfn too.

Hah!  Someone in the ppc64 world noticed pte_ERROR was unused so removed it:
restored (and say "pud" not "pmd" in its pud_ERROR).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ppc64/pgtable.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h
index c83679c9d2b..2eb1778a3a1 100644
--- a/include/asm-ppc64/pgtable.h
+++ b/include/asm-ppc64/pgtable.h
@@ -478,10 +478,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
 #define __HAVE_ARCH_PTE_SAME
 #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
 
+#define pte_ERROR(e) \
+	printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 #define pmd_ERROR(e) \
 	printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
 #define pud_ERROR(e) \
-	printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e))
+	printk("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
 #define pgd_ERROR(e) \
 	printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
-- 
cgit v1.2.3-18-g5258


From 15a23ffa2fc91cebdac44d4aee994f59d5c28dc0 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:01 -0700
Subject: [PATCH] mm: tlb_gather_mmu get_cpu_var

tlb_gather_mmu dates from before kernel preemption was allowed, and uses
smp_processor_id or __get_cpu_var to find its per-cpu mmu_gather.  That works
because it's currently only called after getting page_table_lock, which is not
dropped until after the matching tlb_finish_mmu.  But don't rely on that, it
will soon change: now disable preemption internally by proper get_cpu_var in
tlb_gather_mmu, put_cpu_var in tlb_finish_mmu.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-arm/tlb.h     |  5 +++--
 include/asm-arm26/tlb.h   |  7 ++++---
 include/asm-generic/tlb.h | 10 +++++-----
 include/asm-ia64/tlb.h    |  6 ++++--
 include/asm-sparc64/tlb.h |  4 +++-
 5 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h
index 9bb325c5464..da41df20928 100644
--- a/include/asm-arm/tlb.h
+++ b/include/asm-arm/tlb.h
@@ -39,8 +39,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	int cpu = smp_processor_id();
-	struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu);
+	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
 	tlb->mm = mm;
 	tlb->freed = 0;
@@ -65,6 +64,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+
+	put_cpu_var(mmu_gathers);
 }
 
 static inline unsigned int tlb_is_full_mm(struct mmu_gather *tlb)
diff --git a/include/asm-arm26/tlb.h b/include/asm-arm26/tlb.h
index 1316352a58f..8486b00a679 100644
--- a/include/asm-arm26/tlb.h
+++ b/include/asm-arm26/tlb.h
@@ -17,13 +17,12 @@ struct mmu_gather {
         unsigned int            avoided_flushes;
 };
 
-extern struct mmu_gather mmu_gathers[NR_CPUS];
+DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
 
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-        int cpu = smp_processor_id();
-        struct mmu_gather *tlb = &mmu_gathers[cpu];
+        struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
         tlb->mm = mm;
         tlb->freed = 0;
@@ -52,6 +51,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
         /* keep the page table cache within bounds */
         check_pgt_cache();
+
+        put_cpu_var(mmu_gathers);
 }
 
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 7d0298347ee..c8232622c8d 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -35,9 +35,7 @@
 #endif
 
 /* struct mmu_gather is an opaque type used by the mm code for passing around
- * any data needed by arch specific code for tlb_remove_page.  This structure
- * can be per-CPU or per-MM as the page table lock is held for the duration of
- * TLB shootdown.
+ * any data needed by arch specific code for tlb_remove_page.
  */
 struct mmu_gather {
 	struct mm_struct	*mm;
@@ -57,7 +55,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &per_cpu(mmu_gathers, smp_processor_id());
+	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
 	tlb->mm = mm;
 
@@ -85,7 +83,7 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
 /* tlb_finish_mmu
  *	Called at the end of the shootdown operation to free up any resources
- *	that were required.  The page table lock is still held at this point.
+ *	that were required.
  */
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
@@ -101,6 +99,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+
+	put_cpu_var(mmu_gathers);
 }
 
 static inline unsigned int
diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h
index 3a9a6d1be75..1b82299d7c1 100644
--- a/include/asm-ia64/tlb.h
+++ b/include/asm-ia64/tlb.h
@@ -129,7 +129,7 @@ ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long e
 static inline struct mmu_gather *
 tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &__get_cpu_var(mmu_gathers);
+	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
 	tlb->mm = mm;
 	/*
@@ -154,7 +154,7 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 
 /*
  * Called at the end of the shootdown operation to free up any resources that were
- * collected.  The page table lock is still held at this point.
+ * collected.
  */
 static inline void
 tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
@@ -174,6 +174,8 @@ tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+
+	put_cpu_var(mmu_gathers);
 }
 
 static inline unsigned int
diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h
index 9baf57db01d..169309bdbf8 100644
--- a/include/asm-sparc64/tlb.h
+++ b/include/asm-sparc64/tlb.h
@@ -44,7 +44,7 @@ extern void flush_tlb_pending(void);
 
 static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
+	struct mmu_gather *mp = &get_cpu_var(mmu_gathers);
 
 	BUG_ON(mp->tlb_nr);
 
@@ -97,6 +97,8 @@ static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, un
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
+
+	put_cpu_var(mmu_gathers);
 }
 
 static inline unsigned int tlb_is_full_mm(struct mmu_gather *mp)
-- 
cgit v1.2.3-18-g5258


From 4d6ddfa9242bc3d27fb0f7248f6fdee0299c731f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:02 -0700
Subject: [PATCH] mm: tlb_is_full_mm was obscure

tlb_is_full_mm?  What does that mean?  The TLB is full?  No, it means that the
mm's last user has gone and the whole mm is being torn down.  And it's an
inline function because sparc64 uses a different (slightly better)
"tlb_frozen" name for the flag others call "fullmm".

And now the ptep_get_and_clear_full macro used in zap_pte_range refers
directly to tlb->fullmm, which would be wrong for sparc64.  Rather than
correct that, I'd prefer to scrap tlb_is_full_mm altogether, and change
sparc64 to just use the same poor name as everyone else - is that okay?

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-arm/tlb.h     |  5 -----
 include/asm-arm26/tlb.h   |  7 -------
 include/asm-generic/tlb.h |  6 ------
 include/asm-ia64/tlb.h    |  6 ------
 include/asm-sparc64/tlb.h | 13 ++++---------
 5 files changed, 4 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h
index da41df20928..a35ab0f2e25 100644
--- a/include/asm-arm/tlb.h
+++ b/include/asm-arm/tlb.h
@@ -68,11 +68,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 	put_cpu_var(mmu_gathers);
 }
 
-static inline unsigned int tlb_is_full_mm(struct mmu_gather *tlb)
-{
-	return tlb->fullmm;
-}
-
 #define tlb_remove_tlb_entry(tlb,ptep,address)	do { } while (0)
 
 /*
diff --git a/include/asm-arm26/tlb.h b/include/asm-arm26/tlb.h
index 8486b00a679..c7d54ca0a23 100644
--- a/include/asm-arm26/tlb.h
+++ b/include/asm-arm26/tlb.h
@@ -55,13 +55,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
         put_cpu_var(mmu_gathers);
 }
 
-
-static inline unsigned int
-tlb_is_full_mm(struct mmu_gather *tlb)
-{
-     return tlb->fullmm;
-}
-
 #define tlb_remove_tlb_entry(tlb,ptep,address)  do { } while (0)
 //#define tlb_start_vma(tlb,vma)                  do { } while (0)
 //FIXME - ARM32 uses this now that things changed in the kernel. seems like it may be pointless on arm26, however to get things compiling...
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index c8232622c8d..5d352a70f00 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -103,12 +103,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 	put_cpu_var(mmu_gathers);
 }
 
-static inline unsigned int
-tlb_is_full_mm(struct mmu_gather *tlb)
-{
-	return tlb->fullmm;
-}
-
 /* tlb_remove_page
  *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
  *	handling the additional races in SMP caused by other CPUs caching valid
diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h
index 1b82299d7c1..0bbd79f6a79 100644
--- a/include/asm-ia64/tlb.h
+++ b/include/asm-ia64/tlb.h
@@ -178,12 +178,6 @@ tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
 	put_cpu_var(mmu_gathers);
 }
 
-static inline unsigned int
-tlb_is_full_mm(struct mmu_gather *tlb)
-{
-     return tlb->fullmm;
-}
-
 /*
  * Logically, this routine frees PAGE.  On MP machines, the actual freeing of the page
  * must be delayed until after the TLB has been flushed (see comments at the beginning of
diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h
index 169309bdbf8..5d194eae870 100644
--- a/include/asm-sparc64/tlb.h
+++ b/include/asm-sparc64/tlb.h
@@ -25,7 +25,7 @@ struct mmu_gather {
 	struct mm_struct *mm;
 	unsigned int pages_nr;
 	unsigned int need_flush;
-	unsigned int tlb_frozen;
+	unsigned int fullmm;
 	unsigned int tlb_nr;
 	unsigned long freed;
 	unsigned long vaddrs[TLB_BATCH_NR];
@@ -50,7 +50,7 @@ static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned i
 
 	mp->mm = mm;
 	mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U;
-	mp->tlb_frozen = full_mm_flush;
+	mp->fullmm = full_mm_flush;
 	mp->freed = 0;
 
 	return mp;
@@ -88,10 +88,10 @@ static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, un
 
 	tlb_flush_mmu(mp);
 
-	if (mp->tlb_frozen) {
+	if (mp->fullmm) {
 		if (CTX_VALID(mm->context))
 			do_flush_tlb_mm(mm);
-		mp->tlb_frozen = 0;
+		mp->fullmm = 0;
 	} else
 		flush_tlb_pending();
 
@@ -101,11 +101,6 @@ static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, un
 	put_cpu_var(mmu_gathers);
 }
 
-static inline unsigned int tlb_is_full_mm(struct mmu_gather *mp)
-{
-	return mp->tlb_frozen;
-}
-
 static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page)
 {
 	mp->need_flush = 1;
-- 
cgit v1.2.3-18-g5258


From fc2acab31be8e869b2d5f6de12f557f6f054f19c Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:03 -0700
Subject: [PATCH] mm: tlb_finish_mmu forget rss

zap_pte_range has been counting the pages it frees in tlb->freed, then
tlb_finish_mmu has used that to update the mm's rss.  That got stranger when I
added anon_rss, yet updated it by a different route; and stranger when rss and
anon_rss became mm_counters with special access macros.  And it would no
longer be viable if we're relying on page_table_lock to stabilize the
mm_counter, but calling tlb_finish_mmu outside that lock.

Remove the mmu_gather's freed field, let tlb_finish_mmu stick to its own
business, just decrement the rss mm_counter in zap_pte_range (yes, there was
some point to batching the update, and a subsequent patch restores that).  And
forget the anal paranoia of first reading the counter to avoid going negative
- if rss does go negative, just fix that bug.

Remove the mmu_gather's flushes and avoided_flushes from arm and arm26: no use
was being made of them.  But arm26 alone was actually using the freed, in the
way some others use need_flush: give it a need_flush.  arm26 seems to prefer
spaces to tabs here: respect that.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-arm/tlb.h     | 15 +--------------
 include/asm-arm26/tlb.h   | 35 +++++++++++++----------------------
 include/asm-generic/tlb.h |  9 ---------
 include/asm-ia64/tlb.h    |  9 ---------
 include/asm-sparc64/tlb.h | 14 ++------------
 5 files changed, 16 insertions(+), 66 deletions(-)

(limited to 'include')

diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h
index a35ab0f2e25..f49bfb78c22 100644
--- a/include/asm-arm/tlb.h
+++ b/include/asm-arm/tlb.h
@@ -27,11 +27,7 @@
  */
 struct mmu_gather {
 	struct mm_struct	*mm;
-	unsigned int		freed;
 	unsigned int		fullmm;
-
-	unsigned int		flushes;
-	unsigned int		avoided_flushes;
 };
 
 DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -42,7 +38,6 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
 	tlb->mm = mm;
-	tlb->freed = 0;
 	tlb->fullmm = full_mm_flush;
 
 	return tlb;
@@ -51,16 +46,8 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-	struct mm_struct *mm = tlb->mm;
-	unsigned long freed = tlb->freed;
-	int rss = get_mm_counter(mm, rss);
-
-	if (rss < freed)
-		freed = rss;
-	add_mm_counter(mm, rss, -freed);
-
 	if (tlb->fullmm)
-		flush_tlb_mm(mm);
+		flush_tlb_mm(tlb->mm);
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
diff --git a/include/asm-arm26/tlb.h b/include/asm-arm26/tlb.h
index c7d54ca0a23..08ddd85b8d3 100644
--- a/include/asm-arm26/tlb.h
+++ b/include/asm-arm26/tlb.h
@@ -10,11 +10,8 @@
  */
 struct mmu_gather {
         struct mm_struct        *mm;
-        unsigned int            freed;
-	unsigned int            fullmm;
-
-        unsigned int            flushes;
-        unsigned int            avoided_flushes;
+        unsigned int            need_flush;
+        unsigned int            fullmm;
 };
 
 DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -25,8 +22,8 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
         struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
         tlb->mm = mm;
-        tlb->freed = 0;
-	tlb->fullmm = full_mm_flush;
+        tlb->need_flush = 0;
+        tlb->fullmm = full_mm_flush;
 
         return tlb;
 }
@@ -34,20 +31,8 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-        struct mm_struct *mm = tlb->mm;
-        unsigned long freed = tlb->freed;
-        int rss = get_mm_counter(mm, rss);
-
-        if (rss < freed)
-                freed = rss;
-        add_mm_counter(mm, rss, -freed);
-
-        if (freed) {
-                flush_tlb_mm(mm);
-                tlb->flushes++;
-        } else {
-                tlb->avoided_flushes++;
-        }
+        if (tlb->need_flush)
+                flush_tlb_mm(tlb->mm);
 
         /* keep the page table cache within bounds */
         check_pgt_cache();
@@ -65,7 +50,13 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
         } while (0)
 #define tlb_end_vma(tlb,vma)                    do { } while (0)
 
-#define tlb_remove_page(tlb,page)       free_page_and_swap_cache(page)
+static inline void
+tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+        tlb->need_flush = 1;
+        free_page_and_swap_cache(page);
+}
+
 #define pte_free_tlb(tlb,ptep)          pte_free(ptep)
 #define pmd_free_tlb(tlb,pmdp)          pmd_free(pmdp)
 
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 5d352a70f00..cdd4145243c 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -42,7 +42,6 @@ struct mmu_gather {
 	unsigned int		nr;	/* set to ~0U means fast mode */
 	unsigned int		need_flush;/* Really unmapped some ptes? */
 	unsigned int		fullmm; /* non-zero means full mm flush */
-	unsigned long		freed;
 	struct page *		pages[FREE_PTE_NR];
 };
 
@@ -63,7 +62,6 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 	tlb->nr = num_online_cpus() > 1 ? 0U : ~0U;
 
 	tlb->fullmm = full_mm_flush;
-	tlb->freed = 0;
 
 	return tlb;
 }
@@ -88,13 +86,6 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 static inline void
 tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-	int freed = tlb->freed;
-	struct mm_struct *mm = tlb->mm;
-	int rss = get_mm_counter(mm, rss);
-
-	if (rss < freed)
-		freed = rss;
-	add_mm_counter(mm, rss, -freed);
 	tlb_flush_mmu(tlb, start, end);
 
 	/* keep the page table cache within bounds */
diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h
index 0bbd79f6a79..834370b9dea 100644
--- a/include/asm-ia64/tlb.h
+++ b/include/asm-ia64/tlb.h
@@ -60,7 +60,6 @@ struct mmu_gather {
 	unsigned int		nr;		/* == ~0U => fast mode */
 	unsigned char		fullmm;		/* non-zero means full mm flush */
 	unsigned char		need_flush;	/* really unmapped some PTEs? */
-	unsigned long		freed;		/* number of pages freed */
 	unsigned long		start_addr;
 	unsigned long		end_addr;
 	struct page 		*pages[FREE_PTE_NR];
@@ -147,7 +146,6 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 	 */
 	tlb->nr = (num_online_cpus() == 1) ? ~0U : 0;
 	tlb->fullmm = full_mm_flush;
-	tlb->freed = 0;
 	tlb->start_addr = ~0UL;
 	return tlb;
 }
@@ -159,13 +157,6 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush)
 static inline void
 tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end)
 {
-	unsigned long freed = tlb->freed;
-	struct mm_struct *mm = tlb->mm;
-	unsigned long rss = get_mm_counter(mm, rss);
-
-	if (rss < freed)
-		freed = rss;
-	add_mm_counter(mm, rss, -freed);
 	/*
 	 * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and
 	 * tlb->end_addr.
diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h
index 5d194eae870..66138d959df 100644
--- a/include/asm-sparc64/tlb.h
+++ b/include/asm-sparc64/tlb.h
@@ -27,7 +27,6 @@ struct mmu_gather {
 	unsigned int need_flush;
 	unsigned int fullmm;
 	unsigned int tlb_nr;
-	unsigned long freed;
 	unsigned long vaddrs[TLB_BATCH_NR];
 	struct page *pages[FREE_PTE_NR];
 };
@@ -51,7 +50,6 @@ static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned i
 	mp->mm = mm;
 	mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U;
 	mp->fullmm = full_mm_flush;
-	mp->freed = 0;
 
 	return mp;
 }
@@ -78,19 +76,11 @@ extern void smp_flush_tlb_mm(struct mm_struct *mm);
 
 static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, unsigned long end)
 {
-	unsigned long freed = mp->freed;
-	struct mm_struct *mm = mp->mm;
-	unsigned long rss = get_mm_counter(mm, rss);
-
-	if (rss < freed)
-		freed = rss;
-	add_mm_counter(mm, rss, -freed);
-
 	tlb_flush_mmu(mp);
 
 	if (mp->fullmm) {
-		if (CTX_VALID(mm->context))
-			do_flush_tlb_mm(mm);
+		if (CTX_VALID(mp->mm->context))
+			do_flush_tlb_mm(mp->mm);
 		mp->fullmm = 0;
 	} else
 		flush_tlb_pending();
-- 
cgit v1.2.3-18-g5258


From 4294621f41a85497019fae64341aa5351a1921b7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:05 -0700
Subject: [PATCH] mm: rss = file_rss + anon_rss

I was lazy when we added anon_rss, and chose to change as few places as
possible.  So currently each anonymous page has to be counted twice, in rss
and in anon_rss.  Which won't be so good if those are atomic counts in some
configurations.

Change that around: keep file_rss and anon_rss separately, and add them
together (with get_mm_rss macro) when the total is needed - reading two
atomics is much cheaper than updating two atomics.  And update anon_rss
upfront, typically in memory.c, not tucked away in page_add_anon_rmap.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 27519df0f98..afcaac66cbd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -254,6 +254,8 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
 #define inc_mm_counter(mm, member) (mm)->_##member++
 #define dec_mm_counter(mm, member) (mm)->_##member--
+#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
+
 typedef unsigned long mm_counter_t;
 
 struct mm_struct {
@@ -286,7 +288,7 @@ struct mm_struct {
 	unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;
 
 	/* Special counters protected by the page_table_lock */
-	mm_counter_t _rss;
+	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
 
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
-- 
cgit v1.2.3-18-g5258


From b5810039a54e5babf428e9a1e89fc1940fabff11 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 29 Oct 2005 18:16:12 -0700
Subject: [PATCH] core remove PageReserved

Remove PageReserved() calls from core code by tightening VM_RESERVED
handling in mm/ to cover PageReserved functionality.

PageReserved special casing is removed from get_page and put_page.

All setting and clearing of PageReserved is retained, and it is now flagged
in the page_alloc checks to help ensure we don't introduce any refcount
based freeing of Reserved pages.

MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being
deprecated.  We never completely handled it correctly anyway, and is be
reintroduced in future if required (Hugh has a proof of concept).

Once PageReserved() calls are removed from kernel/power/swsusp.c, and all
arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can
be trivially removed.

Last real user of PageReserved is swsusp, which uses PageReserved to
determine whether a struct page points to valid memory or not.  This still
needs to be addressed (a generic page_is_ram() should work).

A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and
thus mapcounted and count towards shared rss).  These writes to the struct
page could cause excessive cacheline bouncing on big systems.  There are a
number of ways this could be addressed if it is an issue.

Signed-off-by: Nick Piggin <npiggin@suse.de>

Refcount bug fix for filemap_xip.c

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0c64484d8ae..da42093250c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
 #define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
-#define VM_RESERVED	0x00080000	/* Don't unmap it from swap_out */
+#define VM_RESERVED	0x00080000	/* Pages managed in a special way */
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
@@ -338,7 +338,7 @@ static inline void get_page(struct page *page)
 
 static inline void put_page(struct page *page)
 {
-	if (!PageReserved(page) && put_page_testzero(page))
+	if (put_page_testzero(page))
 		__page_cache_release(page);
 }
 
@@ -723,6 +723,7 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
-- 
cgit v1.2.3-18-g5258


From 365e9c87a982c03d0af3886e29d877f581b59611 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:18 -0700
Subject: [PATCH] mm: update_hiwaters just in time

update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability.  Originally it was called whenever rss or
total_vm got raised.  Then many of those callsites were replaced by a timer
tick call from account_system_time.  Now Frank van Maarseveen reports that to
be found inadequate.  How about this?  Works for Frank.

Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm.  Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths.  Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit.  Handle
mm->hiwater_vm in the same way, though it's much less of an issue.  Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.

And there has been no collector of these hiwater statistics in the tree.  The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).

There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high.  A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.

What locking?  None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact.  But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h    |  3 ---
 include/linux/sched.h | 10 ++++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index da42093250c..7d4552fe086 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -938,9 +938,6 @@ static inline void vm_stat_account(struct mm_struct *mm,
 }
 #endif /* CONFIG_PROC_FS */
 
-/* update per process rss and vm hiwater data */
-extern void update_mem_hiwater(struct task_struct *tsk);
-
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index afcaac66cbd..a9c0b7d2630 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -256,6 +256,16 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 #define dec_mm_counter(mm, member) (mm)->_##member--
 #define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
 
+#define update_hiwater_rss(mm)	do {			\
+	unsigned long _rss = get_mm_rss(mm);		\
+	if ((mm)->hiwater_rss < _rss)			\
+		(mm)->hiwater_rss = _rss;		\
+} while (0)
+#define update_hiwater_vm(mm)	do {			\
+	if ((mm)->hiwater_vm < (mm)->total_vm)		\
+		(mm)->hiwater_vm = (mm)->total_vm;	\
+} while (0)
+
 typedef unsigned long mm_counter_t;
 
 struct mm_struct {
-- 
cgit v1.2.3-18-g5258


From f449952bc8bde7fbc73c6d20dff92b627a21f8b9 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:19 -0700
Subject: [PATCH] mm: mm_struct hiwaters moved

Slight and timid rearrangement of mm_struct: hiwater_rss and hiwater_vm were
tacked on the end, but it seems better to keep them near _file_rss, _anon_rss
and total_vm, in the same cacheline on those arches verified.

There are likely to be more profitable rearrangements, but less obvious (is it
good or bad that saved_auxv[AT_VECTOR_SIZE] isolates cpu_vm_mask and context
from many others?), needing serious instrumentation.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a9c0b7d2630..292cb57ce38 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -291,16 +291,19 @@ struct mm_struct {
 						 * by mmlist_lock
 						 */
 
-	unsigned long start_code, end_code, start_data, end_data;
-	unsigned long start_brk, brk, start_stack;
-	unsigned long arg_start, arg_end, env_start, env_end;
-	unsigned long total_vm, locked_vm, shared_vm;
-	unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;
-
 	/* Special counters protected by the page_table_lock */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
 
+	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
+	unsigned long hiwater_vm;	/* High-water virtual memory usage */
+
+	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
+	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
+	unsigned long start_code, end_code, start_data, end_data;
+	unsigned long start_brk, brk, start_stack;
+	unsigned long arg_start, arg_end, env_start, env_end;
+
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
 	unsigned dumpable:2;
@@ -320,11 +323,7 @@ struct mm_struct {
 	/* aio bits */
 	rwlock_t		ioctx_list_lock;
 	struct kioctx		*ioctx_list;
-
 	struct kioctx		default_kioctx;
-
-	unsigned long hiwater_rss;	/* High-water RSS usage */
-	unsigned long hiwater_vm;	/* High-water virtual memory usage */
 };
 
 struct sighand_struct {
-- 
cgit v1.2.3-18-g5258


From 46dea3d092d23a58b42499cc8a21de0fad079f4a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:20 -0700
Subject: [PATCH] mm: ia64 use expand_upwards

ia64 has expand_backing_store function for growing its Register Backing Store
vma upwards.  But more complete code for this purpose is found in the
CONFIG_STACK_GROWSUP part of mm/mmap.c.  Uglify its #ifdefs further to provide
expand_upwards for ia64 as well as expand_stack for parisc.

The Register Backing Store vma should be marked VM_ACCOUNT.  Implement the
intention of growing it only a page at a time, instead of passing an address
outside of the vma to handle_mm_fault, with unknown consequences.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7d4552fe086..89398032bc4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -896,7 +896,8 @@ void handle_ra_miss(struct address_space *mapping,
 unsigned long max_sane_readahead(unsigned long nr);
 
 /* Do stack extension */
-extern int expand_stack(struct vm_area_struct * vma, unsigned long address);
+extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
+extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
 
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
-- 
cgit v1.2.3-18-g5258


From 872fec16d9a0ed3b75b8893aa217e49cca575ee5 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:21 -0700
Subject: [PATCH] mm: init_mm without ptlock

First step in pushing down the page_table_lock.  init_mm.page_table_lock has
been used throughout the architectures (usually for ioremap): not to serialize
kernel address space allocation (that's usually vmlist_lock), but because
pud_alloc,pmd_alloc,pte_alloc_kernel expect caller holds it.

Reverse that: don't lock or unlock init_mm.page_table_lock in any of the
architectures; instead rely on pud_alloc,pmd_alloc,pte_alloc_kernel to take
and drop it when allocating a new one, to check lest a racing task already
did.  Similarly no page_table_lock in vmalloc's map_vm_area.

Some temporary ugliness in __pud_alloc and __pmd_alloc: since they also handle
user mms, which are converted only by a later patch, for now they have to lock
differently according to whether or not it's init_mm.

If sources get muddled, there's a danger that an arch source taking
init_mm.page_table_lock will be mixed with common source also taking it (or
neither take it).  So break the rules and make another change, which should
break the build for such a mismatch: remove the redundant mm arg from
pte_alloc_kernel (ppc64 scrapped its distinct ioremap_mm in 2.6.13).

Exceptions: arm26 used pte_alloc_kernel on user mm, now pte_alloc_map; ia64
used pte_alloc_map on init_mm, now pte_alloc_kernel; parisc had bad args to
pmd_alloc and pte_alloc_kernel in unused USE_HPPA_IOREMAP code; ppc64
map_io_page forgot to unlock on failure; ppc mmu_mapin_ram and ppc64 im_free
took page_table_lock for no good reason.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 89398032bc4..b9fa82b96d9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -706,7 +706,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
 extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
+extern pte_t *FASTCALL(pte_alloc_kernel(pmd_t *pmd, unsigned long address));
 extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
 extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
-- 
cgit v1.2.3-18-g5258


From 1bb3630e89cb8a7b3d3807629c20c5bad88290ff Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:22 -0700
Subject: [PATCH] mm: ptd_alloc inline and out

It seems odd to me that, whereas pud_alloc and pmd_alloc test inline, only
calling out-of-line __pud_alloc __pmd_alloc if allocation needed,
pte_alloc_map and pte_alloc_kernel are entirely out-of-line.  Though it does
add a little to kernel size, change them to macros testing inline, calling
__pte_alloc or __pte_alloc_kernel to allocate out-of-line.  Mark none of them
as fastcalls, leave that to CONFIG_REGPARM or not.

It also seems more natural for the out-of-line functions to leave the offset
calculation and map to the inline, which has to do it anyway for the common
case.  At least mremap move wants __pte_alloc without _map.

Macros rather than inline functions, certainly to avoid the header file issues
which arise from CONFIG_HIGHPTE needing kmap_types.h, but also in case any
architectures I haven't built would have other such problems.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/4level-fixup.h | 11 +++--------
 include/linux/mm.h                 | 38 +++++++++++++++++++-------------------
 2 files changed, 22 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
index c20ec257ecc..68c6fea994d 100644
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -10,14 +10,9 @@
 
 #define pud_t				pgd_t
 
-#define pmd_alloc(mm, pud, address)			\
-({	pmd_t *ret;					\
-	if (pgd_none(*pud))				\
- 		ret = __pmd_alloc(mm, pud, address);	\
- 	else						\
-		ret = pmd_offset(pud, address);		\
- 	ret;						\
-})
+#define pmd_alloc(mm, pud, address) \
+	((unlikely(pgd_none(*(pud))) && __pmd_alloc(mm, pud, address))? \
+ 		NULL: pmd_offset(pud, address))
 
 #define pud_alloc(mm, pgd, address)	(pgd)
 #define pud_offset(pgd, start)		(pgd)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b9fa82b96d9..22c2d6922c0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -704,10 +704,6 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
 }
 
 extern int vmtruncate(struct inode * inode, loff_t offset);
-extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
-extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc_kernel(pmd_t *pmd, unsigned long address));
-extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
 extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
 extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
@@ -760,32 +756,36 @@ struct shrinker;
 extern struct shrinker *set_shrinker(int, shrinker_t);
 extern void remove_shrinker(struct shrinker *shrinker);
 
-/*
- * On a two-level or three-level page table, this ends up being trivial. Thus
- * the inlining and the symmetry break with pte_alloc_map() that does all
- * of this out-of-line.
- */
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
+int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
+
 /*
  * The following ifdef needed to get the 4level-fixup.h header to work.
  * Remove it when 4level-fixup.h has been removed.
  */
-#ifdef CONFIG_MMU
-#ifndef __ARCH_HAS_4LEVEL_HACK 
+#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
 static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
-	if (pgd_none(*pgd))
-		return __pud_alloc(mm, pgd, address);
-	return pud_offset(pgd, address);
+	return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
+		NULL: pud_offset(pgd, address);
 }
 
 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
-	if (pud_none(*pud))
-		return __pmd_alloc(mm, pud, address);
-	return pmd_offset(pud, address);
+	return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
+		NULL: pmd_offset(pud, address);
 }
-#endif
-#endif /* CONFIG_MMU */
+#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
+
+#define pte_alloc_map(mm, pmd, address)			\
+	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+		NULL: pte_offset_map(pmd, address))
+
+#define pte_alloc_kernel(pmd, address)			\
+	((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
+		NULL: pte_offset_kernel(pmd, address))
 
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, pg_data_t *pgdat,
-- 
cgit v1.2.3-18-g5258


From c74df32c724a1652ad8399b4891bb02c9d43743a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:23 -0700
Subject: [PATCH] mm: ptd_alloc take ptlock

Second step in pushing down the page_table_lock.  Remove the temporary
bridging hack from __pud_alloc, __pmd_alloc, __pte_alloc: expect callers not
to hold page_table_lock, whether it's on init_mm or a user mm; take
page_table_lock internally to check if a racing task already allocated.

Convert their callers from common code.  But avoid coming back to change them
again later: instead of moving the spin_lock(&mm->page_table_lock) down,
switch over to new macros pte_alloc_map_lock and pte_unmap_unlock, which
encapsulate the mapping+locking and unlocking+unmapping together, and in the
end may use alternatives to the mm page_table_lock itself.

These callers all hold mmap_sem (some exclusively, some not), so at no level
can a page table be whipped away from beneath them; and pte_alloc uses the
"atomic" pmd_present to test whether it needs to allocate.  It appears that on
all arches we can safely descend without page_table_lock.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 22c2d6922c0..d4c3512e7db 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -779,10 +779,28 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
+#define pte_offset_map_lock(mm, pmd, address, ptlp)	\
+({							\
+	spinlock_t *__ptl = &(mm)->page_table_lock;	\
+	pte_t *__pte = pte_offset_map(pmd, address);	\
+	*(ptlp) = __ptl;				\
+	spin_lock(__ptl);				\
+	__pte;						\
+})
+
+#define pte_unmap_unlock(pte, ptl)	do {		\
+	spin_unlock(ptl);				\
+	pte_unmap(pte);					\
+} while (0)
+
 #define pte_alloc_map(mm, pmd, address)			\
 	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
 		NULL: pte_offset_map(pmd, address))
 
+#define pte_alloc_map_lock(mm, pmd, address, ptlp)	\
+	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+		NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
+
 #define pte_alloc_kernel(pmd, address)			\
 	((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
 		NULL: pte_offset_kernel(pmd, address))
-- 
cgit v1.2.3-18-g5258


From 705e87c0c3c38424f7f30556c85bc20e808d2f59 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:27 -0700
Subject: [PATCH] mm: pte_offset_map_lock loops

Convert those common loops using page_table_lock on the outside and
pte_offset_map within to use just pte_offset_map_lock within instead.

These all hold mmap_sem (some exclusively, some not), so at no level can a
page table be whipped away from beneath them.  But whereas pte_alloc loops
tested with the "atomic" pmd_present, these loops are testing with pmd_none,
which on i386 PAE tests both lower and upper halves.

That's now unsafe, so add a cast into pmd_none to test only the vital lower
half: we lose a little sensitivity to a corrupt middle directory, but not
enough to worry about.  It appears that i386 and UML were the only
architectures vulnerable in this way, and pgd and pud no problem.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/pgtable.h | 3 ++-
 include/asm-um/pgtable.h   | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index d101ac414f0..0e3ec809352 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -203,7 +203,8 @@ extern unsigned long pg0[];
 #define pte_present(x)	((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
 #define pte_clear(mm,addr,xp)	do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
 
-#define pmd_none(x)	(!pmd_val(x))
+/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
+#define pmd_none(x)	(!(unsigned long)pmd_val(x))
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
 #define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h
index 616d02b57ea..ac64eb95586 100644
--- a/include/asm-um/pgtable.h
+++ b/include/asm-um/pgtable.h
@@ -138,7 +138,7 @@ extern unsigned long pg0[1024];
 
 #define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE))
 
-#define pmd_none(x)	(!(pmd_val(x) & ~_PAGE_NEWPAGE))
+#define pmd_none(x)	(!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE))
 #define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
 #define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
 #define pmd_clear(xp)	do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0)
-- 
cgit v1.2.3-18-g5258


From 663b97f7efd001b0c56bd5fce059c5272725b86f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:28 -0700
Subject: [PATCH] mm: flush_tlb_range outside ptlock

There was one small but very significant change in the previous patch:
mprotect's flush_tlb_range fell outside the page_table_lock: as it is in 2.4,
but that doesn't prove it safe in 2.6.

On some architectures flush_tlb_range comes to the same as flush_tlb_mm, which
has always been called from outside page_table_lock in dup_mmap, and is so
proved safe.  Others required a deeper audit: I could find no reliance on
page_table_lock in any; but in ia64 and parisc found some code which looks a
bit as if it might want preemption disabled.  That won't do any actual harm,
so pending a decision from the maintainers, disable preemption there.

Remove comments on page_table_lock from flush_tlb_mm, flush_tlb_range and
flush_tlb_page entries in cachetlb.txt: they were rather misleading (what
generic code does is different from what usually happens), the rules are now
changing, and it's not yet clear where we'll end up (will the generic
tlb_flush_mmu happen always under lock?  never under lock?  or sometimes under
and sometimes not?).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-parisc/tlbflush.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-parisc/tlbflush.h b/include/asm-parisc/tlbflush.h
index 84af4ab1fe5..e97aa8d1eff 100644
--- a/include/asm-parisc/tlbflush.h
+++ b/include/asm-parisc/tlbflush.h
@@ -88,7 +88,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 	if (npages >= 512)  /* 2MB of space: arbitrary, should be tuned */
 		flush_tlb_all();
 	else {
-
+		preempt_disable();
 		mtsp(vma->vm_mm->context,1);
 		purge_tlb_start();
 		if (split_tlb) {
@@ -102,6 +102,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 				pdtlb(start);
 				start += PAGE_SIZE;
 			}
+		preempt_enable();
 		}
 		purge_tlb_end();
 	}
-- 
cgit v1.2.3-18-g5258


From 508034a32b819a2d40aa7ac0dbc8cd2e044c2de6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:30 -0700
Subject: [PATCH] mm: unmap_vmas with inner ptlock

Remove the page_table_lock from around the calls to unmap_vmas, and replace
the pte_offset_map in zap_pte_range by pte_offset_map_lock: all callers are
now safe to descend without page_table_lock.

Don't attempt fancy locking for hugepages, just take page_table_lock in
unmap_hugepage_range.  Which makes zap_hugepage_range, and the hugetlb test in
zap_page_range, redundant: unmap_vmas calls unmap_hugepage_range anyway.  Nor
does unmap_vmas have much use for its mm arg now.

The tlb_start_vma and tlb_end_vma in unmap_page_range are now called without
page_table_lock: if they're implemented at all, they typically come down to
flush_cache_range (usually done outside page_table_lock) and flush_tlb_range
(which we already audited for the mprotect case).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hugetlb.h | 2 --
 include/linux/mm.h      | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d664330d900..0cea162b08c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -16,7 +16,6 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
 int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
 int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
-void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
 void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
@@ -87,7 +86,6 @@ static inline unsigned long hugetlb_total_pages(void)
 #define follow_huge_addr(mm, addr, write)	ERR_PTR(-EINVAL)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 #define hugetlb_prefault(mapping, vma)		({ BUG(); 0; })
-#define zap_hugepage_range(vma, start, len)	BUG()
 #define unmap_hugepage_range(vma, start, end)	BUG()
 #define is_hugepage_mem_enough(size)		0
 #define hugetlb_report_meminfo(buf)		0
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d4c3512e7db..972e2ce8e07 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -682,7 +682,7 @@ struct zap_details {
 
 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size, struct zap_details *);
-unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlb,
 		struct vm_area_struct *start_vma, unsigned long start_addr,
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
-- 
cgit v1.2.3-18-g5258


From c0718806cf955d5eb51ea77bffb5b21d9bba4972 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:31 -0700
Subject: [PATCH] mm: rmap with inner ptlock

rmap's page_check_address descend without page_table_lock.  First just
pte_offset_map in case there's no pte present worth locking for, then take
page_table_lock for the full check, and pass ptl back to caller in the same
style as pte_offset_map_lock.  __xip_unmap, page_referenced_one and
try_to_unmap_one use pte_unmap_unlock.  try_to_unmap_cluster also.

page_check_address reformatted to avoid progressive indentation.  No use is
made of its one error code, return NULL when it fails.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rmap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index e80fb7ee6ef..35b30e6c8cf 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -95,8 +95,8 @@ int try_to_unmap(struct page *);
 /*
  * Called from mm/filemap_xip.c to unmap empty zero page
  */
-pte_t *page_check_address(struct page *, struct mm_struct *, unsigned long);
-
+pte_t *page_check_address(struct page *, struct mm_struct *,
+				unsigned long, spinlock_t **);
 
 /*
  * Used by swapoff to help locate where page is expected in vma.
-- 
cgit v1.2.3-18-g5258


From c34d1b4d165c67b966bca4aba026443d7ff161eb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:32 -0700
Subject: [PATCH] mm: kill check_user_page_readable

check_user_page_readable is a problematic variant of follow_page.  It's used
only by oprofile's i386 and arm backtrace code, at interrupt time, to
establish whether a userspace stackframe is currently readable.

This is problematic, because we want to push the page_table_lock down inside
follow_page, and later split it; whereas oprofile is doing a spin_trylock on
it (in the i386 case, forgotten in the arm case), and needs that to pin
perhaps two pages spanned by the stackframe (which might be covered by
different locks when we split).

I think oprofile is going about this in the wrong way: it doesn't need to know
the area is readable (neither i386 nor arm uses read protection of user
pages), it doesn't need to pin the memory, it should simply
__copy_from_user_inatomic, and see if that succeeds or not.  Sorry, but I've
not got around to devising the sparse __user annotations for this.

Then we can eliminate check_user_page_readable, and return to a single
follow_page without the __follow_page variants.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 972e2ce8e07..aa8de20e2e8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -944,7 +944,6 @@ extern struct page * vmalloc_to_page(void *addr);
 extern unsigned long vmalloc_to_pfn(void *addr);
 extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
 		int write);
-extern int check_user_page_readable(struct mm_struct *mm, unsigned long address);
 int remap_pfn_range(struct vm_area_struct *, unsigned long,
 		unsigned long, unsigned long, pgprot_t);
 
-- 
cgit v1.2.3-18-g5258


From deceb6cd17e6dfafe4c4f81b1b4153bc41b2cb70 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:33 -0700
Subject: [PATCH] mm: follow_page with inner ptlock

Final step in pushing down common core's page_table_lock.  follow_page no
longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself;
and so no page_table_lock is taken in get_user_pages itself.

But get_user_pages (and get_futex_key) do then need follow_page to pin the
page for them: take Daniel's suggestion of bitflags to follow_page.

Need one for WRITE, another for TOUCH (it was the accessed flag before:
vanished along with check_user_page_readable, but surely get_numa_maps is
wrong to mark every page it finds as accessed), another for GET.

And another, ANON to dispose of untouched_anonymous_page: it seems silly for
that to descend a second time, let follow_page observe if there was no page
table and return ZERO_PAGE if so.  Fix minor bug in that: check VM_LOCKED -
make_pages_present ought to make readonly anonymous present.

Give get_numa_maps a cond_resched while we're there.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index aa8de20e2e8..e8d1424153b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -938,14 +938,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
-extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
-
-extern struct page * vmalloc_to_page(void *addr);
-extern unsigned long vmalloc_to_pfn(void *addr);
-extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
-		int write);
-int remap_pfn_range(struct vm_area_struct *, unsigned long,
-		unsigned long, unsigned long, pgprot_t);
+struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
+struct page *vmalloc_to_page(void *addr);
+unsigned long vmalloc_to_pfn(void *addr);
+int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+			unsigned long pfn, unsigned long size, pgprot_t);
+
+struct page *follow_page(struct mm_struct *, unsigned long address,
+			unsigned int foll_flags);
+#define FOLL_WRITE	0x01	/* check pte is writable */
+#define FOLL_TOUCH	0x02	/* mark page accessed */
+#define FOLL_GET	0x04	/* do get_page on page */
+#define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
 
 #ifdef CONFIG_PROC_FS
 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
-- 
cgit v1.2.3-18-g5258


From 92dc6fcc845d99e87d8168e0786796525832d130 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:36 -0700
Subject: [PATCH] mm: parisc pte atomicity

There's a worrying function translation_exists in parisc cacheflush.h,
unaffected by split ptlock since flush_dcache_page is using it on some other
mm, without any relevant lock.  Oh well, make it a slightly more robust by
factoring the pfn check within it.  And it looked liable to confuse a
camouflaged swap or file entry with a good pte: fix that too.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-parisc/cacheflush.h | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/asm-parisc/cacheflush.h b/include/asm-parisc/cacheflush.h
index aa592d8c0e3..1bc3c83ee74 100644
--- a/include/asm-parisc/cacheflush.h
+++ b/include/asm-parisc/cacheflush.h
@@ -100,30 +100,34 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
 
 /* Simple function to work out if we have an existing address translation
  * for a user space vma. */
-static inline pte_t *__translation_exists(struct mm_struct *mm,
-					  unsigned long addr)
+static inline int translation_exists(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long pfn)
 {
-	pgd_t *pgd = pgd_offset(mm, addr);
+	pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
 	pmd_t *pmd;
-	pte_t *pte;
+	pte_t pte;
 
 	if(pgd_none(*pgd))
-		return NULL;
+		return 0;
 
 	pmd = pmd_offset(pgd, addr);
 	if(pmd_none(*pmd) || pmd_bad(*pmd))
-		return NULL;
+		return 0;
 
-	pte = pte_offset_map(pmd, addr);
+	/* We cannot take the pte lock here: flush_cache_page is usually
+	 * called with pte lock already held.  Whereas flush_dcache_page
+	 * takes flush_dcache_mmap_lock, which is lower in the hierarchy:
+	 * the vma itself is secure, but the pte might come or go racily.
+	 */
+	pte = *pte_offset_map(pmd, addr);
+	/* But pte_unmap() does nothing on this architecture */
 
-	/* The PA flush mappings show up as pte_none, but they're
-	 * valid none the less */
-	if(pte_none(*pte) && ((pte_val(*pte) & _PAGE_FLUSH) == 0))
-		return NULL;
-	return pte;
-}
-#define	translation_exists(vma, addr)	__translation_exists((vma)->vm_mm, addr)
+	/* Filter out coincidental file entries and swap entries */
+	if (!(pte_val(pte) & (_PAGE_FLUSH|_PAGE_PRESENT)))
+		return 0;
 
+	return pte_pfn(pte) == pfn;
+}
 
 /* Private function to flush a page from the cache of a non-current
  * process.  cr25 contains the Page Directory of the current user
@@ -175,9 +179,8 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long
 {
 	BUG_ON(!vma->vm_mm->context);
 
-	if(likely(translation_exists(vma, vmaddr)))
+	if (likely(translation_exists(vma, vmaddr, pfn)))
 		__flush_cache_page(vma, vmaddr);
 
 }
 #endif
-
-- 
cgit v1.2.3-18-g5258


From 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:40 -0700
Subject: [PATCH] mm: split page table lock

Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.

This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock.  (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)

In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.

Splitting the lock is not quite for free: another cacheline access.  Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS.  But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.

There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/buffer_head.h |  6 +++---
 include/linux/mm.h          | 46 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 41 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 88af42f5e04..c937d6e6550 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp)
 /* If we *know* page->private refers to buffer_heads */
 #define page_buffers(page)					\
 	({							\
-		BUG_ON(!PagePrivate(page));		\
-		((struct buffer_head *)(page)->private);	\
+		BUG_ON(!PagePrivate(page));			\
+		((struct buffer_head *)page_private(page));	\
 	})
 #define page_has_buffers(page)	PagePrivate(page)
 
@@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page,
 {
 	page_cache_get(page);
 	SetPagePrivate(page);
-	page->private = (unsigned long)head;
+	set_page_private(page, (unsigned long)head);
 }
 
 static inline void get_bh(struct buffer_head *bh)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e8d1424153b..8a514eca40d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -226,13 +226,18 @@ struct page {
 					 * to show when page is mapped
 					 * & limit reverse map searches.
 					 */
-	unsigned long private;		/* Mapping-private opaque data:
+	union {
+		unsigned long private;	/* Mapping-private opaque data:
 					 * usually used for buffer_heads
 					 * if PagePrivate set; used for
 					 * swp_entry_t if PageSwapCache
 					 * When page is free, this indicates
 					 * order in the buddy system.
 					 */
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+		spinlock_t ptl;
+#endif
+	} u;
 	struct address_space *mapping;	/* If low bit clear, points to
 					 * inode address_space, or NULL.
 					 * If page mapped as anonymous
@@ -260,6 +265,9 @@ struct page {
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
+#define page_private(page)		((page)->u.private)
+#define set_page_private(page, v)	((page)->u.private = (v))
+
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
@@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *));
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-static inline int page_count(struct page *p)
+static inline int page_count(struct page *page)
 {
-	if (PageCompound(p))
-		p = (struct page *)p->private;
-	return atomic_read(&(p)->_count) + 1;
+	if (PageCompound(page))
+		page = (struct page *)page_private(page);
+	return atomic_read(&page->_count) + 1;
 }
 
 static inline void get_page(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
-		page = (struct page *)page->private;
+		page = (struct page *)page_private(page);
 	atomic_inc(&page->_count);
 }
 
@@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page)
 static inline pgoff_t page_index(struct page *page)
 {
 	if (unlikely(PageSwapCache(page)))
-		return page->private;
+		return page_private(page);
 	return page->index;
 }
 
@@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * We tuck a spinlock to guard each pagetable page into its struct page,
+ * at page->private, with BUILD_BUG_ON to make sure that this will not
+ * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
+ * When freeing, reset page->mapping so free_pages_check won't complain.
+ */
+#define __pte_lockptr(page)	&((page)->u.ptl)
+#define pte_lock_init(_page)	do {					\
+	spin_lock_init(__pte_lockptr(_page));				\
+} while (0)
+#define pte_lock_deinit(page)	((page)->mapping = NULL)
+#define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#else
+/*
+ * We use mm->page_table_lock to guard all pagetable pages of the mm.
+ */
+#define pte_lock_init(page)	do {} while (0)
+#define pte_lock_deinit(page)	do {} while (0)
+#define pte_lockptr(mm, pmd)	({(void)(pmd); &(mm)->page_table_lock;})
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
 #define pte_offset_map_lock(mm, pmd, address, ptlp)	\
 ({							\
-	spinlock_t *__ptl = &(mm)->page_table_lock;	\
+	spinlock_t *__ptl = pte_lockptr(mm, pmd);	\
 	pte_t *__pte = pte_offset_map(pmd, address);	\
 	*(ptlp) = __ptl;				\
 	spin_lock(__ptl);				\
-- 
cgit v1.2.3-18-g5258


From f412ac08c9861b4791af0145934c22f1458686da Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:41 -0700
Subject: [PATCH] mm: fix rss and mmlist locking

A couple of oddities were guarded by page_table_lock, no longer properly
guarded when that is split.

The mm_counters of file_rss and anon_rss: make those an atomic_t, or an
atomic64_t if the architecture supports it, in such a case.  Definitions by
courtesy of Christoph Lameter: who spent considerable effort on more scalable
ways of counting, but found insufficient benefit in practice.

And adding an mm with swap to the mmlist for swapoff: the list is well-
guarded by its own lock, but the list_empty check now has to be repeated
inside it.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 42 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 292cb57ce38..1c30bc308ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -249,13 +249,47 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * The mm counters are not protected by its page_table_lock,
+ * so must be incremented atomically.
+ */
+#ifdef ATOMIC64_INIT
+#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member)
+typedef atomic64_t mm_counter_t;
+#else /* !ATOMIC64_INIT */
+/*
+ * The counters wrap back to 0 at 2^32 * PAGE_SIZE,
+ * that is, at 16TB if using 4kB page size.
+ */
+#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value)
+#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member))
+#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member)
+#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member)
+#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member)
+typedef atomic_t mm_counter_t;
+#endif /* !ATOMIC64_INIT */
+
+#else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+/*
+ * The mm counters are protected by its page_table_lock,
+ * so can be incremented directly.
+ */
 #define set_mm_counter(mm, member, value) (mm)->_##member = (value)
 #define get_mm_counter(mm, member) ((mm)->_##member)
 #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
 #define inc_mm_counter(mm, member) (mm)->_##member++
 #define dec_mm_counter(mm, member) (mm)->_##member--
-#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
+typedef unsigned long mm_counter_t;
+
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
 
+#define get_mm_rss(mm)					\
+	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
 #define update_hiwater_rss(mm)	do {			\
 	unsigned long _rss = get_mm_rss(mm);		\
 	if ((mm)->hiwater_rss < _rss)			\
@@ -266,8 +300,6 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 		(mm)->hiwater_vm = (mm)->total_vm;	\
 } while (0)
 
-typedef unsigned long mm_counter_t;
-
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	struct rb_root mm_rb;
@@ -291,7 +323,9 @@ struct mm_struct {
 						 * by mmlist_lock
 						 */
 
-	/* Special counters protected by the page_table_lock */
+	/* Special counters, in some configurations protected by the
+	 * page_table_lock, in other configurations by being atomic.
+	 */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
 
-- 
cgit v1.2.3-18-g5258


From b8072f099b7829a6ff3eba618e1d079a81f753f8 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:41 -0700
Subject: [PATCH] mm: update comments to pte lock

Updated several references to page_table_lock in common code comments.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/pgtable.h | 2 +-
 include/linux/mempolicy.h     | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index ff28c8b31f5..7dca30a26c5 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -8,7 +8,7 @@
  *  - update the page tables
  *  - inform the TLB about the new one
  *
- * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock.
+ * We hold the mm semaphore for reading, and the pte lock.
  *
  * Note: the old pte is known to not be writable, so we don't need to
  * worry about dirty bits etc getting lost.
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 38e60a09939..7af8cb836e7 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -47,8 +47,7 @@ struct vm_area_struct;
  * Locking policy for interlave:
  * In process context there is no locking because only the process accesses
  * its own state. All vma manipulation is somewhat protected by a down_read on
- * mmap_sem. For allocating in the interleave policy the page_table_lock
- * must be also aquired to protect il_next.
+ * mmap_sem.
  *
  * Freeing policy:
  * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
-- 
cgit v1.2.3-18-g5258


From 2774812f417db562f0d659d2c1b5755ba35d2770 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:49 -0700
Subject: [PATCH] memory hotplug prep: kill local_mapnr

The following series implements memory hot-add for ppc64 and i386.  There are
x86_64 and ia64 implementations that will be submitted shortly as well,
through the normal maintainers.

This patch:

local_mapnr is unused, except for in an alpha header.  Keep the alpha one,
kill the rest.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/mmzone.h   | 6 ------
 include/asm-m32r/mmzone.h   | 6 ------
 include/asm-parisc/mmzone.h | 6 ------
 include/asm-ppc64/mmzone.h  | 3 ---
 4 files changed, 21 deletions(-)

(limited to 'include')

diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 348fe3a4879..620a90641ea 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -88,12 +88,6 @@ static inline int pfn_to_nid(unsigned long pfn)
 	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
 })
 
-#define local_mapnr(kvaddr)						\
-({									\
-	unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT;		\
-	(__pfn - node_start_pfn(pfn_to_nid(__pfn)));			\
-})
-
 /* XXX: FIXME -- wli */
 #define kern_addr_valid(kaddr)	(0)
 
diff --git a/include/asm-m32r/mmzone.h b/include/asm-m32r/mmzone.h
index d58878ec899..adc7970a77e 100644
--- a/include/asm-m32r/mmzone.h
+++ b/include/asm-m32r/mmzone.h
@@ -21,12 +21,6 @@ extern struct pglist_data *node_data[];
 	__pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1;	\
 })
 
-#define local_mapnr(kvaddr)						\
-({									\
-	unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT;		\
-	(__pfn - node_start_pfn(pfn_to_nid(__pfn)));			\
-})
-
 #define pfn_to_page(pfn)						\
 ({									\
 	unsigned long __pfn = pfn;					\
diff --git a/include/asm-parisc/mmzone.h b/include/asm-parisc/mmzone.h
index 595d3dce120..ae039f4fd71 100644
--- a/include/asm-parisc/mmzone.h
+++ b/include/asm-parisc/mmzone.h
@@ -27,12 +27,6 @@ extern struct node_map_data node_data[];
 })
 #define node_localnr(pfn, nid)		((pfn) - node_start_pfn(nid))
 
-#define local_mapnr(kvaddr)						\
-({									\
-	unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT;		\
-	(__pfn - node_start_pfn(pfn_to_nid(__pfn)));			\
-})
-
 #define pfn_to_page(pfn)						\
 ({									\
 	unsigned long __pfn = (pfn);					\
diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h
index ed473f4b015..80a708e7093 100644
--- a/include/asm-ppc64/mmzone.h
+++ b/include/asm-ppc64/mmzone.h
@@ -67,9 +67,6 @@ static inline int pa_to_nid(unsigned long pa)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)	(NODE_DATA(nid)->node_end_pfn)
 
-#define local_mapnr(kvaddr) \
-	( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) 
-
 #ifdef CONFIG_DISCONTIGMEM
 
 /*
-- 
cgit v1.2.3-18-g5258


From 4ca644d970bf2542623228a4624af356d20ca267 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:51 -0700
Subject: [PATCH] memory hotplug prep: __section_nr helper

A little helper that we use in the hotplug code.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7519eb4191e..4674145bb63 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -509,6 +509,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
 		return NULL;
 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
+extern int __section_nr(struct mem_section* ms);
 
 /*
  * We use the lower bits of the mem_map pointer to store
-- 
cgit v1.2.3-18-g5258


From 208d54e5513c0c02d85af0990901354c74364d5c Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:52 -0700
Subject: [PATCH] memory hotplug locking: node_size_lock

pgdat->node_size_lock is basically only neeeded in one place in the normal
code: show_mem(), which is the arch-specific sysrq-m printing function.

Strictly speaking, the architectures not doing memory hotplug do no need this
locking in show_mem().  However, they are all included for completeness.  This
should also make any future consolidation of all of the implementations a
little more straightforward.

This lock is also held in the sparsemem code during a memory removal, as
sections are invalidated.  This is the place there pfn_valid() is made false
for a memory area that's being removed.  The lock is only required when doing
pfn_valid() operations on memory which the user does not already have a
reference on the page, such as in show_mem().

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/memory_hotplug.h | 34 ++++++++++++++++++++++++++++++++++
 include/linux/mmzone.h         | 12 ++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 include/linux/memory_hotplug.h

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
new file mode 100644
index 00000000000..e8103be9d52
--- /dev/null
+++ b/include/linux/memory_hotplug.h
@@ -0,0 +1,34 @@
+#ifndef __LINUX_MEMORY_HOTPLUG_H
+#define __LINUX_MEMORY_HOTPLUG_H
+
+#include <linux/mmzone.h>
+#include <linux/spinlock.h>
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * pgdat resizing functions
+ */
+static inline
+void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
+{
+	spin_lock_irqsave(&pgdat->node_size_lock, *flags);
+}
+static inline
+void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
+{
+	spin_lock_irqrestore(&pgdat->node_size_lock, *flags);
+}
+static inline
+void pgdat_resize_init(struct pglist_data *pgdat)
+{
+	spin_lock_init(&pgdat->node_size_lock);
+}
+#else /* ! CONFIG_MEMORY_HOTPLUG */
+/*
+ * Stub functions for when hotplug is off
+ */
+static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
+static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
+static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
+#endif
+#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4674145bb63..e050d68963a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -273,6 +273,16 @@ typedef struct pglist_data {
 	struct page *node_mem_map;
 #endif
 	struct bootmem_data *bdata;
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/*
+	 * Must be held any time you expect node_start_pfn, node_present_pages
+	 * or node_spanned_pages stay constant.  Holding this will also
+	 * guarantee that any pfn_valid() stays that way.
+	 *
+	 * Nests above zone->lock and zone->size_seqlock.
+	 */
+	spinlock_t node_size_lock;
+#endif
 	unsigned long node_start_pfn;
 	unsigned long node_present_pages; /* total number of physical pages */
 	unsigned long node_spanned_pages; /* total size of physical page
@@ -293,6 +303,8 @@ typedef struct pglist_data {
 #endif
 #define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
+#include <linux/memory_hotplug.h>
+
 extern struct pglist_data *pgdat_list;
 
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
-- 
cgit v1.2.3-18-g5258


From bdc8cb984576ab5b550c8b24c6fa111a873503e3 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:53 -0700
Subject: [PATCH] memory hotplug locking: zone span seqlock

See the "fixup bad_range()" patch for more information, but this actually
creates a the lock to protect things making assumptions about a zone's size
staying constant at runtime.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/memory_hotplug.h | 39 +++++++++++++++++++++++++++++++++++++--
 include/linux/mmzone.h         | 15 +++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index e8103be9d52..4b08bc94757 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -16,13 +16,36 @@ void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
 static inline
 void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
 {
-	spin_lock_irqrestore(&pgdat->node_size_lock, *flags);
+	spin_unlock_irqrestore(&pgdat->node_size_lock, *flags);
 }
 static inline
 void pgdat_resize_init(struct pglist_data *pgdat)
 {
 	spin_lock_init(&pgdat->node_size_lock);
 }
+/*
+ * Zone resizing functions
+ */
+static inline unsigned zone_span_seqbegin(struct zone *zone)
+{
+	return read_seqbegin(&zone->span_seqlock);
+}
+static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
+{
+	return read_seqretry(&zone->span_seqlock, iv);
+}
+static inline void zone_span_writelock(struct zone *zone)
+{
+	write_seqlock(&zone->span_seqlock);
+}
+static inline void zone_span_writeunlock(struct zone *zone)
+{
+	write_sequnlock(&zone->span_seqlock);
+}
+static inline void zone_seqlock_init(struct zone *zone)
+{
+	seqlock_init(&zone->span_seqlock);
+}
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
  * Stub functions for when hotplug is off
@@ -30,5 +53,17 @@ void pgdat_resize_init(struct pglist_data *pgdat)
 static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
 static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
 static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
-#endif
+
+static inline unsigned zone_span_seqbegin(struct zone *zone)
+{
+	return 0;
+}
+static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
+{
+	return 0;
+}
+static inline void zone_span_writelock(struct zone *zone) {}
+static inline void zone_span_writeunlock(struct zone *zone) {}
+static inline void zone_seqlock_init(struct zone *zone) {}
+#endif /* ! CONFIG_MEMORY_HOTPLUG */
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e050d68963a..f5fa3082fd6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -12,6 +12,7 @@
 #include <linux/threads.h>
 #include <linux/numa.h>
 #include <linux/init.h>
+#include <linux/seqlock.h>
 #include <asm/atomic.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -137,6 +138,10 @@ struct zone {
 	 * free areas of different sizes
 	 */
 	spinlock_t		lock;
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/* see spanned/present_pages for more description */
+	seqlock_t		span_seqlock;
+#endif
 	struct free_area	free_area[MAX_ORDER];
 
 
@@ -220,6 +225,16 @@ struct zone {
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
 
+	/*
+	 * zone_start_pfn, spanned_pages and present_pages are all
+	 * protected by span_seqlock.  It is a seqlock because it has
+	 * to be read outside of zone->lock, and it is done in the main
+	 * allocator path.  But, it is written quite infrequently.
+	 *
+	 * The lock is declared along with zone->lock because it is
+	 * frequently read in proximity to zone->lock.  It's good to
+	 * give them a chance of being in the same cacheline.
+	 */
 	unsigned long		spanned_pages;	/* total size, including holes */
 	unsigned long		present_pages;	/* amount of memory (excluding holes) */
 
-- 
cgit v1.2.3-18-g5258


From 3947be1969a9ce455ec30f60ef51efb10e4323d1 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 29 Oct 2005 18:16:54 -0700
Subject: [PATCH] memory hotplug: sysfs and add/remove functions

This adds generic memory add/remove and supporting functions for memory
hotplug into a new file as well as a memory hotplug kernel config option.

Individual architecture patches will follow.

For now, disable memory hotplug when swsusp is enabled.  There's a lot of
churn there right now.  We'll fix it up properly once it calms down.

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/memory.h         | 94 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/memory_hotplug.h | 35 ++++++++++++++++
 include/linux/mm.h             |  1 +
 3 files changed, 130 insertions(+)
 create mode 100644 include/linux/memory.h

(limited to 'include')

diff --git a/include/linux/memory.h b/include/linux/memory.h
new file mode 100644
index 00000000000..0def328ab5c
--- /dev/null
+++ b/include/linux/memory.h
@@ -0,0 +1,94 @@
+/*
+ * include/linux/memory.h - generic memory definition
+ *
+ * This is mainly for topological representation. We define the
+ * basic "struct memory_block" here, which can be embedded in per-arch
+ * definitions or NUMA information.
+ *
+ * Basic handling of the devices is done in drivers/base/memory.c
+ * and system devices are handled in drivers/base/sys.c.
+ *
+ * Memory block are exported via sysfs in the class/memory/devices/
+ * directory.
+ *
+ */
+#ifndef _LINUX_MEMORY_H_
+#define _LINUX_MEMORY_H_
+
+#include <linux/sysdev.h>
+#include <linux/node.h>
+#include <linux/compiler.h>
+
+#include <asm/semaphore.h>
+
+struct memory_block {
+	unsigned long phys_index;
+	unsigned long state;
+	/*
+	 * This serializes all state change requests.  It isn't
+	 * held during creation because the control files are
+	 * created long after the critical areas during
+	 * initialization.
+	 */
+	struct semaphore state_sem;
+	int phys_device;		/* to which fru does this belong? */
+	void *hw;			/* optional pointer to fw/hw data */
+	int (*phys_callback)(struct memory_block *);
+	struct sys_device sysdev;
+};
+
+/* These states are exposed to userspace as text strings in sysfs */
+#define	MEM_ONLINE		(1<<0) /* exposed to userspace */
+#define	MEM_GOING_OFFLINE	(1<<1) /* exposed to userspace */
+#define	MEM_OFFLINE		(1<<2) /* exposed to userspace */
+
+/*
+ * All of these states are currently kernel-internal for notifying
+ * kernel components and architectures.
+ *
+ * For MEM_MAPPING_INVALID, all notifier chains with priority >0
+ * are called before pfn_to_page() becomes invalid.  The priority=0
+ * entry is reserved for the function that actually makes
+ * pfn_to_page() stop working.  Any notifiers that want to be called
+ * after that should have priority <0.
+ */
+#define	MEM_MAPPING_INVALID	(1<<3)
+
+#ifndef CONFIG_MEMORY_HOTPLUG
+static inline int memory_dev_init(void)
+{
+	return 0;
+}
+static inline int register_memory_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+static inline void unregister_memory_notifier(struct notifier_block *nb)
+{
+}
+#else
+extern int register_memory(struct memory_block *, struct mem_section *section, struct node *);
+extern int register_new_memory(struct mem_section *);
+extern int unregister_memory_section(struct mem_section *);
+extern int memory_dev_init(void);
+extern int register_memory_notifier(struct notifier_block *nb);
+extern void unregister_memory_notifier(struct notifier_block *nb);
+
+#define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
+
+extern int invalidate_phys_mapping(unsigned long, unsigned long);
+struct notifier_block;
+
+extern int register_memory_notifier(struct notifier_block *nb);
+extern void unregister_memory_notifier(struct notifier_block *nb);
+
+extern struct sysdev_class memory_sysdev_class;
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#define hotplug_memory_notifier(fn, pri) {			\
+	static struct notifier_block fn##_mem_nb =		\
+		{ .notifier_call = fn, .priority = pri };	\
+	register_memory_notifier(&fn##_mem_nb);			\
+}
+
+#endif /* _LINUX_MEMORY_H_ */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4b08bc94757..01f03bc06ef 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -3,6 +3,8 @@
 
 #include <linux/mmzone.h>
 #include <linux/spinlock.h>
+#include <linux/mmzone.h>
+#include <linux/notifier.h>
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
@@ -46,6 +48,19 @@ static inline void zone_seqlock_init(struct zone *zone)
 {
 	seqlock_init(&zone->span_seqlock);
 }
+extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
+extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
+extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
+/* need some defines for these for archs that don't support it */
+extern void online_page(struct page *page);
+/* VM interface that may be used by firmware interface */
+extern int add_memory(u64 start, u64 size);
+extern int remove_memory(u64 start, u64 size);
+extern int online_pages(unsigned long, unsigned long);
+
+/* reasonably generic interface to expand the physical pages in a zone  */
+extern int __add_pages(struct zone *zone, unsigned long start_pfn,
+	unsigned long nr_pages);
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
  * Stub functions for when hotplug is off
@@ -65,5 +80,25 @@ static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
 static inline void zone_span_writelock(struct zone *zone) {}
 static inline void zone_span_writeunlock(struct zone *zone) {}
 static inline void zone_seqlock_init(struct zone *zone) {}
+
+static inline int mhp_notimplemented(const char *func)
+{
+	printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func);
+	dump_stack();
+	return -ENOSYS;
+}
+
+static inline int __add_pages(struct zone *zone, unsigned long start_pfn,
+	unsigned long nr_pages)
+{
+	return mhp_notimplemented(__FUNCTION__);
+}
 #endif /* ! CONFIG_MEMORY_HOTPLUG */
+static inline int __remove_pages(struct zone *zone, unsigned long start_pfn,
+	unsigned long nr_pages)
+{
+	printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__);
+	dump_stack();
+	return -ENOSYS;
+}
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8a514eca40d..5c1fb0a2e80 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -840,6 +840,7 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat,
 	unsigned long * zones_size, unsigned long zone_start_pfn, 
 	unsigned long *zholes_size);
 extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
+extern void setup_per_zone_pages_min(void);
 extern void mem_init(void);
 extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
-- 
cgit v1.2.3-18-g5258