20 files changed, 1332 insertions, 756 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index cd379936cac..4e9937ac352 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -89,3 +89,25 @@ config NEED_MULTIPLE_NODES
 config HAVE_MEMORY_PRESENT
 	def_bool y
 	depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
+
+#
+# SPARSEMEM_EXTREME (which is the default) does some bootmem
+# allocations when memory_present() is called.  If this can not
+# be done on your architecture, select this option.  However,
+# statically allocating the mem_section[] array can potentially
+# consume vast quantities of .bss, so be careful.
+#
+# This option will also potentially produce smaller runtime code
+# with gcc 3.4 and later.
+#
+config SPARSEMEM_STATIC
+	def_bool n
+
+#
+# Architectecture platforms which require a two level mem_section in SPARSEMEM
+# must select this option. This is usually for architecture platforms with
+# an extremely sparse physical address space.
+#
+config SPARSEMEM_EXTREME
+	def_bool y
+	depends on SPARSEMEM && !SPARSEMEM_STATIC
diff --git a/mm/filemap.c b/mm/filemap.c
index c11418dd94e..b5346576e58 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -37,6 +37,10 @@
 #include <asm/uaccess.h>
 #include <asm/mman.h>
 
+static ssize_t
+generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+	loff_t offset, unsigned long nr_segs);
+
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  * though.
@@ -54,9 +58,8 @@
  *
  *  ->i_mmap_lock		(vmtruncate)
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
- *      ->swap_list_lock
- *        ->swap_device_lock	(exclusive_swap_page, others)
- *          ->mapping->tree_lock
+ *      ->swap_lock		(exclusive_swap_page, others)
+ *        ->mapping->tree_lock
  *
  *  ->i_sem
  *    ->i_mmap_lock		(truncate->unmap_mapping_range)
@@ -86,7 +89,7 @@
  *    ->page_table_lock		(anon_vma_prepare and various)
  *
  *  ->page_table_lock
- *    ->swap_device_lock	(try_to_unmap_one)
+ *    ->swap_lock		(try_to_unmap_one)
  *    ->private_lock		(try_to_unmap_one)
  *    ->tree_lock		(try_to_unmap_one)
  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
@@ -302,8 +305,9 @@ EXPORT_SYMBOL(sync_page_range);
  * as it forces O_SYNC writers to different parts of the same file
  * to be serialised right until io completion.
  */
-int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
-			loff_t pos, size_t count)
+static int sync_page_range_nolock(struct inode *inode,
+				  struct address_space *mapping,
+				  loff_t pos, size_t count)
 {
 	pgoff_t start = pos >> PAGE_CACHE_SHIFT;
 	pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -318,7 +322,6 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
 		ret = wait_on_page_writeback_range(mapping, start, end);
 	return ret;
 }
-EXPORT_SYMBOL(sync_page_range_nolock);
 
 /**
  * filemap_fdatawait - walk the list of under-writeback pages of the given
@@ -1505,8 +1508,12 @@ repeat:
 		return -EINVAL;
 
 	page = filemap_getpage(file, pgoff, nonblock);
+
+	/* XXX: This is wrong, a filesystem I/O error may have happened. Fix that as
+	 * done in shmem_populate calling shmem_getpage */
 	if (!page && !nonblock)
 		return -ENOMEM;
+
 	if (page) {
 		err = install_page(mm, vma, addr, page, prot);
 		if (err) {
@@ -1514,6 +1521,9 @@ repeat:
 			return err;
 		}
 	} else {
+		/* No page was found just because we can't read it in now (being
+		 * here implies nonblock != 0), but the page may exist, so set
+		 * the PTE to fault it in later. */
 		err = install_file_pte(mm, vma, addr, pgoff, prot);
 		if (err)
 			return err;
@@ -2002,7 +2012,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 }
 EXPORT_SYMBOL(generic_file_buffered_write);
 
-ssize_t
+static ssize_t
 __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t *ppos)
 {
@@ -2102,7 +2112,7 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 	return ret;
 }
 
-ssize_t
+static ssize_t
 __generic_file_write_nolock(struct file *file, const struct iovec *iov,
 				unsigned long nr_segs, loff_t *ppos)
 {
@@ -2223,7 +2233,7 @@ EXPORT_SYMBOL(generic_file_writev);
  * Called under i_sem for writes to S_ISREG files.   Returns -EIO if something
  * went wrong during pagecache shootdown.
  */
-ssize_t
+static ssize_t
 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	loff_t offset, unsigned long nr_segs)
 {
@@ -2258,4 +2268,3 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	}
 	return retval;
 }
-EXPORT_SYMBOL_GPL(generic_file_direct_IO);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6bf720bc662..901ac523a1c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -360,8 +360,6 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 			ret = -ENOMEM;
 			goto out;
 		}
-		if (! pte_none(*pte))
-			hugetlb_clean_stale_pgtable(pte);
 
 		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
 			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
diff --git a/mm/madvise.c b/mm/madvise.c
index c8c01a12fea..4454936f87d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -37,7 +37,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
 
 	if (new_flags == vma->vm_flags) {
 		*prev = vma;
-		goto success;
+		goto out;
 	}
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -62,6 +62,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
 			goto out;
 	}
 
+success:
 	/*
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 */
@@ -70,7 +71,6 @@ static long madvise_behavior(struct vm_area_struct * vma,
 out:
 	if (error == -ENOMEM)
 		error = -EAGAIN;
-success:
 	return error;
 }
 
@@ -237,8 +237,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
 	 * - different from the way of handling in mlock etc.
 	 */
 	vma = find_vma_prev(current->mm, start, &prev);
-	if (!vma && prev)
-		vma = prev->vm_next;
+	if (vma && start > vma->vm_start)
+		prev = vma;
+
 	for (;;) {
 		/* Still start < end. */
 		error = -ENOMEM;
diff --git a/mm/memory.c b/mm/memory.c
index a596c117224..ae8161f1f45 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -562,7 +562,8 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 				     page->index > details->last_index))
 					continue;
 			}
-			ptent = ptep_get_and_clear(tlb->mm, addr, pte);
+			ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
+							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
@@ -590,7 +591,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			continue;
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
-		pte_clear(tlb->mm, addr, pte);
+		pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(pte - 1);
 }
@@ -1955,7 +1956,7 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
 	 * Fall back to the linear mapping if the fs does not support
 	 * ->populate:
 	 */
-	if (!vma->vm_ops || !vma->vm_ops->populate || 
+	if (!vma->vm_ops->populate ||
 			(write_access && !(vma->vm_flags & VM_SHARED))) {
 		pte_clear(mm, address, pte);
 		return do_no_page(mm, vma, address, write_access, pte, pmd);
@@ -2224,7 +2225,7 @@ void update_mem_hiwater(struct task_struct *tsk)
 #if !defined(__HAVE_ARCH_GATE_AREA)
 
 #if defined(AT_SYSINFO_EHDR)
-struct vm_area_struct gate_vma;
+static struct vm_area_struct gate_vma;
 
 static int __init gate_vma_init(void)
 {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b4eababc819..afa06e184d8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -88,7 +88,7 @@ static kmem_cache_t *sn_cache;
    policied. */
 static int policy_zone;
 
-static struct mempolicy default_policy = {
+struct mempolicy default_policy = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
 	.policy = MPOL_DEFAULT,
 };
@@ -664,10 +664,10 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 #endif
 
 /* Return effective policy for a VMA */
-static struct mempolicy *
-get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
+struct mempolicy *
+get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = current->mempolicy;
+	struct mempolicy *pol = task->mempolicy;
 
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy)
@@ -786,7 +786,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
 struct page *
 alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = get_vma_policy(vma, addr);
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
 	cpuset_update_current_mems_allowed();
 
@@ -908,7 +908,7 @@ void __mpol_free(struct mempolicy *p)
 /* Find first node suitable for an allocation */
 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = get_vma_policy(vma, addr);
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
@@ -928,7 +928,7 @@ int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 /* Find secondary valid nodes for an allocation */
 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = get_vma_policy(vma, addr);
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
 	switch (pol->policy) {
 	case MPOL_PREFERRED:
diff --git a/mm/mmap.c b/mm/mmap.c
index 404319477e7..12334aecf8a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -61,7 +61,7 @@ pgprot_t protection_map[16] = {
 
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50;	/* default is 50% */
-int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 atomic_t vm_committed_space = ATOMIC_INIT(0);
 
 /*
@@ -203,13 +203,6 @@ static void remove_vm_struct(struct vm_area_struct *vma)
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
-/*
- *  sys_brk() for the most part doesn't need the global kernel
- *  lock, except when an application is doing something nasty
- *  like trying to un-brk an area that has already been mapped
- *  to a regular file.  in this case, the unmapping will need
- *  to invoke file system routines that need the global lock.
- */
 asmlinkage unsigned long sys_brk(unsigned long brk)
 {
 	unsigned long rlim, retval;
diff --git a/mm/mremap.c b/mm/mremap.c
index fc45dc9a617..a32fed454bd 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -141,6 +141,10 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
 			if (dst) {
 				pte_t pte;
 				pte = ptep_clear_flush(vma, old_addr, src);
+				/* ZERO_PAGE can be dependant on virtual addr */
+				if (pfn_valid(pte_pfn(pte)) &&
+					pte_page(pte) == ZERO_PAGE(old_addr))
+					pte = pte_wrprotect(mk_pte(ZERO_PAGE(new_addr), new_vma->vm_page_prot));
 				set_pte_at(mm, new_addr, dst, pte);
 			} else
 				error = -ENOMEM;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e56076672f..ac3bf33e537 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -6,8 +6,8 @@
  *	for goading me into coding this file...
  *
  *  The routines in this file are used to kill a process when
- *  we're seriously out of memory. This gets called from kswapd()
- *  in linux/mm/vmscan.c when we really run out of memory.
+ *  we're seriously out of memory. This gets called from __alloc_pages()
+ *  in mm/page_alloc.c when we really run out of memory.
  *
  *  Since we won't call these routines often (on a well-configured
  *  machine) this file will double as a 'coding guide' and a signpost
@@ -20,13 +20,14 @@
 #include <linux/swap.h>
 #include <linux/timex.h>
 #include <linux/jiffies.h>
+#include <linux/cpuset.h>
 
 /* #define DEBUG */
 
 /**
  * oom_badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
- * @p: current uptime in seconds
+ * @uptime: current uptime in seconds
  *
  * The formula used is relatively simple and documented inline in the
  * function. The main rationale is that we want to select a good task
@@ -57,9 +58,9 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 
 	/*
 	 * Processes which fork a lot of child processes are likely
-	 * a good choice. We add the vmsize of the childs if they
+	 * a good choice. We add the vmsize of the children if they
 	 * have an own mm. This prevents forking servers to flood the
-	 * machine with an endless amount of childs
+	 * machine with an endless amount of children
 	 */
 	list_for_each(tsk, &p->children) {
 		struct task_struct *chld;
@@ -143,28 +144,36 @@ static struct task_struct * select_bad_process(void)
 	struct timespec uptime;
 
 	do_posix_clock_monotonic_gettime(&uptime);
-	do_each_thread(g, p)
+	do_each_thread(g, p) {
+		unsigned long points;
+		int releasing;
+
 		/* skip the init task with pid == 1 */
-		if (p->pid > 1 && p->oomkilladj != OOM_DISABLE) {
-			unsigned long points;
-
-			/*
-			 * This is in the process of releasing memory so wait it
-			 * to finish before killing some other task by mistake.
-			 */
-			if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&
-			    !(p->flags & PF_DEAD))
-				return ERR_PTR(-1UL);
-			if (p->flags & PF_SWAPOFF)
-				return p;
-
-			points = badness(p, uptime.tv_sec);
-			if (points > maxpoints || !chosen) {
-				chosen = p;
-				maxpoints = points;
-			}
+		if (p->pid == 1)
+			continue;
+		if (p->oomkilladj == OOM_DISABLE)
+			continue;
+		/* If p's nodes don't overlap ours, it won't help to kill p. */
+		if (!cpuset_excl_nodes_overlap(p))
+			continue;
+
+		/*
+		 * This is in the process of releasing memory so for wait it
+		 * to finish before killing some other task by mistake.
+		 */
+		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
+						p->flags & PF_EXITING;
+		if (releasing && !(p->flags & PF_DEAD))
+			return ERR_PTR(-1UL);
+		if (p->flags & PF_SWAPOFF)
+			return p;
+
+		points = badness(p, uptime.tv_sec);
+		if (points > maxpoints || !chosen) {
+			chosen = p;
+			maxpoints = points;
 		}
-	while_each_thread(g, p);
+	} while_each_thread(g, p);
 	return chosen;
 }
 
@@ -189,7 +198,8 @@ static void __oom_kill_task(task_t *p)
 		return;
 	}
 	task_unlock(p);
-	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
+	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n",
+							p->pid, p->comm);
 
 	/*
 	 * We give our sacrificial lamb high priority and access to
@@ -290,6 +300,5 @@ retry:
 	 * Give "p" a good chance of killing itself before we
 	 * retry to allocate memory.
 	 */
-	__set_current_state(TASK_INTERRUPTIBLE);
-	schedule_timeout(1);
+	schedule_timeout_interruptible(1);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a6329fa8f86..0166ea15c9e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -368,10 +368,8 @@ int wakeup_pdflush(long nr_pages)
 static void wb_timer_fn(unsigned long unused);
 static void laptop_timer_fn(unsigned long unused);
 
-static struct timer_list wb_timer =
-			TIMER_INITIALIZER(wb_timer_fn, 0, 0);
-static struct timer_list laptop_mode_wb_timer =
-			TIMER_INITIALIZER(laptop_timer_fn, 0, 0);
+static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
+static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 
 /*
  * Periodic writeback of "old" data.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8d088371196..c5823c395f7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -42,13 +42,13 @@
  * MCD - HACK: Find somewhere to initialize this EARLY, or make this
  * initializer cleaner
  */
-nodemask_t node_online_map = { { [0] = 1UL } };
+nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
-nodemask_t node_possible_map = NODE_MASK_ALL;
+nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
-struct pglist_data *pgdat_list;
-unsigned long totalram_pages;
-unsigned long totalhigh_pages;
+struct pglist_data *pgdat_list __read_mostly;
+unsigned long totalram_pages __read_mostly;
+unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 
 /*
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
  * Used by page_zone() to look up the address of the struct zone whose
  * id is encoded in the upper bits of page->flags
  */
-struct zone *zone_table[1 << ZONETABLE_SHIFT];
+struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -329,13 +329,13 @@ static inline void free_pages_check(const char *function, struct page *page)
 			1 << PG_writeback )))
 		bad_page(function, page);
 	if (PageDirty(page))
-		ClearPageDirty(page);
+		__ClearPageDirty(page);
 }
 
 /*
  * Frees a list of pages. 
  * Assumes all pages on list are in same zone, and of same order.
- * count is the number of pages to free, or 0 for all on the list.
+ * count is the number of pages to free.
  *
  * If the zone was previously in an "all pages pinned" state then look to
  * see if this freeing clears that state.
@@ -806,11 +806,14 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
 	classzone_idx = zone_idx(zones[0]);
 
 restart:
-	/* Go through the zonelist once, looking for a zone with enough free */
+	/*
+	 * Go through the zonelist once, looking for a zone with enough free.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		int do_reclaim = should_reclaim_zone(z, gfp_mask);
 
-		if (!cpuset_zone_allowed(z))
+		if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 			continue;
 
 		/*
@@ -845,6 +848,7 @@ zone_reclaim_retry:
 	 *
 	 * This is the last chance, in general, before the goto nopage.
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		if (!zone_watermark_ok(z, order, z->pages_min,
@@ -852,7 +856,7 @@ zone_reclaim_retry:
 				       gfp_mask & __GFP_HIGH))
 			continue;
 
-		if (wait && !cpuset_zone_allowed(z))
+		if (wait && !cpuset_zone_allowed(z, gfp_mask))
 			continue;
 
 		page = buffered_rmqueue(z, order, gfp_mask);
@@ -867,7 +871,7 @@ zone_reclaim_retry:
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 			/* go through the zonelist yet again, ignoring mins */
 			for (i = 0; (z = zones[i]) != NULL; i++) {
-				if (!cpuset_zone_allowed(z))
+				if (!cpuset_zone_allowed(z, gfp_mask))
 					continue;
 				page = buffered_rmqueue(z, order, gfp_mask);
 				if (page)
@@ -903,7 +907,7 @@ rebalance:
 					       gfp_mask & __GFP_HIGH))
 				continue;
 
-			if (!cpuset_zone_allowed(z))
+			if (!cpuset_zone_allowed(z, gfp_mask))
 				continue;
 
 			page = buffered_rmqueue(z, order, gfp_mask);
@@ -922,7 +926,7 @@ rebalance:
 					       classzone_idx, 0, 0))
 				continue;
 
-			if (!cpuset_zone_allowed(z))
+			if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
 				continue;
 
 			page = buffered_rmqueue(z, order, gfp_mask);
@@ -1130,19 +1134,20 @@ EXPORT_SYMBOL(nr_pagecache);
 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 #endif
 
-void __get_page_state(struct page_state *ret, int nr)
+void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
 	int cpu = 0;
 
 	memset(ret, 0, sizeof(*ret));
+	cpus_and(*cpumask, *cpumask, cpu_online_map);
 
-	cpu = first_cpu(cpu_online_map);
+	cpu = first_cpu(*cpumask);
 	while (cpu < NR_CPUS) {
 		unsigned long *in, *out, off;
 
 		in = (unsigned long *)&per_cpu(page_states, cpu);
 
-		cpu = next_cpu(cpu, cpu_online_map);
+		cpu = next_cpu(cpu, *cpumask);
 
 		if (cpu < NR_CPUS)
 			prefetch(&per_cpu(page_states, cpu));
@@ -1153,19 +1158,33 @@ void __get_page_state(struct page_state *ret, int nr)
 	}
 }
 
+void get_page_state_node(struct page_state *ret, int node)
+{
+	int nr;
+	cpumask_t mask = node_to_cpumask(node);
+
+	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+	nr /= sizeof(unsigned long);
+
+	__get_page_state(ret, nr+1, &mask);
+}
+
 void get_page_state(struct page_state *ret)
 {
 	int nr;
+	cpumask_t mask = CPU_MASK_ALL;
 
 	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
 	nr /= sizeof(unsigned long);
 
-	__get_page_state(ret, nr + 1);
+	__get_page_state(ret, nr + 1, &mask);
 }
 
 void get_full_page_state(struct page_state *ret)
 {
-	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
+	cpumask_t mask = CPU_MASK_ALL;
+
+	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
 }
 
 unsigned long __read_page_state(unsigned long offset)
@@ -1909,7 +1928,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		zone->nr_scan_inactive = 0;
 		zone->nr_active = 0;
 		zone->nr_inactive = 0;
-		atomic_set(&zone->reclaim_in_progress, -1);
+		atomic_set(&zone->reclaim_in_progress, 0);
 		if (!size)
 			continue;
 
diff --git a/mm/readahead.c b/mm/readahead.c
index b840e7c6ea7..d0b50034e24 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -540,6 +540,7 @@ void handle_ra_miss(struct address_space *mapping,
 {
 	ra->flags |= RA_FLAG_MISS;
 	ra->flags &= ~RA_FLAG_INCACHE;
+	ra->cache_hit = 0;
 }
 
 /*
diff --git a/mm/rmap.c b/mm/rmap.c
index 08ac5c7fa91..450f5241b5a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -34,9 +34,8 @@
  *       anon_vma->lock
  *         mm->page_table_lock
  *           zone->lru_lock (in mark_page_accessed)
- *           swap_list_lock (in swap_free etc's swap_info_get)
+ *           swap_lock (in swap_duplicate, swap_info_get)
  *             mmlist_lock (in mmput, drain_mmlist and others)
- *             swap_device_lock (in swap_duplicate, swap_info_get)
  *             mapping->private_lock (in __set_page_dirty_buffers)
  *             inode_lock (in set_page_dirty's __mark_inode_dirty)
  *               sb_lock (within inode_lock in fs/fs-writeback.c)
@@ -290,8 +289,6 @@ static int page_referenced_one(struct page *page,
 	pte_t *pte;
 	int referenced = 0;
 
-	if (!get_mm_counter(mm, rss))
-		goto out;
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
 		goto out;
@@ -442,22 +439,19 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
-	struct anon_vma *anon_vma = vma->anon_vma;
-	pgoff_t index;
-
 	BUG_ON(PageReserved(page));
-	BUG_ON(!anon_vma);
 
 	inc_mm_counter(vma->vm_mm, anon_rss);
 
-	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-	index = (address - vma->vm_start) >> PAGE_SHIFT;
-	index += vma->vm_pgoff;
-	index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
-
 	if (atomic_inc_and_test(&page->_mapcount)) {
-		page->index = index;
+		struct anon_vma *anon_vma = vma->anon_vma;
+
+		BUG_ON(!anon_vma);
+		anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 		page->mapping = (struct address_space *) anon_vma;
+
+		page->index = linear_page_index(vma, address);
+
 		inc_page_state(nr_mapped);
 	}
 	/* else checking page index and mapping is racy */
@@ -518,8 +512,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	pte_t pteval;
 	int ret = SWAP_AGAIN;
 
-	if (!get_mm_counter(mm, rss))
-		goto out;
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
 		goto out;
@@ -532,6 +524,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	 * If the page is mlock()d, we cannot swap it out.
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
+	 *
+	 * Pages belonging to VM_RESERVED regions should not happen here.
 	 */
 	if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
 			ptep_clear_flush_young(vma, address, pte)) {
@@ -767,8 +761,7 @@ static int try_to_unmap_file(struct page *page)
 			if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
 				continue;
 			cursor = (unsigned long) vma->vm_private_data;
-			while (get_mm_counter(vma->vm_mm, rss) &&
-				cursor < max_nl_cursor &&
+			while ( cursor < max_nl_cursor &&
 				cursor < vma->vm_end - vma->vm_start) {
 				try_to_unmap_cluster(cursor, &mapcount, vma);
 				cursor += CLUSTER_SIZE;
diff --git a/mm/shmem.c b/mm/shmem.c
index 5a81b1ee4f7..1f7aeb210c7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,7 +45,6 @@
 #include <linux/swapops.h>
 #include <linux/mempolicy.h>
 #include <linux/namei.h>
-#include <linux/xattr.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <asm/pgtable.h>
@@ -179,10 +178,9 @@ static struct address_space_operations shmem_aops;
 static struct file_operations shmem_file_operations;
 static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
-static struct inode_operations shmem_special_inode_operations;
 static struct vm_operations_struct shmem_vm_ops;
 
-static struct backing_dev_info shmem_backing_dev_info = {
+static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 	.unplug_io_fn	= default_unplug_io_fn,
@@ -668,6 +666,7 @@ static void shmem_delete_inode(struct inode *inode)
 	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	if (inode->i_op->truncate == shmem_truncate) {
+		truncate_inode_pages(inode->i_mapping, 0);
 		shmem_unacct_size(info->flags, inode->i_size);
 		inode->i_size = 0;
 		shmem_truncate(inode);
@@ -1195,6 +1194,7 @@ static int shmem_populate(struct vm_area_struct *vma,
 		err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
 		if (err)
 			return err;
+		/* Page may still be null, but only if nonblock was set. */
 		if (page) {
 			mark_page_accessed(page);
 			err = install_page(mm, vma, addr, page, prot);
@@ -1202,7 +1202,10 @@ static int shmem_populate(struct vm_area_struct *vma,
 				page_cache_release(page);
 				return err;
 			}
-		} else if (nonblock) {
+		} else {
+			/* No page was found just because we can't read it in
+			 * now (being here implies nonblock != 0), but the page
+			 * may exist, so set the PTE to fault it in later. */
     			err = install_file_pte(mm, vma, addr, pgoff, prot);
 			if (err)
 	    			return err;
@@ -1296,7 +1299,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 
 		switch (mode & S_IFMT) {
 		default:
-			inode->i_op = &shmem_special_inode_operations;
 			init_special_inode(inode, mode, dev);
 			break;
 		case S_IFREG:
@@ -1606,6 +1608,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	int error = -ENOSPC;
 
 	if (inode) {
+		error = security_inode_init_security(inode, dir, NULL, NULL,
+						     NULL);
+		if (error) {
+			if (error != -EOPNOTSUPP) {
+				iput(inode);
+				return error;
+			}
+			error = 0;
+		}
 		if (dir->i_mode & S_ISGID) {
 			inode->i_gid = dir->i_gid;
 			if (S_ISDIR(mode))
@@ -1615,7 +1626,6 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 		d_instantiate(dentry, inode);
 		dget(dentry); /* Extra count - pin the dentry in core */
-		error = 0;
 	}
 	return error;
 }
@@ -1745,6 +1755,16 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	if (!inode)
 		return -ENOSPC;
 
+	error = security_inode_init_security(inode, dir, NULL, NULL,
+					     NULL);
+	if (error) {
+		if (error != -EOPNOTSUPP) {
+			iput(inode);
+			return error;
+		}
+		error = 0;
+	}
+
 	info = SHMEM_I(inode);
 	inode->i_size = len-1;
 	if (len <= (char *)inode - (char *)info) {
@@ -1800,12 +1820,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
 static struct inode_operations shmem_symlink_inline_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= shmem_follow_link_inline,
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr       = generic_setxattr,
-	.getxattr       = generic_getxattr,
-	.listxattr      = generic_listxattr,
-	.removexattr    = generic_removexattr,
-#endif
 };
 
 static struct inode_operations shmem_symlink_inode_operations = {
@@ -1813,12 +1827,6 @@ static struct inode_operations shmem_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= shmem_follow_link,
 	.put_link	= shmem_put_link,
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr       = generic_setxattr,
-	.getxattr       = generic_getxattr,
-	.listxattr      = generic_listxattr,
-	.removexattr    = generic_removexattr,
-#endif
 };
 
 static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
@@ -1938,12 +1946,6 @@ static void shmem_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }
 
-#ifdef CONFIG_TMPFS_XATTR
-static struct xattr_handler *shmem_xattr_handlers[];
-#else
-#define shmem_xattr_handlers NULL
-#endif
-
 static int shmem_fill_super(struct super_block *sb,
 			    void *data, int silent)
 {
@@ -1994,7 +1996,6 @@ static int shmem_fill_super(struct super_block *sb,
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = TMPFS_MAGIC;
 	sb->s_op = &shmem_ops;
-	sb->s_xattr = shmem_xattr_handlers;
 
 	inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
 	if (!inode)
@@ -2083,12 +2084,6 @@ static struct file_operations shmem_file_operations = {
 static struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
 	.setattr	= shmem_notify_change,
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr       = generic_setxattr,
-	.getxattr       = generic_getxattr,
-	.listxattr      = generic_listxattr,
-	.removexattr    = generic_removexattr,
-#endif
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
@@ -2102,21 +2097,6 @@ static struct inode_operations shmem_dir_inode_operations = {
 	.rmdir		= shmem_rmdir,
 	.mknod		= shmem_mknod,
 	.rename		= shmem_rename,
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr       = generic_setxattr,
-	.getxattr       = generic_getxattr,
-	.listxattr      = generic_listxattr,
-	.removexattr    = generic_removexattr,
-#endif
-#endif
-};
-
-static struct inode_operations shmem_special_inode_operations = {
-#ifdef CONFIG_TMPFS_XATTR
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
-	.listxattr	= generic_listxattr,
-	.removexattr	= generic_removexattr,
 #endif
 };
 
@@ -2142,48 +2122,6 @@ static struct vm_operations_struct shmem_vm_ops = {
 };
 
 
-#ifdef CONFIG_TMPFS_SECURITY
-
-static size_t shmem_xattr_security_list(struct inode *inode, char *list, size_t list_len,
-					const char *name, size_t name_len)
-{
-	return security_inode_listsecurity(inode, list, list_len);
-}
-
-static int shmem_xattr_security_get(struct inode *inode, const char *name, void *buffer, size_t size)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return security_inode_getsecurity(inode, name, buffer, size);
-}
-
-static int shmem_xattr_security_set(struct inode *inode, const char *name, const void *value, size_t size, int flags)
-{
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-	return security_inode_setsecurity(inode, name, value, size, flags);
-}
-
-static struct xattr_handler shmem_xattr_security_handler = {
-	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= shmem_xattr_security_list,
-	.get	= shmem_xattr_security_get,
-	.set	= shmem_xattr_security_set,
-};
-
-#endif	/* CONFIG_TMPFS_SECURITY */
-
-#ifdef CONFIG_TMPFS_XATTR
-
-static struct xattr_handler *shmem_xattr_handlers[] = {
-#ifdef CONFIG_TMPFS_SECURITY
-	&shmem_xattr_security_handler,
-#endif
-	NULL
-};
-
-#endif	/* CONFIG_TMPFS_XATTR */
-
 static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
diff --git a/mm/slab.c b/mm/slab.c
index c9e706db463..9e876d6dfad 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -75,6 +75,15 @@
  *
  *	At present, each engine can be growing a cache.  This should be blocked.
  *
+ * 15 March 2005. NUMA slab allocator.
+ *	Shai Fultheim <shai@scalex86.org>.
+ *	Shobhit Dayal <shobhit@calsoftinc.com>
+ *	Alok N Kataria <alokk@calsoftinc.com>
+ *	Christoph Lameter <christoph@lameter.com>
+ *
+ *	Modified the slab allocator to be node aware on NUMA systems.
+ *	Each node has its own list of partial, free and full slabs.
+ *	All object allocations for a node occur from node specific slab lists.
  */
 
 #include	<linux/config.h>
@@ -93,6 +102,7 @@
 #include	<linux/module.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
+#include	<linux/nodemask.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -189,6 +199,7 @@
  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
  */
 
+typedef unsigned int kmem_bufctl_t;
 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-2)
@@ -211,6 +222,7 @@ struct slab {
 	void			*s_mem;		/* including colour offset */
 	unsigned int		inuse;		/* num of objs active in slab */
 	kmem_bufctl_t		free;
+	unsigned short          nodeid;
 };
 
 /*
@@ -238,7 +250,6 @@ struct slab_rcu {
 /*
  * struct array_cache
  *
- * Per cpu structures
  * Purpose:
  * - LIFO ordering, to hand out cache-warm objects from _alloc
  * - reduce the number of linked list operations
@@ -253,6 +264,13 @@ struct array_cache {
 	unsigned int limit;
 	unsigned int batchcount;
 	unsigned int touched;
+	spinlock_t lock;
+	void *entry[0];		/*
+				 * Must have this definition in here for the proper
+				 * alignment of array_cache. Also simplifies accessing
+				 * the entries.
+				 * [0] is for gcc 2.95. It should really be [].
+				 */
 };
 
 /* bootstrap: The caches do not work without cpuarrays anymore,
@@ -265,34 +283,83 @@ struct arraycache_init {
 };
 
 /*
- * The slab lists of all objects.
- * Hopefully reduce the internal fragmentation
- * NUMA: The spinlock could be moved from the kmem_cache_t
- * into this structure, too. Figure out what causes
- * fewer cross-node spinlock operations.
+ * The slab lists for all objects.
  */
 struct kmem_list3 {
 	struct list_head	slabs_partial;	/* partial list first, better asm code */
 	struct list_head	slabs_full;
 	struct list_head	slabs_free;
 	unsigned long	free_objects;
-	int		free_touched;
 	unsigned long	next_reap;
-	struct array_cache	*shared;
+	int		free_touched;
+	unsigned int 	free_limit;
+	spinlock_t      list_lock;
+	struct array_cache	*shared;	/* shared per node */
+	struct array_cache	**alien;	/* on other nodes */
 };
 
-#define LIST3_INIT(parent) \
-	{ \
-		.slabs_full	= LIST_HEAD_INIT(parent.slabs_full), \
-		.slabs_partial	= LIST_HEAD_INIT(parent.slabs_partial), \
-		.slabs_free	= LIST_HEAD_INIT(parent.slabs_free) \
+/*
+ * Need this for bootstrapping a per node allocator.
+ */
+#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
+struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
+#define	CACHE_CACHE 0
+#define	SIZE_AC 1
+#define	SIZE_L3 (1 + MAX_NUMNODES)
+
+/*
+ * This function may be completely optimized away if
+ * a constant is passed to it. Mostly the same as
+ * what is in linux/slab.h except it returns an
+ * index.
+ */
+static inline int index_of(const size_t size)
+{
+	if (__builtin_constant_p(size)) {
+		int i = 0;
+
+#define CACHE(x) \
+	if (size <=x) \
+		return i; \
+	else \
+		i++;
+#include "linux/kmalloc_sizes.h"
+#undef CACHE
+		{
+			extern void __bad_size(void);
+			__bad_size();
+		}
 	}
-#define list3_data(cachep) \
-	(&(cachep)->lists)
+	return 0;
+}
+
+#define INDEX_AC index_of(sizeof(struct arraycache_init))
+#define INDEX_L3 index_of(sizeof(struct kmem_list3))
+
+static inline void kmem_list3_init(struct kmem_list3 *parent)
+{
+	INIT_LIST_HEAD(&parent->slabs_full);
+	INIT_LIST_HEAD(&parent->slabs_partial);
+	INIT_LIST_HEAD(&parent->slabs_free);
+	parent->shared = NULL;
+	parent->alien = NULL;
+	spin_lock_init(&parent->list_lock);
+	parent->free_objects = 0;
+	parent->free_touched = 0;
+}
 
-/* NUMA: per-node */
-#define list3_data_ptr(cachep, ptr) \
-		list3_data(cachep)
+#define MAKE_LIST(cachep, listp, slab, nodeid)	\
+	do {	\
+		INIT_LIST_HEAD(listp);		\
+		list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
+	} while (0)
+
+#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)			\
+	do {					\
+	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
+	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
+	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
+	} while (0)
 
 /*
  * kmem_cache_t
@@ -305,13 +372,12 @@ struct kmem_cache_s {
 	struct array_cache	*array[NR_CPUS];
 	unsigned int		batchcount;
 	unsigned int		limit;
-/* 2) touched by every alloc & free from the backend */
-	struct kmem_list3	lists;
-	/* NUMA: kmem_3list_t	*nodelists[MAX_NUMNODES] */
+	unsigned int 		shared;
 	unsigned int		objsize;
+/* 2) touched by every alloc & free from the backend */
+	struct kmem_list3	*nodelists[MAX_NUMNODES];
 	unsigned int	 	flags;	/* constant flags */
 	unsigned int		num;	/* # of objs per slab */
-	unsigned int		free_limit; /* upper limit of objects in the lists */
 	spinlock_t		spinlock;
 
 /* 3) cache_grow/shrink */
@@ -348,6 +414,7 @@ struct kmem_cache_s {
 	unsigned long 		errors;
 	unsigned long		max_freeable;
 	unsigned long		node_allocs;
+	unsigned long		node_frees;
 	atomic_t		allochit;
 	atomic_t		allocmiss;
 	atomic_t		freehit;
@@ -383,6 +450,7 @@ struct kmem_cache_s {
 				} while (0)
 #define	STATS_INC_ERR(x)	((x)->errors++)
 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
+#define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
 #define	STATS_SET_FREEABLE(x, i) \
 				do { if ((x)->max_freeable < i) \
 					(x)->max_freeable = i; \
@@ -401,6 +469,7 @@ struct kmem_cache_s {
 #define	STATS_SET_HIGH(x)	do { } while (0)
 #define	STATS_INC_ERR(x)	do { } while (0)
 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
+#define	STATS_INC_NODEFREES(x)	do { } while (0)
 #define	STATS_SET_FREEABLE(x, i) \
 				do { } while (0)
 
@@ -533,9 +602,9 @@ static struct arraycache_init initarray_generic =
 
 /* internal cache of cache description objs */
 static kmem_cache_t cache_cache = {
-	.lists		= LIST3_INIT(cache_cache.lists),
 	.batchcount	= 1,
 	.limit		= BOOT_CPUCACHE_ENTRIES,
+	.shared		= 1,
 	.objsize	= sizeof(kmem_cache_t),
 	.flags		= SLAB_NO_REAP,
 	.spinlock	= SPIN_LOCK_UNLOCKED,
@@ -556,7 +625,6 @@ static struct list_head cache_chain;
  * SLAB_RECLAIM_ACCOUNT turns this on per-slab
  */
 atomic_t slab_reclaim_pages;
-EXPORT_SYMBOL(slab_reclaim_pages);
 
 /*
  * chicken and egg problem: delay the per-cpu array allocation
@@ -564,7 +632,8 @@ EXPORT_SYMBOL(slab_reclaim_pages);
  */
 static enum {
 	NONE,
-	PARTIAL,
+	PARTIAL_AC,
+	PARTIAL_L3,
 	FULL
 } g_cpucache_up;
 
@@ -573,11 +642,7 @@ static DEFINE_PER_CPU(struct work_struct, reap_work);
 static void free_block(kmem_cache_t* cachep, void** objpp, int len);
 static void enable_cpucache (kmem_cache_t *cachep);
 static void cache_reap (void *unused);
-
-static inline void **ac_entry(struct array_cache *ac)
-{
-	return (void**)(ac+1);
-}
+static int __node_shrink(kmem_cache_t *cachep, int node);
 
 static inline struct array_cache *ac_data(kmem_cache_t *cachep)
 {
@@ -600,7 +665,7 @@ static inline kmem_cache_t *__find_general_cachep(size_t size,
 		csizep++;
 
 	/*
-	 * Really subtile: The last entry with cs->cs_size==ULONG_MAX
+	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
 	 * has cs_{dma,}cachep==NULL. Thus no special case
 	 * for large kmalloc calls required.
 	 */
@@ -675,48 +740,160 @@ static void __devinit start_cpu_timer(int cpu)
 	}
 }
 
-static struct array_cache *alloc_arraycache(int cpu, int entries,
+static struct array_cache *alloc_arraycache(int node, int entries,
 						int batchcount)
 {
 	int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
 	struct array_cache *nc = NULL;
 
-	if (cpu == -1)
-		nc = kmalloc(memsize, GFP_KERNEL);
-	else
-		nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
-
+	nc = kmalloc_node(memsize, GFP_KERNEL, node);
 	if (nc) {
 		nc->avail = 0;
 		nc->limit = entries;
 		nc->batchcount = batchcount;
 		nc->touched = 0;
+		spin_lock_init(&nc->lock);
 	}
 	return nc;
 }
 
+#ifdef CONFIG_NUMA
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+	struct array_cache **ac_ptr;
+	int memsize = sizeof(void*)*MAX_NUMNODES;
+	int i;
+
+	if (limit > 1)
+		limit = 12;
+	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+	if (ac_ptr) {
+		for_each_node(i) {
+			if (i == node || !node_online(i)) {
+				ac_ptr[i] = NULL;
+				continue;
+			}
+			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
+			if (!ac_ptr[i]) {
+				for (i--; i <=0; i--)
+					kfree(ac_ptr[i]);
+				kfree(ac_ptr);
+				return NULL;
+			}
+		}
+	}
+	return ac_ptr;
+}
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+	int i;
+
+	if (!ac_ptr)
+		return;
+
+	for_each_node(i)
+		kfree(ac_ptr[i]);
+
+	kfree(ac_ptr);
+}
+
+static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
+{
+	struct kmem_list3 *rl3 = cachep->nodelists[node];
+
+	if (ac->avail) {
+		spin_lock(&rl3->list_lock);
+		free_block(cachep, ac->entry, ac->avail);
+		ac->avail = 0;
+		spin_unlock(&rl3->list_lock);
+	}
+}
+
+static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
+{
+	int i=0;
+	struct array_cache *ac;
+	unsigned long flags;
+
+	for_each_online_node(i) {
+		ac = l3->alien[i];
+		if (ac) {
+			spin_lock_irqsave(&ac->lock, flags);
+			__drain_alien_cache(cachep, ac, i);
+			spin_unlock_irqrestore(&ac->lock, flags);
+		}
+	}
+}
+#else
+#define alloc_alien_cache(node, limit) do { } while (0)
+#define free_alien_cache(ac_ptr) do { } while (0)
+#define drain_alien_cache(cachep, l3) do { } while (0)
+#endif
+
 static int __devinit cpuup_callback(struct notifier_block *nfb,
 				  unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 	kmem_cache_t* cachep;
+	struct kmem_list3 *l3 = NULL;
+	int node = cpu_to_node(cpu);
+	int memsize = sizeof(struct kmem_list3);
+	struct array_cache *nc = NULL;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
 		down(&cache_chain_sem);
+		/* we need to do this right in the beginning since
+		 * alloc_arraycache's are going to use this list.
+		 * kmalloc_node allows us to add the slab to the right
+		 * kmem_list3 and not this cpu's kmem_list3
+		 */
+
 		list_for_each_entry(cachep, &cache_chain, next) {
-			struct array_cache *nc;
+			/* setup the size64 kmemlist for cpu before we can
+			 * begin anything. Make sure some other cpu on this
+			 * node has not already allocated this
+			 */
+			if (!cachep->nodelists[node]) {
+				if (!(l3 = kmalloc_node(memsize,
+						GFP_KERNEL, node)))
+					goto bad;
+				kmem_list3_init(l3);
+				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+				  ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+
+				cachep->nodelists[node] = l3;
+			}
 
-			nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
+			spin_lock_irq(&cachep->nodelists[node]->list_lock);
+			cachep->nodelists[node]->free_limit =
+				(1 + nr_cpus_node(node)) *
+				cachep->batchcount + cachep->num;
+			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+		}
+
+		/* Now we can go ahead with allocating the shared array's
+		  & array cache's */
+		list_for_each_entry(cachep, &cache_chain, next) {
+			nc = alloc_arraycache(node, cachep->limit,
+					cachep->batchcount);
 			if (!nc)
 				goto bad;
-
-			spin_lock_irq(&cachep->spinlock);
 			cachep->array[cpu] = nc;
-			cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
-						+ cachep->num;
-			spin_unlock_irq(&cachep->spinlock);
 
+			l3 = cachep->nodelists[node];
+			BUG_ON(!l3);
+			if (!l3->shared) {
+				if (!(nc = alloc_arraycache(node,
+					cachep->shared*cachep->batchcount,
+					0xbaadf00d)))
+					goto  bad;
+
+				/* we are serialised from CPU_DEAD or
+				  CPU_UP_CANCELLED by the cpucontrol lock */
+				l3->shared = nc;
+			}
 		}
 		up(&cache_chain_sem);
 		break;
@@ -731,13 +908,51 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
+			cpumask_t mask;
 
+			mask = node_to_cpumask(node);
 			spin_lock_irq(&cachep->spinlock);
 			/* cpu is dead; no one can alloc from it. */
 			nc = cachep->array[cpu];
 			cachep->array[cpu] = NULL;
-			cachep->free_limit -= cachep->batchcount;
-			free_block(cachep, ac_entry(nc), nc->avail);
+			l3 = cachep->nodelists[node];
+
+			if (!l3)
+				goto unlock_cache;
+
+			spin_lock(&l3->list_lock);
+
+			/* Free limit for this kmem_list3 */
+			l3->free_limit -= cachep->batchcount;
+			if (nc)
+				free_block(cachep, nc->entry, nc->avail);
+
+			if (!cpus_empty(mask)) {
+                                spin_unlock(&l3->list_lock);
+                                goto unlock_cache;
+                        }
+
+			if (l3->shared) {
+				free_block(cachep, l3->shared->entry,
+						l3->shared->avail);
+				kfree(l3->shared);
+				l3->shared = NULL;
+			}
+			if (l3->alien) {
+				drain_alien_cache(cachep, l3);
+				free_alien_cache(l3->alien);
+				l3->alien = NULL;
+			}
+
+			/* free slabs belonging to this node */
+			if (__node_shrink(cachep, node)) {
+				cachep->nodelists[node] = NULL;
+				spin_unlock(&l3->list_lock);
+				kfree(l3);
+			} else {
+				spin_unlock(&l3->list_lock);
+			}
+unlock_cache:
 			spin_unlock_irq(&cachep->spinlock);
 			kfree(nc);
 		}
@@ -753,6 +968,25 @@ bad:
 
 static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 
+/*
+ * swap the static kmem_list3 with kmalloced memory
+ */
+static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
+		int nodeid)
+{
+	struct kmem_list3 *ptr;
+
+	BUG_ON(cachep->nodelists[nodeid] != list);
+	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+	BUG_ON(!ptr);
+
+	local_irq_disable();
+	memcpy(ptr, list, sizeof(struct kmem_list3));
+	MAKE_ALL_LISTS(cachep, ptr, nodeid);
+	cachep->nodelists[nodeid] = ptr;
+	local_irq_enable();
+}
+
 /* Initialisation.
  * Called after the gfp() functions have been enabled, and before smp_init().
  */
@@ -761,6 +995,13 @@ void __init kmem_cache_init(void)
 	size_t left_over;
 	struct cache_sizes *sizes;
 	struct cache_names *names;
+	int i;
+
+	for (i = 0; i < NUM_INIT_LISTS; i++) {
+		kmem_list3_init(&initkmem_list3[i]);
+		if (i < MAX_NUMNODES)
+			cache_cache.nodelists[i] = NULL;
+	}
 
 	/*
 	 * Fragmentation resistance on low memory - only use bigger
@@ -769,21 +1010,24 @@ void __init kmem_cache_init(void)
 	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
 		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
 
-	
 	/* Bootstrap is tricky, because several objects are allocated
 	 * from caches that do not exist yet:
 	 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
 	 *    structures of all caches, except cache_cache itself: cache_cache
 	 *    is statically allocated.
-	 *    Initially an __init data area is used for the head array, it's
-	 *    replaced with a kmalloc allocated array at the end of the bootstrap.
+	 *    Initially an __init data area is used for the head array and the
+	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
+	 *    array at the end of the bootstrap.
 	 * 2) Create the first kmalloc cache.
-	 *    The kmem_cache_t for the new cache is allocated normally. An __init
-	 *    data area is used for the head array.
-	 * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
+	 *    The kmem_cache_t for the new cache is allocated normally.
+	 *    An __init data area is used for the head array.
+	 * 3) Create the remaining kmalloc caches, with minimally sized
+	 *    head arrays.
 	 * 4) Replace the __init data head arrays for cache_cache and the first
 	 *    kmalloc cache with kmalloc allocated arrays.
-	 * 5) Resize the head arrays of the kmalloc caches to their final sizes.
+	 * 5) Replace the __init data for kmem_list3 for cache_cache and
+	 *    the other cache's with kmalloc allocated memory.
+	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
 	 */
 
 	/* 1) create the cache_cache */
@@ -792,6 +1036,7 @@ void __init kmem_cache_init(void)
 	list_add(&cache_cache.next, &cache_chain);
 	cache_cache.colour_off = cache_line_size();
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
+	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
 
 	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
 
@@ -809,15 +1054,33 @@ void __init kmem_cache_init(void)
 	sizes = malloc_sizes;
 	names = cache_names;
 
+	/* Initialize the caches that provide memory for the array cache
+	 * and the kmem_list3 structures first.
+	 * Without this, further allocations will bug
+	 */
+
+	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
+				sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
+				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+
+	if (INDEX_AC != INDEX_L3)
+		sizes[INDEX_L3].cs_cachep =
+			kmem_cache_create(names[INDEX_L3].name,
+				sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
+				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+
 	while (sizes->cs_size != ULONG_MAX) {
-		/* For performance, all the general caches are L1 aligned.
+		/*
+		 * For performance, all the general caches are L1 aligned.
 		 * This should be particularly beneficial on SMP boxes, as it
 		 * eliminates "false sharing".
 		 * Note for systems short on memory removing the alignment will
-		 * allow tighter packing of the smaller caches. */
-		sizes->cs_cachep = kmem_cache_create(names->name,
-			sizes->cs_size, ARCH_KMALLOC_MINALIGN,
-			(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+		 * allow tighter packing of the smaller caches.
+		 */
+		if(!sizes->cs_cachep)
+			sizes->cs_cachep = kmem_cache_create(names->name,
+				sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
 
 		/* Inc off-slab bufctl limit until the ceiling is hit. */
 		if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -836,24 +1099,47 @@ void __init kmem_cache_init(void)
 	/* 4) Replace the bootstrap head arrays */
 	{
 		void * ptr;
-		
+
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
 		local_irq_disable();
 		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
-		memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
+		memcpy(ptr, ac_data(&cache_cache),
+				sizeof(struct arraycache_init));
 		cache_cache.array[smp_processor_id()] = ptr;
 		local_irq_enable();
-	
+
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
 		local_irq_disable();
-		BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
-		memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
+		BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
+				!= &initarray_generic.cache);
+		memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
 				sizeof(struct arraycache_init));
-		malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
+		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
+						ptr;
 		local_irq_enable();
 	}
+	/* 5) Replace the bootstrap kmem_list3's */
+	{
+		int node;
+		/* Replace the static kmem_list3 structures for the boot cpu */
+		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
+				numa_node_id());
+
+		for_each_online_node(node) {
+			init_list(malloc_sizes[INDEX_AC].cs_cachep,
+					&initkmem_list3[SIZE_AC+node], node);
+
+			if (INDEX_AC != INDEX_L3) {
+				init_list(malloc_sizes[INDEX_L3].cs_cachep,
+						&initkmem_list3[SIZE_L3+node],
+						node);
+			}
+		}
+	}
 
-	/* 5) resize the head arrays to their final sizes */
+	/* 6) resize the head arrays to their final sizes */
 	{
 		kmem_cache_t *cachep;
 		down(&cache_chain_sem);
@@ -869,7 +1155,6 @@ void __init kmem_cache_init(void)
 	 * that initializes ac_data for all new cpus
 	 */
 	register_cpu_notifier(&cpucache_notifier);
-	
 
 	/* The reap timers are started later, with a module init call:
 	 * That part of the kernel is not yet operational.
@@ -884,10 +1169,8 @@ static int __init cpucache_init(void)
 	 * Register the timers that return unneeded
 	 * pages to gfp.
 	 */
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		if (cpu_online(cpu))
-			start_cpu_timer(cpu);
-	}
+	for_each_online_cpu(cpu)
+		start_cpu_timer(cpu);
 
 	return 0;
 }
@@ -1166,6 +1449,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
 	}
 }
 
+/* For setting up all the kmem_list3s for cache whose objsize is same
+   as size of kmem_list3. */
+static inline void set_up_list3s(kmem_cache_t *cachep, int index)
+{
+	int node;
+
+	for_each_online_node(node) {
+		cachep->nodelists[node] = &initkmem_list3[index+node];
+		cachep->nodelists[node]->next_reap = jiffies +
+			REAPTIMEOUT_LIST3 +
+			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+	}
+}
+
 /**
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1319,7 +1616,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		size += BYTES_PER_WORD;
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-	if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
+	if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
 		cachep->dbghead += PAGE_SIZE - size;
 		size = PAGE_SIZE;
 	}
@@ -1421,13 +1718,9 @@ next:
 		cachep->gfpflags |= GFP_DMA;
 	spin_lock_init(&cachep->spinlock);
 	cachep->objsize = size;
-	/* NUMA */
-	INIT_LIST_HEAD(&cachep->lists.slabs_full);
-	INIT_LIST_HEAD(&cachep->lists.slabs_partial);
-	INIT_LIST_HEAD(&cachep->lists.slabs_free);
 
 	if (flags & CFLGS_OFF_SLAB)
-		cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
+		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
 	cachep->ctor = ctor;
 	cachep->dtor = dtor;
 	cachep->name = name;
@@ -1443,11 +1736,43 @@ next:
 			 * the cache that's used by kmalloc(24), otherwise
 			 * the creation of further caches will BUG().
 			 */
-			cachep->array[smp_processor_id()] = &initarray_generic.cache;
-			g_cpucache_up = PARTIAL;
+			cachep->array[smp_processor_id()] =
+				&initarray_generic.cache;
+
+			/* If the cache that's used by
+			 * kmalloc(sizeof(kmem_list3)) is the first cache,
+			 * then we need to set up all its list3s, otherwise
+			 * the creation of further caches will BUG().
+			 */
+			set_up_list3s(cachep, SIZE_AC);
+			if (INDEX_AC == INDEX_L3)
+				g_cpucache_up = PARTIAL_L3;
+			else
+				g_cpucache_up = PARTIAL_AC;
 		} else {
-			cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
+			cachep->array[smp_processor_id()] =
+				kmalloc(sizeof(struct arraycache_init),
+						GFP_KERNEL);
+
+			if (g_cpucache_up == PARTIAL_AC) {
+				set_up_list3s(cachep, SIZE_L3);
+				g_cpucache_up = PARTIAL_L3;
+			} else {
+				int node;
+				for_each_online_node(node) {
+
+					cachep->nodelists[node] =
+						kmalloc_node(sizeof(struct kmem_list3),
+								GFP_KERNEL, node);
+					BUG_ON(!cachep->nodelists[node]);
+					kmem_list3_init(cachep->nodelists[node]);
+				}
+			}
 		}
+		cachep->nodelists[numa_node_id()]->next_reap =
+			jiffies + REAPTIMEOUT_LIST3 +
+			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+
 		BUG_ON(!ac_data(cachep));
 		ac_data(cachep)->avail = 0;
 		ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
@@ -1455,13 +1780,8 @@ next:
 		ac_data(cachep)->touched = 0;
 		cachep->batchcount = 1;
 		cachep->limit = BOOT_CPUCACHE_ENTRIES;
-		cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
-					+ cachep->num;
 	} 
 
-	cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
-					((unsigned long)cachep)%REAPTIMEOUT_LIST3;
-
 	/* Need the semaphore to access the chain. */
 	down(&cache_chain_sem);
 	{
@@ -1518,13 +1838,23 @@ static void check_spinlock_acquired(kmem_cache_t *cachep)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
-	BUG_ON(spin_trylock(&cachep->spinlock));
+	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
+#endif
+}
+
+static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
+{
+#ifdef CONFIG_SMP
+	check_irq_off();
+	assert_spin_locked(&cachep->nodelists[node]->list_lock);
 #endif
 }
+
 #else
 #define check_irq_off()	do { } while(0)
 #define check_irq_on()	do { } while(0)
 #define check_spinlock_acquired(x) do { } while(0)
+#define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 
 /*
@@ -1546,7 +1876,7 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
 }
 
 static void drain_array_locked(kmem_cache_t* cachep,
-				struct array_cache *ac, int force);
+				struct array_cache *ac, int force, int node);
 
 static void do_drain(void *arg)
 {
@@ -1555,59 +1885,82 @@ static void do_drain(void *arg)
 
 	check_irq_off();
 	ac = ac_data(cachep);
-	spin_lock(&cachep->spinlock);
-	free_block(cachep, &ac_entry(ac)[0], ac->avail);
-	spin_unlock(&cachep->spinlock);
+	spin_lock(&cachep->nodelists[numa_node_id()]->list_lock);
+	free_block(cachep, ac->entry, ac->avail);
+	spin_unlock(&cachep->nodelists[numa_node_id()]->list_lock);
 	ac->avail = 0;
 }
 
 static void drain_cpu_caches(kmem_cache_t *cachep)
 {
+	struct kmem_list3 *l3;
+	int node;
+
 	smp_call_function_all_cpus(do_drain, cachep);
 	check_irq_on();
 	spin_lock_irq(&cachep->spinlock);
-	if (cachep->lists.shared)
-		drain_array_locked(cachep, cachep->lists.shared, 1);
+	for_each_online_node(node)  {
+		l3 = cachep->nodelists[node];
+		if (l3) {
+			spin_lock(&l3->list_lock);
+			drain_array_locked(cachep, l3->shared, 1, node);
+			spin_unlock(&l3->list_lock);
+			if (l3->alien)
+				drain_alien_cache(cachep, l3);
+		}
+	}
 	spin_unlock_irq(&cachep->spinlock);
 }
 
-
-/* NUMA shrink all list3s */
-static int __cache_shrink(kmem_cache_t *cachep)
+static int __node_shrink(kmem_cache_t *cachep, int node)
 {
 	struct slab *slabp;
+	struct kmem_list3 *l3 = cachep->nodelists[node];
 	int ret;
 
-	drain_cpu_caches(cachep);
-
-	check_irq_on();
-	spin_lock_irq(&cachep->spinlock);
-
-	for(;;) {
+	for (;;) {
 		struct list_head *p;
 
-		p = cachep->lists.slabs_free.prev;
-		if (p == &cachep->lists.slabs_free)
+		p = l3->slabs_free.prev;
+		if (p == &l3->slabs_free)
 			break;
 
-		slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list);
+		slabp = list_entry(l3->slabs_free.prev, struct slab, list);
 #if DEBUG
 		if (slabp->inuse)
 			BUG();
 #endif
 		list_del(&slabp->list);
 
-		cachep->lists.free_objects -= cachep->num;
-		spin_unlock_irq(&cachep->spinlock);
+		l3->free_objects -= cachep->num;
+		spin_unlock_irq(&l3->list_lock);
 		slab_destroy(cachep, slabp);
-		spin_lock_irq(&cachep->spinlock);
+		spin_lock_irq(&l3->list_lock);
 	}
-	ret = !list_empty(&cachep->lists.slabs_full) ||
-		!list_empty(&cachep->lists.slabs_partial);
-	spin_unlock_irq(&cachep->spinlock);
+	ret = !list_empty(&l3->slabs_full) ||
+		!list_empty(&l3->slabs_partial);
 	return ret;
 }
 
+static int __cache_shrink(kmem_cache_t *cachep)
+{
+	int ret = 0, i = 0;
+	struct kmem_list3 *l3;
+
+	drain_cpu_caches(cachep);
+
+	check_irq_on();
+	for_each_online_node(i) {
+		l3 = cachep->nodelists[i];
+		if (l3) {
+			spin_lock_irq(&l3->list_lock);
+			ret += __node_shrink(cachep, i);
+			spin_unlock_irq(&l3->list_lock);
+		}
+	}
+	return (ret ? 1 : 0);
+}
+
 /**
  * kmem_cache_shrink - Shrink a cache.
  * @cachep: The cache to shrink.
@@ -1644,6 +1997,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
 int kmem_cache_destroy(kmem_cache_t * cachep)
 {
 	int i;
+	struct kmem_list3 *l3;
 
 	if (!cachep || in_interrupt())
 		BUG();
@@ -1671,15 +2025,17 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
 		synchronize_rcu();
 
-	/* no cpu_online check required here since we clear the percpu
-	 * array on cpu offline and set this to NULL.
-	 */
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_online_cpu(i)
 		kfree(cachep->array[i]);
 
 	/* NUMA: free the list3 structures */
-	kfree(cachep->lists.shared);
-	cachep->lists.shared = NULL;
+	for_each_online_node(i) {
+		if ((l3 = cachep->nodelists[i])) {
+			kfree(l3->shared);
+			free_alien_cache(l3->alien);
+			kfree(l3);
+		}
+	}
 	kmem_cache_free(&cache_cache, cachep);
 
 	unlock_cpu_hotplug();
@@ -1689,8 +2045,8 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
 EXPORT_SYMBOL(kmem_cache_destroy);
 
 /* Get the memory for a slab management obj. */
-static struct slab* alloc_slabmgmt(kmem_cache_t *cachep,
-			void *objp, int colour_off, unsigned int __nocast local_flags)
+static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
+			int colour_off, unsigned int __nocast local_flags)
 {
 	struct slab *slabp;
 	
@@ -1721,7 +2077,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
 	int i;
 
 	for (i = 0; i < cachep->num; i++) {
-		void* objp = slabp->s_mem+cachep->objsize*i;
+		void *objp = slabp->s_mem+cachep->objsize*i;
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
@@ -1798,6 +2154,7 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
 	size_t		 offset;
 	unsigned int	 local_flags;
 	unsigned long	 ctor_flags;
+	struct kmem_list3 *l3;
 
 	/* Be lazy and only check for valid flags here,
  	 * keeping it out of the critical path in kmem_cache_alloc().
@@ -1829,6 +2186,7 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
 
 	spin_unlock(&cachep->spinlock);
 
+	check_irq_off();
 	if (local_flags & __GFP_WAIT)
 		local_irq_enable();
 
@@ -1840,8 +2198,9 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
 	 */
 	kmem_flagcheck(cachep, flags);
 
-
-	/* Get mem for the objs. */
+	/* Get mem for the objs.
+	 * Attempt to allocate a physical page from 'nodeid',
+	 */
 	if (!(objp = kmem_getpages(cachep, flags, nodeid)))
 		goto failed;
 
@@ -1849,6 +2208,7 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
 	if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
 		goto opps1;
 
+	slabp->nodeid = nodeid;
 	set_slab_attr(cachep, slabp, objp);
 
 	cache_init_objs(cachep, slabp, ctor_flags);
@@ -1856,13 +2216,14 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	check_irq_off();
-	spin_lock(&cachep->spinlock);
+	l3 = cachep->nodelists[nodeid];
+	spin_lock(&l3->list_lock);
 
 	/* Make slab active. */
-	list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
+	list_add_tail(&slabp->list, &(l3->slabs_free));
 	STATS_INC_GROWN(cachep);
-	list3_data(cachep)->free_objects += cachep->num;
-	spin_unlock(&cachep->spinlock);
+	l3->free_objects += cachep->num;
+	spin_unlock(&l3->list_lock);
 	return 1;
 opps1:
 	kmem_freepages(cachep, objp);
@@ -1968,7 +2329,6 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
 	kmem_bufctl_t i;
 	int entries = 0;
 	
-	check_spinlock_acquired(cachep);
 	/* Check slab's freelist to see if this obj is there. */
 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
 		entries++;
@@ -2011,10 +2371,11 @@ retry:
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
-	l3 = list3_data(cachep);
+	l3 = cachep->nodelists[numa_node_id()];
+
+	BUG_ON(ac->avail > 0 || !l3);
+	spin_lock(&l3->list_lock);
 
-	BUG_ON(ac->avail > 0);
-	spin_lock(&cachep->spinlock);
 	if (l3->shared) {
 		struct array_cache *shared_array = l3->shared;
 		if (shared_array->avail) {
@@ -2022,8 +2383,9 @@ retry:
 				batchcount = shared_array->avail;
 			shared_array->avail -= batchcount;
 			ac->avail = batchcount;
-			memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail],
-					sizeof(void*)*batchcount);
+			memcpy(ac->entry,
+				&(shared_array->entry[shared_array->avail]),
+				sizeof(void*)*batchcount);
 			shared_array->touched = 1;
 			goto alloc_done;
 		}
@@ -2050,7 +2412,8 @@ retry:
 			STATS_SET_HIGH(cachep);
 
 			/* get obj pointer */
-			ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize;
+			ac->entry[ac->avail++] = slabp->s_mem +
+				slabp->free*cachep->objsize;
 
 			slabp->inuse++;
 			next = slab_bufctl(slabp)[slabp->free];
@@ -2072,12 +2435,12 @@ retry:
 must_grow:
 	l3->free_objects -= ac->avail;
 alloc_done:
-	spin_unlock(&cachep->spinlock);
+	spin_unlock(&l3->list_lock);
 
 	if (unlikely(!ac->avail)) {
 		int x;
-		x = cache_grow(cachep, flags, -1);
-		
+		x = cache_grow(cachep, flags, numa_node_id());
+
 		// cache_grow can reenable interrupts, then ac could change.
 		ac = ac_data(cachep);
 		if (!x && ac->avail == 0)	// no objects in sight? abort
@@ -2087,7 +2450,7 @@ alloc_done:
 			goto retry;
 	}
 	ac->touched = 1;
-	return ac_entry(ac)[--ac->avail];
+	return ac->entry[--ac->avail];
 }
 
 static inline void
@@ -2159,43 +2522,116 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast fl
 	if (likely(ac->avail)) {
 		STATS_INC_ALLOCHIT(cachep);
 		ac->touched = 1;
-		objp = ac_entry(ac)[--ac->avail];
+		objp = ac->entry[--ac->avail];
 	} else {
 		STATS_INC_ALLOCMISS(cachep);
 		objp = cache_alloc_refill(cachep, flags);
 	}
 	local_irq_restore(save_flags);
-	objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0));
+	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
+					__builtin_return_address(0));
+	prefetchw(objp);
 	return objp;
 }
 
-/* 
- * NUMA: different approach needed if the spinlock is moved into
- * the l3 structure
+#ifdef CONFIG_NUMA
+/*
+ * A interface to enable slab creation on nodeid
  */
+static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
+{
+	struct list_head *entry;
+ 	struct slab *slabp;
+ 	struct kmem_list3 *l3;
+ 	void *obj;
+ 	kmem_bufctl_t next;
+ 	int x;
 
+ 	l3 = cachep->nodelists[nodeid];
+ 	BUG_ON(!l3);
+
+retry:
+ 	spin_lock(&l3->list_lock);
+ 	entry = l3->slabs_partial.next;
+ 	if (entry == &l3->slabs_partial) {
+ 		l3->free_touched = 1;
+ 		entry = l3->slabs_free.next;
+ 		if (entry == &l3->slabs_free)
+ 			goto must_grow;
+ 	}
+
+ 	slabp = list_entry(entry, struct slab, list);
+ 	check_spinlock_acquired_node(cachep, nodeid);
+ 	check_slabp(cachep, slabp);
+
+ 	STATS_INC_NODEALLOCS(cachep);
+ 	STATS_INC_ACTIVE(cachep);
+ 	STATS_SET_HIGH(cachep);
+
+ 	BUG_ON(slabp->inuse == cachep->num);
+
+ 	/* get obj pointer */
+ 	obj =  slabp->s_mem + slabp->free*cachep->objsize;
+ 	slabp->inuse++;
+ 	next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+ 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+#endif
+ 	slabp->free = next;
+ 	check_slabp(cachep, slabp);
+ 	l3->free_objects--;
+ 	/* move slabp to correct slabp list: */
+ 	list_del(&slabp->list);
+
+ 	if (slabp->free == BUFCTL_END) {
+ 		list_add(&slabp->list, &l3->slabs_full);
+ 	} else {
+ 		list_add(&slabp->list, &l3->slabs_partial);
+ 	}
+
+ 	spin_unlock(&l3->list_lock);
+ 	goto done;
+
+must_grow:
+ 	spin_unlock(&l3->list_lock);
+ 	x = cache_grow(cachep, flags, nodeid);
+
+ 	if (!x)
+ 		return NULL;
+
+ 	goto retry;
+done:
+ 	return obj;
+}
+#endif
+
+/*
+ * Caller needs to acquire correct kmem_list's list_lock
+ */
 static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
 {
 	int i;
-
-	check_spinlock_acquired(cachep);
-
-	/* NUMA: move add into loop */
-	cachep->lists.free_objects += nr_objects;
+	struct kmem_list3 *l3;
 
 	for (i = 0; i < nr_objects; i++) {
 		void *objp = objpp[i];
 		struct slab *slabp;
 		unsigned int objnr;
+		int nodeid = 0;
 
 		slabp = GET_PAGE_SLAB(virt_to_page(objp));
+		nodeid = slabp->nodeid;
+		l3 = cachep->nodelists[nodeid];
 		list_del(&slabp->list);
 		objnr = (objp - slabp->s_mem) / cachep->objsize;
+		check_spinlock_acquired_node(cachep, nodeid);
 		check_slabp(cachep, slabp);
+
+
 #if DEBUG
 		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
-			printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n",
-						cachep->name, objp);
+			printk(KERN_ERR "slab: double free detected in cache "
+					"'%s', objp %p\n", cachep->name, objp);
 			BUG();
 		}
 #endif
@@ -2203,24 +2639,23 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
 		slabp->free = objnr;
 		STATS_DEC_ACTIVE(cachep);
 		slabp->inuse--;
+		l3->free_objects++;
 		check_slabp(cachep, slabp);
 
 		/* fixup slab chains */
 		if (slabp->inuse == 0) {
-			if (cachep->lists.free_objects > cachep->free_limit) {
-				cachep->lists.free_objects -= cachep->num;
+			if (l3->free_objects > l3->free_limit) {
+				l3->free_objects -= cachep->num;
 				slab_destroy(cachep, slabp);
 			} else {
-				list_add(&slabp->list,
-				&list3_data_ptr(cachep, objp)->slabs_free);
+				list_add(&slabp->list, &l3->slabs_free);
 			}
 		} else {
 			/* Unconditionally move a slab to the end of the
 			 * partial list on free - maximum time for the
 			 * other objects to be freed, too.
 			 */
-			list_add_tail(&slabp->list,
-				&list3_data_ptr(cachep, objp)->slabs_partial);
+			list_add_tail(&slabp->list, &l3->slabs_partial);
 		}
 	}
 }
@@ -2228,36 +2663,38 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
 static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
 {
 	int batchcount;
+	struct kmem_list3 *l3;
 
 	batchcount = ac->batchcount;
 #if DEBUG
 	BUG_ON(!batchcount || batchcount > ac->avail);
 #endif
 	check_irq_off();
-	spin_lock(&cachep->spinlock);
-	if (cachep->lists.shared) {
-		struct array_cache *shared_array = cachep->lists.shared;
+	l3 = cachep->nodelists[numa_node_id()];
+	spin_lock(&l3->list_lock);
+	if (l3->shared) {
+		struct array_cache *shared_array = l3->shared;
 		int max = shared_array->limit-shared_array->avail;
 		if (max) {
 			if (batchcount > max)
 				batchcount = max;
-			memcpy(&ac_entry(shared_array)[shared_array->avail],
-					&ac_entry(ac)[0],
+			memcpy(&(shared_array->entry[shared_array->avail]),
+					ac->entry,
 					sizeof(void*)*batchcount);
 			shared_array->avail += batchcount;
 			goto free_done;
 		}
 	}
 
-	free_block(cachep, &ac_entry(ac)[0], batchcount);
+	free_block(cachep, ac->entry, batchcount);
 free_done:
 #if STATS
 	{
 		int i = 0;
 		struct list_head *p;
 
-		p = list3_data(cachep)->slabs_free.next;
-		while (p != &(list3_data(cachep)->slabs_free)) {
+		p = l3->slabs_free.next;
+		while (p != &(l3->slabs_free)) {
 			struct slab *slabp;
 
 			slabp = list_entry(p, struct slab, list);
@@ -2269,12 +2706,13 @@ free_done:
 		STATS_SET_FREEABLE(cachep, i);
 	}
 #endif
-	spin_unlock(&cachep->spinlock);
+	spin_unlock(&l3->list_lock);
 	ac->avail -= batchcount;
-	memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
+	memmove(ac->entry, &(ac->entry[batchcount]),
 			sizeof(void*)*ac->avail);
 }
 
+
 /*
  * __cache_free
  * Release an obj back to its cache. If the obj has a constructed
@@ -2289,14 +2727,46 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
 
+	/* Make sure we are not freeing a object from another
+	 * node to the array cache on this cpu.
+	 */
+#ifdef CONFIG_NUMA
+	{
+		struct slab *slabp;
+		slabp = GET_PAGE_SLAB(virt_to_page(objp));
+		if (unlikely(slabp->nodeid != numa_node_id())) {
+			struct array_cache *alien = NULL;
+			int nodeid = slabp->nodeid;
+			struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];
+
+			STATS_INC_NODEFREES(cachep);
+			if (l3->alien && l3->alien[nodeid]) {
+				alien = l3->alien[nodeid];
+				spin_lock(&alien->lock);
+				if (unlikely(alien->avail == alien->limit))
+					__drain_alien_cache(cachep,
+							alien, nodeid);
+				alien->entry[alien->avail++] = objp;
+				spin_unlock(&alien->lock);
+			} else {
+				spin_lock(&(cachep->nodelists[nodeid])->
+						list_lock);
+				free_block(cachep, &objp, 1);
+				spin_unlock(&(cachep->nodelists[nodeid])->
+						list_lock);
+			}
+			return;
+		}
+	}
+#endif
 	if (likely(ac->avail < ac->limit)) {
 		STATS_INC_FREEHIT(cachep);
-		ac_entry(ac)[ac->avail++] = objp;
+		ac->entry[ac->avail++] = objp;
 		return;
 	} else {
 		STATS_INC_FREEMISS(cachep);
 		cache_flusharray(cachep, ac);
-		ac_entry(ac)[ac->avail++] = objp;
+		ac->entry[ac->avail++] = objp;
 	}
 }
 
@@ -2366,81 +2836,30 @@ out:
  * Identical to kmem_cache_alloc, except that this function is slow
  * and can sleep. And it will allocate memory on the given node, which
  * can improve the performance for cpu bound structures.
+ * New and improved: it will now make sure that the object gets
+ * put on the correct node list so that there is no false sharing.
  */
-void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
+void *kmem_cache_alloc_node(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
 {
-	int loop;
-	void *objp;
-	struct slab *slabp;
-	kmem_bufctl_t next;
-
-	if (nodeid == -1)
-		return kmem_cache_alloc(cachep, flags);
-
-	for (loop = 0;;loop++) {
-		struct list_head *q;
-
-		objp = NULL;
-		check_irq_on();
-		spin_lock_irq(&cachep->spinlock);
-		/* walk through all partial and empty slab and find one
-		 * from the right node */
-		list_for_each(q,&cachep->lists.slabs_partial) {
-			slabp = list_entry(q, struct slab, list);
-
-			if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
-					loop > 2)
-				goto got_slabp;
-		}
-		list_for_each(q, &cachep->lists.slabs_free) {
-			slabp = list_entry(q, struct slab, list);
+	unsigned long save_flags;
+	void *ptr;
 
-			if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
-					loop > 2)
-				goto got_slabp;
-		}
-		spin_unlock_irq(&cachep->spinlock);
+	if (nodeid == numa_node_id() || nodeid == -1)
+		return __cache_alloc(cachep, flags);
 
-		local_irq_disable();
-		if (!cache_grow(cachep, flags, nodeid)) {
-			local_irq_enable();
-			return NULL;
-		}
-		local_irq_enable();
+	if (unlikely(!cachep->nodelists[nodeid])) {
+		/* Fall back to __cache_alloc if we run into trouble */
+		printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name);
+		return __cache_alloc(cachep,flags);
 	}
-got_slabp:
-	/* found one: allocate object */
-	check_slabp(cachep, slabp);
-	check_spinlock_acquired(cachep);
-
-	STATS_INC_ALLOCED(cachep);
-	STATS_INC_ACTIVE(cachep);
-	STATS_SET_HIGH(cachep);
-	STATS_INC_NODEALLOCS(cachep);
-
-	objp = slabp->s_mem + slabp->free*cachep->objsize;
-
-	slabp->inuse++;
-	next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
-	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-#endif
-	slabp->free = next;
-	check_slabp(cachep, slabp);
 
-	/* move slabp to correct slabp list: */
-	list_del(&slabp->list);
-	if (slabp->free == BUFCTL_END)
-		list_add(&slabp->list, &cachep->lists.slabs_full);
-	else
-		list_add(&slabp->list, &cachep->lists.slabs_partial);
-
-	list3_data(cachep)->free_objects--;
-	spin_unlock_irq(&cachep->spinlock);
+	cache_alloc_debugcheck_before(cachep, flags);
+	local_irq_save(save_flags);
+	ptr = __cache_alloc_node(cachep, flags, nodeid);
+	local_irq_restore(save_flags);
+	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
 
-	objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp,
-					__builtin_return_address(0));
-	return objp;
+	return ptr;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
@@ -2510,11 +2929,18 @@ void *__alloc_percpu(size_t size, size_t align)
 	if (!pdata)
 		return NULL;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
-		pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL,
-						cpu_to_node(i));
+	/*
+	 * Cannot use for_each_online_cpu since a cpu may come online
+	 * and we have no way of figuring out how to fix the array
+	 * that we have allocated then....
+	 */
+	for_each_cpu(i) {
+		int node = cpu_to_node(i);
+
+		if (node_online(node))
+			pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
+		else
+			pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
 
 		if (!pdata->ptrs[i])
 			goto unwind_oom;
@@ -2555,29 +2981,25 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
 EXPORT_SYMBOL(kmem_cache_free);
 
 /**
- * kcalloc - allocate memory for an array. The memory is set to zero.
- * @n: number of elements.
- * @size: element size.
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
  */
-void *kcalloc(size_t n, size_t size, unsigned int __nocast flags)
+void *kzalloc(size_t size, unsigned int __nocast flags)
 {
-	void *ret = NULL;
-
-	if (n != 0 && size > INT_MAX / n)
-		return ret;
-
-	ret = kmalloc(n * size, flags);
+	void *ret = kmalloc(size, flags);
 	if (ret)
-		memset(ret, 0, n * size);
+		memset(ret, 0, size);
 	return ret;
 }
-EXPORT_SYMBOL(kcalloc);
+EXPORT_SYMBOL(kzalloc);
 
 /**
  * kfree - free previously allocated memory
  * @objp: pointer returned by kmalloc.
  *
+ * If @objp is NULL, no operation is performed.
+ *
  * Don't free memory not originally allocated by kmalloc()
  * or you will run into trouble.
  */
@@ -2610,11 +3032,11 @@ free_percpu(const void *objp)
 	int i;
 	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_possible(i))
-			continue;
+	/*
+	 * We allocate for all cpus so we cannot use for online cpu here.
+	 */
+	for_each_cpu(i)
 		kfree(p->ptrs[i]);
-	}
 	kfree(p);
 }
 EXPORT_SYMBOL(free_percpu);
@@ -2632,6 +3054,64 @@ const char *kmem_cache_name(kmem_cache_t *cachep)
 }
 EXPORT_SYMBOL_GPL(kmem_cache_name);
 
+/*
+ * This initializes kmem_list3 for all nodes.
+ */
+static int alloc_kmemlist(kmem_cache_t *cachep)
+{
+	int node;
+	struct kmem_list3 *l3;
+	int err = 0;
+
+	for_each_online_node(node) {
+		struct array_cache *nc = NULL, *new;
+		struct array_cache **new_alien = NULL;
+#ifdef CONFIG_NUMA
+		if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
+			goto fail;
+#endif
+		if (!(new = alloc_arraycache(node, (cachep->shared*
+				cachep->batchcount), 0xbaadf00d)))
+			goto fail;
+		if ((l3 = cachep->nodelists[node])) {
+
+			spin_lock_irq(&l3->list_lock);
+
+			if ((nc = cachep->nodelists[node]->shared))
+				free_block(cachep, nc->entry,
+							nc->avail);
+
+			l3->shared = new;
+			if (!cachep->nodelists[node]->alien) {
+				l3->alien = new_alien;
+				new_alien = NULL;
+			}
+			l3->free_limit = (1 + nr_cpus_node(node))*
+				cachep->batchcount + cachep->num;
+			spin_unlock_irq(&l3->list_lock);
+			kfree(nc);
+			free_alien_cache(new_alien);
+			continue;
+		}
+		if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
+						GFP_KERNEL, node)))
+			goto fail;
+
+		kmem_list3_init(l3);
+		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+		l3->shared = new;
+		l3->alien = new_alien;
+		l3->free_limit = (1 + nr_cpus_node(node))*
+			cachep->batchcount + cachep->num;
+		cachep->nodelists[node] = l3;
+	}
+	return err;
+fail:
+	err = -ENOMEM;
+	return err;
+}
+
 struct ccupdate_struct {
 	kmem_cache_t *cachep;
 	struct array_cache *new[NR_CPUS];
@@ -2644,7 +3124,7 @@ static void do_ccupdate_local(void *info)
 
 	check_irq_off();
 	old = ac_data(new->cachep);
-	
+
 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
 	new->new[smp_processor_id()] = old;
 }
@@ -2654,54 +3134,43 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
 				int shared)
 {
 	struct ccupdate_struct new;
-	struct array_cache *new_shared;
-	int i;
+	int i, err;
 
 	memset(&new.new,0,sizeof(new.new));
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_online(i)) {
-			new.new[i] = alloc_arraycache(i, limit, batchcount);
-			if (!new.new[i]) {
-				for (i--; i >= 0; i--) kfree(new.new[i]);
-				return -ENOMEM;
-			}
-		} else {
-			new.new[i] = NULL;
+	for_each_online_cpu(i) {
+		new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount);
+		if (!new.new[i]) {
+			for (i--; i >= 0; i--) kfree(new.new[i]);
+			return -ENOMEM;
 		}
 	}
 	new.cachep = cachep;
 
 	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
-	
+
 	check_irq_on();
 	spin_lock_irq(&cachep->spinlock);
 	cachep->batchcount = batchcount;
 	cachep->limit = limit;
-	cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
+	cachep->shared = shared;
 	spin_unlock_irq(&cachep->spinlock);
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_online_cpu(i) {
 		struct array_cache *ccold = new.new[i];
 		if (!ccold)
 			continue;
-		spin_lock_irq(&cachep->spinlock);
-		free_block(cachep, ac_entry(ccold), ccold->avail);
-		spin_unlock_irq(&cachep->spinlock);
+		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+		free_block(cachep, ccold->entry, ccold->avail);
+		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
 		kfree(ccold);
 	}
-	new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
-	if (new_shared) {
-		struct array_cache *old;
 
-		spin_lock_irq(&cachep->spinlock);
-		old = cachep->lists.shared;
-		cachep->lists.shared = new_shared;
-		if (old)
-			free_block(cachep, ac_entry(old), old->avail);
-		spin_unlock_irq(&cachep->spinlock);
-		kfree(old);
+	err = alloc_kmemlist(cachep);
+	if (err) {
+		printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
+				cachep->name, -err);
+		BUG();
 	}
-
 	return 0;
 }
 
@@ -2759,11 +3228,11 @@ static void enable_cpucache(kmem_cache_t *cachep)
 }
 
 static void drain_array_locked(kmem_cache_t *cachep,
-				struct array_cache *ac, int force)
+				struct array_cache *ac, int force, int node)
 {
 	int tofree;
 
-	check_spinlock_acquired(cachep);
+	check_spinlock_acquired_node(cachep, node);
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else if (ac->avail) {
@@ -2771,9 +3240,9 @@ static void drain_array_locked(kmem_cache_t *cachep,
 		if (tofree > ac->avail) {
 			tofree = (ac->avail+1)/2;
 		}
-		free_block(cachep, ac_entry(ac), tofree);
+		free_block(cachep, ac->entry, tofree);
 		ac->avail -= tofree;
-		memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
+		memmove(ac->entry, &(ac->entry[tofree]),
 					sizeof(void*)*ac->avail);
 	}
 }
@@ -2792,6 +3261,7 @@ static void drain_array_locked(kmem_cache_t *cachep,
 static void cache_reap(void *unused)
 {
 	struct list_head *walk;
+	struct kmem_list3 *l3;
 
 	if (down_trylock(&cache_chain_sem)) {
 		/* Give up. Setup the next iteration. */
@@ -2812,27 +3282,32 @@ static void cache_reap(void *unused)
 
 		check_irq_on();
 
-		spin_lock_irq(&searchp->spinlock);
+		l3 = searchp->nodelists[numa_node_id()];
+		if (l3->alien)
+			drain_alien_cache(searchp, l3);
+		spin_lock_irq(&l3->list_lock);
 
-		drain_array_locked(searchp, ac_data(searchp), 0);
+		drain_array_locked(searchp, ac_data(searchp), 0,
+				numa_node_id());
 
-		if(time_after(searchp->lists.next_reap, jiffies))
+		if (time_after(l3->next_reap, jiffies))
 			goto next_unlock;
 
-		searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3;
+		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
 
-		if (searchp->lists.shared)
-			drain_array_locked(searchp, searchp->lists.shared, 0);
+		if (l3->shared)
+			drain_array_locked(searchp, l3->shared, 0,
+				numa_node_id());
 
-		if (searchp->lists.free_touched) {
-			searchp->lists.free_touched = 0;
+		if (l3->free_touched) {
+			l3->free_touched = 0;
 			goto next_unlock;
 		}
 
-		tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
+		tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
 		do {
-			p = list3_data(searchp)->slabs_free.next;
-			if (p == &(list3_data(searchp)->slabs_free))
+			p = l3->slabs_free.next;
+			if (p == &(l3->slabs_free))
 				break;
 
 			slabp = list_entry(p, struct slab, list);
@@ -2845,13 +3320,13 @@ static void cache_reap(void *unused)
 			 * searchp cannot disappear, we hold
 			 * cache_chain_lock
 			 */
-			searchp->lists.free_objects -= searchp->num;
-			spin_unlock_irq(&searchp->spinlock);
+			l3->free_objects -= searchp->num;
+			spin_unlock_irq(&l3->list_lock);
 			slab_destroy(searchp, slabp);
-			spin_lock_irq(&searchp->spinlock);
+			spin_lock_irq(&l3->list_lock);
 		} while(--tofree > 0);
 next_unlock:
-		spin_unlock_irq(&searchp->spinlock);
+		spin_unlock_irq(&l3->list_lock);
 next:
 		cond_resched();
 	}
@@ -2885,7 +3360,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
 #if STATS
 		seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
-				" <error> <maxfreeable> <freelimit> <nodeallocs>");
+				" <error> <maxfreeable> <nodeallocs> <remotefrees>");
 		seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
 		seq_putc(m, '\n');
@@ -2920,39 +3395,53 @@ static int s_show(struct seq_file *m, void *p)
 	unsigned long	active_objs;
 	unsigned long	num_objs;
 	unsigned long	active_slabs = 0;
-	unsigned long	num_slabs;
-	const char *name; 
+	unsigned long	num_slabs, free_objects = 0, shared_avail = 0;
+	const char *name;
 	char *error = NULL;
+	int node;
+	struct kmem_list3 *l3;
 
 	check_irq_on();
 	spin_lock_irq(&cachep->spinlock);
 	active_objs = 0;
 	num_slabs = 0;
-	list_for_each(q,&cachep->lists.slabs_full) {
-		slabp = list_entry(q, struct slab, list);
-		if (slabp->inuse != cachep->num && !error)
-			error = "slabs_full accounting error";
-		active_objs += cachep->num;
-		active_slabs++;
-	}
-	list_for_each(q,&cachep->lists.slabs_partial) {
-		slabp = list_entry(q, struct slab, list);
-		if (slabp->inuse == cachep->num && !error)
-			error = "slabs_partial inuse accounting error";
-		if (!slabp->inuse && !error)
-			error = "slabs_partial/inuse accounting error";
-		active_objs += slabp->inuse;
-		active_slabs++;
-	}
-	list_for_each(q,&cachep->lists.slabs_free) {
-		slabp = list_entry(q, struct slab, list);
-		if (slabp->inuse && !error)
-			error = "slabs_free/inuse accounting error";
-		num_slabs++;
+	for_each_online_node(node) {
+		l3 = cachep->nodelists[node];
+		if (!l3)
+			continue;
+
+		spin_lock(&l3->list_lock);
+
+		list_for_each(q,&l3->slabs_full) {
+			slabp = list_entry(q, struct slab, list);
+			if (slabp->inuse != cachep->num && !error)
+				error = "slabs_full accounting error";
+			active_objs += cachep->num;
+			active_slabs++;
+		}
+		list_for_each(q,&l3->slabs_partial) {
+			slabp = list_entry(q, struct slab, list);
+			if (slabp->inuse == cachep->num && !error)
+				error = "slabs_partial inuse accounting error";
+			if (!slabp->inuse && !error)
+				error = "slabs_partial/inuse accounting error";
+			active_objs += slabp->inuse;
+			active_slabs++;
+		}
+		list_for_each(q,&l3->slabs_free) {
+			slabp = list_entry(q, struct slab, list);
+			if (slabp->inuse && !error)
+				error = "slabs_free/inuse accounting error";
+			num_slabs++;
+		}
+		free_objects += l3->free_objects;
+		shared_avail += l3->shared->avail;
+
+		spin_unlock(&l3->list_lock);
 	}
 	num_slabs+=active_slabs;
 	num_objs = num_slabs*cachep->num;
-	if (num_objs - active_objs != cachep->lists.free_objects && !error)
+	if (num_objs - active_objs != free_objects && !error)
 		error = "free_objects accounting error";
 
 	name = cachep->name; 
@@ -2964,9 +3453,9 @@ static int s_show(struct seq_file *m, void *p)
 		cachep->num, (1<<cachep->gfporder));
 	seq_printf(m, " : tunables %4u %4u %4u",
 			cachep->limit, cachep->batchcount,
-			cachep->lists.shared->limit/cachep->batchcount);
-	seq_printf(m, " : slabdata %6lu %6lu %6u",
-			active_slabs, num_slabs, cachep->lists.shared->avail);
+			cachep->shared);
+	seq_printf(m, " : slabdata %6lu %6lu %6lu",
+			active_slabs, num_slabs, shared_avail);
 #if STATS
 	{	/* list3 stats */
 		unsigned long high = cachep->high_mark;
@@ -2975,12 +3464,13 @@ static int s_show(struct seq_file *m, void *p)
 		unsigned long reaped = cachep->reaped;
 		unsigned long errors = cachep->errors;
 		unsigned long max_freeable = cachep->max_freeable;
-		unsigned long free_limit = cachep->free_limit;
 		unsigned long node_allocs = cachep->node_allocs;
+		unsigned long node_frees = cachep->node_frees;
 
-		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu",
-				allocs, high, grown, reaped, errors, 
-				max_freeable, free_limit, node_allocs);
+		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
+				%4lu %4lu %4lu %4lu",
+				allocs, high, grown, reaped, errors,
+				max_freeable, node_allocs, node_frees);
 	}
 	/* cpu stats */
 	{
@@ -3059,9 +3549,10 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 			    batchcount < 1 ||
 			    batchcount > limit ||
 			    shared < 0) {
-				res = -EINVAL;
+				res = 0;
 			} else {
-				res = do_tune_cpucache(cachep, limit, batchcount, shared);
+				res = do_tune_cpucache(cachep, limit,
+							batchcount, shared);
 			}
 			break;
 		}
@@ -3073,20 +3564,24 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 }
 #endif
 
+/**
+ * ksize - get the actual amount of memory allocated for a given object
+ * @objp: Pointer to the object
+ *
+ * kmalloc may internally round up allocations and return more memory
+ * than requested. ksize() can be used to determine the actual amount of
+ * memory allocated. The caller may use this additional memory, even though
+ * a smaller amount of memory was initially specified with the kmalloc call.
+ * The caller must guarantee that objp points to a valid object previously
+ * allocated with either kmalloc() or kmem_cache_alloc(). The object
+ * must not be freed during the duration of the call.
+ */
 unsigned int ksize(const void *objp)
 {
-	kmem_cache_t *c;
-	unsigned long flags;
-	unsigned int size = 0;
-
-	if (likely(objp != NULL)) {
-		local_irq_save(flags);
-		c = GET_PAGE_CACHE(virt_to_page(objp));
-		size = kmem_cache_size(c);
-		local_irq_restore(flags);
-	}
+	if (unlikely(objp == NULL))
+		return 0;
 
-	return size;
+	return obj_reallen(GET_PAGE_CACHE(virt_to_page(objp)));
 }
 
 
diff --git a/mm/sparse.c b/mm/sparse.c
index b54e304df4a..347249a4917 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -6,6 +6,7 @@
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
+#include <linux/spinlock.h>
 #include <asm/dma.h>
 
 /*
@@ -13,9 +14,64 @@
  *
  * 1) mem_section	- memory sections, mem_map's for valid memory
  */
-struct mem_section mem_section[NR_MEM_SECTIONS];
+#ifdef CONFIG_SPARSEMEM_EXTREME
+struct mem_section *mem_section[NR_SECTION_ROOTS]
+	____cacheline_maxaligned_in_smp;
+#else
+struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
+	____cacheline_maxaligned_in_smp;
+#endif
 EXPORT_SYMBOL(mem_section);
 
+#ifdef CONFIG_SPARSEMEM_EXTREME
+static struct mem_section *sparse_index_alloc(int nid)
+{
+	struct mem_section *section = NULL;
+	unsigned long array_size = SECTIONS_PER_ROOT *
+				   sizeof(struct mem_section);
+
+	section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+
+	if (section)
+		memset(section, 0, array_size);
+
+	return section;
+}
+
+static int sparse_index_init(unsigned long section_nr, int nid)
+{
+	static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED;
+	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
+	struct mem_section *section;
+	int ret = 0;
+
+	if (mem_section[root])
+		return -EEXIST;
+
+	section = sparse_index_alloc(nid);
+	/*
+	 * This lock keeps two different sections from
+	 * reallocating for the same index
+	 */
+	spin_lock(&index_init_lock);
+
+	if (mem_section[root]) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	mem_section[root] = section;
+out:
+	spin_unlock(&index_init_lock);
+	return ret;
+}
+#else /* !SPARSEMEM_EXTREME */
+static inline int sparse_index_init(unsigned long section_nr, int nid)
+{
+	return 0;
+}
+#endif
+
 /* Record a memory area against a node. */
 void memory_present(int nid, unsigned long start, unsigned long end)
 {
@@ -24,8 +80,13 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 	start &= PAGE_SECTION_MASK;
 	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
 		unsigned long section = pfn_to_section_nr(pfn);
-		if (!mem_section[section].section_mem_map)
-			mem_section[section].section_mem_map = SECTION_MARKED_PRESENT;
+		struct mem_section *ms;
+
+		sparse_index_init(section, nid);
+
+		ms = __nr_to_section(section);
+		if (!ms->section_mem_map)
+			ms->section_mem_map = SECTION_MARKED_PRESENT;
 	}
 }
 
@@ -85,6 +146,7 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 {
 	struct page *map;
 	int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
+	struct mem_section *ms = __nr_to_section(pnum);
 
 	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
 	if (map)
@@ -96,7 +158,7 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 		return map;
 
 	printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
-	mem_section[pnum].section_mem_map = 0;
+	ms->section_mem_map = 0;
 	return NULL;
 }
 
@@ -114,8 +176,9 @@ void sparse_init(void)
 			continue;
 
 		map = sparse_early_mem_map_alloc(pnum);
-		if (map)
-			sparse_init_one_section(&mem_section[pnum], pnum, map);
+		if (!map)
+			continue;
+		sparse_init_one_section(__nr_to_section(pnum), pnum, map);
 	}
 }
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4f251775ef9..adbc2b426c2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -67,8 +67,8 @@ void show_swap_cache_info(void)
  * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
  * but sets SwapCache flag and private instead of mapping and index.
  */
-static int __add_to_swap_cache(struct page *page,
-		swp_entry_t entry, int gfp_mask)
+static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
+			       unsigned int __nocast gfp_mask)
 {
 	int error;
 
@@ -124,6 +124,7 @@ void __delete_from_swap_cache(struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!PageSwapCache(page));
 	BUG_ON(PageWriteback(page));
+	BUG_ON(PagePrivate(page));
 
 	radix_tree_delete(&swapper_space.page_tree, page->private);
 	page->private = 0;
@@ -196,11 +197,6 @@ void delete_from_swap_cache(struct page *page)
 {
 	swp_entry_t entry;
 
-	BUG_ON(!PageSwapCache(page));
-	BUG_ON(!PageLocked(page));
-	BUG_ON(PageWriteback(page));
-	BUG_ON(PagePrivate(page));
-  
 	entry.val = page->private;
 
 	write_lock_irq(&swapper_space.tree_lock);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 60cd24a5520..0184f510aac 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,7 +31,7 @@
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
 
-DEFINE_SPINLOCK(swaplock);
+DEFINE_SPINLOCK(swap_lock);
 unsigned int nr_swapfiles;
 long total_swap_pages;
 static int swap_overflow;
@@ -51,13 +51,11 @@ static DECLARE_MUTEX(swapon_sem);
 
 /*
  * We need this because the bdev->unplug_fn can sleep and we cannot
- * hold swap_list_lock while calling the unplug_fn. And swap_list_lock
+ * hold swap_lock while calling the unplug_fn. And swap_lock
  * cannot be turned into a semaphore.
  */
 static DECLARE_RWSEM(swap_unplug_sem);
 
-#define SWAPFILE_CLUSTER 256
-
 void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 {
 	swp_entry_t entry;
@@ -84,116 +82,135 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 	up_read(&swap_unplug_sem);
 }
 
-static inline int scan_swap_map(struct swap_info_struct *si)
+#define SWAPFILE_CLUSTER	256
+#define LATENCY_LIMIT		256
+
+static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
-	unsigned long offset;
+	unsigned long offset, last_in_cluster;
+	int latency_ration = LATENCY_LIMIT;
+
 	/* 
-	 * We try to cluster swap pages by allocating them
-	 * sequentially in swap.  Once we've allocated
-	 * SWAPFILE_CLUSTER pages this way, however, we resort to
-	 * first-free allocation, starting a new cluster.  This
-	 * prevents us from scattering swap pages all over the entire
-	 * swap partition, so that we reduce overall disk seek times
-	 * between swap pages.  -- sct */
-	if (si->cluster_nr) {
-		while (si->cluster_next <= si->highest_bit) {
-			offset = si->cluster_next++;
+	 * We try to cluster swap pages by allocating them sequentially
+	 * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
+	 * way, however, we resort to first-free allocation, starting
+	 * a new cluster.  This prevents us from scattering swap pages
+	 * all over the entire swap partition, so that we reduce
+	 * overall disk seek times between swap pages.  -- sct
+	 * But we do now try to find an empty cluster.  -Andrea
+	 */
+
+	si->flags += SWP_SCANNING;
+	if (unlikely(!si->cluster_nr)) {
+		si->cluster_nr = SWAPFILE_CLUSTER - 1;
+		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
+			goto lowest;
+		spin_unlock(&swap_lock);
+
+		offset = si->lowest_bit;
+		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+
+		/* Locate the first empty (unaligned) cluster */
+		for (; last_in_cluster <= si->highest_bit; offset++) {
 			if (si->swap_map[offset])
-				continue;
-			si->cluster_nr--;
-			goto got_page;
-		}
-	}
-	si->cluster_nr = SWAPFILE_CLUSTER;
-
-	/* try to find an empty (even not aligned) cluster. */
-	offset = si->lowest_bit;
- check_next_cluster:
-	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
-	{
-		unsigned long nr;
-		for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
-			if (si->swap_map[nr])
-			{
-				offset = nr+1;
-				goto check_next_cluster;
+				last_in_cluster = offset + SWAPFILE_CLUSTER;
+			else if (offset == last_in_cluster) {
+				spin_lock(&swap_lock);
+				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
+				goto cluster;
 			}
-		/* We found a completly empty cluster, so start
-		 * using it.
-		 */
-		goto got_page;
+			if (unlikely(--latency_ration < 0)) {
+				cond_resched();
+				latency_ration = LATENCY_LIMIT;
+			}
+		}
+		spin_lock(&swap_lock);
+		goto lowest;
 	}
-	/* No luck, so now go finegrined as usual. -Andrea */
-	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
-		if (si->swap_map[offset])
-			continue;
-		si->lowest_bit = offset+1;
-	got_page:
+
+	si->cluster_nr--;
+cluster:
+	offset = si->cluster_next;
+	if (offset > si->highest_bit)
+lowest:		offset = si->lowest_bit;
+checks:	if (!(si->flags & SWP_WRITEOK))
+		goto no_page;
+	if (!si->highest_bit)
+		goto no_page;
+	if (!si->swap_map[offset]) {
 		if (offset == si->lowest_bit)
 			si->lowest_bit++;
 		if (offset == si->highest_bit)
 			si->highest_bit--;
-		if (si->lowest_bit > si->highest_bit) {
+		si->inuse_pages++;
+		if (si->inuse_pages == si->pages) {
 			si->lowest_bit = si->max;
 			si->highest_bit = 0;
 		}
 		si->swap_map[offset] = 1;
-		si->inuse_pages++;
-		nr_swap_pages--;
-		si->cluster_next = offset+1;
+		si->cluster_next = offset + 1;
+		si->flags -= SWP_SCANNING;
 		return offset;
 	}
-	si->lowest_bit = si->max;
-	si->highest_bit = 0;
+
+	spin_unlock(&swap_lock);
+	while (++offset <= si->highest_bit) {
+		if (!si->swap_map[offset]) {
+			spin_lock(&swap_lock);
+			goto checks;
+		}
+		if (unlikely(--latency_ration < 0)) {
+			cond_resched();
+			latency_ration = LATENCY_LIMIT;
+		}
+	}
+	spin_lock(&swap_lock);
+	goto lowest;
+
+no_page:
+	si->flags -= SWP_SCANNING;
 	return 0;
 }
 
 swp_entry_t get_swap_page(void)
 {
-	struct swap_info_struct * p;
-	unsigned long offset;
-	swp_entry_t entry;
-	int type, wrapped = 0;
+	struct swap_info_struct *si;
+	pgoff_t offset;
+	int type, next;
+	int wrapped = 0;
 
-	entry.val = 0;	/* Out of memory */
-	swap_list_lock();
-	type = swap_list.next;
-	if (type < 0)
-		goto out;
+	spin_lock(&swap_lock);
 	if (nr_swap_pages <= 0)
-		goto out;
+		goto noswap;
+	nr_swap_pages--;
+
+	for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
+		si = swap_info + type;
+		next = si->next;
+		if (next < 0 ||
+		    (!wrapped && si->prio != swap_info[next].prio)) {
+			next = swap_list.head;
+			wrapped++;
+		}
 
-	while (1) {
-		p = &swap_info[type];
-		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
-			swap_device_lock(p);
-			offset = scan_swap_map(p);
-			swap_device_unlock(p);
-			if (offset) {
-				entry = swp_entry(type,offset);
-				type = swap_info[type].next;
-				if (type < 0 ||
-					p->prio != swap_info[type].prio) {
-						swap_list.next = swap_list.head;
-				} else {
-					swap_list.next = type;
-				}
-				goto out;
-			}
+		if (!si->highest_bit)
+			continue;
+		if (!(si->flags & SWP_WRITEOK))
+			continue;
+
+		swap_list.next = next;
+		offset = scan_swap_map(si);
+		if (offset) {
+			spin_unlock(&swap_lock);
+			return swp_entry(type, offset);
 		}
-		type = p->next;
-		if (!wrapped) {
-			if (type < 0 || p->prio != swap_info[type].prio) {
-				type = swap_list.head;
-				wrapped = 1;
-			}
-		} else
-			if (type < 0)
-				goto out;	/* out of swap space */
+		next = swap_list.next;
 	}
-out:
-	swap_list_unlock();
-	return entry;
+
+	nr_swap_pages++;
+noswap:
+	spin_unlock(&swap_lock);
+	return (swp_entry_t) {0};
 }
 
 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
@@ -214,10 +231,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
 		goto bad_offset;
 	if (!p->swap_map[offset])
 		goto bad_free;
-	swap_list_lock();
-	if (p->prio > swap_info[swap_list.next].prio)
-		swap_list.next = type;
-	swap_device_lock(p);
+	spin_lock(&swap_lock);
 	return p;
 
 bad_free:
@@ -235,12 +249,6 @@ out:
 	return NULL;
 }	
 
-static void swap_info_put(struct swap_info_struct * p)
-{
-	swap_device_unlock(p);
-	swap_list_unlock();
-}
-
 static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
 {
 	int count = p->swap_map[offset];
@@ -253,6 +261,8 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
 				p->lowest_bit = offset;
 			if (offset > p->highest_bit)
 				p->highest_bit = offset;
+			if (p->prio > swap_info[swap_list.next].prio)
+				swap_list.next = p - swap_info;
 			nr_swap_pages++;
 			p->inuse_pages--;
 		}
@@ -271,7 +281,7 @@ void swap_free(swp_entry_t entry)
 	p = swap_info_get(entry);
 	if (p) {
 		swap_entry_free(p, swp_offset(entry));
-		swap_info_put(p);
+		spin_unlock(&swap_lock);
 	}
 }
 
@@ -289,7 +299,7 @@ static inline int page_swapcount(struct page *page)
 	if (p) {
 		/* Subtract the 1 for the swap cache itself */
 		count = p->swap_map[swp_offset(entry)] - 1;
-		swap_info_put(p);
+		spin_unlock(&swap_lock);
 	}
 	return count;
 }
@@ -346,7 +356,7 @@ int remove_exclusive_swap_page(struct page *page)
 		}
 		write_unlock_irq(&swapper_space.tree_lock);
 	}
-	swap_info_put(p);
+	spin_unlock(&swap_lock);
 
 	if (retval) {
 		swap_free(entry);
@@ -369,7 +379,7 @@ void free_swap_and_cache(swp_entry_t entry)
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1)
 			page = find_trylock_page(&swapper_space, entry.val);
-		swap_info_put(p);
+		spin_unlock(&swap_lock);
 	}
 	if (page) {
 		int one_user;
@@ -531,17 +541,18 @@ static int unuse_mm(struct mm_struct *mm,
  * Scan swap_map from current position to next entry still in use.
  * Recycle to start on reaching the end, returning 0 when empty.
  */
-static int find_next_to_unuse(struct swap_info_struct *si, int prev)
+static unsigned int find_next_to_unuse(struct swap_info_struct *si,
+					unsigned int prev)
 {
-	int max = si->max;
-	int i = prev;
+	unsigned int max = si->max;
+	unsigned int i = prev;
 	int count;
 
 	/*
-	 * No need for swap_device_lock(si) here: we're just looking
+	 * No need for swap_lock here: we're just looking
 	 * for whether an entry is in use, not modifying it; false
 	 * hits are okay, and sys_swapoff() has already prevented new
-	 * allocations from this area (while holding swap_list_lock()).
+	 * allocations from this area (while holding swap_lock).
 	 */
 	for (;;) {
 		if (++i >= max) {
@@ -577,7 +588,7 @@ static int try_to_unuse(unsigned int type)
 	unsigned short swcount;
 	struct page *page;
 	swp_entry_t entry;
-	int i = 0;
+	unsigned int i = 0;
 	int retval = 0;
 	int reset_overflow = 0;
 	int shmem;
@@ -731,9 +742,9 @@ static int try_to_unuse(unsigned int type)
 		 * report them; but do report if we reset SWAP_MAP_MAX.
 		 */
 		if (*swap_map == SWAP_MAP_MAX) {
-			swap_device_lock(si);
+			spin_lock(&swap_lock);
 			*swap_map = 1;
-			swap_device_unlock(si);
+			spin_unlock(&swap_lock);
 			reset_overflow = 1;
 		}
 
@@ -797,9 +808,9 @@ static int try_to_unuse(unsigned int type)
 }
 
 /*
- * After a successful try_to_unuse, if no swap is now in use, we know we
- * can empty the mmlist.  swap_list_lock must be held on entry and exit.
- * Note that mmlist_lock nests inside swap_list_lock, and an mm must be
+ * After a successful try_to_unuse, if no swap is now in use, we know
+ * we can empty the mmlist.  swap_lock must be held on entry and exit.
+ * Note that mmlist_lock nests inside swap_lock, and an mm must be
  * added to the mmlist just after page_duplicate - before would be racy.
  */
 static void drain_mmlist(void)
@@ -832,9 +843,9 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
 				offset < (se->start_page + se->nr_pages)) {
 			return se->start_block + (offset - se->start_page);
 		}
-		lh = se->list.prev;
+		lh = se->list.next;
 		if (lh == &sis->extent_list)
-			lh = lh->prev;
+			lh = lh->next;
 		se = list_entry(lh, struct swap_extent, list);
 		sis->curr_swap_extent = se;
 		BUG_ON(se == start_se);		/* It *must* be present */
@@ -854,15 +865,13 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
 		list_del(&se->list);
 		kfree(se);
 	}
-	sis->nr_extents = 0;
 }
 
 /*
  * Add a block range (and the corresponding page range) into this swapdev's
- * extent list.  The extent list is kept sorted in block order.
+ * extent list.  The extent list is kept sorted in page order.
  *
- * This function rather assumes that it is called in ascending sector_t order.
- * It doesn't look for extent coalescing opportunities.
+ * This function rather assumes that it is called in ascending page order.
  */
 static int
 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
@@ -872,16 +881,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 	struct swap_extent *new_se;
 	struct list_head *lh;
 
-	lh = sis->extent_list.next;	/* The highest-addressed block */
-	while (lh != &sis->extent_list) {
+	lh = sis->extent_list.prev;	/* The highest page extent */
+	if (lh != &sis->extent_list) {
 		se = list_entry(lh, struct swap_extent, list);
-		if (se->start_block + se->nr_pages == start_block &&
-		    se->start_page  + se->nr_pages == start_page) {
+		BUG_ON(se->start_page + se->nr_pages != start_page);
+		if (se->start_block + se->nr_pages == start_block) {
 			/* Merge it */
 			se->nr_pages += nr_pages;
 			return 0;
 		}
-		lh = lh->next;
 	}
 
 	/*
@@ -894,16 +902,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 	new_se->nr_pages = nr_pages;
 	new_se->start_block = start_block;
 
-	lh = sis->extent_list.prev;	/* The lowest block */
-	while (lh != &sis->extent_list) {
-		se = list_entry(lh, struct swap_extent, list);
-		if (se->start_block > start_block)
-			break;
-		lh = lh->prev;
-	}
-	list_add_tail(&new_se->list, lh);
-	sis->nr_extents++;
-	return 0;
+	list_add_tail(&new_se->list, &sis->extent_list);
+	return 1;
 }
 
 /*
@@ -926,7 +926,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
  * requirements, they are simply tossed out - we will never use those blocks
  * for swapping.
  *
- * For S_ISREG swapfiles we hold i_sem across the life of the swapon.  This
+ * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon.  This
  * prevents root from shooting her foot off by ftruncating an in-use swapfile,
  * which will scribble on the fs.
  *
@@ -937,7 +937,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
  * This is extremely effective.  The average number of iterations in
  * map_swap_page() has been measured at about 0.3 per page.  - akpm.
  */
-static int setup_swap_extents(struct swap_info_struct *sis)
+static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 {
 	struct inode *inode;
 	unsigned blocks_per_page;
@@ -945,11 +945,15 @@ static int setup_swap_extents(struct swap_info_struct *sis)
 	unsigned blkbits;
 	sector_t probe_block;
 	sector_t last_block;
+	sector_t lowest_block = -1;
+	sector_t highest_block = 0;
+	int nr_extents = 0;
 	int ret;
 
 	inode = sis->swap_file->f_mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
 		ret = add_swap_extent(sis, 0, sis->max, 0);
+		*span = sis->pages;
 		goto done;
 	}
 
@@ -994,22 +998,32 @@ static int setup_swap_extents(struct swap_info_struct *sis)
 			}
 		}
 
+		first_block >>= (PAGE_SHIFT - blkbits);
+		if (page_no) {	/* exclude the header page */
+			if (first_block < lowest_block)
+				lowest_block = first_block;
+			if (first_block > highest_block)
+				highest_block = first_block;
+		}
+
 		/*
 		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
 		 */
-		ret = add_swap_extent(sis, page_no, 1,
-				first_block >> (PAGE_SHIFT - blkbits));
-		if (ret)
+		ret = add_swap_extent(sis, page_no, 1, first_block);
+		if (ret < 0)
 			goto out;
+		nr_extents += ret;
 		page_no++;
 		probe_block += blocks_per_page;
 reprobe:
 		continue;
 	}
-	ret = 0;
+	ret = nr_extents;
+	*span = 1 + highest_block - lowest_block;
 	if (page_no == 0)
-		ret = -EINVAL;
+		page_no = 1;	/* force Empty message */
 	sis->max = page_no;
+	sis->pages = page_no - 1;
 	sis->highest_bit = page_no - 1;
 done:
 	sis->curr_swap_extent = list_entry(sis->extent_list.prev,
@@ -1069,7 +1083,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 
 	mapping = victim->f_mapping;
 	prev = -1;
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
 		p = swap_info + type;
 		if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
@@ -1080,14 +1094,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	}
 	if (type < 0) {
 		err = -EINVAL;
-		swap_list_unlock();
+		spin_unlock(&swap_lock);
 		goto out_dput;
 	}
 	if (!security_vm_enough_memory(p->pages))
 		vm_unacct_memory(p->pages);
 	else {
 		err = -ENOMEM;
-		swap_list_unlock();
+		spin_unlock(&swap_lock);
 		goto out_dput;
 	}
 	if (prev < 0) {
@@ -1102,18 +1116,15 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	nr_swap_pages -= p->pages;
 	total_swap_pages -= p->pages;
 	p->flags &= ~SWP_WRITEOK;
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
+
 	current->flags |= PF_SWAPOFF;
 	err = try_to_unuse(type);
 	current->flags &= ~PF_SWAPOFF;
 
-	/* wait for any unplug function to finish */
-	down_write(&swap_unplug_sem);
-	up_write(&swap_unplug_sem);
-
 	if (err) {
 		/* re-insert swap space back into swap_list */
-		swap_list_lock();
+		spin_lock(&swap_lock);
 		for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
 			if (p->prio >= swap_info[i].prio)
 				break;
@@ -1125,22 +1136,34 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 		nr_swap_pages += p->pages;
 		total_swap_pages += p->pages;
 		p->flags |= SWP_WRITEOK;
-		swap_list_unlock();
+		spin_unlock(&swap_lock);
 		goto out_dput;
 	}
+
+	/* wait for any unplug function to finish */
+	down_write(&swap_unplug_sem);
+	up_write(&swap_unplug_sem);
+
+	destroy_swap_extents(p);
 	down(&swapon_sem);
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	drain_mmlist();
-	swap_device_lock(p);
+
+	/* wait for anyone still in scan_swap_map */
+	p->highest_bit = 0;		/* cuts scans short */
+	while (p->flags >= SWP_SCANNING) {
+		spin_unlock(&swap_lock);
+		schedule_timeout_uninterruptible(1);
+		spin_lock(&swap_lock);
+	}
+
 	swap_file = p->swap_file;
 	p->swap_file = NULL;
 	p->max = 0;
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
-	destroy_swap_extents(p);
-	swap_device_unlock(p);
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	up(&swapon_sem);
 	vfree(swap_map);
 	inode = mapping->host;
@@ -1213,7 +1236,7 @@ static int swap_show(struct seq_file *swap, void *v)
 
 	file = ptr->swap_file;
 	len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
-	seq_printf(swap, "%*s%s\t%d\t%ld\t%d\n",
+	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
 		       len < 40 ? 40 - len : 1, " ",
 		       S_ISBLK(file->f_dentry->d_inode->i_mode) ?
 				"partition" : "file\t",
@@ -1272,7 +1295,9 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	static int least_priority;
 	union swap_header *swap_header = NULL;
 	int swap_header_version;
-	int nr_good_pages = 0;
+	unsigned int nr_good_pages = 0;
+	int nr_extents = 0;
+	sector_t span;
 	unsigned long maxpages = 1;
 	int swapfilesize;
 	unsigned short *swap_map;
@@ -1282,7 +1307,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	p = swap_info;
 	for (type = 0 ; type < nr_swapfiles ; type++,p++)
 		if (!(p->flags & SWP_USED))
@@ -1301,14 +1326,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	 * swp_entry_t or the architecture definition of a swap pte.
 	 */
 	if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
-		swap_list_unlock();
+		spin_unlock(&swap_lock);
 		goto out;
 	}
 	if (type >= nr_swapfiles)
 		nr_swapfiles = type+1;
 	INIT_LIST_HEAD(&p->extent_list);
 	p->flags = SWP_USED;
-	p->nr_extents = 0;
 	p->swap_file = NULL;
 	p->old_block_size = 0;
 	p->swap_map = NULL;
@@ -1316,7 +1340,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	p->highest_bit = 0;
 	p->cluster_nr = 0;
 	p->inuse_pages = 0;
-	spin_lock_init(&p->sdev_lock);
 	p->next = -1;
 	if (swap_flags & SWAP_FLAG_PREFER) {
 		p->prio =
@@ -1324,7 +1347,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	} else {
 		p->prio = --least_priority;
 	}
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	name = getname(specialfile);
 	error = PTR_ERR(name);
 	if (IS_ERR(name)) {
@@ -1426,6 +1449,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		}
 
 		p->lowest_bit  = 1;
+		p->cluster_next = 1;
+
 		/*
 		 * Find out how many pages are allowed for a single swap
 		 * device. There are two limiting factors: 1) the number of
@@ -1446,6 +1471,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		p->highest_bit = maxpages - 1;
 
 		error = -EINVAL;
+		if (!maxpages)
+			goto bad_swap;
+		if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
+			goto bad_swap;
 		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
 			goto bad_swap;
 		
@@ -1470,35 +1499,40 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 		if (error) 
 			goto bad_swap;
 	}
-	
+
 	if (swapfilesize && maxpages > swapfilesize) {
 		printk(KERN_WARNING
 		       "Swap area shorter than signature indicates\n");
 		error = -EINVAL;
 		goto bad_swap;
 	}
+	if (nr_good_pages) {
+		p->swap_map[0] = SWAP_MAP_BAD;
+		p->max = maxpages;
+		p->pages = nr_good_pages;
+		nr_extents = setup_swap_extents(p, &span);
+		if (nr_extents < 0) {
+			error = nr_extents;
+			goto bad_swap;
+		}
+		nr_good_pages = p->pages;
+	}
 	if (!nr_good_pages) {
 		printk(KERN_WARNING "Empty swap-file\n");
 		error = -EINVAL;
 		goto bad_swap;
 	}
-	p->swap_map[0] = SWAP_MAP_BAD;
-	p->max = maxpages;
-	p->pages = nr_good_pages;
-
-	error = setup_swap_extents(p);
-	if (error)
-		goto bad_swap;
 
 	down(&swapon_sem);
-	swap_list_lock();
-	swap_device_lock(p);
+	spin_lock(&swap_lock);
 	p->flags = SWP_ACTIVE;
 	nr_swap_pages += nr_good_pages;
 	total_swap_pages += nr_good_pages;
-	printk(KERN_INFO "Adding %dk swap on %s.  Priority:%d extents:%d\n",
-		nr_good_pages<<(PAGE_SHIFT-10), name,
-		p->prio, p->nr_extents);
+
+	printk(KERN_INFO "Adding %uk swap on %s.  "
+			"Priority:%d extents:%d across:%lluk\n",
+		nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
+		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10));
 
 	/* insert swap space into swap_list: */
 	prev = -1;
@@ -1514,8 +1548,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	} else {
 		swap_info[prev].next = p - swap_info;
 	}
-	swap_device_unlock(p);
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 	up(&swapon_sem);
 	error = 0;
 	goto out;
@@ -1524,16 +1557,16 @@ bad_swap:
 		set_blocksize(bdev, p->old_block_size);
 		bd_release(bdev);
 	}
+	destroy_swap_extents(p);
 bad_swap_2:
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	swap_map = p->swap_map;
 	p->swap_file = NULL;
 	p->swap_map = NULL;
 	p->flags = 0;
 	if (!(swap_flags & SWAP_FLAG_PREFER))
 		++least_priority;
-	swap_list_unlock();
-	destroy_swap_extents(p);
+	spin_unlock(&swap_lock);
 	vfree(swap_map);
 	if (swap_file)
 		filp_close(swap_file, NULL);
@@ -1557,7 +1590,7 @@ void si_swapinfo(struct sysinfo *val)
 	unsigned int i;
 	unsigned long nr_to_be_unused = 0;
 
-	swap_list_lock();
+	spin_lock(&swap_lock);
 	for (i = 0; i < nr_swapfiles; i++) {
 		if (!(swap_info[i].flags & SWP_USED) ||
 		     (swap_info[i].flags & SWP_WRITEOK))
@@ -1566,7 +1599,7 @@ void si_swapinfo(struct sysinfo *val)
 	}
 	val->freeswap = nr_swap_pages + nr_to_be_unused;
 	val->totalswap = total_swap_pages + nr_to_be_unused;
-	swap_list_unlock();
+	spin_unlock(&swap_lock);
 }
 
 /*
@@ -1587,7 +1620,7 @@ int swap_duplicate(swp_entry_t entry)
 	p = type + swap_info;
 	offset = swp_offset(entry);
 
-	swap_device_lock(p);
+	spin_lock(&swap_lock);
 	if (offset < p->max && p->swap_map[offset]) {
 		if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
 			p->swap_map[offset]++;
@@ -1599,7 +1632,7 @@ int swap_duplicate(swp_entry_t entry)
 			result = 1;
 		}
 	}
-	swap_device_unlock(p);
+	spin_unlock(&swap_lock);
 out:
 	return result;
 
@@ -1615,7 +1648,7 @@ get_swap_info_struct(unsigned type)
 }
 
 /*
- * swap_device_lock prevents swap_map being freed. Don't grab an extra
+ * swap_lock prevents swap_map being freed. Don't grab an extra
  * reference on the swaphandle, it doesn't matter if it becomes unused.
  */
 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
@@ -1631,7 +1664,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 		toff++, i--;
 	*offset = toff;
 
-	swap_device_lock(swapdev);
+	spin_lock(&swap_lock);
 	do {
 		/* Don't read-ahead past the end of the swap area */
 		if (toff >= swapdev->max)
@@ -1644,6 +1677,6 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 		toff++;
 		ret++;
 	} while (--i);
-	swap_device_unlock(swapdev);
+	spin_unlock(&swap_lock);
 	return ret;
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8ff16a1eee6..13c3d82968a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -158,8 +158,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 	return err;
 }
 
-#define IOREMAP_MAX_ORDER	(7 + PAGE_SHIFT)	/* 128 pages */
-
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 				unsigned long start, unsigned long end)
 {
@@ -334,9 +332,10 @@ void __vunmap(void *addr, int deallocate_pages)
  *	@addr:		memory base address
  *
  *	Free the virtually contiguous memory area starting at @addr, as
- *	obtained from vmalloc(), vmalloc_32() or __vmalloc().
+ *	obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
+ *	NULL, no operation is performed.
  *
- *	May not be called in interrupt context.
+ *	Must not be called in interrupt context.
  */
 void vfree(void *addr)
 {
@@ -354,7 +353,7 @@ EXPORT_SYMBOL(vfree);
  *	Free the virtually contiguous memory area starting at @addr,
  *	which was created from the page array passed to vmap().
  *
- *	May not be called in interrupt context.
+ *	Must not be called in interrupt context.
  */
 void vunmap(void *addr)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cfffe5098d5..a740778f688 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -822,6 +822,8 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 	unsigned long nr_active;
 	unsigned long nr_inactive;
 
+	atomic_inc(&zone->reclaim_in_progress);
+
 	/*
 	 * Add one to `nr_to_scan' just to make sure that the kernel will
 	 * slowly sift through the active list.
@@ -861,6 +863,8 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 	}
 
 	throttle_vm_writeout();
+
+	atomic_dec(&zone->reclaim_in_progress);
 }
 
 /*
@@ -890,7 +894,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 		if (zone->present_pages == 0)
 			continue;
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->temp_priority = sc->priority;
@@ -900,9 +904,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 		if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
-		atomic_inc(&zone->reclaim_in_progress);
 		shrink_zone(zone, sc);
-		atomic_dec(&zone->reclaim_in_progress);
 	}
 }
  
@@ -938,7 +940,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->temp_priority = DEF_PRIORITY;
@@ -984,7 +986,7 @@ out:
 	for (i = 0; zones[i] != 0; i++) {
 		struct zone *zone = zones[i];
 
-		if (!cpuset_zone_allowed(zone))
+		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
 		zone->prev_priority = zone->temp_priority;
@@ -1254,7 +1256,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 		return;
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
-	if (!cpuset_zone_allowed(zone))
+	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 		return;
 	if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
 		return;
@@ -1358,14 +1360,13 @@ int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
 		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 
 	/* Don't reclaim the zone if there are other reclaimers active */
-	if (!atomic_inc_and_test(&zone->reclaim_in_progress))
+	if (atomic_read(&zone->reclaim_in_progress) > 0)
 		goto out;
 
 	shrink_zone(zone, &sc);
 	total_reclaimed = sc.nr_reclaimed;
 
  out:
-	atomic_dec(&zone->reclaim_in_progress);
 	return total_reclaimed;
 }
 
@@ -1375,6 +1376,9 @@ asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
 	struct zone *z;
 	int i;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
 	if (node >= MAX_NUMNODES || !node_online(node))
 		return -EINVAL;