From 18b48d5873a6fc8e0e6044ddb572fcda26988f19 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Mon, 17 Sep 2012 14:08:55 -0700 Subject: memory hotplug: reset pgdat->kswapd to NULL if creating kernel thread fails If kthread_run() fails, pgdat->kswapd contains errno. When we stop this thread, we only check whether pgdat->kswapd is NULL and access it. If it contains errno, it will cause page fault. Reset pgdat->kswapd to NULL when creating kernel thread fails can avoid this problem. Signed-off-by: Wen Congyang Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8d01243d956..99b434b674c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3102,6 +3102,7 @@ int kswapd_run(int nid) /* failure at boot is fatal */ BUG_ON(system_state == SYSTEM_BOOTING); printk("Failed to start kswapd on node %d\n",nid); + pgdat->kswapd = NULL; ret = -1; } return ret; -- cgit v1.2.3-18-g5258 From 05cf96398e1b6502f9e191291b715c7463c9d5dd Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Mon, 17 Sep 2012 14:08:56 -0700 Subject: mm/ia64: fix a memory block size bug I found following definition in include/linux/memory.h, in my IA64 platform, SECTION_SIZE_BITS is equal to 32, and MIN_MEMORY_BLOCK_SIZE will be 0. #define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS) Because MIN_MEMORY_BLOCK_SIZE is int type and length of 32bits, so MIN_MEMORY_BLOCK_SIZE(1 << 32) will will equal to 0. Actually when SECTION_SIZE_BITS >= 31, MIN_MEMORY_BLOCK_SIZE will be wrong. This will cause wrong system memory infomation in sysfs. I think it should be: #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) And "echo offline > memory0/state" will cause following call trace: kernel BUG at mm/memory_hotplug.c:885! sh[6455]: bugcheck! 0 [1] Pid: 6455, CPU 0, comm: sh psr : 0000101008526030 ifs : 8000000000000fa4 ip : [] Not tainted (3.6.0-rc1) ip is at offline_pages+0x210/0xee0 Call Trace: show_stack+0x80/0xa0 show_regs+0x640/0x920 die+0x190/0x2c0 die_if_kernel+0x50/0x80 ia64_bad_break+0x3d0/0x6e0 ia64_native_leave_kernel+0x0/0x270 offline_pages+0x210/0xee0 alloc_pages_current+0x180/0x2a0 Signed-off-by: Jianguo Wu Signed-off-by: Jiang Liu Cc: "Luck, Tony" Reviewed-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memory.h b/include/linux/memory.h index 1ac7f6e405f..ff9a9f8e0ed 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -19,7 +19,7 @@ #include #include -#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS) +#define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) struct memory_block { unsigned long start_section_nr; -- cgit v1.2.3-18-g5258 From e9b7d7c81d9bdb41a897a2983ae3386a5fd4a1e3 Mon Sep 17 00:00:00 2001 From: Gang Wei Date: Mon, 17 Sep 2012 14:08:59 -0700 Subject: MAINTAINERS: fix TXT maintainer list and source repo path Signed-off-by: Gang Wei Cc: Richard L Maliszewski Cc: Gang Wei Cc: Shane Wang Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 53cc13c82cb..b17587d9412 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3666,11 +3666,12 @@ F: Documentation/networking/README.ipw2200 F: drivers/net/wireless/ipw2x00/ INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT) -M: Joseph Cihula +M: Richard L Maliszewski +M: Gang Wei M: Shane Wang L: tboot-devel@lists.sourceforge.net W: http://tboot.sourceforge.net -T: Mercurial http://www.bughost.org/repos.hg/tboot.hg +T: hg http://tboot.hg.sourceforge.net:8000/hgroot/tboot/tboot S: Supported F: Documentation/intel_txt.txt F: include/linux/tboot.h -- cgit v1.2.3-18-g5258 From fded4e090c60100d709318896c79816d68d5b47d Mon Sep 17 00:00:00 2001 From: Paul Clements Date: Mon, 17 Sep 2012 14:09:02 -0700 Subject: nbd: clear waiting_queue on shutdown Fix a serious but uncommon bug in nbd which occurs when there is heavy I/O going to the nbd device while, at the same time, a failure (server, network) or manual disconnect of the nbd connection occurs. There is a small window between the time that the nbd_thread is stopped and the socket is shutdown where requests can continue to be queued to nbd's internal waiting_queue. When this happens, those requests are never completed or freed. The fix is to clear the waiting_queue on shutdown of the nbd device, in the same way that the nbd request queue (queue_head) is already being cleared. Signed-off-by: Paul Clements Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/nbd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index d07c9f7fded..0c03411c59e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -449,6 +449,14 @@ static void nbd_clear_que(struct nbd_device *nbd) req->errors++; nbd_end_request(req); } + + while (!list_empty(&nbd->waiting_queue)) { + req = list_entry(nbd->waiting_queue.next, struct request, + queuelist); + list_del_init(&req->queuelist); + req->errors++; + nbd_end_request(req); + } } @@ -598,6 +606,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd->file = NULL; nbd_clear_que(nbd); BUG_ON(!list_empty(&nbd->queue_head)); + BUG_ON(!list_empty(&nbd->waiting_queue)); if (file) fput(file); return 0; -- cgit v1.2.3-18-g5258 From 30c29bea6af2d3b6ffc8865864de7fc08cadb5df Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 17 Sep 2012 14:09:03 -0700 Subject: slab: do ClearSlabPfmemalloc() for all pages of slab Right now, we call ClearSlabPfmemalloc() for first page of slab when we clear SlabPfmemalloc flag. This is fine for most swap-over-network use cases as it is expected that order-0 pages are in use. Unfortunately it is possible that that __ac_put_obj() checks SlabPfmemalloc on a tail page and while this is harmless, it is sloppy. This patch ensures that the head page is always used. This problem was originally identified by Joonsoo Kim. [js1304@gmail.com: Original implementation and problem identification] Signed-off-by: Mel Gorman Cc: David Miller Cc: Chuck Lever Cc: Joonsoo Kim Cc: David Rientjes Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 811af03a14e..d34a9034f92 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1000,7 +1000,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, l3 = cachep->nodelists[numa_mem_id()]; if (!list_empty(&l3->slabs_free) && force_refill) { struct slab *slabp = virt_to_slab(objp); - ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem)); + ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem)); clear_obj_pfmemalloc(&objp); recheck_pfmemalloc_active(cachep, ac); return objp; @@ -1032,7 +1032,7 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, { if (unlikely(pfmemalloc_active)) { /* Some pfmemalloc slabs exist, check if this is one */ - struct page *page = virt_to_page(objp); + struct page *page = virt_to_head_page(objp); if (PageSlabPfmemalloc(page)) set_obj_pfmemalloc(&objp); } -- cgit v1.2.3-18-g5258 From d014dc2ed4fae84cb92509416c8bfc9078d4f0d9 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 17 Sep 2012 14:09:06 -0700 Subject: slab: fix starting index for finding another object In array cache, there is a object at index 0, check it. Signed-off-by: Joonsoo Kim Signed-off-by: Mel Gorman Cc: David Miller Cc: Chuck Lever Cc: David Rientjes Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab.c b/mm/slab.c index d34a9034f92..c6854759bcf 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -983,7 +983,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, } /* The caller cannot use PFMEMALLOC objects, find another one */ - for (i = 1; i < ac->avail; i++) { + for (i = 0; i < ac->avail; i++) { /* If a !PFMEMALLOC object is found, swap them */ if (!is_obj_pfmemalloc(ac->entry[i])) { objp = ac->entry[i]; -- cgit v1.2.3-18-g5258 From 8ba00bb68a067c221cc3ea3a0293e8fcbdcb7ba1 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 17 Sep 2012 14:09:09 -0700 Subject: slub: consider pfmemalloc_match() in get_partial_node() get_partial() is currently not checking pfmemalloc_match() meaning that it is possible for pfmemalloc pages to leak to non-pfmemalloc users. This is a problem in the following situation. Assume that there is a request from normal allocation and there are no objects in the per-cpu cache and no node-partial slab. In this case, slab_alloc enters the slow path and new_slab_objects() is called which may return a PFMEMALLOC page. As the current user is not allowed to access PFMEMALLOC page, deactivate_slab() is called ([5091b74a: mm: slub: optimise the SLUB fast path to avoid pfmemalloc checks]) and returns an object from PFMEMALLOC page. Next time, when we get another request from normal allocation, slab_alloc() enters the slow-path and calls new_slab_objects(). In new_slab_objects(), we call get_partial() and get a partial slab which was just deactivated but is a pfmemalloc page. We extract one object from it and re-deactivate. "deactivate -> re-get in get_partial -> re-deactivate" occures repeatedly. As a result, access to PFMEMALLOC page is not properly restricted and it can cause a performance degradation due to frequent deactivation. deactivation frequently. This patch changes get_partial_node() to take pfmemalloc_match() into account and prevents the "deactivate -> re-get in get_partial() scenario. Instead, new_slab() is called. Signed-off-by: Joonsoo Kim Acked-by: David Rientjes Signed-off-by: Mel Gorman Cc: David Miller Cc: Chuck Lever Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 8f78e257703..2fdd96f9e99 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1524,12 +1524,13 @@ static inline void *acquire_slab(struct kmem_cache *s, } static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); /* * Try to allocate a partial slab from a specific node. */ -static void *get_partial_node(struct kmem_cache *s, - struct kmem_cache_node *n, struct kmem_cache_cpu *c) +static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + struct kmem_cache_cpu *c, gfp_t flags) { struct page *page, *page2; void *object = NULL; @@ -1545,9 +1546,13 @@ static void *get_partial_node(struct kmem_cache *s, spin_lock(&n->list_lock); list_for_each_entry_safe(page, page2, &n->partial, lru) { - void *t = acquire_slab(s, n, page, object == NULL); + void *t; int available; + if (!pfmemalloc_match(page, flags)) + continue; + + t = acquire_slab(s, n, page, object == NULL); if (!t) break; @@ -1614,7 +1619,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, c); + object = get_partial_node(s, n, c, flags); if (object) { /* * Return the object even if @@ -1643,7 +1648,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, void *object; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - object = get_partial_node(s, get_node(s, searchnode), c); + object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) return object; -- cgit v1.2.3-18-g5258 From 35c448a8a3471b95ebc0ebcf91eb1183401b4274 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 17 Sep 2012 14:09:11 -0700 Subject: include/net/sock.h: squelch compiler warning in sk_rmem_schedule() This warning: In file included from linux/include/linux/tcp.h:227:0, from linux/include/linux/ipv6.h:221, from linux/include/net/ipv6.h:16, from linux/include/linux/sunrpc/clnt.h:26, from linux/net/sunrpc/stats.c:22: linux/include/net/sock.h: In function `sk_rmem_schedule': linux/nfs-2.6/include/net/sock.h:1339:13: warning: comparison between signed and unsigned integer expressions [-Wsign-compare] is seen with gcc (GCC) 4.6.3 20120306 (Red Hat 4.6.3-2) using the -Wextra option. Commit c76562b6709f ("netvm: prevent a stream-specific deadlock") accidentally replaced the "size" parameter of sk_rmem_schedule() with an unsigned int. This changes the semantics of the comparison in the return statement. In sk_wmem_schedule we have syntactically the same comparison, but "size" is a signed integer. In addition, __sk_mem_schedule() takes a signed integer for its "size" parameter, so there is an implicit type conversion in sk_rmem_schedule() anyway. Revert the "size" parameter back to a signed integer so that the semantics of the expressions in both sk_[rw]mem_schedule() are exactly the same. Signed-off-by: Chuck Lever Signed-off-by: Mel Gorman Cc: David Miller Cc: Joonsoo Kim Cc: David Rientjes Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/sock.h b/include/net/sock.h index 72132aef53f..adb7da20b5a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1332,7 +1332,7 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size) } static inline bool -sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size) +sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) { if (!sk_has_account(sk)) return true; -- cgit v1.2.3-18-g5258 From 579035dc5ddd6d48fd8529e7358b03d911ab9d8a Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Mon, 17 Sep 2012 14:09:12 -0700 Subject: pid-namespace: limit value of ns_last_pid to (0, max_pid) The kernel doesn't check the pid for negative values, so if you try to write -2 to /proc/sys/kernel/ns_last_pid, you will get a kernel panic. The crash happens because the next pid is -1, and alloc_pidmap() will try to access to a nonexistent pidmap. map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; Signed-off-by: Andrew Vagin Acked-by: Cyrill Gorcunov Acked-by: Oleg Nesterov Cc: Eric W. Biederman Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b3c7fd55425..6144bab8fd8 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -232,15 +232,19 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, */ tmp.data = ¤t->nsproxy->pid_ns->last_pid; - return proc_dointvec(&tmp, write, buffer, lenp, ppos); + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); } +extern int pid_max; +static int zero = 0; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, + .extra1 = &zero, + .extra2 = &pid_max, }, { } }; -- cgit v1.2.3-18-g5258 From 9a858dc7cebce01a7bb616bebb85087fa2b40871 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 17 Sep 2012 14:09:15 -0700 Subject: compiler.h: add __visible gcc 4.6+ has support for a externally_visible attribute that prevents the optimizer from optimizing unused symbols away. Add a __visible macro to use it with that compiler version or later. This is used (at least) by the "Link Time Optimization" patchset. Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc4.h | 7 +++++++ include/linux/compiler.h | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h index 2f4079175af..934bc34d5f9 100644 --- a/include/linux/compiler-gcc4.h +++ b/include/linux/compiler-gcc4.h @@ -49,6 +49,13 @@ #endif #endif +#if __GNUC_MINOR__ >= 6 +/* + * Tell the optimizer that something else uses this function or variable. + */ +#define __visible __attribute__((externally_visible)) +#endif + #if __GNUC_MINOR__ > 0 #define __compiletime_object_size(obj) __builtin_object_size(obj, 0) #endif diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 923d093c9ce..f430e4162f4 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -278,6 +278,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); # define __section(S) __attribute__ ((__section__(#S))) #endif +#ifndef __visible +#define __visible +#endif + /* Are two types/vars the same type (ignoring qualifiers)? */ #ifndef __same_type # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) -- cgit v1.2.3-18-g5258 From 8dcebaa9a0ae8a0487f4342f3d56d2cb1c980860 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Mon, 17 Sep 2012 14:09:17 -0700 Subject: drivers/rtc/rtc-twl.c: ensure all interrupts are disabled during probe On some platforms, bootloaders are known to do some interesting RTC programming. Without going into the obscurities as to why this may be the case, suffice it to say the the driver should not make any assumptions about the state of the RTC when the driver loads. In particular, the driver probe should be sure that all interrupts are disabled until otherwise programmed. This was discovered when finding bursty I2C traffic every second on Overo platforms. This I2C overhead was keeping the SoC from hitting deep power states. The cause was found to be the RTC firing every second on the I2C-connected TWL PMIC. Special thanks to Felipe Balbi for suggesting to look for a rogue driver as the source of the I2C traffic rather than the I2C driver itself. Special thanks to Steve Sakoman for helping track down the source of the continuous RTC interrups on the Overo boards. Signed-off-by: Kevin Hilman Cc: Felipe Balbi Tested-by: Steve Sakoman Cc: Alessandro Zummo Tested-by: Shubhrajyoti Datta Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-twl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c index c5d06fe83bb..9277d945bf4 100644 --- a/drivers/rtc/rtc-twl.c +++ b/drivers/rtc/rtc-twl.c @@ -495,6 +495,11 @@ static int __devinit twl_rtc_probe(struct platform_device *pdev) if (ret < 0) goto out1; + /* ensure interrupts are disabled, bootloaders can be strange */ + ret = twl_rtc_write_u8(0, REG_RTC_INTERRUPTS_REG); + if (ret < 0) + dev_warn(&pdev->dev, "unable to disable interrupt\n"); + /* init cached IRQ enable bits */ ret = twl_rtc_read_u8(&rtc_irq_bits, REG_RTC_INTERRUPTS_REG); if (ret < 0) -- cgit v1.2.3-18-g5258 From 0ba8f2d59304dfe69b59c034de723ad80f7ab9ac Mon Sep 17 00:00:00 2001 From: Li Haifeng Date: Mon, 17 Sep 2012 14:09:21 -0700 Subject: mm/page_alloc: fix the page address of higher page's buddy calculation The heuristic method for buddy has been introduced since commit 43506fad21ca ("mm/page_alloc.c: simplify calculation of combined index of adjacent buddy lists"). But the page address of higher page's buddy was wrongly calculated, which will lead page_is_buddy to fail for ever. IOW, the heuristic method would be disabled with the wrong page address of higher page's buddy. Calculating the page address of higher page's buddy should be based higher_page with the offset between index of higher page and index of higher page's buddy. Signed-off-by: Haifeng Li Signed-off-by: Gavin Shan Reviewed-by: Michal Hocko Cc: KyongHo Cho Cc: Mel Gorman Cc: Minchan Kim Cc: Johannes Weiner Cc: [2.6.38+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c66fb875104..c13ea753889 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -584,7 +584,7 @@ static inline void __free_one_page(struct page *page, combined_idx = buddy_idx & page_idx; higher_page = page + (combined_idx - page_idx); buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = page + (buddy_idx - combined_idx); + higher_buddy = higher_page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); -- cgit v1.2.3-18-g5258 From f14851af0ebb32745c6c5a2e400aa0549f9d20df Mon Sep 17 00:00:00 2001 From: qiuxishi Date: Mon, 17 Sep 2012 14:09:24 -0700 Subject: memory hotplug: fix section info double registration bug There may be a bug when registering section info. For example, on my Itanium platform, the pfn range of node0 includes the other nodes, so other nodes' section info will be double registered, and memmap's page count will equal to 3. node0: start_pfn=0x100, spanned_pfn=0x20fb00, present_pfn=0x7f8a3, => 0x000100-0x20fc00 node1: start_pfn=0x80000, spanned_pfn=0x80000, present_pfn=0x80000, => 0x080000-0x100000 node2: start_pfn=0x100000, spanned_pfn=0x80000, present_pfn=0x80000, => 0x100000-0x180000 node3: start_pfn=0x180000, spanned_pfn=0x80000, present_pfn=0x80000, => 0x180000-0x200000 free_all_bootmem_node() register_page_bootmem_info_node() register_page_bootmem_info_section() When hot remove memory, we can't free the memmap's page because page_count() is 2 after put_page_bootmem(). sparse_remove_one_section() free_section_usemap() free_map_bootmem() put_page_bootmem() [akpm@linux-foundation.org: add code comment] Signed-off-by: Xishi Qiu Signed-off-by: Jiang Liu Acked-by: Mel Gorman Cc: "Luck, Tony" Cc: Yasuaki Ishimatsu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3ad25f9d1fc..6a5b90d0cfd 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -126,9 +126,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) struct mem_section *ms; struct page *page, *memmap; - if (!pfn_valid(start_pfn)) - return; - section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -187,9 +184,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) end_pfn = pfn + pgdat->node_spanned_pages; /* register_section info */ - for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) - register_page_bootmem_info_section(pfn); - + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + /* + * Some platforms can assign the same pfn to multiple nodes - on + * node0 as well as nodeN. To avoid registering a pfn against + * multiple nodes we check that this pfn does not already + * reside in some other node. + */ + if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) + register_page_bootmem_info_section(pfn); + } } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -- cgit v1.2.3-18-g5258