From d5b562330ec766292a3ac54ae5e0673610bd5b3d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 8 Feb 2009 20:56:58 +0000 Subject: mm: fix error case in mlock downgrade reversion Commit 27421e211a39784694b597dbf35848b88363c248, Manually revert "mlock: downgrade mmap sem while populating mlocked regions", has introduced its own regression: __mlock_vma_pages_range() may report an error (for example, -EFAULT from trying to lock down pages from beyond EOF), but mlock_vma_pages_range() must hide that from its callers as before. Reported-by: Sami Farin Signed-off-by: Hugh Dickins Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/mlock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 028ec482fdd..037161d61b4 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -311,7 +311,10 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - return __mlock_vma_pages_range(vma, start, end, 1); + __mlock_vma_pages_range(vma, start, end, 1); + + /* Hide errors from mmap() and other callers */ + return 0; } /* -- cgit v1.2.3-18-g5258 From 5a6fe125950676015f5108fb71b2a67441755003 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Feb 2009 14:02:27 +0000 Subject: Do not account for the address space used by hugetlbfs using VM_ACCOUNT When overcommit is disabled, the core VM accounts for pages used by anonymous shared, private mappings and special mappings. It keeps track of VMAs that should be accounted for with VM_ACCOUNT and VMAs that never had a reserve with VM_NORESERVE. Overcommit for hugetlbfs is much riskier than overcommit for base pages due to contiguity requirements. It avoids overcommiting on both shared and private mappings using reservation counters that are checked and updated during mmap(). This ensures (within limits) that hugepages exist in the future when faults occurs or it is too easy to applications to be SIGKILLed. As hugetlbfs makes its own reservations of a different unit to the base page size, VM_ACCOUNT should never be set. Even if the units were correct, we would double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may be set because an application can request no reserves be made for hugetlbfs at the risk of getting killed later. With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This breaks the accounting for both the core VM and hugetlbfs, can trigger an OOM storm when hugepage pools are too small lockups and corrupted counters otherwise are used. This patch brings hugetlbfs more in line with how the core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set. Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/fremap.c | 2 +- mm/hugetlb.c | 39 +++++++++++++++++++++++++-------------- mm/mmap.c | 38 ++++++++++++++++++++++---------------- mm/mprotect.c | 5 +++-- 4 files changed, 51 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 736ba7f3306..b6ec85abbb3 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; get_file(file); addr = mmap_region(file, start, size, - flags, vma->vm_flags, pgoff, 1); + flags, vma->vm_flags, pgoff); fput(file); if (IS_ERR_VALUE(addr)) { err = addr; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 618e9830408..20746420954 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int acctflag) { - long ret, chg; + long ret = 0, chg; struct hstate *h = hstate_inode(inode); - if (vma && vma->vm_flags & VM_NORESERVE) - return 0; - /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need @@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode, */ if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); - if (!resv_map) - return -ENOMEM; - + else chg = to - from; - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); - } - if (chg < 0) return chg; if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; + + /* + * Only apply hugepage reservation if asked. We still have to + * take the filesystem quota because it is an upper limit + * defined for the mount and not necessarily memory as a whole + */ + if (acctflag & VM_NORESERVE) { + reset_vma_resv_huge_pages(vma); + return 0; + } + ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); @@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode, } if (!vma || vma->vm_flags & VM_SHARED) region_add(&inode->i_mapping->private_list, from, to); + else { + struct resv_map *resv_map = resv_map_alloc(); + + if (!resv_map) + return -ENOMEM; + + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 214b6a258ee..eb1270bebe6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, struct inode *inode; unsigned int vm_flags; int error; - int accountable = 1; unsigned long reqprot = prot; /* @@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, return -EPERM; vm_flags &= ~VM_MAYEXEC; } - if (is_file_hugepages(file)) - accountable = 0; if (!file->f_op || !file->f_op->mmap) return -ENODEV; @@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (error) return error; - return mmap_region(file, addr, len, flags, vm_flags, pgoff, - accountable); + return mmap_region(file, addr, len, flags, vm_flags, pgoff); } EXPORT_SYMBOL(do_mmap_pgoff); @@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma) /* * We account for memory if it's a private writeable mapping, - * and VM_NORESERVE wasn't set. + * not hugepages and VM_NORESERVE wasn't set. */ -static inline int accountable_mapping(unsigned int vm_flags) +static inline int accountable_mapping(struct file *file, unsigned int vm_flags) { + /* + * hugetlb has its own accounting separate from the core VM + * VM_HUGETLB may not be set yet so we cannot check for that flag. + */ + if (file && is_file_hugepages(file)) + return 0; + return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff, - int accountable) + unsigned int vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1128,18 +1130,22 @@ munmap_back: /* * Set 'VM_NORESERVE' if we should not account for the - * memory use of this mapping. We only honor MAP_NORESERVE - * if we're allowed to overcommit memory. + * memory use of this mapping. */ - if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) - vm_flags |= VM_NORESERVE; - if (!accountable) - vm_flags |= VM_NORESERVE; + if ((flags & MAP_NORESERVE)) { + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) + vm_flags |= VM_NORESERVE; + } /* * Private writable mapping: check memory availability */ - if (accountable_mapping(vm_flags)) { + if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; if (security_vm_enough_memory(charged)) return -ENOMEM; diff --git a/mm/mprotect.c b/mm/mprotect.c index abe2694e13f..258197b76fb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we - * make it unwritable again. + * make it unwritable again. hugetlb mapping were accounted for + * even if read-only so there is no need to account for them here */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE| + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; if (security_vm_enough_memory(charged)) -- cgit v1.2.3-18-g5258 From 9f339e7028e2855717af3193c938f9960ad13b38 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Wed, 11 Feb 2009 15:10:27 +0100 Subject: x86, ptrace, mm: fix double-free on race Ptrace_detach() races with __ptrace_unlink() if the traced task is reaped while detaching. This might cause a double-free of the BTS buffer. Change the ptrace_detach() path to only do the memory accounting in ptrace_bts_detach() and leave the buffer free to ptrace_bts_untrace() which will be called from __ptrace_unlink(). The fix follows a proposal from Oleg Nesterov. Reported-by: Oleg Nesterov Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- mm/mlock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 028ec482fdd..2b57f7e6039 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -657,7 +657,7 @@ void *alloc_locked_buffer(size_t size) return buffer; } -void free_locked_buffer(void *buffer, size_t size) +void release_locked_buffer(void *buffer, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; @@ -667,6 +667,11 @@ void free_locked_buffer(void *buffer, size_t size) current->mm->locked_vm -= pgsz; up_write(¤t->mm->mmap_sem); +} + +void free_locked_buffer(void *buffer, size_t size) +{ + release_locked_buffer(buffer, size); kfree(buffer); } -- cgit v1.2.3-18-g5258 From 17c9d12e126cb0de8d535dc1908c4819d712bc68 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 11 Feb 2009 16:34:16 +0000 Subject: Do not account for hugetlbfs quota at mmap() time if mapping [SHM|MAP]_NORESERVE Commit 5a6fe125950676015f5108fb71b2a67441755003 brought hugetlbfs more in line with the core VM by obeying VM_NORESERVE and not reserving hugepages for both shared and private mappings when [SHM|MAP]_NORESERVE are specified. However, it is still taking filesystem quota unconditionally. At fault time, if there are no reserves and attempt is made to allocate the page and account for filesystem quota. If either fail, the fault fails. The impact is that quota is getting accounted for twice. This patch partially reverts 5a6fe125950676015f5108fb71b2a67441755003. To help prevent this mistake happening again, it improves the documentation of hugetlb_reserve_pages() Reported-by: Andy Whitcroft Signed-off-by: Mel Gorman Acked-by: Andy Whitcroft Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 53 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 20746420954..107da3d809a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2272,9 +2272,17 @@ int hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma, int acctflag) { - long ret = 0, chg; + long ret, chg; struct hstate *h = hstate_inode(inode); + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page + * and filesystem quota without using reserves + */ + if (acctflag & VM_NORESERVE) + return 0; + /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need @@ -2283,42 +2291,47 @@ int hugetlb_reserve_pages(struct inode *inode, */ if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); - else + else { + struct resv_map *resv_map = resv_map_alloc(); + if (!resv_map) + return -ENOMEM; + chg = to - from; + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + if (chg < 0) return chg; + /* There must be enough filesystem quota for the mapping */ if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; /* - * Only apply hugepage reservation if asked. We still have to - * take the filesystem quota because it is an upper limit - * defined for the mount and not necessarily memory as a whole + * Check enough hugepages are available for the reservation. + * Hand back the quota if there are not */ - if (acctflag & VM_NORESERVE) { - reset_vma_resv_huge_pages(vma); - return 0; - } - ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); return ret; } + + /* + * Account for the reservations made. Shared mappings record regions + * that have reservations as they are shared by multiple VMAs. + * When the last VMA disappears, the region map says how much + * the reservation was and the page cache tells how much of + * the reservation was consumed. Private mappings are per-VMA and + * only the consumed reservations are tracked. When the VMA + * disappears, the original reservation is the VMA size and the + * consumed reservations are stored in the map. Hence, nothing + * else has to be done for private mappings here + */ if (!vma || vma->vm_flags & VM_SHARED) region_add(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); - - if (!resv_map) - return -ENOMEM; - - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); - } - return 0; } -- cgit v1.2.3-18-g5258 From 1001c9fb8721ab395e21f571ed2aaa523cdd1e29 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 11 Feb 2009 13:04:18 -0800 Subject: migration: migrate_vmas should check "vma" migrate_vmas() should check "vma" not "vma->vm_next" for for-loop condition. Signed-off-by: Daisuke Nishimura Cc: Christoph Lameter Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 2bb4e1d6352..a9eff3f092f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1129,7 +1129,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, struct vm_area_struct *vma; int err = 0; - for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) { + for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { if (vma->vm_ops && vma->vm_ops->migrate) { err = vma->vm_ops->migrate(vma, to, from, flags); if (err) -- cgit v1.2.3-18-g5258 From fc3501d411d34823fb9be248a95a0c44f945866f Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 11 Feb 2009 13:04:23 -0800 Subject: mm: fix dirty_bytes/dirty_background_bytes sysctls on 64bit arches We need to pass an unsigned long as the minimum, because it gets casted to an unsigned long in the sysctl handler. If we pass an int, we'll access four more bytes on 64bit arches, resulting in a random minimum value. [rientjes@google.com: fix type of `old_bytes'] Signed-off-by: Sven Wegener Cc: Peter Zijlstra Cc: Dave Chinner Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dc32dae01e5..c17005e7397 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -209,7 +209,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int old_bytes = vm_dirty_bytes; + unsigned long old_bytes = vm_dirty_bytes; int ret; ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); -- cgit v1.2.3-18-g5258 From 508b9f8efdad123b202b228f71f59feba51e4fb5 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Wed, 11 Feb 2009 13:04:27 -0800 Subject: mm: fix mlocked page counter mismatch When I tested following program, I found that the mlocked counter is strange. It cannot free some mlocked pages. It is because try_to_unmap_file() doesn't check real page mappings in vmas. That is because the goal of an address_space for a file is to find all processes into which the file's specific interval is mapped. It is related to the file's interval, not to pages. Even if the page isn't really mapped by the vma, it returns SWAP_MLOCK since the vma has VM_LOCKED, then calls try_to_mlock_page. After this the mlocked counter is increased again. COWed anon page in a file-backed vma could be a such case. This patch resolves it. -- my test program -- int main() { mlockall(MCL_CURRENT); return 0; } -- before -- root@barrios-target-linux:~# cat /proc/meminfo | egrep 'Mlo|Unev' Unevictable: 0 kB Mlocked: 0 kB -- after -- root@barrios-target-linux:~# cat /proc/meminfo | egrep 'Mlo|Unev' Unevictable: 8 kB Mlocked: 8 kB Signed-off-by: MinChan Kim Acked-by: Lee Schermerhorn Acked-by: KOSAKI Motohiro Tested-by: Lee Schermerhorn Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index ac4af8cffbf..16521664010 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1072,7 +1072,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (MLOCK_PAGES && unlikely(unlock)) { - if (!(vma->vm_flags & VM_LOCKED)) + if (!((vma->vm_flags & VM_LOCKED) && + page_mapped_in_vma(page, vma))) continue; /* must visit all vmas */ ret = SWAP_MLOCK; } else { -- cgit v1.2.3-18-g5258 From 2e9c23724328ae4e56c42a35a717a956d7d3001d Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 11 Feb 2009 13:04:29 -0800 Subject: memcg: use __GFP_NOWARN in page cgroup allocation page_cgroup's page allocation at init/memory hotplug uses kmalloc() and vmalloc(). If kmalloc() failes, vmalloc() is used. This is because vmalloc() is very limited resource on 32bit systems. We want to use kmalloc() first. But in this kind of call, __GFP_NOWARN should be specified. Reported-by: Heiko Carstens Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 7006a11350c..ceecfbb143f 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -114,7 +114,8 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; if (slab_is_available()) { - base = kmalloc_node(table_size, GFP_KERNEL, nid); + base = kmalloc_node(table_size, + GFP_KERNEL | __GFP_NOWARN, nid); if (!base) base = vmalloc_node(table_size, nid); } else { -- cgit v1.2.3-18-g5258 From 89e1219004b3657cc014521663eeef0744f1c99d Mon Sep 17 00:00:00 2001 From: Federico Cuello Date: Wed, 11 Feb 2009 13:04:39 -0800 Subject: writeback: fix break condition Commit dcf6a79dda5cc2a2bec183e50d829030c0972aaa ("write-back: fix nr_to_write counter") fixed nr_to_write counter, but didn't set the break condition properly. If nr_to_write == 0 after being decremented it will loop one more time before setting done = 1 and breaking the loop. [akpm@linux-foundation.org: coding-style fixes] Cc: Artem Bityutskiy Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c17005e7397..6106a5c7ed4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1051,20 +1051,23 @@ continue_unlock: } } - if (nr_to_write > 0) + if (nr_to_write > 0) { nr_to_write--; - else if (wbc->sync_mode == WB_SYNC_NONE) { - /* - * We stop writing back only if we are not - * doing integrity sync. In case of integrity - * sync we have to keep going because someone - * may be concurrently dirtying pages, and we - * might have synced a lot of newly appeared - * dirty pages, but have not synced all of the - * old dirty pages. - */ - done = 1; - break; + if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are + * not doing integrity sync. In case of + * integrity sync we have to keep going + * because someone may be concurrently + * dirtying pages, and we might have + * synced a lot of newly appeared dirty + * pages, but have not synced all of the + * old dirty pages. + */ + done = 1; + break; + } } if (wbc->nonblocking && bdi_write_congested(bdi)) { -- cgit v1.2.3-18-g5258 From 9480c53e9b2aa13a06283ffb96bb8f1873ac4e9a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 11 Feb 2009 13:04:41 -0800 Subject: mm: rearrange exit_mmap() to unlock before arch_exit_mmap Christophe Saout reported [in precursor to: http://marc.info/?l=linux-kernel&m=123209902707347&w=4]: > Note that I also some a different issue with CONFIG_UNEVICTABLE_LRU. > Seems like Xen tears down current->mm early on process termination, so > that __get_user_pages in exit_mmap causes nasty messages when the > process had any mlocked pages. (in fact, it somehow manages to get into > the swapping code and produces a null pointer dereference trying to get > a swap token) Jeremy explained: Yes. In the normal case under Xen, an in-use pagetable is "pinned", meaning that it is RO to the kernel, and all updates must go via hypercall (or writes are trapped and emulated, which is much the same thing). An unpinned pagetable is not currently in use by any process, and can be directly accessed as normal RW pages. As an optimisation at process exit time, we unpin the pagetable as early as possible (switching the process to init_mm), so that all the normal pagetable teardown can happen with direct memory accesses. This happens in exit_mmap() -> arch_exit_mmap(). The munlocking happens a few lines below. The obvious thing to do would be to move arch_exit_mmap() to below the munlock code, but I think we'd want to call it even if mm->mmap is NULL, just to be on the safe side. Thus, this patch: exit_mmap() needs to unlock any locked vmas before calling arch_exit_mmap, as the latter may switch the current mm to init_mm, which would cause the former to fail. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Lee Schermerhorn Cc: Christophe Saout Cc: Keir Fraser Cc: Christophe Saout Cc: Alex Williamson Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index eb1270bebe6..00ced3ee49a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2084,12 +2084,8 @@ void exit_mmap(struct mm_struct *mm) unsigned long end; /* mm's last user has gone, and its about to be pulled down */ - arch_exit_mmap(mm); mmu_notifier_release(mm); - if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */ - return; - if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -2098,7 +2094,13 @@ void exit_mmap(struct mm_struct *mm) vma = vma->vm_next; } } + + arch_exit_mmap(mm); + vma = mm->mmap; + if (!vma) /* Can happen if dup_mmap() received an OOM */ + return; + lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); -- cgit v1.2.3-18-g5258 From b1aabecd55931ee754f6a913969516b26a0e682e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2009 15:21:44 +0200 Subject: mm: Export symbol ksize() Commit 7b2cd92adc5430b0c1adeb120971852b4ea1ab08 ("crypto: api - Fix zeroing on free") added modular user of ksize(). Export that to fix crypto.ko compilation. Cc: Herbert Xu Signed-off-by: Kirill A. Shutemov Signed-off-by: Pekka Enberg --- mm/slab.c | 1 + mm/slob.c | 1 + mm/slub.c | 1 + 3 files changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index ddc41f337d5..4d00855629c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4457,3 +4457,4 @@ size_t ksize(const void *objp) return obj_size(virt_to_cache(objp)); } +EXPORT_SYMBOL(ksize); diff --git a/mm/slob.c b/mm/slob.c index bf7e8fc3aed..52bc8a2bd9e 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -521,6 +521,7 @@ size_t ksize(const void *block) } else return sp->page.private; } +EXPORT_SYMBOL(ksize); struct kmem_cache { unsigned int size, align; diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a2..0280eee6cf3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2736,6 +2736,7 @@ size_t ksize(const void *object) */ return s->size; } +EXPORT_SYMBOL(ksize); void kfree(const void *x) { -- cgit v1.2.3-18-g5258 From 3a4c6800f31ea8395628af5e7e490270ee5d0585 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 12 Feb 2009 04:34:23 +0100 Subject: Fix page writeback thinko, causing Berkeley DB slowdown A bug was introduced into write_cache_pages cyclic writeout by commit 31a12666d8f0c22235297e1c1575f82061480029 ("mm: write_cache_pages cyclic fix"). The intention (and comments) is that we should cycle back and look for more dirty pages at the beginning of the file if there is no more work to be done. But the !done condition was dropped from the test. This means that any time the page writeout loop breaks (eg. due to nr_to_write == 0), we will set index to 0, then goto again. This will set done_index to index, then find done is set, so will proceed to the end of the function. When updating mapping->writeback_index for cyclic writeout, we now use done_index == 0, so we're always cycling back to 0. This seemed to be causing random mmap writes (slapadd and iozone) to start writing more pages from the LRU and writeout would slowdown, and caused bugzilla entry http://bugzilla.kernel.org/show_bug.cgi?id=12604 about Berkeley DB slowing down dramatically. With this patch, iozone random write performance is increased nearly 5x on my system (iozone -B -r 4k -s 64k -s 512m -s 1200m on ext2). Signed-off-by: Nick Piggin Reported-and-tested-by: Jan Kara Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6106a5c7ed4..3c84128596b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1079,7 +1079,7 @@ continue_unlock: pagevec_release(&pvec); cond_resched(); } - if (!cycled) { + if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap -- cgit v1.2.3-18-g5258 From 93dbb393503d53cd226e5e1f0088fe8f4dbaa2b8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 16 Feb 2009 10:25:40 +0100 Subject: block: fix bad definition of BIO_RW_SYNC We can't OR shift values, so get rid of BIO_RW_SYNC and use BIO_RW_SYNCIO and BIO_RW_UNPLUG explicitly. This brings back the behaviour from before 213d9417fec62ef4c3675621b9364a667954d4dd. Signed-off-by: Jens Axboe --- mm/page_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index dc6ce0afbde..3023c475e04 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -111,7 +111,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= (1 << BIO_RW_SYNC); + rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); -- cgit v1.2.3-18-g5258 From c296861291669f305deef19b78042330d7135017 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 18 Feb 2009 14:48:12 -0800 Subject: vmalloc: add __get_vm_area_caller() We have get_vm_area_caller() and __get_vm_area() but not __get_vm_area_caller() On powerpc, I use __get_vm_area() to separate the ranges of addresses given to vmalloc vs. ioremap (various good reasons for that) so in order to be able to implement the new caller tracking in /proc/vmallocinfo, I need a "_caller" variant of it. (akpm: needed for ongoing powerpc development, so merge it early) [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Benjamin Herrenschmidt Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 75f49d312e8..4dd2636d0b9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1106,6 +1106,14 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, } EXPORT_SYMBOL_GPL(__get_vm_area); +struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end, + void *caller) +{ + return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + caller); +} + /** * get_vm_area - reserve a contiguous kernel virtual area * @size: size of the area -- cgit v1.2.3-18-g5258 From 1cf6e7d83bf334cc5916137862c920a97aabc018 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Feb 2009 14:48:18 -0800 Subject: mm: task dirty accounting fix YAMAMOTO-san noticed that task_dirty_inc doesn't seem to be called properly for cases where set_page_dirty is not used to dirty a page (eg. mark_buffer_dirty). Additionally, there is some inconsistency about when task_dirty_inc is called. It is used for dirty balancing, however it even gets called for __set_page_dirty_no_writeback. So rather than increment it in a set_page_dirty wrapper, move it down to exactly where the dirty page accounting stats are incremented. Cc: YAMAMOTO Takashi Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3c84128596b..74dc57c7434 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -240,7 +240,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi) } EXPORT_SYMBOL_GPL(bdi_writeout_inc); -static inline void task_dirty_inc(struct task_struct *tsk) +void task_dirty_inc(struct task_struct *tsk) { prop_inc_single(&vm_dirties, &tsk->dirties); } @@ -1230,6 +1230,7 @@ int __set_page_dirty_nobuffers(struct page *page) __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, @@ -1262,7 +1263,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage); * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ -static int __set_page_dirty(struct page *page) +int set_page_dirty(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -1280,14 +1281,6 @@ static int __set_page_dirty(struct page *page) } return 0; } - -int set_page_dirty(struct page *page) -{ - int ret = __set_page_dirty(page); - if (ret) - task_dirty_inc(current); - return ret; -} EXPORT_SYMBOL(set_page_dirty); /* -- cgit v1.2.3-18-g5258 From f2dbcfa738368c8a40d4a5f0b65dc9879577cb21 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 18 Feb 2009 14:48:32 -0800 Subject: mm: clean up for early_pfn_to_nid() What's happening is that the assertion in mm/page_alloc.c:move_freepages() is triggering: BUG_ON(page_zone(start_page) != page_zone(end_page)); Once I knew this is what was happening, I added some annotations: if (unlikely(page_zone(start_page) != page_zone(end_page))) { printk(KERN_ERR "move_freepages: Bogus zones: " "start_page[%p] end_page[%p] zone[%p]\n", start_page, end_page, zone); printk(KERN_ERR "move_freepages: " "start_zone[%p] end_zone[%p]\n", page_zone(start_page), page_zone(end_page)); printk(KERN_ERR "move_freepages: " "start_pfn[0x%lx] end_pfn[0x%lx]\n", page_to_pfn(start_page), page_to_pfn(end_page)); printk(KERN_ERR "move_freepages: " "start_nid[%d] end_nid[%d]\n", page_to_nid(start_page), page_to_nid(end_page)); ... And here's what I got: move_freepages: Bogus zones: start_page[2207d0000] end_page[2207dffc0] zone[fffff8103effcb00] move_freepages: start_zone[fffff8103effcb00] end_zone[fffff8003fffeb00] move_freepages: start_pfn[0x81f600] end_pfn[0x81f7ff] move_freepages: start_nid[1] end_nid[0] My memory layout on this box is: [ 0.000000] Zone PFN ranges: [ 0.000000] Normal 0x00000000 -> 0x0081ff5d [ 0.000000] Movable zone start PFN for each node [ 0.000000] early_node_map[8] active PFN ranges [ 0.000000] 0: 0x00000000 -> 0x00020000 [ 0.000000] 1: 0x00800000 -> 0x0081f7ff [ 0.000000] 1: 0x0081f800 -> 0x0081fe50 [ 0.000000] 1: 0x0081fed1 -> 0x0081fed8 [ 0.000000] 1: 0x0081feda -> 0x0081fedb [ 0.000000] 1: 0x0081fedd -> 0x0081fee5 [ 0.000000] 1: 0x0081fee7 -> 0x0081ff51 [ 0.000000] 1: 0x0081ff59 -> 0x0081ff5d So it's a block move in that 0x81f600-->0x81f7ff region which triggers the problem. This patch: Declaration of early_pfn_to_nid() is scattered over per-arch include files, and it seems it's complicated to know when the declaration is used. I think it makes fix-for-memmap-init not easy. This patch moves all declaration to include/linux/mm.h After this, if !CONFIG_NODES_POPULATES_NODE_MAP && !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -> Use static definition in include/linux/mm.h else if !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -> Use generic definition in mm/page_alloc.c else -> per-arch back end function will be called. Signed-off-by: KAMEZAWA Hiroyuki Tested-by: KOSAKI Motohiro Reported-by: David Miller Cc: Mel Gorman Cc: Heiko Carstens Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5675b307385..c5dd74602ef 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2989,7 +2989,7 @@ static int __meminit next_active_region_index_in_nid(int index, int nid) * was used and there are no special requirements, this is a convenient * alternative */ -int __meminit early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn) { int i; @@ -3005,6 +3005,12 @@ int __meminit early_pfn_to_nid(unsigned long pfn) } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + return __early_pfn_to_nid(pfn); +} + + /* Basic iterator support to walk early_node_map[] */ #define for_each_active_range_index_in_nid(i, nid) \ for (i = first_active_region_index_in_nid(nid); i != -1; \ -- cgit v1.2.3-18-g5258 From cc2559bccc72767cb446f79b071d96c30c26439b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 18 Feb 2009 14:48:33 -0800 Subject: mm: fix memmap init for handling memory hole Now, early_pfn_in_nid(PFN, NID) may returns false if PFN is a hole. and memmap initialization was not done. This was a trouble for sparc boot. To fix this, the PFN should be initialized and marked as PG_reserved. This patch changes early_pfn_in_nid() return true if PFN is a hole. Signed-off-by: KAMEZAWA Hiroyuki Reported-by: David Miller Tested-by: KOSAKI Motohiro Cc: Mel Gorman Cc: Heiko Carstens Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c5dd74602ef..5c44ed49ca9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3000,16 +3000,33 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) if (start_pfn <= pfn && pfn < end_pfn) return early_node_map[i].nid; } - - return 0; + /* This is a memory hole */ + return -1; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ int __meminit early_pfn_to_nid(unsigned long pfn) { - return __early_pfn_to_nid(pfn); + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0) + return nid; + /* just returns 0 */ + return 0; } +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0 && nid != node) + return false; + return true; +} +#endif /* Basic iterator support to walk early_node_map[] */ #define for_each_active_range_index_in_nid(i, nid) \ -- cgit v1.2.3-18-g5258 From 3ef0e5ba467366125f04b423f4638baca54a4fc1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 20 Feb 2009 15:38:41 -0800 Subject: slab: introduce kzfree() kzfree() is a wrapper for kfree() that additionally zeroes the underlying memory before releasing it to the slab allocator. Currently there is code which memset()s the memory region of an object before releasing it back to the slab allocator to make sure security-sensitive data are really zeroed out after use. These callsites can then just use kzfree() which saves some code, makes users greppable and allows for a stupid destructor that isn't necessarily aware of the actual object size. Signed-off-by: Johannes Weiner Reviewed-by: Pekka Enberg Cc: Matt Mackall Acked-by: Christoph Lameter Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index cb00b748ce4..37eaccdf305 100644 --- a/mm/util.c +++ b/mm/util.c @@ -129,6 +129,26 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) } EXPORT_SYMBOL(krealloc); +/** + * kzfree - like kfree but zero memory + * @p: object to free memory of + * + * The memory of the object @p points to is zeroed before freed. + * If @p is %NULL, kzfree() does nothing. + */ +void kzfree(const void *p) +{ + size_t ks; + void *mem = (void *)p; + + if (unlikely(ZERO_OR_NULL_PTR(mem))) + return; + ks = ksize(mem); + memset(mem, 0, ks); + kfree(mem); +} +EXPORT_SYMBOL(kzfree); + /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate -- cgit v1.2.3-18-g5258 From f6fcba7014f9cc535fa75ef98c008b24e49e2212 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 15:38:48 -0800 Subject: vmalloc: call flush_cache_vunmap() from unmap_kernel_range() Impact: proper vcache flush on unmap_kernel_range() flush_cache_vunmap() should be called before pages are unmapped. Add a call to it in unmap_kernel_range(). Signed-off-by: Tejun Heo Acked-by: Nick Piggin Acked-by: David S. Miller Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4dd2636d0b9..903cad46e79 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1012,6 +1012,8 @@ void __init vmalloc_init(void) void unmap_kernel_range(unsigned long addr, unsigned long size) { unsigned long end = addr + size; + + flush_cache_vunmap(addr, end); vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end); } -- cgit v1.2.3-18-g5258 From a1bb7d61233ba5fb5cd865f907a9ddcc8f8c02bd Mon Sep 17 00:00:00 2001 From: Alan Jenkins Date: Sat, 14 Feb 2009 02:01:14 +0100 Subject: PM/hibernate: fix "swap breaks after hibernation failures" http://bugzilla.kernel.org/show_bug.cgi?id=12239 The image writing code dropped a reference to the current swap device. This doesn't show up if the hibernation succeeds - because it doesn't affect the image which gets resumed. But it means multiple _failed_ hibernations end up freeing the swap device while it is still use! swsusp_write() finds the block device for the swap file using swap_type_of(). It then uses blkdev_get() / blkdev_put() to open and close the block device. Unfortunately, blkdev_get() assumes ownership of the inode of the block_device passed to it. So blkdev_put() calls iput() on the inode. This is by design and other callers expect this behaviour. The fix is for swap_type_of() to take a reference on the inode using bdget(). Signed-off-by: Alan Jenkins Signed-off-by: Rafael J. Wysocki Cc: Len Brown Cc: Greg KH Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 7e6304dfafa..312fafe0ab6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -635,7 +635,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) if (!bdev) { if (bdev_p) - *bdev_p = sis->bdev; + *bdev_p = bdget(sis->bdev->bd_dev); spin_unlock(&swap_lock); return i; @@ -647,7 +647,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) - *bdev_p = sis->bdev; + *bdev_p = bdget(sis->bdev->bd_dev); spin_unlock(&swap_lock); bdput(bdev); -- cgit v1.2.3-18-g5258 From 3049103ddfc9aac111916bd2f39ac6976c431517 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 14 Feb 2009 02:03:08 +0100 Subject: swsusp: dont fiddle with swappiness sc.swappiness is not used in the swsusp memory shrinking path, do not set it. Signed-off-by: Johannes Weiner Reviewed-by: KOSAKI Motohiro Signed-off-by: Rafael J. Wysocki Cc: Len Brown Cc: Greg KH Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 9a27c44aa32..550e8695070 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2112,7 +2112,6 @@ unsigned long shrink_all_memory(unsigned long nr_pages) .may_swap = 0, .swap_cluster_max = nr_pages, .may_writepage = 1, - .swappiness = vm_swappiness, .isolate_pages = isolate_pages_global, }; @@ -2146,10 +2145,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) int prio; /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) { + if (pass > 2) sc.may_swap = 1; - sc.swappiness = 100; - } for (prio = DEF_PRIORITY; prio >= 0; prio--) { unsigned long nr_to_scan = nr_pages - ret; -- cgit v1.2.3-18-g5258 From 0cb57258fe01e9b21076b6a15b6aec7a24168228 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 14 Feb 2009 02:04:10 +0100 Subject: swsusp: clean up shrink_all_zones() Move local variables to innermost possible scopes and use local variables to cache calculations/reads done more than once. No change in functionality (intended). Signed-off-by: Johannes Weiner Reviewed-by: KOSAKI Motohiro Signed-off-by: Rafael J. Wysocki Cc: Len Brown Cc: Greg KH Acked-by: Pavel Machek Signed-off-by: Linus Torvalds --- mm/vmscan.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 550e8695070..6177e3bcd66 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2057,31 +2057,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, int pass, struct scan_control *sc) { struct zone *zone; - unsigned long nr_to_scan, ret = 0; - enum lru_list l; + unsigned long ret = 0; for_each_zone(zone) { + enum lru_list l; if (!populated_zone(zone)) continue; - if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) continue; for_each_evictable_lru(l) { + enum zone_stat_item ls = NR_LRU_BASE + l; + unsigned long lru_pages = zone_page_state(zone, ls); + /* For pass = 0, we don't shrink the active list */ - if (pass == 0 && - (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) + if (pass == 0 && (l == LRU_ACTIVE_ANON || + l == LRU_ACTIVE_FILE)) continue; - zone->lru[l].nr_scan += - (zone_page_state(zone, NR_LRU_BASE + l) - >> prio) + 1; + zone->lru[l].nr_scan += (lru_pages >> prio) + 1; if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { + unsigned long nr_to_scan; + zone->lru[l].nr_scan = 0; - nr_to_scan = min(nr_pages, - zone_page_state(zone, - NR_LRU_BASE + l)); + nr_to_scan = min(nr_pages, lru_pages); ret += shrink_list(l, nr_to_scan, zone, sc, prio); if (ret >= nr_pages) @@ -2089,7 +2089,6 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, } } } - return ret; } -- cgit v1.2.3-18-g5258 From 0b0a0806b0d8635e046bf533225a25903b1cddce Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 24 Feb 2009 20:51:52 +0000 Subject: shmem: fix shared anonymous accounting Each time I exit Firefox, /proc/meminfo's Committed_AS goes down almost 400 kB: OVERCOMMIT_NEVER would be allowing overcommits it should prohibit. Commit fc8744adc870a8d4366908221508bb113d8b72ee "Stop playing silly games with the VM_ACCOUNT flag" changed shmem_file_setup() to set the shmem file's VM_ACCOUNT flag according to VM_NORESERVE not being set in the vma flags; but did so only _after_ the shmem_acct_size(flags, size) call which is expected to pre-account a shared anonymous object. It's all clearer if we switch shmem.c over to use VM_NORESERVE throughout in place of !VM_ACCOUNT. But I very nearly sent in a patch which mistakenly removed the accounting from tmpfs files: shmem_get_inode()'s memset was good for not setting VM_ACCOUNT, but now it needs to set VM_NORESERVE. Rather than setting that by default, then perhaps clearing it again in shmem_file_setup(), let's pass it as a flag to shmem_get_inode(): that allows us to remove the #ifdef CONFIG_SHMEM from shmem_file_setup(). Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/shmem.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 19d566ccdee..4103a239ce8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -169,13 +169,13 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) */ static inline int shmem_acct_size(unsigned long flags, loff_t size) { - return (flags & VM_ACCOUNT) ? - security_vm_enough_memory_kern(VM_ACCT(size)) : 0; + return (flags & VM_NORESERVE) ? + 0 : security_vm_enough_memory_kern(VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { - if (flags & VM_ACCOUNT) + if (!(flags & VM_NORESERVE)) vm_unacct_memory(VM_ACCT(size)); } @@ -187,13 +187,13 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) */ static inline int shmem_acct_block(unsigned long flags) { - return (flags & VM_ACCOUNT) ? - 0 : security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)); + return (flags & VM_NORESERVE) ? + security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0; } static inline void shmem_unacct_blocks(unsigned long flags, long pages) { - if (!(flags & VM_ACCOUNT)) + if (flags & VM_NORESERVE) vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); } @@ -1515,8 +1515,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static struct inode * -shmem_get_inode(struct super_block *sb, int mode, dev_t dev) +static struct inode *shmem_get_inode(struct super_block *sb, int mode, + dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; @@ -1537,6 +1537,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); switch (mode & S_IFMT) { @@ -1779,9 +1780,10 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) static int shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev); + struct inode *inode; int error = -ENOSPC; + inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE); if (inode) { error = security_inode_init_security(inode, dir, NULL, NULL, NULL); @@ -1920,7 +1922,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s if (len > PAGE_CACHE_SIZE) return -ENAMETOOLONG; - inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); + inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); if (!inode) return -ENOSPC; @@ -2332,7 +2334,7 @@ static int shmem_fill_super(struct super_block *sb, sb->s_flags |= MS_POSIXACL; #endif - inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0); + inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (!inode) goto failed; inode->i_uid = sbinfo->uid; @@ -2574,12 +2576,12 @@ int shmem_unuse(swp_entry_t entry, struct page *page) return 0; } -#define shmem_file_operations ramfs_file_operations -#define shmem_vm_ops generic_file_vm_ops -#define shmem_get_inode ramfs_get_inode -#define shmem_acct_size(a, b) 0 -#define shmem_unacct_size(a, b) do {} while (0) -#define SHMEM_MAX_BYTES LLONG_MAX +#define shmem_vm_ops generic_file_vm_ops +#define shmem_file_operations ramfs_file_operations +#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) +#define shmem_acct_size(flags, size) 0 +#define shmem_unacct_size(flags, size) do {} while (0) +#define SHMEM_MAX_BYTES LLONG_MAX #endif /* CONFIG_SHMEM */ @@ -2589,7 +2591,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc//maps * @size: size to be set for the file - * @flags: vm_flags + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) { @@ -2623,13 +2625,10 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) goto put_dentry; error = -ENOSPC; - inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); + inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) goto close_file; -#ifdef CONFIG_SHMEM - SHMEM_I(inode)->flags = (flags & VM_NORESERVE) ? 0 : VM_ACCOUNT; -#endif d_instantiate(dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ -- cgit v1.2.3-18-g5258 From 7766970cc13e9071b356b1f2a48a9eb8675bfcce Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 27 Feb 2009 14:03:03 -0800 Subject: mm: vmap fix overflow The new vmap allocator can wrap the address and get confused in the case of large allocations or VMALLOC_END near the end of address space. Problem reported by Christoph Hellwig on a 32-bit XFS workload. Signed-off-by: Nick Piggin Reported-by: Christoph Hellwig Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 903cad46e79..ed3705e4b83 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -323,6 +323,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long addr; int purged = 0; + BUG_ON(!size); BUG_ON(size & ~PAGE_MASK); va = kmalloc_node(sizeof(struct vmap_area), @@ -334,6 +335,9 @@ retry: addr = ALIGN(vstart, align); spin_lock(&vmap_area_lock); + if (addr + size - 1 < addr) + goto overflow; + /* XXX: could have a last_hole cache */ n = vmap_area_root.rb_node; if (n) { @@ -365,6 +369,8 @@ retry: while (addr + size > first->va_start && addr + size <= vend) { addr = ALIGN(first->va_end + PAGE_SIZE, align); + if (addr + size - 1 < addr) + goto overflow; n = rb_next(&first->rb_node); if (n) @@ -375,6 +381,7 @@ retry: } found: if (addr + size > vend) { +overflow: spin_unlock(&vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); -- cgit v1.2.3-18-g5258 From cbb766766f3f2f6d9326c561b1020590642c6e39 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 27 Feb 2009 14:03:04 -0800 Subject: mm: fix lazy vmap purging (use-after-free error) I just got this new warning from kmemcheck: WARNING: kmemcheck: Caught 32-bit read from freed memory (c7806a60) a06a80c7ecde70c1a04080c700000000a06709c1000000000000000000000000 f f f f f f f f f f f f f f f f f f f f f f f f f f f f f f f f ^ Pid: 0, comm: swapper Not tainted (2.6.29-rc4 #230) EIP: 0060:[] EFLAGS: 00000286 CPU: 0 EIP is at __purge_vmap_area_lazy+0x117/0x140 EAX: 00070f43 EBX: c7806a40 ECX: c1677080 EDX: 00027b66 ESI: 00002001 EDI: c170df0c EBP: c170df00 ESP: c178830c DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 80050033 CR2: c7806b14 CR3: 01775000 CR4: 00000690 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: 00004000 DR7: 00000000 [] free_unmap_vmap_area_noflush+0x6e/0x70 [] remove_vm_area+0x2a/0x70 [] __vunmap+0x45/0xe0 [] vunmap+0x1e/0x30 [] text_poke+0x95/0x150 [] alternatives_smp_unlock+0x49/0x60 [] alternative_instructions+0x11b/0x124 [] check_bugs+0xbd/0xdc [] start_kernel+0x2ed/0x360 [] __init_begin+0x9e/0xa9 [] 0xffffffff It happened here: $ addr2line -e vmlinux -i c1096df7 mm/vmalloc.c:540 Code: list_for_each_entry(va, &valist, purge_list) __free_vmap_area(va); It's this instruction: mov 0x20(%ebx),%edx Which corresponds to a dereference of va->purge_list.next: (gdb) p ((struct vmap_area *) 0)->purge_list.next Cannot access memory at address 0x20 It seems that we should use "safe" list traversal here, as the element is freed inside the loop. Please verify that this is the right fix. Acked-by: Nick Piggin Signed-off-by: Vegard Nossum Cc: Pekka Enberg Cc: Ingo Molnar Cc: "Paul E. McKenney" Cc: [2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ed3705e4b83..520a7598026 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -505,6 +505,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, static DEFINE_SPINLOCK(purge_lock); LIST_HEAD(valist); struct vmap_area *va; + struct vmap_area *n_va; int nr = 0; /* @@ -544,7 +545,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (nr) { spin_lock(&vmap_area_lock); - list_for_each_entry(va, &valist, purge_list) + list_for_each_entry_safe(va, n_va, &valist, purge_list) __free_vmap_area(va); spin_unlock(&vmap_area_lock); } -- cgit v1.2.3-18-g5258 From f272b7bc447553410dde691aa31fc531adf9c175 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 12 Mar 2009 14:31:36 -0700 Subject: memcg: use correct scan number at reclaim Even when page reclaim is under mem_cgroup, # of scan page is determined by status of global LRU. Fix that. Signed-off-by: KOSAKI Motohiro Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 6177e3bcd66..e8951714165 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1469,7 +1469,7 @@ static void shrink_zone(int priority, struct zone *zone, int file = is_file_lru(l); int scan; - scan = zone_page_state(zone, NR_LRU_BASE + l); + scan = zone_nr_pages(zone, sc, l); if (priority) { scan >>= priority; scan = (scan * percent[file]) / 100; -- cgit v1.2.3-18-g5258 From 1d885526f2f3fffacee2ecb541270bd00168adff Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Fri, 13 Mar 2009 13:52:00 -0700 Subject: vmscan: pgmoved should be cleared after updating recent_rotated pgmoved should be cleared after updating recent_rotated. Signed-off-by: Daisuke Nishimura Cc: Rik van Riel Cc: Lee Schermerhorn Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index e8951714165..56ddf41149e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1262,7 +1262,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * Move the pages to the [file or anon] inactive list. */ pagevec_init(&pvec, 1); - pgmoved = 0; lru = LRU_BASE + file * LRU_FILE; spin_lock_irq(&zone->lru_lock); @@ -1274,6 +1273,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, */ reclaim_stat->recent_rotated[!!file] += pgmoved; + pgmoved = 0; while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); -- cgit v1.2.3-18-g5258