diff options
author | Trond Myklebust <Trond.Myklebust@netapp.com> | 2006-06-24 08:41:41 -0400 |
---|---|---|
committer | Trond Myklebust <Trond.Myklebust@netapp.com> | 2006-06-24 13:07:53 -0400 |
commit | 816724e65c72a90a44fbad0ef0b59b186c85fa90 (patch) | |
tree | 421fa29aedff988e392f92780637553e275d37a0 /mm | |
parent | 70ac4385a13f78bc478f26d317511893741b05bd (diff) | |
parent | d384ea691fe4ea8c2dd5b9b8d9042eb181776f18 (diff) |
Merge branch 'master' of /home/trondmy/kernel/linux-2.6/
Conflicts:
fs/nfs/inode.c
fs/super.c
Fix conflicts between patch 'NFS: Split fs/nfs/inode.c' and patch
'VFS: Permit filesystem to override root dentry on mount'
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/filemap.c | 183 | ||||
-rw-r--r-- | mm/filemap.h | 6 | ||||
-rw-r--r-- | mm/fremap.c | 9 | ||||
-rw-r--r-- | mm/hugetlb.c | 282 | ||||
-rw-r--r-- | mm/memory.c | 125 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 27 | ||||
-rw-r--r-- | mm/mempolicy.c | 36 | ||||
-rw-r--r-- | mm/migrate.c | 1058 | ||||
-rw-r--r-- | mm/mmap.c | 12 | ||||
-rw-r--r-- | mm/mprotect.c | 37 | ||||
-rw-r--r-- | mm/msync.c | 3 | ||||
-rw-r--r-- | mm/oom_kill.c | 9 | ||||
-rw-r--r-- | mm/page-writeback.c | 3 | ||||
-rw-r--r-- | mm/page_alloc.c | 184 | ||||
-rw-r--r-- | mm/pdflush.c | 3 | ||||
-rw-r--r-- | mm/rmap.c | 107 | ||||
-rw-r--r-- | mm/shmem.c | 18 | ||||
-rw-r--r-- | mm/slab.c | 249 | ||||
-rw-r--r-- | mm/sparse.c | 22 | ||||
-rw-r--r-- | mm/swap.c | 42 | ||||
-rw-r--r-- | mm/swapfile.c | 43 | ||||
-rw-r--r-- | mm/truncate.c | 22 | ||||
-rw-r--r-- | mm/vmalloc.c | 122 | ||||
-rw-r--r-- | mm/vmscan.c | 240 |
25 files changed, 1816 insertions, 1030 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 332f5c29b53..66e65ab3942 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS # config MIGRATION bool "Page migration" - def_bool y if NUMA - depends on SWAP && NUMA + def_bool y + depends on NUMA help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful for diff --git a/mm/filemap.c b/mm/filemap.c index fd57442186c..807a463fd5e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/compiler.h> #include <linux/fs.h> +#include <linux/uaccess.h> #include <linux/aio.h> #include <linux/capability.h> #include <linux/kernel_stat.h> @@ -38,7 +39,6 @@ */ #include <linux/buffer_head.h> /* for generic_osync_inode */ -#include <asm/uaccess.h> #include <asm/mman.h> static ssize_t @@ -171,15 +171,17 @@ static int sync_page(void *word) } /** - * filemap_fdatawrite_range - start writeback against all of a mapping's - * dirty pages that lie within the byte offsets <start, end> + * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) * @sync_mode: enable synchronous operation * + * Start writeback against all of a mapping's dirty pages that lie + * within the byte offsets <start, end> inclusive. + * * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory * cleansing writeback. The difference between + * opposed to a regular memory cleansing writeback. The difference between * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. */ @@ -190,8 +192,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = mapping->nrpages * 2, - .start = start, - .end = end, + .range_start = start, + .range_end = end, }; if (!mapping_cap_writeback_dirty(mapping)) @@ -204,7 +206,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, static inline int __filemap_fdatawrite(struct address_space *mapping, int sync_mode) { - return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode); + return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); } int filemap_fdatawrite(struct address_space *mapping) @@ -219,7 +221,10 @@ static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } -/* +/** + * filemap_flush - mostly a non-blocking flush + * @mapping: target address_space + * * This is a mostly non-blocking flush. Not suitable for data-integrity * purposes - I/O may not be started against all dirty pages. */ @@ -229,7 +234,12 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); -/* +/** + * wait_on_page_writeback_range - wait for writeback to complete + * @mapping: target address_space + * @start: beginning page index + * @end: ending page index + * * Wait for writeback to complete against pages indexed by start->end * inclusive */ @@ -276,7 +286,13 @@ int wait_on_page_writeback_range(struct address_space *mapping, return ret; } -/* +/** + * sync_page_range - write and wait on all pages in the passed range + * @inode: target inode + * @mapping: target address_space + * @pos: beginning offset in pages to write + * @count: number of bytes to write + * * Write and wait upon all the pages in the passed range. This is a "data * integrity" operation. It waits upon in-flight writeout before starting and * waiting upon new writeout. If there was an IO error, return it. @@ -305,7 +321,13 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, } EXPORT_SYMBOL(sync_page_range); -/* +/** + * sync_page_range_nolock + * @inode: target inode + * @mapping: target address_space + * @pos: beginning offset in pages to write + * @count: number of bytes to write + * * Note: Holding i_mutex across sync_page_range_nolock is not a good idea * as it forces O_SYNC writers to different parts of the same file * to be serialised right until io completion. @@ -329,10 +351,11 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, EXPORT_SYMBOL(sync_page_range_nolock); /** - * filemap_fdatawait - walk the list of under-writeback pages of the given - * address space and wait for all of them. - * + * filemap_fdatawait - wait for all under-writeback pages to complete * @mapping: address space structure to wait for + * + * Walk the list of under-writeback pages of the given address space + * and wait for all of them. */ int filemap_fdatawait(struct address_space *mapping) { @@ -368,7 +391,12 @@ int filemap_write_and_wait(struct address_space *mapping) } EXPORT_SYMBOL(filemap_write_and_wait); -/* +/** + * filemap_write_and_wait_range - write out & wait on a file range + * @mapping: the address_space for the pages + * @lstart: offset in bytes where the range starts + * @lend: offset in bytes where the range ends (inclusive) + * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that `lend' is inclusive (describes the last byte to be written) so @@ -394,8 +422,14 @@ int filemap_write_and_wait_range(struct address_space *mapping, return err; } -/* - * This function is used to add newly allocated pagecache pages: +/** + * add_to_page_cache - add newly allocated pagecache pages + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * + * This function is used to add newly allocated pagecache pages; * the page is new, so we can just run SetPageLocked() against it. * The other page state flags were set by rmqueue(). * @@ -422,7 +456,6 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, } return error; } - EXPORT_SYMBOL(add_to_page_cache); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, @@ -489,8 +522,7 @@ void fastcall wait_on_page_bit(struct page *page, int bit_nr) EXPORT_SYMBOL(wait_on_page_bit); /** - * unlock_page() - unlock a locked page - * + * unlock_page - unlock a locked page * @page: the page * * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). @@ -513,8 +545,9 @@ void fastcall unlock_page(struct page *page) } EXPORT_SYMBOL(unlock_page); -/* - * End writeback against a page. +/** + * end_page_writeback - end writeback against a page + * @page: the page */ void end_page_writeback(struct page *page) { @@ -527,10 +560,11 @@ void end_page_writeback(struct page *page) } EXPORT_SYMBOL(end_page_writeback); -/* - * Get a lock on the page, assuming we need to sleep to get it. +/** + * __lock_page - get a lock on the page, assuming we need to sleep to get it + * @page: the page to lock * - * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some + * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some * random driver's requestfn sets TASK_RUNNING, we could busywait. However * chances are that on the second loop, the block layer's plug list is empty, * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. @@ -544,8 +578,12 @@ void fastcall __lock_page(struct page *page) } EXPORT_SYMBOL(__lock_page); -/* - * a rather lightweight function, finding and getting a reference to a +/** + * find_get_page - find and get a page reference + * @mapping: the address_space to search + * @offset: the page index + * + * A rather lightweight function, finding and getting a reference to a * hashed page atomically. */ struct page * find_get_page(struct address_space *mapping, unsigned long offset) @@ -559,11 +597,14 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset) read_unlock_irq(&mapping->tree_lock); return page; } - EXPORT_SYMBOL(find_get_page); -/* - * Same as above, but trylock it instead of incrementing the count. +/** + * find_trylock_page - find and lock a page + * @mapping: the address_space to search + * @offset: the page index + * + * Same as find_get_page(), but trylock it instead of incrementing the count. */ struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) { @@ -576,12 +617,10 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs read_unlock_irq(&mapping->tree_lock); return page; } - EXPORT_SYMBOL(find_trylock_page); /** * find_lock_page - locate, pin and lock a pagecache page - * * @mapping: the address_space to search * @offset: the page index * @@ -617,12 +656,10 @@ repeat: read_unlock_irq(&mapping->tree_lock); return page; } - EXPORT_SYMBOL(find_lock_page); /** * find_or_create_page - locate or add a pagecache page - * * @mapping: the page's address_space * @index: the page's index into the mapping * @gfp_mask: page allocation mode @@ -663,7 +700,6 @@ repeat: page_cache_release(cached_page); return page; } - EXPORT_SYMBOL(find_or_create_page); /** @@ -729,9 +765,16 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, return i; } -/* +/** + * find_get_pages_tag - find and return pages that match @tag + * @mapping: the address_space to search + * @index: the starting page index + * @tag: the tag index + * @nr_pages: the maximum number of pages + * @pages: where the resulting pages are placed + * * Like find_get_pages, except we only return pages which are tagged with - * `tag'. We update *index to index the next page for the traversal. + * @tag. We update @index to index the next page for the traversal. */ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages) @@ -750,7 +793,11 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, return ret; } -/* +/** + * grab_cache_page_nowait - returns locked page at given index in given cache + * @mapping: target address_space + * @index: the page index + * * Same as grab_cache_page, but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should @@ -779,19 +826,25 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index) } return page; } - EXPORT_SYMBOL(grab_cache_page_nowait); -/* +/** + * do_generic_mapping_read - generic file read routine + * @mapping: address_space to be read + * @_ra: file's readahead state + * @filp: the file to read + * @ppos: current file position + * @desc: read_descriptor + * @actor: read method + * * This is a generic file read routine, and uses the - * mapping->a_ops->readpage() function for the actual low-level - * stuff. + * mapping->a_ops->readpage() function for the actual low-level stuff. * * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. * - * Note the struct file* is only passed for the use of readpage. It may be - * NULL. + * Note the struct file* is only passed for the use of readpage. + * It may be NULL. */ void do_generic_mapping_read(struct address_space *mapping, struct file_ra_state *_ra, @@ -1004,7 +1057,6 @@ out: if (filp) file_accessed(filp); } - EXPORT_SYMBOL(do_generic_mapping_read); int file_read_actor(read_descriptor_t *desc, struct page *page, @@ -1045,7 +1097,13 @@ success: return size; } -/* +/** + * __generic_file_aio_read - generic filesystem read routine + * @iocb: kernel I/O control block + * @iov: io vector request + * @nr_segs: number of segments in the iovec + * @ppos: current file position + * * This is the "read()" routine for all filesystems * that can use the page cache directly. */ @@ -1124,7 +1182,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, out: return retval; } - EXPORT_SYMBOL(__generic_file_aio_read); ssize_t @@ -1135,7 +1192,6 @@ generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t BUG_ON(iocb->ki_pos != pos); return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); } - EXPORT_SYMBOL(generic_file_aio_read); ssize_t @@ -1151,7 +1207,6 @@ generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppo ret = wait_on_sync_kiocb(&kiocb); return ret; } - EXPORT_SYMBOL(generic_file_read); int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) @@ -1192,7 +1247,6 @@ ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, return desc.written; return desc.error; } - EXPORT_SYMBOL(generic_file_sendfile); static ssize_t @@ -1228,11 +1282,15 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) } #ifdef CONFIG_MMU -/* +static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); +/** + * page_cache_read - adds requested page to the page cache if not already there + * @file: file to read + * @offset: page index + * * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); static int fastcall page_cache_read(struct file * file, unsigned long offset) { struct address_space *mapping = file->f_mapping; @@ -1259,7 +1317,12 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) #define MMAP_LOTSAMISS (100) -/* +/** + * filemap_nopage - read in file data for page fault handling + * @area: the applicable vm_area + * @address: target address to read in + * @type: returned with VM_FAULT_{MINOR,MAJOR} if not %NULL + * * filemap_nopage() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * @@ -1462,7 +1525,6 @@ page_not_uptodate: page_cache_release(page); return NULL; } - EXPORT_SYMBOL(filemap_nopage); static struct page * filemap_getpage(struct file *file, unsigned long pgoff, @@ -1716,7 +1778,13 @@ repeat: return page; } -/* +/** + * read_cache_page - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * * Read into the page cache. If a page already exists, * and PageUptodate() is not set, try to fill the page. */ @@ -1754,7 +1822,6 @@ retry: out: return page; } - EXPORT_SYMBOL(read_cache_page); /* @@ -1835,7 +1902,7 @@ __filemap_copy_from_user_iovec(char *vaddr, int copy = min(bytes, iov->iov_len - base); base = 0; - left = __copy_from_user_inatomic(vaddr, buf, copy); + left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); copied += copy; bytes -= copy; vaddr += copy; @@ -1854,7 +1921,7 @@ __filemap_copy_from_user_iovec(char *vaddr, /* * Performs necessary checks before doing a write * - * Can adjust writing position aor amount of bytes to write. + * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ diff --git a/mm/filemap.h b/mm/filemap.h index 13793ba0ce1..5683cde2205 100644 --- a/mm/filemap.h +++ b/mm/filemap.h @@ -13,7 +13,7 @@ #include <linux/highmem.h> #include <linux/uio.h> #include <linux/config.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> size_t __filemap_copy_from_user_iovec(char *vaddr, @@ -34,13 +34,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset, int left; kaddr = kmap_atomic(page, KM_USER0); - left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); + left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); kunmap_atomic(kaddr, KM_USER0); if (left != 0) { /* Do it the slow way */ kaddr = kmap(page); - left = __copy_from_user(kaddr + offset, buf, bytes); + left = __copy_from_user_nocache(kaddr + offset, buf, bytes); kunmap(page); } return bytes - left; diff --git a/mm/fremap.c b/mm/fremap.c index 9f381e58bf4..21b7d0cbc98 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -83,6 +83,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, page_add_file_rmap(page); pte_val = *pte; update_mmu_cache(vma, addr, pte_val); + lazy_mmu_prot_update(pte_val); err = 0; unlock: pte_unmap_unlock(pte, ptl); @@ -114,7 +115,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); pte_val = *pte; - update_mmu_cache(vma, addr, pte_val); + /* + * We don't need to run update_mmu_cache() here because the "file pte" + * being installed by install_file_pte() is not a real pte - it's a + * non-present entry (like a swap entry), noting what file offset should + * be mapped there when there's a fault (in a non-linear vma where + * that's not obvious). + */ pte_unmap_unlock(pte, ptl); err = 0; out: diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 832f676ca03..df499973255 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,7 +22,7 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; +static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; @@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void) static struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; struct page *page; - int use_reserve = 0; - unsigned long idx; spin_lock(&hugetlb_lock); - - if (vma->vm_flags & VM_MAYSHARE) { - - /* idx = radix tree index, i.e. offset into file in - * HPAGE_SIZE units */ - idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); - - /* The hugetlbfs specific inode info stores the number - * of "guaranteed available" (huge) pages. That is, - * the first 'prereserved_hpages' pages of the inode - * are either already instantiated, or have been - * pre-reserved (by hugetlb_reserve_for_inode()). Here - * we're in the process of instantiating the page, so - * we use this to determine whether to draw from the - * pre-reserved pool or the truly free pool. */ - if (idx < HUGETLBFS_I(inode)->prereserved_hpages) - use_reserve = 1; - } - - if (!use_reserve) { - if (free_huge_pages <= reserved_huge_pages) - goto fail; - } else { - BUG_ON(reserved_huge_pages == 0); - reserved_huge_pages--; - } + if (vma->vm_flags & VM_MAYSHARE) + resv_huge_pages--; + else if (free_huge_pages <= resv_huge_pages) + goto fail; page = dequeue_huge_page(vma, addr); if (!page) @@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_refcounted(page); return page; - fail: - WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ +fail: spin_unlock(&hugetlb_lock); return NULL; } -/* hugetlb_extend_reservation() - * - * Ensure that at least 'atleast' hugepages are, and will remain, - * available to instantiate the first 'atleast' pages of the given - * inode. If the inode doesn't already have this many pages reserved - * or instantiated, set aside some hugepages in the reserved pool to - * satisfy later faults (or fail now if there aren't enough, rather - * than getting the SIGBUS later). - */ -int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, - unsigned long atleast) -{ - struct inode *inode = &info->vfs_inode; - unsigned long change_in_reserve = 0; - int ret = 0; - - spin_lock(&hugetlb_lock); - read_lock_irq(&inode->i_mapping->tree_lock); - - if (info->prereserved_hpages >= atleast) - goto out; - - /* Because we always call this on shared mappings, none of the - * pages beyond info->prereserved_hpages can have been - * instantiated, so we need to reserve all of them now. */ - change_in_reserve = atleast - info->prereserved_hpages; - - if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { - ret = -ENOMEM; - goto out; - } - - reserved_huge_pages += change_in_reserve; - info->prereserved_hpages = atleast; - - out: - read_unlock_irq(&inode->i_mapping->tree_lock); - spin_unlock(&hugetlb_lock); - - return ret; -} - -/* hugetlb_truncate_reservation() - * - * This returns pages reserved for the given inode to the general free - * hugepage pool. If the inode has any pages prereserved, but not - * instantiated, beyond offset (atmost << HPAGE_SIZE), then release - * them. - */ -void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, - unsigned long atmost) -{ - struct inode *inode = &info->vfs_inode; - struct address_space *mapping = inode->i_mapping; - unsigned long idx; - unsigned long change_in_reserve = 0; - struct page *page; - - spin_lock(&hugetlb_lock); - read_lock_irq(&inode->i_mapping->tree_lock); - - if (info->prereserved_hpages <= atmost) - goto out; - - /* Count pages which were reserved, but not instantiated, and - * which we can now release. */ - for (idx = atmost; idx < info->prereserved_hpages; idx++) { - page = radix_tree_lookup(&mapping->page_tree, idx); - if (!page) - /* Pages which are already instantiated can't - * be unreserved (and in fact have already - * been removed from the reserved pool) */ - change_in_reserve++; - } - - BUG_ON(reserved_huge_pages < change_in_reserve); - reserved_huge_pages -= change_in_reserve; - info->prereserved_hpages = atmost; - - out: - read_unlock_irq(&inode->i_mapping->tree_lock); - spin_unlock(&hugetlb_lock); -} - static int __init hugetlb_init(void) { unsigned long i; @@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count) return nr_huge_pages; spin_lock(&hugetlb_lock); - count = max(count, reserved_huge_pages); + count = max(count, resv_huge_pages); try_to_free_low(count); while (count < nr_huge_pages) { struct page *page = dequeue_huge_page(NULL, 0); @@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf) return sprintf(buf, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" - "HugePages_Rsvd: %5lu\n" + "HugePages_Rsvd: %5lu\n" "Hugepagesize: %5lu kB\n", nr_huge_pages, free_huge_pages, - reserved_huge_pages, + resv_huge_pages, HPAGE_SIZE/1024); } @@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma, flush_tlb_range(vma, start, end); } +struct file_region { + struct list_head link; + long from; + long to; +}; + +static long region_add(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg, *trg; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + + /* Check for and consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + break; + + /* If this area reaches higher then extend our area to + * include it completely. If this is not the first area + * which we intend to reuse, free it. */ + if (rg->to > t) + t = rg->to; + if (rg != nrg) { + list_del(&rg->link); + kfree(rg); + } + } + nrg->from = f; + nrg->to = t; + return 0; +} + +static long region_chg(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg; + long chg = 0; + + /* Locate the region we are before or in. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* If we are below the current region then a new region is required. + * Subtle, allocate a new region at the position but make it zero + * size such that we can guarentee to record the reservation. */ + if (&rg->link == head || t < rg->from) { + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (nrg == 0) + return -ENOMEM; + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + list_add(&nrg->link, rg->link.prev); + + return t - f; + } + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + chg = t - f; + + /* Check for and consume any regions we now overlap with. */ + list_for_each_entry(rg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + return chg; + + /* We overlap with this area, if it extends futher than + * us then we must extend ourselves. Account for its + * existing reservation. */ + if (rg->to > t) { + chg += rg->to - t; + t = rg->to; + } + chg -= rg->to - rg->from; + } + return chg; +} + +static long region_truncate(struct list_head *head, long end) +{ + struct file_region *rg, *trg; + long chg = 0; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (end <= rg->to) + break; + if (&rg->link == head) + return 0; + + /* If we are in the middle of a region then adjust it. */ + if (end > rg->from) { + chg = rg->to - end; + rg->to = end; + rg = list_entry(rg->link.next, typeof(*rg), link); + } + + /* Drop any remaining regions. */ + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + chg += rg->to - rg->from; + list_del(&rg->link); + kfree(rg); + } + return chg; +} + +static int hugetlb_acct_memory(long delta) +{ + int ret = -ENOMEM; + + spin_lock(&hugetlb_lock); + if ((delta + resv_huge_pages) <= free_huge_pages) { + resv_huge_pages += delta; + ret = 0; + } + spin_unlock(&hugetlb_lock); + return ret; +} + +int hugetlb_reserve_pages(struct inode *inode, long from, long to) +{ + long ret, chg; + + chg = region_chg(&inode->i_mapping->private_list, from, to); + if (chg < 0) + return chg; + ret = hugetlb_acct_memory(chg); + if (ret < 0) + return ret; + region_add(&inode->i_mapping->private_list, from, to); + return 0; +} + +void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) +{ + long chg = region_truncate(&inode->i_mapping->private_list, offset); + hugetlb_acct_memory(freed - chg); +} diff --git a/mm/memory.c b/mm/memory.c index 0ec7bc64427..247b5c312b9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { if (!pte_file(pte)) { - swap_duplicate(pte_to_swp_entry(pte)); + swp_entry_t entry = pte_to_swp_entry(pte); + + swap_duplicate(entry); /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); @@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both parent + * and child to be set to read. + */ + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); |