diff options
Diffstat (limited to 'mm/filemap.c')
| -rw-r--r-- | mm/filemap.c | 2691 |
1 files changed, 1389 insertions, 1302 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 07e9d9258b4..900edfaf6df 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -9,14 +9,14 @@ * most "normal" filesystems (but you don't /have/ to use this: * the NFS filesystem used to do this differently, for example) */ -#include <linux/module.h> -#include <linux/slab.h> +#include <linux/export.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/aio.h> #include <linux/capability.h> #include <linux/kernel_stat.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/mman.h> @@ -29,23 +29,23 @@ #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/security.h> -#include <linux/syscalls.h> #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> +#include <linux/cleancache.h> +#include <linux/rmap.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/filemap.h> + /* * FIXME: remove all knowledge of the buffer layer from the core VM */ -#include <linux/buffer_head.h> /* for generic_osync_inode */ +#include <linux/buffer_head.h> /* for try_to_free_buffers */ #include <asm/mman.h> -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs); - /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -61,33 +61,30 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_mutex (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock * * ->i_mutex - * ->i_mmap_lock (truncate->unmap_mapping_range) + * ->i_mmap_mutex (truncate->unmap_mapping_range) * * ->mmap_sem - * ->i_mmap_lock + * ->i_mmap_mutex * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_sem * ->lock_page (access_process_vm) * - * ->i_mutex (generic_file_buffered_write) + * ->i_mutex (generic_perform_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * - * ->i_mutex - * ->i_alloc_sem (various) - * - * ->inode_lock - * ->sb_lock (fs/fs-writeback.c) + * bdi->wb.list_lock + * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * - * ->i_mmap_lock + * ->i_mmap_mutex * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock @@ -101,28 +98,107 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (zap_pte_range->set_page_dirty) + * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) + * ->inode->i_lock (page_remove_rmap->set_page_dirty) + * bdi.wb->list_lock (zap_pte_range->set_page_dirty) + * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * - * ->task->proc_lock - * ->dcache_lock (proc_pid_lookup) + * ->i_mmap_mutex + * ->tasklist_lock (memory_failure, collect_procs_ao) */ +static void page_cache_tree_delete(struct address_space *mapping, + struct page *page, void *shadow) +{ + struct radix_tree_node *node; + unsigned long index; + unsigned int offset; + unsigned int tag; + void **slot; + + VM_BUG_ON(!PageLocked(page)); + + __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + + if (shadow) { + mapping->nrshadows++; + /* + * Make sure the nrshadows update is committed before + * the nrpages update so that final truncate racing + * with reclaim does not see both counters 0 at the + * same time and miss a shadow entry. + */ + smp_wmb(); + } + mapping->nrpages--; + + if (!node) { + /* Clear direct pointer tags in root node */ + mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; + radix_tree_replace_slot(slot, shadow); + return; + } + + /* Clear tree tags for the removed page */ + index = page->index; + offset = index & RADIX_TREE_MAP_MASK; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (test_bit(offset, node->tags[tag])) + radix_tree_tag_clear(&mapping->page_tree, index, tag); + } + + /* Delete page, swap shadow entry */ + radix_tree_replace_slot(slot, shadow); + workingset_node_pages_dec(node); + if (shadow) + workingset_node_shadows_inc(node); + else + if (__radix_tree_delete_node(&mapping->page_tree, node)) + return; + + /* + * Track node that only contains shadow entries. + * + * Avoid acquiring the list_lru lock if already tracked. The + * list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_pages(node) && + list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&workingset_shadow_nodes, &node->private_list); + } +} + /* - * Remove a page from the page cache and free it. Caller has to make + * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold a write_lock on the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock. */ -void __remove_from_page_cache(struct page *page) +void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; - mem_cgroup_uncharge_page(page); - radix_tree_delete(&mapping->page_tree, page->index); + trace_mm_filemap_delete_from_page_cache(page); + /* + * if we're uptodate, flush out into the cleancache, otherwise + * invalidate any existing cleancache entries. We can't leave + * stale data around in the cleancache once our page is gone + */ + if (PageUptodate(page) && PageMappedToDisk(page)) + cleancache_put_page(page); + else + cleancache_invalidate_page(mapping, page); + + page_cache_tree_delete(mapping, page, shadow); + page->mapping = NULL; - mapping->nrpages--; + /* Leave page->index set: truncation lookup relies upon it */ + __dec_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); /* @@ -138,59 +214,58 @@ void __remove_from_page_cache(struct page *page) } } -void remove_from_page_cache(struct page *page) +/** + * delete_from_page_cache - delete page from page cache + * @page: the page which the kernel is trying to remove from page cache + * + * This must be called only on pages that have been verified to be in the page + * cache and locked. It will never put the page into the free list, the caller + * has a reference on the page. + */ +void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); - write_lock_irq(&mapping->tree_lock); - __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + freepage = mapping->a_ops->freepage; + spin_lock_irq(&mapping->tree_lock); + __delete_from_page_cache(page, NULL); + spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); + + if (freepage) + freepage(page); + page_cache_release(page); } +EXPORT_SYMBOL(delete_from_page_cache); -static int sync_page(void *word) +static int sleep_on_page(void *word) { - struct address_space *mapping; - struct page *page; - - page = container_of((unsigned long *)word, struct page, flags); - - /* - * page_mapping() is being called without PG_locked held. - * Some knowledge of the state and use of the page is used to - * reduce the requirements down to a memory barrier. - * The danger here is of a stale page_mapping() return value - * indicating a struct address_space different from the one it's - * associated with when it is associated with one. - * After smp_mb(), it's either the correct page_mapping() for - * the page, or an old page_mapping() and the page's own - * page_mapping() has gone NULL. - * The ->sync_page() address_space operation must tolerate - * page_mapping() going NULL. By an amazing coincidence, - * this comes about because none of the users of the page - * in the ->sync_page() methods make essential use of the - * page_mapping(), merely passing the page down to the backing - * device's unplug functions when it's non-NULL, which in turn - * ignore it for all cases but swap, where only page_private(page) is - * of interest. When page_mapping() does go NULL, the entire - * call stack gracefully ignores the page and returns. - * -- wli - */ - smp_mb(); - mapping = page_mapping(page); - if (mapping && mapping->a_ops && mapping->a_ops->sync_page) - mapping->a_ops->sync_page(page); io_schedule(); return 0; } -static int sync_page_killable(void *word) +static int sleep_on_page_killable(void *word) { - sync_page(word); + sleep_on_page(word); return fatal_signal_pending(current) ? -EINTR : 0; } +static int filemap_check_errors(struct address_space *mapping) +{ + int ret = 0; + /* Check for outstanding write errors */ + if (test_bit(AS_ENOSPC, &mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_bit(AS_EIO, &mapping->flags) && + test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + return ret; +} + /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write @@ -212,7 +287,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, int ret; struct writeback_control wbc = { .sync_mode = sync_mode, - .nr_to_write = mapping->nrpages * 2, + .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; @@ -236,11 +311,12 @@ int filemap_fdatawrite(struct address_space *mapping) } EXPORT_SYMBOL(filemap_fdatawrite); -static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } +EXPORT_SYMBOL(filemap_fdatawrite_range); /** * filemap_flush - mostly a non-blocking flush @@ -256,27 +332,27 @@ int filemap_flush(struct address_space *mapping) EXPORT_SYMBOL(filemap_flush); /** - * wait_on_page_writeback_range - wait for writeback to complete - * @mapping: target address_space - * @start: beginning page index - * @end: ending page index + * filemap_fdatawait_range - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) * - * Wait for writeback to complete against pages indexed by start->end - * inclusive + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. */ -int wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end) +int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, + loff_t end_byte) { + pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; + pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; struct pagevec pvec; int nr_pages; - int ret = 0; - pgoff_t index; + int ret2, ret = 0; - if (end < start) - return 0; + if (end_byte < start_byte) + goto out; pagevec_init(&pvec, 0); - index = start; while ((index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_WRITEBACK, @@ -291,85 +367,20 @@ int wait_on_page_writeback_range(struct address_space *mapping, continue; wait_on_page_writeback(page); - if (PageError(page)) + if (TestClearPageError(page)) ret = -EIO; } pagevec_release(&pvec); cond_resched(); } +out: + ret2 = filemap_check_errors(mapping); + if (!ret) + ret = ret2; - /* Check for outstanding write errors */ - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) - ret = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) - ret = -EIO; - - return ret; -} - -/** - * sync_page_range - write and wait on all pages in the passed range - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write - * - * Write and wait upon all the pages in the passed range. This is a "data - * integrity" operation. It waits upon in-flight writeout before starting and - * waiting upon new writeout. If there was an IO error, return it. - * - * We need to re-take i_mutex during the generic_osync_inode list walk because - * it is otherwise livelockable. - */ -int sync_page_range(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) -{ - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) { - mutex_lock(&inode->i_mutex); - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - mutex_unlock(&inode->i_mutex); - } - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; -} -EXPORT_SYMBOL(sync_page_range); - -/** - * sync_page_range_nolock - write & wait on all pages in the passed range without locking - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write - * - * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea - * as it forces O_SYNC writers to different parts of the same file - * to be serialised right until io completion. - */ -int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) -{ - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); return ret; } -EXPORT_SYMBOL(sync_page_range_nolock); +EXPORT_SYMBOL(filemap_fdatawait_range); /** * filemap_fdatawait - wait for all under-writeback pages to complete @@ -385,8 +396,7 @@ int filemap_fdatawait(struct address_space *mapping) if (i_size == 0) return 0; - return wait_on_page_writeback_range(mapping, 0, - (i_size - 1) >> PAGE_CACHE_SHIFT); + return filemap_fdatawait_range(mapping, 0, i_size - 1); } EXPORT_SYMBOL(filemap_fdatawait); @@ -407,6 +417,8 @@ int filemap_write_and_wait(struct address_space *mapping) if (!err) err = err2; } + } else { + err = filemap_check_errors(mapping); } return err; } @@ -433,87 +445,226 @@ int filemap_write_and_wait_range(struct address_space *mapping, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ if (err != -EIO) { - int err2 = wait_on_page_writeback_range(mapping, - lstart >> PAGE_CACHE_SHIFT, - lend >> PAGE_CACHE_SHIFT); + int err2 = filemap_fdatawait_range(mapping, + lstart, lend); if (!err) err = err2; } + } else { + err = filemap_check_errors(mapping); } return err; } +EXPORT_SYMBOL(filemap_write_and_wait_range); /** - * add_to_page_cache - add newly allocated pagecache pages + * replace_page_cache_page - replace a pagecache page with a new one + * @old: page to be replaced + * @new: page to replace with + * @gfp_mask: allocation mode + * + * This function replaces a page in the pagecache with a new one. On + * success it acquires the pagecache reference for the new page and + * drops it for the old page. Both the old and new pages must be + * locked. This function does not add the new page to the LRU, the + * caller must do that. + * + * The remove + add is atomic. The only way this function can fail is + * memory allocation failure. + */ +int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) +{ + int error; + + VM_BUG_ON_PAGE(!PageLocked(old), old); + VM_BUG_ON_PAGE(!PageLocked(new), new); + VM_BUG_ON_PAGE(new->mapping, new); + + error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + if (!error) { + struct address_space *mapping = old->mapping; + void (*freepage)(struct page *); + + pgoff_t offset = old->index; + freepage = mapping->a_ops->freepage; + + page_cache_get(new); + new->mapping = mapping; + new->index = offset; + + spin_lock_irq(&mapping->tree_lock); + __delete_from_page_cache(old, NULL); + error = radix_tree_insert(&mapping->page_tree, offset, new); + BUG_ON(error); + mapping->nrpages++; + __inc_zone_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(new)) + __inc_zone_page_state(new, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + /* mem_cgroup codes must not be called under tree_lock */ + mem_cgroup_replace_page_cache(old, new); + radix_tree_preload_end(); + if (freepage) + freepage(old); + page_cache_release(old); + } + + return error; +} +EXPORT_SYMBOL_GPL(replace_page_cache_page); + +static int page_cache_tree_insert(struct address_space *mapping, + struct page *page, void **shadowp) +{ + struct radix_tree_node *node; + void **slot; + int error; + + error = __radix_tree_create(&mapping->page_tree, page->index, + &node, &slot); + if (error) + return error; + if (*slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + if (!radix_tree_exceptional_entry(p)) + return -EEXIST; + if (shadowp) + *shadowp = p; + mapping->nrshadows--; + if (node) + workingset_node_shadows_dec(node); + } + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + } + return 0; +} + +static int __add_to_page_cache_locked(struct page *page, + struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask, + void **shadowp) +{ + int error; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageSwapBacked(page), page); + + error = mem_cgroup_charge_file(page, current->mm, + gfp_mask & GFP_RECLAIM_MASK); + if (error) + return error; + + error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); + if (error) { + mem_cgroup_uncharge_cache_page(page); + return error; + } + + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + + spin_lock_irq(&mapping->tree_lock); + error = page_cache_tree_insert(mapping, page, shadowp); + radix_tree_preload_end(); + if (unlikely(error)) + goto err_insert; + __inc_zone_page_state(page, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); + trace_mm_filemap_add_to_page_cache(page); + return 0; +err_insert: + page->mapping = NULL; + /* Leave page->index set: truncation relies upon it */ + spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); + return error; +} + +/** + * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add * @mapping: the page's address_space * @offset: page index * @gfp_mask: page allocation mode * - * This function is used to add newly allocated pagecache pages; - * the page is new, so we can just run SetPageLocked() against it. - * The other page state flags were set by rmqueue(). - * + * This function is used to add a page to the pagecache. It must be locked. * This function does not add the page to the LRU. The caller must do that. */ -int add_to_page_cache(struct page *page, struct address_space *mapping, +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int error = mem_cgroup_cache_charge(page, current->mm, - gfp_mask & ~__GFP_HIGHMEM); - if (error) - goto out; - - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); - if (error == 0) { - write_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); - if (!error) { - page_cache_get(page); - SetPageLocked(page); - page->mapping = mapping; - page->index = offset; - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - } else - mem_cgroup_uncharge_page(page); - - write_unlock_irq(&mapping->tree_lock); - radix_tree_preload_end(); - } else - mem_cgroup_uncharge_page(page); -out: - return error; + return __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, NULL); } -EXPORT_SYMBOL(add_to_page_cache); +EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) + void *shadow = NULL; + int ret; + + __set_page_locked(page); + ret = __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, &shadow); + if (unlikely(ret)) + __clear_page_locked(page); + else { + /* + * The page might have been evicted from cache only + * recently, in which case it should be activated like + * any other repeatedly accessed page. + */ + if (shadow && workingset_refault(shadow)) { + SetPageActive(page); + workingset_activation(page); + } else + ClearPageActive(page); lru_cache_add(page); + } return ret; } +EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { + int n; + struct page *page; + if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, gfp, 0); + unsigned int cpuset_mems_cookie; + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); + + return page; } return alloc_pages(gfp, 0); } EXPORT_SYMBOL(__page_cache_alloc); #endif -static int __sleep_on_page_lock(void *word) -{ - io_schedule(); - return 0; -} - /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of @@ -541,11 +692,40 @@ void wait_on_page_bit(struct page *page, int bit_nr) DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, sync_page, + __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_on_page_bit); +int wait_on_page_bit_killable(struct page *page, int bit_nr) +{ + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + + if (!test_bit(bit_nr, &page->flags)) + return 0; + + return __wait_on_bit(page_waitqueue(page), &wait, + sleep_on_page_killable, TASK_KILLABLE); +} + +/** + * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue + * @page: Page defining the wait queue of interest + * @waiter: Waiter to add to the queue + * + * Add an arbitrary @waiter to the wait queue for the nominated @page. + */ +void add_page_wait_queue(struct page *page, wait_queue_t *waiter) +{ + wait_queue_head_t *q = page_waitqueue(page); + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, waiter); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(add_page_wait_queue); + /** * unlock_page - unlock a locked page * @page: the page @@ -555,17 +735,14 @@ EXPORT_SYMBOL(wait_on_page_bit); * mechananism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. * - * The first mb is necessary to safely close the critical section opened by the - * TestSetPageLocked(), the second mb is necessary to enforce ordering between - * the clear_bit and the read of the waitqueue (to avoid SMP races with a - * parallel wait_on_page_locked()). + * The mb is necessary to enforce ordering between the clear_bit and the read + * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). */ void unlock_page(struct page *page) { - smp_mb__before_clear_bit(); - if (!TestClearPageLocked(page)) - BUG(); - smp_mb__after_clear_bit(); + VM_BUG_ON_PAGE(!PageLocked(page), page); + clear_bit_unlock(PG_locked, &page->flags); + smp_mb__after_atomic(); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -576,29 +753,60 @@ EXPORT_SYMBOL(unlock_page); */ void end_page_writeback(struct page *page) { - if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { - if (!test_clear_page_writeback(page)) - BUG(); + /* + * TestClearPageReclaim could be used here but it is an atomic + * operation and overkill in this particular case. Failing to + * shuffle a page marked for immediate reclaim is too mild to + * justify taking an atomic operation penalty at the end of + * ever page writeback. + */ + if (PageReclaim(page)) { + ClearPageReclaim(page); + rotate_reclaimable_page(page); } - smp_mb__after_clear_bit(); + + if (!test_clear_page_writeback(page)) + BUG(); + + smp_mb__after_atomic(); wake_up_page(page, PG_writeback); } EXPORT_SYMBOL(end_page_writeback); +/* + * After completing I/O on a page, call this routine to update the page + * flags appropriately + */ +void page_endio(struct page *page, int rw, int err) +{ + if (rw == READ) { + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { /* rw == WRITE */ + if (err) { + SetPageError(page); + if (page->mapping) + mapping_set_error(page->mapping, err); + } + end_page_writeback(page); + } +} +EXPORT_SYMBOL_GPL(page_endio); + /** * __lock_page - get a lock on the page, assuming we need to sleep to get it * @page: the page to lock - * - * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some - * random driver's requestfn sets TASK_RUNNING, we could busywait. However - * chances are that on the second loop, the block layer's plug list is empty, - * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, + __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); @@ -608,113 +816,298 @@ int __lock_page_killable(struct page *page) DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); return __wait_on_bit_lock(page_waitqueue(page), &wait, - sync_page_killable, TASK_KILLABLE); + sleep_on_page_killable, TASK_KILLABLE); +} +EXPORT_SYMBOL_GPL(__lock_page_killable); + +int __lock_page_or_retry(struct page *page, struct mm_struct *mm, + unsigned int flags) +{ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + /* + * CAUTION! In this case, mmap_sem is not released + * even though return 0. + */ + if (flags & FAULT_FLAG_RETRY_NOWAIT) + return 0; + + up_read(&mm->mmap_sem); + if (flags & FAULT_FLAG_KILLABLE) + wait_on_page_locked_killable(page); + else + wait_on_page_locked(page); + return 0; + } else { + if (flags & FAULT_FLAG_KILLABLE) { + int ret; + + ret = __lock_page_killable(page); + if (ret) { + up_read(&mm->mmap_sem); + return 0; + } + } else + __lock_page(page); + return 1; + } } /** - * __lock_page_nosync - get a lock on the page, without calling sync_page() - * @page: the page to lock - * - * Variant of lock_page that does not require the caller to hold a reference - * on the page's mapping. + * page_cache_next_hole - find the next hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the + * lowest indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= + * max_scan' will be true). In rare cases of index wrap-around, 0 will + * be returned. + * + * page_cache_next_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 5, then subsequently a hole is created at + * index 10, page_cache_next_hole covering both indexes may return 10 + * if called under rcu_read_lock. */ -void __lock_page_nosync(struct page *page) +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) { - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, - TASK_UNINTERRUPTIBLE); + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index++; + if (index == 0) + break; + } + + return index; } +EXPORT_SYMBOL(page_cache_next_hole); /** - * find_get_page - find and get a page reference + * page_cache_prev_hole - find the prev hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] for + * the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= + * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX + * will be returned. + * + * page_cache_prev_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 10, then subsequently a hole is created at + * index 5, page_cache_prev_hole covering both indexes may return 5 if + * called under rcu_read_lock. + */ +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index--; + if (index == ULONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_prev_hole); + +/** + * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page index + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. + * + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. * - * Is there a pagecache struct page at the given (mapping, offset) tuple? - * If yes, increment its refcount and return it; if no, return NULL. + * Otherwise, %NULL is returned. */ -struct page * find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { + void **pagep; struct page *page; - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); +repeat: + page = NULL; + pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + if (pagep) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page)) + goto out; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto repeat; + /* + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. + */ + goto out; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + goto repeat; + } + } +out: + rcu_read_unlock(); + return page; } -EXPORT_SYMBOL(find_get_page); +EXPORT_SYMBOL(find_get_entry); /** - * find_lock_page - locate, pin and lock a pagecache page + * find_lock_entry - locate, pin and lock a page cache entry * @mapping: the address_space to search - * @offset: the page index + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. * - * Locates the desired pagecache page, locks it, increments its reference - * count and returns its address. + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. * - * Returns zero if the page was not present. find_lock_page() may sleep. + * Otherwise, %NULL is returned. + * + * find_lock_entry() may sleep. */ -struct page *find_lock_page(struct address_space *mapping, - pgoff_t offset) +struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) { - page_cache_get(page); - if (TestSetPageLocked(page)) { - read_unlock_irq(&mapping->tree_lock); - __lock_page(page); - - /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - page_cache_release(page); - goto repeat; - } - VM_BUG_ON(page->index != offset); - goto out; + page = find_get_entry(mapping, offset); + if (page && !radix_tree_exception(page)) { + lock_page(page); + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; } + VM_BUG_ON_PAGE(page->index != offset, page); } - read_unlock_irq(&mapping->tree_lock); -out: return page; } -EXPORT_SYMBOL(find_lock_page); +EXPORT_SYMBOL(find_lock_entry); /** - * find_or_create_page - locate or add a pagecache page - * @mapping: the page's address_space - * @index: the page's index into the mapping - * @gfp_mask: page allocation mode + * pagecache_get_page - find and get a page reference + * @mapping: the address_space to search + * @offset: the page index + * @fgp_flags: PCG flags + * @cache_gfp_mask: gfp mask to use for the page cache data page allocation + * @radix_gfp_mask: gfp mask to use for radix tree node allocation + * + * Looks up the page cache slot at @mapping & @offset. * - * Locates a page in the pagecache. If the page is not present, a new page - * is allocated using @gfp_mask and is added to the pagecache and to the VM's - * LRU list. The returned page is locked and has its reference count - * incremented. + * PCG flags modify how the page is returned. * - * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic - * allocation! + * FGP_ACCESSED: the page will be marked accessed + * FGP_LOCK: Page is return locked + * FGP_CREAT: If page is not present then a new page is allocated using + * @cache_gfp_mask and added to the page cache and the VM's LRU + * list. If radix tree nodes are allocated during page cache + * insertion then @radix_gfp_mask is used. The page is returned + * locked and with an increased refcount. Otherwise, %NULL is + * returned. * - * find_or_create_page() returns the desired page's address, or zero on - * memory exhaustion. + * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even + * if the GFP flags specified for FGP_CREAT are atomic. + * + * If there is a page cache page, it is returned with an increased refcount. */ -struct page *find_or_create_page(struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask) +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, + int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask) { struct page *page; - int err; + repeat: - page = find_lock_page(mapping, index); - if (!page) { - page = __page_cache_alloc(gfp_mask); + page = find_get_entry(mapping, offset); + if (radix_tree_exceptional_entry(page)) + page = NULL; + if (!page) + goto no_page; + + if (fgp_flags & FGP_LOCK) { + if (fgp_flags & FGP_NOWAIT) { + if (!trylock_page(page)) { + page_cache_release(page); + return NULL; + } + } else { + lock_page(page); + } + + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + VM_BUG_ON_PAGE(page->index != offset, page); + } + + if (page && (fgp_flags & FGP_ACCESSED)) + mark_page_accessed(page); + +no_page: + if (!page && (fgp_flags & FGP_CREAT)) { + int err; + if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) + cache_gfp_mask |= __GFP_WRITE; + if (fgp_flags & FGP_NOFS) { + cache_gfp_mask &= ~__GFP_FS; + radix_gfp_mask &= ~__GFP_FS; + } + + page = __page_cache_alloc(cache_gfp_mask); if (!page) return NULL; - err = add_to_page_cache_lru(page, mapping, index, gfp_mask); + + if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) + fgp_flags |= FGP_LOCK; + + /* Init accessed so avoit atomic mark_page_accessed later */ + if (fgp_flags & FGP_ACCESSED) + init_page_accessed(page); + + err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); if (unlikely(err)) { page_cache_release(page); page = NULL; @@ -722,9 +1115,80 @@ repeat: goto repeat; } } + return page; } -EXPORT_SYMBOL(find_or_create_page); +EXPORT_SYMBOL(pagecache_get_page); + +/** + * find_get_entries - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page cache index + * @nr_entries: The maximum number of entries + * @entries: Where the resulting entries are placed + * @indices: The cache indices corresponding to the entries in @entries + * + * find_get_entries() will search for and return a group of up to + * @nr_entries entries in the mapping. The entries are placed at + * @entries. find_get_entries() takes a reference against any actual + * pages it returns. + * + * The search returns a group of mapping-contiguous page cache entries + * with ascending indexes. There may be holes in the indices due to + * not-present pages. + * + * Any shadow entries of evicted pages, or swap entries from + * shmem/tmpfs, are included in the returned array. + * + * find_get_entries() returns the number of pages and shadow entries + * which were found. + */ +unsigned find_get_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + /* + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} /** * find_get_pages - gang pagecache lookup @@ -745,15 +1209,55 @@ EXPORT_SYMBOL(find_or_create_page); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - read_unlock_irq(&mapping->tree_lock); + if (unlikely(!nr_pages)) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + WARN_ON(iter.index); + goto restart; + } + /* + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Skip + * over it. + */ + continue; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + if (++ret == nr_pages) + break; + } + + rcu_read_unlock(); return ret; } @@ -772,21 +1276,65 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; + + if (unlikely(!nr_pages)) + return 0; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, index, nr_pages); - for (i = 0; i < ret; i++) { - if (pages[i]->mapping == NULL || pages[i]->index != index) + rcu_read_lock(); +restart: + radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + /* The hole, there no reason to continue */ + if (unlikely(!page)) break; - page_cache_get(pages[i]); - index++; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + /* + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Stop + * looking for contiguous pages. + */ + break; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + /* + * must check mapping and index after taking the ref. + * otherwise we can get both false positives and false + * negatives, which is just confusing to the caller. + */ + if (page->mapping == NULL || page->index != iter.index) { + page_cache_release(page); + break; + } + + pages[ret] = page; + if (++ret == nr_pages) + break; } - read_unlock_irq(&mapping->tree_lock); - return i; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(find_get_pages_contig); @@ -804,54 +1352,69 @@ EXPORT_SYMBOL(find_get_pages_contig); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_tagged(slot, &mapping->page_tree, + &iter, *index, tag) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + /* + * A shadow entry of a recently evicted page. + * + * Those entries should never be tagged, but + * this tree walk is lockless and the tags are + * looked up in bulk, one radix tree node at a + * time, so there is a sizable window for page + * reclaim to evict a page we saw tagged. + * + * Skip over it. + */ + continue; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + if (++ret == nr_pages) + break; + } + + rcu_read_unlock(); - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup_tag(&mapping->page_tree, - (void **)pages, *index, nr_pages, tag); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); if (ret) *index = pages[ret - 1]->index + 1; - read_unlock_irq(&mapping->tree_lock); + return ret; } EXPORT_SYMBOL(find_get_pages_tag); -/** - * grab_cache_page_nowait - returns locked page at given index in given cache - * @mapping: target address_space - * @index: the page index - * - * Same as grab_cache_page(), but do not wait if the page is unavailable. - * This is intended for speculative data generators, where the data can - * be regenerated if the page couldn't be grabbed. This routine should - * be safe to call while holding the lock for another page. - * - * Clear __GFP_FS when allocating the page to avoid recursion into the fs - * and deadlock against the caller's locked page. - */ -struct page * -grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) -{ - struct page *page = find_get_page(mapping, index); - - if (page) { - if (!TestSetPageLocked(page)) - return page; - page_cache_release(page); - return NULL; - } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); - if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { - page_cache_release(page); - page = NULL; - } - return page; -} -EXPORT_SYMBOL(grab_cache_page_nowait); - /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -870,9 +1433,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); static void shrink_readahead_size_eio(struct file *filp, struct file_ra_state *ra) { - if (!ra->ra_pages) - return; - ra->ra_pages /= 4; } @@ -880,8 +1440,8 @@ static void shrink_readahead_size_eio(struct file *filp, * do_generic_file_read - generic file read routine * @filp: the file to read * @ppos: current file position - * @desc: read_descriptor - * @actor: read method + * @iter: data destination + * @written: already copied * * This is a generic file read routine, and uses the * mapping->a_ops->readpage() function for the actual low-level stuff. @@ -889,8 +1449,8 @@ static void shrink_readahead_size_eio(struct file *filp, * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. */ -static void do_generic_file_read(struct file *filp, loff_t *ppos, - read_descriptor_t *desc, read_actor_t actor) +static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, + struct iov_iter *iter, ssize_t written) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; @@ -900,12 +1460,12 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos, pgoff_t prev_index; unsigned long offset; /* offset into pagecache page */ unsigned int prev_offset; - int error; + int error = 0; index = *ppos >> PAGE_CACHE_SHIFT; prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); - last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; for (;;) { @@ -930,8 +1490,20 @@ find_page: ra, filp, page, index, last_index - index); } - if (!PageUptodate(page)) - goto page_not_up_to_date; + if (!PageUptodate(page)) { + if (inode->i_blkbits == PAGE_CACHE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + if (!trylock_page(page)) + goto page_not_up_to_date; + /* Did it get truncated before we got the lock? */ + if (!page->mapping) + goto page_not_up_to_date_locked; + if (!mapping->a_ops->is_partially_uptodate(page, + offset, iter->count)) + goto page_not_up_to_date_locked; + unlock_page(page); + } page_ok: /* * i_size must be checked after we know the page is Uptodate. @@ -978,29 +1550,31 @@ page_ok: /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). */ - ret = actor(desc, page, offset, nr); + + ret = copy_page_to_iter(page, offset, nr, iter); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; prev_offset = offset; page_cache_release(page); - if (ret == nr && desc->count) - continue; - goto out; + written += ret; + if (!iov_iter_count(iter)) + goto out; + if (ret < nr) { + error = -EFAULT; + goto out; + } + continue; page_not_up_to_date: /* Get exclusive access to the page ... */ - if (lock_page_killable(page)) - goto readpage_eio; + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; +page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ if (!page->mapping) { unlock_page(page); @@ -1015,24 +1589,32 @@ page_not_up_to_date: } readpage: + /* + * A previous I/O error may have been due to temporary + * failures, eg. multipath errors. + * PG_error will be set again if readpage fails. + */ + ClearPageError(page); /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); if (unlikely(error)) { if (error == AOP_TRUNCATED_PAGE) { page_cache_release(page); + error = 0; goto find_page; } goto readpage_error; } if (!PageUptodate(page)) { - if (lock_page_killable(page)) - goto readpage_eio; + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; if (!PageUptodate(page)) { if (page->mapping == NULL) { /* - * invalidate_inode_pages got it + * invalidate_mapping_pages got it */ unlock_page(page); page_cache_release(page); @@ -1040,18 +1622,16 @@ readpage: } unlock_page(page); shrink_readahead_size_eio(filp, ra); - goto readpage_eio; + error = -EIO; + goto readpage_error; } unlock_page(page); } goto page_ok; -readpage_eio: - error = -EIO; readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ - desc->error = error; page_cache_release(page); goto out; @@ -1062,16 +1642,17 @@ no_cached_page: */ page = page_cache_alloc_cold(mapping); if (!page) { - desc->error = -ENOMEM; + error = -ENOMEM; goto out; } error = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (error) { page_cache_release(page); - if (error == -EEXIST) + if (error == -EEXIST) { + error = 0; goto find_page; - desc->error = error; + } goto out; } goto readpage; @@ -1083,193 +1664,67 @@ out: ra->prev_pos |= prev_offset; *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; - if (filp) - file_accessed(filp); -} - -int file_read_actor(read_descriptor_t *desc, struct page *page, - unsigned long offset, unsigned long size) -{ - char *kaddr; - unsigned long left, count = desc->count; - - if (size > count) - size = count; - - /* - * Faults on the destination of a read are common, so do it before - * taking the kmap. - */ - if (!fault_in_pages_writeable(desc->arg.buf, size)) { - kaddr = kmap_atomic(page, KM_USER0); - left = __copy_to_user_inatomic(desc->arg.buf, - kaddr + offset, size); - kunmap_atomic(kaddr, KM_USER0); - if (left == 0) - goto success; - } - - /* Do it the slow way */ - kaddr = kmap(page); - left = __copy_to_user(desc->arg.buf, kaddr + offset, size); - kunmap(page); - - if (left) { - size -= left; - desc->error = -EFAULT; - } -success: - desc->count = count - size; - desc->written += size; - desc->arg.buf += size; - return size; -} - -/* - * Performs necessary checks before doing a write - * @iov: io vector request - * @nr_segs: number of segments in the iovec - * @count: number of bytes to write - * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE - * - * Adjust number of segments and amount of bytes to write (nr_segs should be - * properly initialized first). Returns appropriate error code that caller - * should return or zero in case that write should be allowed. - */ -int generic_segment_checks(const struct iovec *iov, - unsigned long *nr_segs, size_t *count, int access_flags) -{ - unsigned long seg; - size_t cnt = 0; - for (seg = 0; seg < *nr_segs; seg++) { - const struct iovec *iv = &iov[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - cnt += iv->iov_len; - if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) - return -EINVAL; - if (access_ok(access_flags, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - *nr_segs = seg; - cnt -= iv->iov_len; /* This segment is no good */ - break; - } - *count = cnt; - return 0; + file_accessed(filp); + return written ? written : error; } -EXPORT_SYMBOL(generic_segment_checks); /** - * generic_file_aio_read - generic filesystem read routine + * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block - * @iov: io vector request - * @nr_segs: number of segments in the iovec - * @pos: current file position + * @iter: destination for the data read * - * This is the "read()" routine for all filesystems + * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. */ ssize_t -generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg; - size_t count; + struct file *file = iocb->ki_filp; + ssize_t retval = 0; loff_t *ppos = &iocb->ki_pos; - - count = 0; - retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (retval) - return retval; + loff_t pos = *ppos; /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ - if (filp->f_flags & O_DIRECT) { + if (file->f_flags & O_DIRECT) { + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); loff_t size; - struct address_space *mapping; - struct inode *inode; - mapping = filp->f_mapping; - inode = mapping->host; - retval = 0; if (!count) goto out; /* skip atime */ size = i_size_read(inode); - if (pos < size) { - retval = generic_file_direct_IO(READ, iocb, - iov, pos, nr_segs); - if (retval > 0) - *ppos = pos + retval; + retval = filemap_write_and_wait_range(mapping, pos, + pos + count - 1); + if (!retval) { + struct iov_iter data = *iter; + retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos); } - if (likely(retval != 0)) { - file_accessed(filp); - goto out; - } - } - retval = 0; - if (count) { - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; + if (retval > 0) { + *ppos = pos + retval; + iov_iter_advance(iter, retval); + } - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp,ppos,&desc,file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; + /* + * Btrfs can have a short DIO read if we encounter + * compressed extents, so if there was an error, or if + * we've already read everything we wanted to, or if + * there was a short read because we hit EOF, go ahead + * and return. Otherwise fallthrough to buffered io for + * the rest of the read. + */ + if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) { + file_accessed(file); + goto out; } } + + retval = do_generic_file_read(file, ppos, iter, retval); out: return retval; } -EXPORT_SYMBOL(generic_file_aio_read); - -static ssize_t -do_readahead(struct address_space *mapping, struct file *filp, - pgoff_t index, unsigned long nr) -{ - if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) - return -EINVAL; - - force_page_cache_readahead(mapping, filp, index, - max_sane_readahead(nr)); - return 0; -} - -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) -{ - ssize_t ret; - struct file *file; - - ret = -EBADF; - file = fget(fd); - if (file) { - if (file->f_mode & FMODE_READ) { - struct address_space *mapping = file->f_mapping; - pgoff_t start = offset >> PAGE_CACHE_SHIFT; - pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; - unsigned long len = end - start + 1; - ret = do_readahead(mapping, file, start, len); - } - fput(file); - } - return ret; -} +EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU /** @@ -1306,6 +1761,73 @@ static int page_cache_read(struct file *file, pgoff_t offset) #define MMAP_LOTSAMISS (100) +/* + * Synchronous readahead happens when we don't even find + * a page in the page cache at all. + */ +static void do_sync_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + pgoff_t offset) +{ + unsigned long ra_pages; + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (vma->vm_flags & VM_RAND_READ) + return; + if (!ra->ra_pages) + return; + + if (vma->vm_flags & VM_SEQ_READ) { + page_cache_sync_readahead(mapping, ra, file, offset, + ra->ra_pages); + return; + } + + /* Avoid banging the cache line if not needed */ + if (ra->mmap_miss < MMAP_LOTSAMISS * 10) + ra->mmap_miss++; + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > MMAP_LOTSAMISS) + return; + + /* + * mmap read-around + */ + ra_pages = max_sane_readahead(ra->ra_pages); + ra->start = max_t(long, 0, offset - ra_pages / 2); + ra->size = ra_pages; + ra->async_size = ra_pages / 4; + ra_submit(ra, mapping, file); +} + +/* + * Asynchronous readahead happens when we find the page and PG_readahead, + * so we want to possibly extend the readahead further.. + */ +static void do_async_mmap_readahead(struct vm_area_struct *vma, + struct file_ra_state *ra, + struct file *file, + struct page *page, + pgoff_t offset) +{ + struct address_space *mapping = file->f_mapping; + + /* If we don't want any read-ahead, don't bother */ + if (vma->vm_flags & VM_RAND_READ) + return; + if (ra->mmap_miss > 0) + ra->mmap_miss--; + if (PageReadahead(page)) + page_cache_async_readahead(mapping, ra, file, + page, offset, ra->ra_pages); +} + /** * filemap_fault - read in file data for page fault handling * @vma: vma in which the fault was taken @@ -1325,77 +1847,49 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; + pgoff_t offset = vmf->pgoff; struct page *page; - pgoff_t size; - int did_readaround = 0; + loff_t size; int ret = 0; - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (vmf->pgoff >= size) + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (offset >= size >> PAGE_CACHE_SHIFT) return VM_FAULT_SIGBUS; - /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) - goto no_cached_page; - /* * Do we have something in the page cache already? */ -retry_find: - page = find_lock_page(mapping, vmf->pgoff); - /* - * For sequential accesses, we use the generic readahead logic. - */ - if (VM_SequentialReadHint(vma)) { - if (!page) { - page_cache_sync_readahead(mapping, ra, file, - vmf->pgoff, 1); - page = find_lock_page(mapping, vmf->pgoff); - if (!page) - goto no_cached_page; - } - if (PageReadahead(page)) { - page_cache_async_readahead(mapping, ra, file, page, - vmf->pgoff, 1); - } - } - - if (!page) { - unsigned long ra_pages; - - ra->mmap_miss++; - - /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. - */ - if (ra->mmap_miss > MMAP_LOTSAMISS) - goto no_cached_page; - + page = find_get_page(mapping, offset); + if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { /* - * To keep the pgmajfault counter straight, we need to - * check did_readaround, as this is an inner loop. + * We found the page, so try async readahead before + * waiting for the lock. */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - did_readaround = 1; - ra_pages = max_sane_readahead(file->f_ra.ra_pages); - if (ra_pages) { - pgoff_t start = 0; - - if (vmf->pgoff > ra_pages / 2) - start = vmf->pgoff - ra_pages / 2; - do_page_cache_readahead(mapping, file, start, ra_pages); - } - page = find_lock_page(mapping, vmf->pgoff); + do_async_mmap_readahead(vma, ra, file, page, offset); + } else if (!page) { + /* No page in the page cache at all */ + do_sync_mmap_readahead(vma, ra, file, offset); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + ret = VM_FAULT_MAJOR; +retry_find: + page = find_get_page(mapping, offset); if (!page) goto no_cached_page; } - if (!did_readaround) - ra->mmap_miss--; + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); + return ret | VM_FAULT_RETRY; + } + + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto retry_find; + } + VM_BUG_ON_PAGE(page->index != offset, page); /* * We have a locked page in the page cache, now we need to check @@ -1404,19 +1898,17 @@ retry_find: if (unlikely(!PageUptodate(page))) goto page_not_uptodate; - /* Must recheck i_size under page lock */ - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { + /* + * Found the page and have a reference on it. + * We must recheck i_size under page lock. + */ + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; } - /* - * Found the page and have a reference on it. - */ - mark_page_accessed(page); - ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; vmf->page = page; return ret | VM_FAULT_LOCKED; @@ -1425,7 +1917,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, vmf->pgoff); + error = page_cache_read(file, offset); /* * The page we want has now been added to the page cache. @@ -1445,12 +1937,6 @@ no_cached_page: return VM_FAULT_SIGBUS; page_not_uptodate: - /* IO error path */ - if (!did_readaround) { - ret = VM_FAULT_MAJOR; - count_vm_event(PGMAJFAULT); - } - /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, @@ -1459,6 +1945,11 @@ page_not_uptodate: */ ClearPageError(page); error = mapping->a_ops->readpage(file, page); + if (!error) { + wait_on_page_locked(page); + if (!PageUptodate(page)) + error = -EIO; + } page_cache_release(page); if (!error || error == AOP_TRUNCATED_PAGE) @@ -1470,8 +1961,110 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -struct vm_operations_struct generic_file_vm_ops = { +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct radix_tree_iter iter; + void **slot; + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + loff_t size; + struct page *page; + unsigned long address = (unsigned long) vmf->virtual_address; + unsigned long addr; + pte_t *pte; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { + if (iter.index > vmf->max_pgoff) + break; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + goto next; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + break; + else + goto next; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + if (!PageUptodate(page) || + PageReadahead(page) || + PageHWPoison(page)) + goto skip; + if (!trylock_page(page)) + goto skip; + + if (page->mapping != mapping || !PageUptodate(page)) + goto unlock; + + size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE); + if (page->index >= size >> PAGE_CACHE_SHIFT) + goto unlock; + + pte = vmf->pte + page->index - vmf->pgoff; + if (!pte_none(*pte)) + goto unlock; + + if (file->f_ra.mmap_miss > 0) + file->f_ra.mmap_miss--; + addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; + do_set_pte(vma, addr, page, pte, false, false); + unlock_page(page); + goto next; +unlock: + unlock_page(page); +skip: + page_cache_release(page); +next: + if (iter.index == vmf->max_pgoff) + break; + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(filemap_map_pages); + +int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + int ret = VM_FAULT_LOCKED; + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + lock_page(page); + if (page->mapping != inode->i_mapping) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out; + } + /* + * We mark the page dirty already here so that when freeze is in + * progress, we are guaranteed that writeback during freezing will + * see the dirty page and writeprotect it again. + */ + set_page_dirty(page); + wait_for_stable_page(page); +out: + sb_end_pagefault(inode->i_sb); + return ret; +} +EXPORT_SYMBOL(filemap_page_mkwrite); + +const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = filemap_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; /* This is used for a general mmap of a disk file */ @@ -1484,7 +2077,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } @@ -1511,20 +2103,33 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), - void *data) + int (*filler)(void *, struct page *), + void *data, + gfp_t gfp) { struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { - page = page_cache_alloc_cold(mapping); + page = __page_cache_alloc(gfp | __GFP_COLD); if (!page) return ERR_PTR(-ENOMEM); - err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + err = add_to_page_cache_lru(page, mapping, index, gfp); if (unlikely(err)) { page_cache_release(page); if (err == -EEXIST) @@ -1536,36 +2141,25 @@ repeat: if (err < 0) { page_cache_release(page); page = ERR_PTR(err); + } else { + page = wait_on_page_read(page); } } return page; } -/** - * read_cache_page_async - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: destination for read data - * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page_async(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), - void *data) + int (*filler)(void *, struct page *), + void *data, + gfp_t gfp) + { struct page *page; int err; retry: - page = __read_cache_page(mapping, index, filler, data); + page = __read_cache_page(mapping, index, filler, data, gfp); if (IS_ERR(page)) return page; if (PageUptodate(page)) @@ -1585,240 +2179,57 @@ retry: if (err < 0) { page_cache_release(page); return ERR_PTR(err); + } else { + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; } out: mark_page_accessed(page); return page; } -EXPORT_SYMBOL(read_cache_page_async); /** * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read - * @data: destination for read data + * @data: first arg to filler(data, page) function, often left as NULL * * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page then wait for it to become unlocked. + * not set, try to fill the page and wait for it to become unlocked. * * If the page does not get brought uptodate, return -EIO. */ struct page *read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data) { - struct page *page; - - page = read_cache_page_async(mapping, index, filler, data); - if (IS_ERR(page)) - goto out; - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - out: - return page; + return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_page); -/* - * The logic we want is +/** + * read_cache_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating * - * if suid or (sgid and xgrp) - * remove privs - */ -int should_remove_suid(struct dentry *dentry) -{ - mode_t mode = dentry->d_inode->i_mode; - int kill = 0; - - /* suid always must be killed */ - if (unlikely(mode & S_ISUID)) - kill = ATTR_KILL_SUID; - - /* - * sgid without any exec bits is just a mandatory locking mark; leave - * it alone. If some exec bits are set, it's a real sgid; kill it. - */ - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - kill |= ATTR_KILL_SGID; - - if (unlikely(kill && !capable(CAP_FSETID))) - return kill; - - return 0; -} -EXPORT_SYMBOL(should_remove_suid); - -int __remove_suid(struct dentry *dentry, int kill) -{ - struct iattr newattrs; - - newattrs.ia_valid = ATTR_FORCE | kill; - return notify_change(dentry, &newattrs); -} - -int remove_suid(struct dentry *dentry) -{ - int killsuid = should_remove_suid(dentry); - int killpriv = security_inode_need_killpriv(dentry); - int error = 0; - - if (killpriv < 0) - return killpriv; - if (killpriv) - error = security_inode_killpriv(dentry); - if (!error && killsuid) - error = __remove_suid(dentry, killsuid); - - return error; -} -EXPORT_SYMBOL(remove_suid); - -static size_t __iovec_copy_from_user_inatomic(char *vaddr, - const struct iovec *iov, size_t base, size_t bytes) -{ - size_t copied = 0, left = 0; - - while (bytes) { - char __user *buf = iov->iov_base + base; - int copy = min(bytes, iov->iov_len - base); - - base = 0; - left = __copy_from_user_inatomic_nocache(vaddr, buf, copy); - copied += copy; - bytes -= copy; - vaddr += copy; - iov++; - - if (unlikely(left)) - break; - } - return copied - left; -} - -/* - * Copy as much as we can into the page and return the number of bytes which - * were sucessfully copied. If a fault is encountered then return the number of - * bytes which were copied. - */ -size_t iov_iter_copy_from_user_atomic(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - BUG_ON(!in_atomic()); - kaddr = kmap_atomic(page, KM_USER0); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_inatomic_nocache(kaddr + offset, - buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap_atomic(kaddr, KM_USER0); - - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); - -/* - * This has the same sideeffects and return value as - * iov_iter_copy_from_user_atomic(). - * The difference is that it attempts to resolve faults. - * Page must not be locked. - */ -size_t iov_iter_copy_from_user(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_nocache(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap(page); - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user); - -void iov_iter_advance(struct iov_iter *i, size_t bytes) -{ - BUG_ON(i->count < bytes); - - if (likely(i->nr_segs == 1)) { - i->iov_offset += bytes; - i->count -= bytes; - } else { - const struct iovec *iov = i->iov; - size_t base = i->iov_offset; - - /* - * The !iov->iov_len check ensures we skip over unlikely - * zero-length segments (without overruning the iovec). - */ - while (bytes || unlikely(!iov->iov_len && i->count)) { - int copy; - - copy = min(bytes, iov->iov_len - base); - BUG_ON(!i->count || i->count < copy); - i->count -= copy; - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - base = 0; - } - } - i->iov = iov; - i->iov_offset = base; - } -} -EXPORT_SYMBOL(iov_iter_advance); - -/* - * Fault in the first iovec of the given iov_iter, to a maximum length - * of bytes. Returns 0 on success, or non-zero if the memory could not be - * accessed (ie. because it is an invalid address). + * This is the same as "read_mapping_page(mapping, index, NULL)", but with + * any new page allocations done using the specified allocation flags. * - * writev-intensive code may want this to prefault several iovecs -- that - * would be possible (callers must not rely on the fact that _only_ the - * first iovec will be faulted with the current implementation). + * If the page does not get brought uptodate, return -EIO. */ -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +struct page *read_cache_page_gfp(struct address_space *mapping, + pgoff_t index, + gfp_t gfp) { - char __user *buf = i->iov->iov_base + i->iov_offset; - bytes = min(bytes, i->iov->iov_len - i->iov_offset); - return fault_in_pages_readable(buf, bytes); -} -EXPORT_SYMBOL(iov_iter_fault_in_readable); + filler_t *filler = (filler_t *)mapping->a_ops->readpage; -/* - * Return the count of just the current iov_iter segment. - */ -size_t iov_iter_single_seg_count(struct iov_iter *i) -{ - const struct iovec *iov = i->iov; - if (i->nr_segs == 1) - return i->count; - else - return min(i->count, iov->iov_len - i->iov_offset); + return do_read_cache_page(mapping, index, filler, NULL, gfp); } -EXPORT_SYMBOL(iov_iter_single_seg_count); +EXPORT_SYMBOL(read_cache_page_gfp); /* * Performs necessary checks before doing a write @@ -1830,7 +2241,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count); inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) { struct inode *inode = file->f_mapping->host; - unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + unsigned long limit = rlimit(RLIMIT_FSIZE); if (unlikely(*pos < 0)) return -EINVAL; @@ -1908,48 +2319,8 @@ int pagecache_write_begin(struct file *file, struct address_space *mapping, { const struct address_space_operations *aops = mapping->a_ops; - if (aops->write_begin) { - return aops->write_begin(file, mapping, pos, len, flags, + return aops->write_begin(file, mapping, pos, len, flags, pagep, fsdata); - } else { - int ret; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); - struct inode *inode = mapping->host; - struct page *page; -again: - page = __grab_cache_page(mapping, index); - *pagep = page; - if (!page) - return -ENOMEM; - - if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { - /* - * There is no way to resolve a short write situation - * for a !Uptodate page (except by double copying in - * the caller done by generic_perform_write_2copy). - * - * Instead, we have to bring it uptodate here. - */ - ret = aops->readpage(file, page); - page_cache_release(page); - if (ret) { - if (ret == AOP_TRUNCATED_PAGE) - goto again; - return ret; - } - goto again; - } - - ret = aops->prepare_write(file, page, offset, offset+len); - if (ret) { - unlock_page(page); - page_cache_release(page); - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); - } - return ret; - } } EXPORT_SYMBOL(pagecache_write_begin); @@ -1958,70 +2329,75 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { const struct address_space_operations *aops = mapping->a_ops; - int ret; - - if (aops->write_end) { - mark_page_accessed(page); - ret = aops->write_end(file, mapping, pos, len, copied, - page, fsdata); - } else { - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); - struct inode *inode = mapping->host; - flush_dcache_page(page); - ret = aops->commit_write(file, page, offset, offset+len); - unlock_page(page); - mark_page_accessed(page); - page_cache_release(page); - - if (ret < 0) { - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); - } else if (ret > 0) - ret = min_t(size_t, copied, ret); - else - ret = copied; - } - - return ret; + return aops->write_end(file, mapping, pos, len, copied, page, fsdata); } EXPORT_SYMBOL(pagecache_write_end); ssize_t -generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long *nr_segs, loff_t pos, loff_t *ppos, - size_t count, size_t ocount) +generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t written; + size_t write_len; + pgoff_t end; + struct iov_iter data; - if (count != ocount) - *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); + write_len = iov_iter_count(from); + end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; - written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); - if (written > 0) { - loff_t end = pos + written; - if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { - i_size_write(inode, end); - mark_inode_dirty(inode); + written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); + if (written) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (mapping->nrpages) { + written = invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; + goto out; } - *ppos = end; } + data = *from; + written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos); + /* - * Sync the fs metadata but not the minor inode changes and - * of course not the data as we did direct DMA for the IO. - * i_mutex is held, which protects generic_osync_inode() from - * livelocking. AIO O_DIRECT ops attempt to sync metadata here. + * Finally, try again to invalidate clean pages which might have been + * cached by non-direct readahead, or faulted in by get_user_pages() + * if the source of the write was an mmap'ed region of the file + * we're writing. Either one is a pretty crazy thing to do, + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... */ - if ((written >= 0 || written == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (err < 0) - written = err; + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + } + + if (written > 0) { + pos += written; + iov_iter_advance(from, written); + if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + iocb->ki_pos = pos; } +out: return written; } EXPORT_SYMBOL(generic_file_direct_write); @@ -2030,198 +2406,26 @@ EXPORT_SYMBOL(generic_file_direct_write); * Find or create a page at the given pagecache position. Return the locked * page. This function is specifically for buffered writes. */ -struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) +struct page *grab_cache_page_write_begin(struct address_space *mapping, + pgoff_t index, unsigned flags) { - int status; struct page *page; -repeat: - page = find_lock_page(mapping, index); - if (likely(page)) - return page; - - page = page_cache_alloc(mapping); - if (!page) - return NULL; - status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); - if (unlikely(status)) { - page_cache_release(page); - if (status == -EEXIST) - goto repeat; - return NULL; - } - return page; -} -EXPORT_SYMBOL(__grab_cache_page); - -static ssize_t generic_perform_write_2copy(struct file *file, - struct iov_iter *i, loff_t pos) -{ - struct address_space *mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - ssize_t written = 0; - - do { - struct page *src_page; - struct page *page; - pgoff_t index; /* Pagecache index for current page */ - unsigned long offset; /* Offset into pagecache page */ - unsigned long bytes; /* Bytes to write to page */ - size_t copied; /* Bytes copied from user */ - - offset = (pos & (PAGE_CACHE_SIZE - 1)); - index = pos >> PAGE_CACHE_SHIFT; - bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, - iov_iter_count(i)); - - /* - * a non-NULL src_page indicates that we're doing the - * copy via get_user_pages and kmap. - */ - src_page = NULL; - - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. - */ - if (unlikely(iov_iter_fault_in_readable(i, bytes))) { - status = -EFAULT; - break; - } - - page = __grab_cache_page(mapping, index); - if (!page) { - status = -ENOMEM; - break; - } - - /* - * non-uptodate pages cannot cope with short copies, and we - * cannot take a pagefault with the destination page locked. - * So pin the source page to copy it. - */ - if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { - unlock_page(page); - - src_page = alloc_page(GFP_KERNEL); - if (!src_page) { - page_cache_release(page); - status = -ENOMEM; - break; - } - - /* - * Cannot get_user_pages with a page locked for the - * same reason as we can't take a page fault with a - * page locked (as explained below). - */ - copied = iov_iter_copy_from_user(src_page, i, - offset, bytes); - if (unlikely(copied == 0)) { - status = -EFAULT; - page_cache_release(page); - page_cache_release(src_page); - break; - } - bytes = copied; - - lock_page(page); - /* - * Can't handle the page going uptodate here, because - * that means we would use non-atomic usercopies, which - * zero out the tail of the page, which can cause - * zeroes to become transiently visible. We could just - * use a non-zeroing copy, but the APIs aren't too - * consistent. - */ - if (unlikely(!page->mapping || PageUptodate(page))) { - unlock_page(page); - page_cache_release(page); - page_cache_release(src_page); - continue; - } - } - - status = a_ops->prepare_write(file, page, offset, offset+bytes); - if (unlikely(status)) - goto fs_write_aop_error; - - if (!src_page) { - /* - * Must not enter the pagefault handler here, because - * we hold the page lock, so we might recursively - * deadlock on the same lock, or get an ABBA deadlock - * against a different lock, or against the mmap_sem - * (which nests outside the page lock). So increment - * preempt count, and use _atomic usercopies. - * - * The page is uptodate so we are OK to encounter a - * short copy: if unmodified parts of the page are - * marked dirty and written out to disk, it doesn't - * really matter. - */ - pagefault_disable(); - copied = iov_iter_copy_from_user_atomic(page, i, - offset, bytes); - pagefault_enable(); - } else { - void *src, *dst; - src = kmap_atomic(src_page, KM_USER0); - dst = kmap_atomic(page, KM_USER1); - memcpy(dst + offset, src + offset, bytes); - kunmap_atomic(dst, KM_USER1); - kunmap_atomic(src, KM_USER0); - copied = bytes; - } - flush_dcache_page(page); - - status = a_ops->commit_write(file, page, offset, offset+bytes); - if (unlikely(status < 0)) - goto fs_write_aop_error; - if (unlikely(status > 0)) /* filesystem did partial write */ - copied = min_t(size_t, copied, status); - - unlock_page(page); - mark_page_accessed(page); - page_cache_release(page); - if (src_page) - page_cache_release(src_page); - - iov_iter_advance(i, copied); - pos += copied; - written += copied; + int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - continue; + if (flags & AOP_FLAG_NOFS) + fgp_flags |= FGP_NOFS; -fs_write_aop_error: - unlock_page(page); - page_cache_release(page); - if (src_page) - page_cache_release(src_page); - - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - */ - if (pos + bytes > inode->i_size) - vmtruncate(inode, inode->i_size); - break; - } while (iov_iter_count(i)); + page = pagecache_get_page(mapping, index, fgp_flags, + mapping_gfp_mask(mapping), + GFP_KERNEL); + if (page) + wait_for_stable_page(page); - return written ? written : status; + return page; } +EXPORT_SYMBOL(grab_cache_page_write_begin); -static ssize_t generic_perform_write(struct file *file, +ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) { struct address_space *mapping = file->f_mapping; @@ -2238,19 +2442,16 @@ static ssize_t generic_perform_write(struct file *file, do { struct page *page; - pgoff_t index; /* Pagecache index for current page */ unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ void *fsdata; offset = (pos & (PAGE_CACHE_SIZE - 1)); - index = pos >> PAGE_CACHE_SHIFT; bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_count(i)); again: - /* * Bring in the user page that we will copy from _first_. * Otherwise there's a nasty deadlock on copying from the @@ -2268,12 +2469,13 @@ again: status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); - if (unlikely(status)) + if (unlikely(status < 0)) break; - pagefault_disable(); + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - pagefault_enable(); flush_dcache_page(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, @@ -2302,84 +2504,46 @@ again: written += copied; balance_dirty_pages_ratelimited(mapping); - - } while (iov_iter_count(i)); - - return written ? written : status; -} - -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - ssize_t status; - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, written); - if (a_ops->write_begin) - status = generic_perform_write(file, &i, pos); - else - status = generic_perform_write_2copy(file, &i, pos); - - if (likely(status >= 0)) { - written += status; - *ppos = pos + status; - - /* - * For now, when the user asks for O_SYNC, we'll actually give - * O_DSYNC - */ - if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - if (!a_ops->writepage || !is_sync_kiocb(iocb)) - status = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); + if (fatal_signal_pending(current)) { + status = -EINTR; + break; } - } - - /* - * If we get here for O_DIRECT writes then we must have fallen through - * to buffered writes (block instantiation inside i_size). So we sync - * the file data here, to try to honour O_DIRECT expectations. - */ - if (unlikely(file->f_flags & O_DIRECT) && written) - status = filemap_write_and_wait(mapping); + } while (iov_iter_count(i)); return written ? written : status; } -EXPORT_SYMBOL(generic_file_buffered_write); +EXPORT_SYMBOL(generic_perform_write); -static ssize_t -__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +/** + * __generic_file_write_iter - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @from: iov_iter with data to write + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * It expects i_mutex to be grabbed unless we work on a block device or similar + * object which does not need locking at all. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_mutex. + */ +ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; - size_t ocount; /* original count */ - size_t count; /* after file limit checks */ struct inode *inode = mapping->host; - loff_t pos; - ssize_t written; + loff_t pos = iocb->ki_pos; + ssize_t written = 0; ssize_t err; - - ocount = 0; - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) - return err; - - count = ocount; - pos = *ppos; - - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + ssize_t status; + size_t count = iov_iter_count(from); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; - written = 0; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) goto out; @@ -2387,54 +2551,53 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; - err = remove_suid(file->f_path.dentry); + iov_iter_truncate(from, count); + + err = file_remove_suid(file); if (err) goto out; - file_update_time(file); + err = file_update_time(file); + if (err) + goto out; /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { loff_t endbyte; - ssize_t written_buffered; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, - ppos, count, ocount); + written = generic_file_direct_write(iocb, from, pos); if (written < 0 || written == count) goto out; + /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ pos += written; count -= written; - written_buffered = generic_file_buffered_write(iocb, iov, - nr_segs, pos, ppos, count, - written); + + status = generic_perform_write(file, from, pos); /* - * If generic_file_buffered_write() retuned a synchronous error + * If generic_perform_write() returned a synchronous error * then we want to return the number of bytes which were * direct-written, or the error code if that was zero. Note * that this differs from normal direct-io semantics, which * will return -EFOO even if some bytes were written. */ - if (written_buffered < 0) { - err = written_buffered; + if (unlikely(status < 0) && !written) { + err = status; goto out; } - + iocb->ki_pos = pos + status; /* * We need to ensure that the page cache pages are written to * disk and invalidated to preserve the expected O_DIRECT * semantics. */ - endbyte = pos + written_buffered - written - 1; - err = do_sync_mapping_range(file->f_mapping, pos, endbyte, - SYNC_FILE_RANGE_WAIT_BEFORE| - SYNC_FILE_RANGE_WRITE| - SYNC_FILE_RANGE_WAIT_AFTER); + endbyte = pos + status - 1; + err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); if (err == 0) { - written = written_buffered; + written += status; invalidate_mapping_pages(mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); @@ -2445,123 +2608,45 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, */ } } else { - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, ppos, count, written); + written = generic_perform_write(file, from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; } out: current->backing_dev_info = NULL; return written ? written : err; } +EXPORT_SYMBOL(__generic_file_write_iter); -ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - BUG_ON(iocb->ki_pos != pos); - - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); - - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range_nolock(inode, mapping, pos, ret); - if (err < 0) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(generic_file_aio_write_nolock); - -ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +/** + * generic_file_write_iter - write data to a file + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * This is a wrapper around __generic_file_write_iter() to be used by most + * filesystems. It takes care of syncing the file in case of O_SYNC file + * and acquires i_mutex as needed. + */ +ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file->f_mapping->host; ssize_t ret; - BUG_ON(iocb->ki_pos != pos); - mutex_lock(&inode->i_mutex); - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); + ret = __generic_file_write_iter(iocb, from); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if (ret > 0) { ssize_t err; - err = sync_page_range(inode, mapping, pos, ret); + err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } return ret; } -EXPORT_SYMBOL(generic_file_aio_write); - -/* - * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something - * went wrong during pagecache shootdown. - */ -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - ssize_t retval; - size_t write_len; - pgoff_t end = 0; /* silence gcc */ - - /* - * If it's a write, unmap all mmappings of the file up-front. This - * will cause any pte dirty bits to be propagated into the pageframes - * for the subsequent filemap_write_and_wait(). - */ - if (rw == WRITE) { - write_len = iov_length(iov, nr_segs); - end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; - if (mapping_mapped(mapping)) - unmap_mapping_range(mapping, offset, write_len, 0); - } - - retval = filemap_write_and_wait(mapping); - if (retval) - goto out; - - /* - * After a write we want buffered reads to be sure to go to disk to get - * the new data. We invalidate clean cached page from the region we're - * about to write. We do this *before* the write so that we can return - * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). - */ - if (rw == WRITE && mapping->nrpages) { - retval = invalidate_inode_pages2_range(mapping, - offset >> PAGE_CACHE_SHIFT, end); - if (retval) - goto out; - } - - retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - - /* - * Finally, try again to invalidate clean pages which might have been - * cached by non-direct readahead, or faulted in by get_user_pages() - * if the source of the write was an mmap'ed region of the file - * we're writing. Either one is a pretty crazy thing to do, - * so we don't support it 100%. If this invalidation - * fails, tough, the write still worked... - */ - if (rw == WRITE && mapping->nrpages) { - invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); - } -out: - return retval; -} +EXPORT_SYMBOL(generic_file_write_iter); /** * try_to_release_page() - release old fs-specific metadata on a page @@ -2573,10 +2658,12 @@ out: * (presumably at page->private). If the release was successful, return `1'. * Otherwise return zero. * + * This may also be called if PG_fscache is set on a page, indicating that the + * page is known to the local caching routines. + * * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). * - * NOTE: @gfp_mask may go away, and this function may become non-blocking. */ int try_to_release_page(struct page *page, gfp_t gfp_mask) { |
