From bde05d1ccd512696b09db9dd2e5f33ad19152605 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:38 -0700 Subject: shmem: replace page if mapping excludes its zone The GMA500 GPU driver uses GEM shmem objects, but with a new twist: the backing RAM has to be below 4GB. Not a problem while the boards supported only 4GB: but now Intel's D2700MUD boards support 8GB, and their GMA3600 is managed by the GMA500 driver. shmem/tmpfs has never pretended to support hardware restrictions on the backing memory, but it might have appeared to do so before v3.1, and even now it works fine until a page is swapped out then back in. When read_cache_page_gfp() supplied a freshly allocated page for copy, that compensated for whatever choice might have been made by earlier swapin readahead; but swapoff was likely to destroy the illusion. We'd like to continue to support GMA500, so now add a new shmem_should_replace_page() check on the zone when about to move a page from swapcache to filecache (in swapin and swapoff cases), with shmem_replace_page() to allocate and substitute a suitable page (given gma500/gem.c's mapping_set_gfp_mask GFP_KERNEL | __GFP_DMA32). This does involve a minor extension to mem_cgroup_replace_page_cache() (the page may or may not have already been charged); and I've removed a comment and call to mem_cgroup_uncharge_cache_page(), which in fact is always a no-op while PageSwapCache. Also removed optimization of an unlikely path in shmem_getpage_gfp(), now that we need to check PageSwapCache more carefully (a racing caller might already have made the copy). And at one point shmem_unuse_inode() needs to use the hitherto private page_swapcount(), to guard against racing with inode eviction. It would make sense to extend shmem_should_replace_page(), to cover cpuset and NUMA mempolicy restrictions too, but set that aside for now: needs a cleanup of shmem mempolicy handling, and more testing, and ought to handle swap faults in do_swap_page() as well as shmem. Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Acked-by: KAMEZAWA Hiroyuki Cc: Alan Cox Cc: Stephane Marchesin Cc: Andi Kleen Cc: Dave Airlie Cc: Daniel Vetter Cc: Rob Clark Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 122 insertions(+), 19 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index be5af34a070..db72d8e44ec 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -103,6 +103,9 @@ static unsigned long shmem_default_max_inodes(void) } #endif +static bool shmem_should_replace_page(struct page *page, gfp_t gfp); +static int shmem_replace_page(struct page **pagep, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); @@ -604,12 +607,13 @@ static void shmem_evict_inode(struct inode *inode) * If swap found in inode, free it and move page from swapcache to filecache. */ static int shmem_unuse_inode(struct shmem_inode_info *info, - swp_entry_t swap, struct page *page) + swp_entry_t swap, struct page **pagep) { struct address_space *mapping = info->vfs_inode.i_mapping; void *radswap; pgoff_t index; - int error; + gfp_t gfp; + int error = 0; radswap = swp_to_radix_entry(swap); index = radix_tree_locate_item(&mapping->page_tree, radswap); @@ -625,22 +629,37 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, if (shmem_swaplist.next != &info->swaplist) list_move_tail(&shmem_swaplist, &info->swaplist); + gfp = mapping_gfp_mask(mapping); + if (shmem_should_replace_page(*pagep, gfp)) { + mutex_unlock(&shmem_swaplist_mutex); + error = shmem_replace_page(pagep, gfp, info, index); + mutex_lock(&shmem_swaplist_mutex); + /* + * We needed to drop mutex to make that restrictive page + * allocation; but the inode might already be freed by now, + * and we cannot refer to inode or mapping or info to check. + * However, we do hold page lock on the PageSwapCache page, + * so can check if that still has our reference remaining. + */ + if (!page_swapcount(*pagep)) + error = -ENOENT; + } + /* * We rely on shmem_swaplist_mutex, not only to protect the swaplist, * but also to hold up shmem_evict_inode(): so inode cannot be freed * beneath us (pagelock doesn't help until the page is in pagecache). */ - error = shmem_add_to_page_cache(page, mapping, index, + if (!error) + error = shmem_add_to_page_cache(*pagep, mapping, index, GFP_NOWAIT, radswap); - /* which does mem_cgroup_uncharge_cache_page on error */ - if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which * only does trylock page: if we raced, best clean up here. */ - delete_from_swap_cache(page); - set_page_dirty(page); + delete_from_swap_cache(*pagep); + set_page_dirty(*pagep); if (!error) { spin_lock(&info->lock); info->swapped--; @@ -660,7 +679,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page) struct list_head *this, *next; struct shmem_inode_info *info; int found = 0; - int error; + int error = 0; + + /* + * There's a faint possibility that swap page was replaced before + * caller locked it: it will come back later with the right page. + */ + if (unlikely(!PageSwapCache(page))) + goto out; /* * Charge page using GFP_KERNEL while we can wait, before taking @@ -676,7 +702,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) list_for_each_safe(this, next, &shmem_swaplist) { info = list_entry(this, struct shmem_inode_info, swaplist); if (info->swapped) - found = shmem_unuse_inode(info, swap, page); + found = shmem_unuse_inode(info, swap, &page); else list_del_init(&info->swaplist); cond_resched(); @@ -685,8 +711,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page) } mutex_unlock(&shmem_swaplist_mutex); - if (!found) - mem_cgroup_uncharge_cache_page(page); if (found < 0) error = found; out: @@ -855,6 +879,84 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) } #endif +/* + * When a page is moved from swapcache to shmem filecache (either by the + * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of + * shmem_unuse_inode()), it may have been read in earlier from swap, in + * ignorance of the mapping it belongs to. If that mapping has special + * constraints (like the gma500 GEM driver, which requires RAM below 4GB), + * we may need to copy to a suitable page before moving to filecache. + * + * In a future release, this may well be extended to respect cpuset and + * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); + * but for now it is a simple matter of zone. + */ +static bool shmem_should_replace_page(struct page *page, gfp_t gfp) +{ + return page_zonenum(page) > gfp_zone(gfp); +} + +static int shmem_replace_page(struct page **pagep, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct page *oldpage, *newpage; + struct address_space *swap_mapping; + pgoff_t swap_index; + int error; + + oldpage = *pagep; + swap_index = page_private(oldpage); + swap_mapping = page_mapping(oldpage); + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success by further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + newpage = shmem_alloc_page(gfp, info, index); + if (!newpage) + return -ENOMEM; + VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); + + *pagep = newpage; + page_cache_get(newpage); + copy_highpage(newpage, oldpage); + + VM_BUG_ON(!PageLocked(oldpage)); + __set_page_locked(newpage); + VM_BUG_ON(!PageUptodate(oldpage)); + SetPageUptodate(newpage); + VM_BUG_ON(!PageSwapBacked(oldpage)); + SetPageSwapBacked(newpage); + VM_BUG_ON(!swap_index); + set_page_private(newpage, swap_index); + VM_BUG_ON(!PageSwapCache(oldpage)); + SetPageSwapCache(newpage); + + /* + * Our caller will very soon move newpage out of swapcache, but it's + * a nice clean interface for us to replace oldpage by newpage there. + */ + spin_lock_irq(&swap_mapping->tree_lock); + error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, + newpage); + __inc_zone_page_state(newpage, NR_FILE_PAGES); + __dec_zone_page_state(oldpage, NR_FILE_PAGES); + spin_unlock_irq(&swap_mapping->tree_lock); + BUG_ON(error); + + mem_cgroup_replace_page_cache(oldpage, newpage); + lru_cache_add_anon(newpage); + + ClearPageSwapCache(oldpage); + set_page_private(oldpage, 0); + + unlock_page(oldpage); + page_cache_release(oldpage); + page_cache_release(oldpage); + return 0; +} + /* * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * @@ -923,19 +1025,20 @@ repeat: /* We have to do this with page locked to prevent races */ lock_page(page); + if (!PageSwapCache(page) || page->mapping) { + error = -EEXIST; /* try again */ + goto failed; + } if (!PageUptodate(page)) { error = -EIO; goto failed; } wait_on_page_writeback(page); - /* Someone may have already done it for us */ - if (page->mapping) { - if (page->mapping == mapping && - page->index == index) - goto done; - error = -EEXIST; - goto failed; + if (shmem_should_replace_page(page, gfp)) { + error = shmem_replace_page(&page, gfp, info, index); + if (error) + goto failed; } error = mem_cgroup_cache_charge(page, current->mm, @@ -998,7 +1101,7 @@ repeat: if (sgp == SGP_DIRTY) set_page_dirty(page); } -done: + /* Perhaps the file has been truncated since we checked */ if (sgp != SGP_WRITE && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { -- cgit v1.2.3-18-g5258 From 2f6e38f3cd17a7858112f538c1700c747170db1f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:38 -0700 Subject: tmpfs: enable NOSEC optimization Let tmpfs into the NOSEC optimization (avoiding file_remove_suid() overhead on most common writes): set MS_NOSEC on its superblocks. Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Andi Kleen Cc: Al Viro Cc: Cong Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index db72d8e44ec..fe5ae6962ab 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2373,6 +2373,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) } } sb->s_export_op = &shmem_export_ops; + sb->s_flags |= MS_NOSEC; #else sb->s_flags |= MS_NOUSER; #endif -- cgit v1.2.3-18-g5258 From ec9516fbc5fa814014991e1ae7f8860127122105 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:39 -0700 Subject: tmpfs: optimize clearing when writing Nick proposed years ago that tmpfs should avoid clearing its pages where write will overwrite them with new data, as ramfs has long done. But I messed it up and just got bad data. Tried again recently, it works fine. Here's time output for writing 4GiB 16 times on this Core i5 laptop: before: real 0m21.169s user 0m0.028s sys 0m21.057s real 0m21.382s user 0m0.016s sys 0m21.289s real 0m21.311s user 0m0.020s sys 0m21.217s after: real 0m18.273s user 0m0.032s sys 0m18.165s real 0m18.354s user 0m0.020s sys 0m18.265s real 0m18.440s user 0m0.032s sys 0m18.337s ramfs: real 0m16.860s user 0m0.028s sys 0m16.765s real 0m17.382s user 0m0.040s sys 0m17.273s real 0m17.133s user 0m0.044s sys 0m17.021s Yes, I have done perf reports, but they need more explanation than they deserve: in summary, clear_page vanishes, its cache loading shifts into copy_user_generic_unrolled; shmem_getpage_gfp goes down, and surprisingly mark_page_accessed goes way up - I think because they are respectively where the cache gets to be reloaded after being purged by clear or copy. Suggested-by: Nick Piggin Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index fe5ae6962ab..45c26476f0f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1095,9 +1095,14 @@ repeat: shmem_recalc_inode(inode); spin_unlock(&info->lock); - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + /* + * Let SGP_WRITE caller clear ends if write does not fill page + */ + if (sgp != SGP_WRITE) { + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } if (sgp == SGP_DIRTY) set_page_dirty(page); } @@ -1307,6 +1312,14 @@ shmem_write_end(struct file *file, struct address_space *mapping, if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); + if (!PageUptodate(page)) { + if (copied < PAGE_CACHE_SIZE) { + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + zero_user_segments(page, 0, from, + from + copied, PAGE_CACHE_SIZE); + } + SetPageUptodate(page); + } set_page_dirty(page); unlock_page(page); page_cache_release(page); @@ -1768,6 +1781,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s kaddr = kmap_atomic(page); memcpy(kaddr, symname, len); kunmap_atomic(kaddr); + SetPageUptodate(page); set_page_dirty(page); unlock_page(page); page_cache_release(page); -- cgit v1.2.3-18-g5258 From 83e4fa9c16e4af7122e31be3eca5d57881d236fe Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:40 -0700 Subject: tmpfs: support fallocate FALLOC_FL_PUNCH_HOLE tmpfs has supported hole-punching since 2.6.16, via madvise(,,MADV_REMOVE). But nowadays fallocate(,FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,,) is the agreed way to punch holes. So add shmem_fallocate() to support that, and tweak shmem_truncate_range() to support partial pages at both the beginning and end of range (never needed for madvise, which demands rounded addr and rounds up length). Based-on-patch-by: Cong Wang Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 11 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index 45c26476f0f..7e54ff1c63e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include #include #include @@ -432,21 +433,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); - pgoff_t end = (lend >> PAGE_CACHE_SHIFT); + pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; + unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); + unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; long nr_swaps_freed = 0; pgoff_t index; int i; - BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); + if (lend == -1) + end = -1; /* unsigned, so actually very big */ pagevec_init(&pvec, 0); index = start; - while (index <= end) { + while (index < end) { pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) break; @@ -455,7 +458,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct page *page = pvec.pages[i]; index = indices[i]; - if (index > end) + if (index >= end) break; if (radix_tree_exceptional_entry(page)) { @@ -479,22 +482,39 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) index++; } - if (partial) { + if (partial_start) { struct page *page = NULL; shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); if (page) { - zero_user_segment(page, partial, PAGE_CACHE_SIZE); + unsigned int top = PAGE_CACHE_SIZE; + if (start > end) { + top = partial_end; + partial_end = 0; + } + zero_user_segment(page, partial_start, top); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + } + } + if (partial_end) { + struct page *page = NULL; + shmem_getpage(inode, end, &page, SGP_READ, NULL); + if (page) { + zero_user_segment(page, 0, partial_end); set_page_dirty(page); unlock_page(page); page_cache_release(page); } } + if (start >= end) + return; index = start; for ( ; ; ) { cond_resched(); pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) { if (index == start) @@ -502,7 +522,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) index = start; continue; } - if (index == start && indices[0] > end) { + if (index == start && indices[0] >= end) { shmem_deswap_pagevec(&pvec); pagevec_release(&pvec); break; @@ -512,7 +532,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) struct page *page = pvec.pages[i]; index = indices[i]; - if (index > end) + if (index >= end) break; if (radix_tree_exceptional_entry(page)) { @@ -1578,6 +1598,31 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, return error; } +static long shmem_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + int error = -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + + if (mode & FALLOC_FL_PUNCH_HOLE) { + struct address_space *mapping = file->f_mapping; + loff_t unmap_start = round_up(offset, PAGE_SIZE); + loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + shmem_truncate_range(inode, offset, offset + len - 1); + /* No need to unmap again: hole-punching leaves COWed pages */ + error = 0; + } + + mutex_unlock(&inode->i_mutex); + return error; +} + static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -2490,6 +2535,7 @@ static const struct file_operations shmem_file_operations = { .fsync = noop_fsync, .splice_read = shmem_file_splice_read, .splice_write = generic_file_splice_write, + .fallocate = shmem_fallocate, #endif }; -- cgit v1.2.3-18-g5258 From 17cf28afea2a1112f240a3a2da8af883be024811 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:41 -0700 Subject: mm/fs: remove truncate_range Remove vmtruncate_range(), and remove the truncate_range method from struct inode_operations: only tmpfs ever supported it, and tmpfs has now converted over to using the fallocate method of file_operations. Update Documentation accordingly, adding (setlease and) fallocate lines. And while we're in mm.h, remove duplicate declarations of shmem_lock() and shmem_file_setup(): everyone is now using the ones in shmem_fs.h. Based-on-patch-by: Cong Wang Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index 7e54ff1c63e..f368d0acb52 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2541,7 +2541,6 @@ static const struct file_operations shmem_file_operations = { static const struct inode_operations shmem_inode_operations = { .setattr = shmem_setattr, - .truncate_range = shmem_truncate_range, #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, .getxattr = shmem_getxattr, -- cgit v1.2.3-18-g5258 From e2d12e22c59ce714008aa5266d769f8568d74eac Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:41 -0700 Subject: tmpfs: support fallocate preallocation The systemd plumbers expressed a wish that tmpfs support preallocation. Cong Wang wrote a patch, but several kernel guys expressed scepticism: https://lkml.org/lkml/2011/11/18/137 Christoph Hellwig: What for exactly? Please explain why preallocating on tmpfs would make any sense. Kay Sievers: To be able to safely use mmap(), regarding SIGBUS, on files on the /dev/shm filesystem. The glibc fallback loop for -ENOSYS [or -EOPNOTSUPP] on fallocate is just ugly. Hugh Dickins: If tmpfs is going to support fallocate(FALLOC_FL_PUNCH_HOLE), it would seem perverse to permit the deallocation but fail the allocation. Christoph Hellwig: Agreed. Now that we do have shmem_fallocate() for hole-punching, plumb in basic support for preallocation mode too. It's fairly straightforward (though quite a few details needed attention), except for when it fails part way through. What a pity that fallocate(2) was not specified to return the length allocated, permitting short fallocations! As it is, when it fails part way through, we ought to free what has just been allocated by this system call; but must be very sure not to free any allocated earlier, or any allocated by racing accesses (not all excluded by i_mutex). But we cannot distinguish them: so in this patch simply leak allocations on partial failure (they will be freed later if the file is removed). An attractive alternative approach would have been for fallocate() not to allocate pages at all, but note reservations by entries in the radix-tree. But that would give less assurance, and, critically, would be hard to fit with mem cgroups (who owns the reservations?): allocating pages lets fallocate() behave in just the same way as write(). Based-on-patch-by: Cong Wang Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index f368d0acb52..9b90d89e54c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1602,7 +1602,9 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file->f_path.dentry->d_inode; - int error = -EOPNOTSUPP; + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + pgoff_t start, index, end; + int error; mutex_lock(&inode->i_mutex); @@ -1617,8 +1619,65 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ error = 0; + goto out; } + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ + error = inode_newsize_ok(inode, offset + len); + if (error) + goto out; + + start = offset >> PAGE_CACHE_SHIFT; + end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + /* Try to avoid a swapstorm if len is impossible to satisfy */ + if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { + error = -ENOSPC; + goto out; + } + + for (index = start; index < end; index++) { + struct page *page; + + /* + * Good, the fallocate(2) manpage permits EINTR: we may have + * been interrupted because we are using up too much memory. + */ + if (signal_pending(current)) + error = -EINTR; + else + error = shmem_getpage(inode, index, &page, SGP_WRITE, + NULL); + if (error) { + /* + * We really ought to free what we allocated so far, + * but it would be wrong to free pages allocated + * earlier, or already now in use: i_mutex does not + * exclude all cases. We do not know what to free. + */ + goto ctime; + } + + if (!PageUptodate(page)) { + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + /* + * set_page_dirty so that memory pressure will swap rather + * than free the pages we are allocating (and SGP_CACHE pages + * might still be clean: we now need to mark those dirty too). + */ + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + cond_resched(); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); +ctime: + inode->i_ctime = CURRENT_TIME; +out: mutex_unlock(&inode->i_mutex); return error; } -- cgit v1.2.3-18-g5258 From 1635f6a74152f1dcd1b888231609d64875f0a81a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:42 -0700 Subject: tmpfs: undo fallocation on failure In the previous episode, we left the already-fallocated pages attached to the file when shmem_fallocate() fails part way through. Now try to do better, by extending the earlier optimization of !Uptodate pages (then always under page lock) to !Uptodate pages (outside of page lock), representing fallocated pages. And don't waste time clearing them at the time of fallocate(), leave that until later if necessary. Adapt shmem_truncate_range() to shmem_undo_range(), so that a failing fallocate can recognize and remove precisely those !Uptodate allocations which it added (and were not independently allocated by racing tasks). But unless we start playing with swapfile.c and memcontrol.c too, once one of our fallocated pages reaches shmem_writepage(), we do then have to instantiate it as an ordinarily allocated page, before swapping out. This is unsatisfactory, but improved in the next episode. Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 105 ++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 72 insertions(+), 33 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index 9b90d89e54c..793dcd1bac8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -89,7 +89,8 @@ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ - SGP_WRITE, /* may exceed i_size, may allocate page */ + SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ + SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; #ifdef CONFIG_TMPFS @@ -427,8 +428,10 @@ void shmem_unlock_mapping(struct address_space *mapping) /* * Remove range of pages and swap entries from radix tree, and free them. + * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ -void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, + bool unfalloc) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -462,6 +465,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) break; if (radix_tree_exceptional_entry(page)) { + if (unfalloc) + continue; nr_swaps_freed += !shmem_free_swap(mapping, index, page); continue; @@ -469,9 +474,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) if (!trylock_page(page)) continue; - if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); - truncate_inode_page(mapping, page); + if (!unfalloc || !PageUptodate(page)) { + if (page->mapping == mapping) { + VM_BUG_ON(PageWriteback(page)); + truncate_inode_page(mapping, page); + } } unlock_page(page); } @@ -517,12 +524,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) { - if (index == start) + if (index == start || unfalloc) break; index = start; continue; } - if (index == start && indices[0] >= end) { + if ((index == start || unfalloc) && indices[0] >= end) { shmem_deswap_pagevec(&pvec); pagevec_release(&pvec); break; @@ -536,15 +543,19 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) break; if (radix_tree_exceptional_entry(page)) { + if (unfalloc) + continue; nr_swaps_freed += !shmem_free_swap(mapping, index, page); continue; } lock_page(page); - if (page->mapping == mapping) { - VM_BUG_ON(PageWriteback(page)); - truncate_inode_page(mapping, page); + if (!unfalloc || !PageUptodate(page)) { + if (page->mapping == mapping) { + VM_BUG_ON(PageWriteback(page)); + truncate_inode_page(mapping, page); + } } unlock_page(page); } @@ -558,7 +569,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) info->swapped -= nr_swaps_freed; shmem_recalc_inode(inode); spin_unlock(&info->lock); +} +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + shmem_undo_range(inode, lstart, lend, false); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -771,6 +786,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ goto redirty; } + + /* + * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC + * value into swapfile.c, the only way we can correctly account for a + * fallocated page arriving here is now to initialize it and write it. + */ + if (!PageUptodate(page)) { + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); + } + swap = get_swap_page(); if (!swap.val) goto redirty; @@ -994,6 +1021,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, swp_entry_t swap; int error; int once = 0; + int alloced = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) return -EFBIG; @@ -1005,19 +1033,21 @@ repeat: page = NULL; } - if (sgp != SGP_WRITE && + if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto failed; } + /* fallocated page? */ + if (page && !PageUptodate(page)) { + if (sgp != SGP_READ) + goto clear; + unlock_page(page); + page_cache_release(page); + page = NULL; + } if (page || (sgp == SGP_READ && !swap.val)) { - /* - * Once we can get the page lock, it must be uptodate: - * if there were an error in reading back from swap, - * the page would not be inserted into the filecache. - */ - BUG_ON(page && !PageUptodate(page)); *pagep = page; return 0; } @@ -1114,9 +1144,18 @@ repeat: inode->i_blocks += BLOCKS_PER_PAGE; shmem_recalc_inode(inode); spin_unlock(&info->lock); + alloced = true; /* - * Let SGP_WRITE caller clear ends if write does not fill page + * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + */ + if (sgp == SGP_FALLOC) + sgp = SGP_WRITE; +clear: + /* + * Let SGP_WRITE caller clear ends if write does not fill page; + * but SGP_FALLOC on a page fallocated earlier must initialize + * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE) { clear_highpage(page); @@ -1128,10 +1167,13 @@ repeat: } /* Perhaps the file has been truncated since we checked */ - if (sgp != SGP_WRITE && + if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; - goto trunc; + if (alloced) + goto trunc; + else + goto failed; } *pagep = page; return 0; @@ -1140,6 +1182,7 @@ repeat: * Error recovery. */ trunc: + info = SHMEM_I(inode); ClearPageDirty(page); delete_from_page_cache(page); spin_lock(&info->lock); @@ -1147,6 +1190,7 @@ trunc: inode->i_blocks -= BLOCKS_PER_PAGE; spin_unlock(&info->lock); decused: + sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) percpu_counter_add(&sbinfo->used_blocks, -1); unacct: @@ -1645,25 +1689,20 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (signal_pending(current)) error = -EINTR; else - error = shmem_getpage(inode, index, &page, SGP_WRITE, + error = shmem_getpage(inode, index, &page, SGP_FALLOC, NULL); if (error) { - /* - * We really ought to free what we allocated so far, - * but it would be wrong to free pages allocated - * earlier, or already now in use: i_mutex does not - * exclude all cases. We do not know what to free. - */ + /* Remove the !PageUptodate pages we added */ + shmem_undo_range(inode, + (loff_t)start << PAGE_CACHE_SHIFT, + (loff_t)index << PAGE_CACHE_SHIFT, true); goto ctime; } - if (!PageUptodate(page)) { - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); - } /* - * set_page_dirty so that memory pressure will swap rather + * If !PageUptodate, leave it that way so that freeable pages + * can be recognized if we need to rollback on error later. + * But set_page_dirty so that memory pressure will swap rather * than free the pages we are allocating (and SGP_CACHE pages * might still be clean: we now need to mark those dirty too). */ -- cgit v1.2.3-18-g5258 From 1aac1400319d30786f32b9290e9cc923937b3d57 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:42 -0700 Subject: tmpfs: quit when fallocate fills memory As it stands, a large fallocate() on tmpfs is liable to fill memory with pages, freed on failure except when they run into swap, at which point they become fixed into the file despite the failure. That feels quite wrong, to be consuming resources precisely when they're in short supply. Go the other way instead: shmem_fallocate() indicate the range it has fallocated to shmem_writepage(), keeping count of pages it's allocating; shmem_writepage() reactivate instead of swapping out pages fallocated by this syscall (but happily swap out those from earlier occasions), keeping count; shmem_fallocate() compare counts and give up once the reactivated pages have started to coming back to writepage (approximately: some zones would in fact recycle faster than others). This is a little unusual, but works well: although we could consider the failure to swap as a bug, and fix it later with SWAP_MAP_FALLOC handling added in swapfile.c and memcontrol.c, I doubt that we shall ever want to. (If there's no swap, an over-large fallocate() on tmpfs is limited in the same way as writing: stopped by rlimit, or by tmpfs mount size if that was set sensibly, or by __vm_enough_memory() heuristics if OVERCOMMIT_GUESS or OVERCOMMIT_NEVER. If OVERCOMMIT_ALWAYS, then it is liable to OOM-kill others as writing would, but stops and frees if interrupted.) Now that everything is freed on failure, we can then skip updating ctime. Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Cong Wang Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index 793dcd1bac8..191663a8def 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -84,6 +84,18 @@ struct shmem_xattr { char value[0]; }; +/* + * shmem_fallocate and shmem_writepage communicate via inode->i_private + * (with i_mutex making sure that it has only one user at a time): + * we would prefer not to enlarge the shmem inode just for that. + */ +struct shmem_falloc { + pgoff_t start; /* start of range currently being fallocated */ + pgoff_t next; /* the next page offset to be fallocated */ + pgoff_t nr_falloced; /* how many new pages have been fallocated */ + pgoff_t nr_unswapped; /* how often writepage refused to swap out */ +}; + /* Flag allocation requirements to shmem_getpage */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ @@ -791,8 +803,28 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a * fallocated page arriving here is now to initialize it and write it. + * + * That's okay for a page already fallocated earlier, but if we have + * not yet completed the fallocation, then (a) we want to keep track + * of this page in case we have to undo it, and (b) it may not be a + * good idea to continue anyway, once we're pushing into swap. So + * reactivate the page, and let shmem_fallocate() quit when too many. */ if (!PageUptodate(page)) { + if (inode->i_private) { + struct shmem_falloc *shmem_falloc; + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + index >= shmem_falloc->start && + index < shmem_falloc->next) + shmem_falloc->nr_unswapped++; + else + shmem_falloc = NULL; + spin_unlock(&inode->i_lock); + if (shmem_falloc) + goto redirty; + } clear_highpage(page); flush_dcache_page(page); SetPageUptodate(page); @@ -1647,6 +1679,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, { struct inode *inode = file->f_path.dentry->d_inode; struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_falloc shmem_falloc; pgoff_t start, index, end; int error; @@ -1679,6 +1712,14 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto out; } + shmem_falloc.start = start; + shmem_falloc.next = start; + shmem_falloc.nr_falloced = 0; + shmem_falloc.nr_unswapped = 0; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); + for (index = start; index < end; index++) { struct page *page; @@ -1688,6 +1729,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, */ if (signal_pending(current)) error = -EINTR; + else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) + error = -ENOMEM; else error = shmem_getpage(inode, index, &page, SGP_FALLOC, NULL); @@ -1696,9 +1739,17 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, shmem_undo_range(inode, (loff_t)start << PAGE_CACHE_SHIFT, (loff_t)index << PAGE_CACHE_SHIFT, true); - goto ctime; + goto undone; } + /* + * Inform shmem_writepage() how far we have reached. + * No need for lock or barrier: we have the page lock. + */ + shmem_falloc.next++; + if (!PageUptodate(page)) + shmem_falloc.nr_falloced++; + /* * If !PageUptodate, leave it that way so that freeable pages * can be recognized if we need to rollback on error later. @@ -1714,8 +1765,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); -ctime: inode->i_ctime = CURRENT_TIME; +undone: + spin_lock(&inode->i_lock); + inode->i_private = NULL; + spin_unlock(&inode->i_lock); out: mutex_unlock(&inode->i_mutex); return error; -- cgit v1.2.3-18-g5258 From 4fb5ef089b288942c6fc3f85c4ecb4016c1aa4c3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 29 May 2012 15:06:43 -0700 Subject: tmpfs: support SEEK_DATA and SEEK_HOLE It's quite easy for tmpfs to scan the radix_tree to support llseek's new SEEK_DATA and SEEK_HOLE options: so add them while the minutiae are still on my mind (in particular, the !PageUptodate-ness of pages fallocated but still unwritten). But I don't know who actually uses SEEK_DATA or SEEK_HOLE, and whether it would be of any use to them on tmpfs. This code adds 92 lines and 752 bytes on x86_64 - is that bloat or worthwhile? [akpm@linux-foundation.org: fix warning with CONFIG_TMPFS=n] Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Josef Bacik Cc: Andi Kleen Cc: Andreas Dilger Cc: Dave Chinner Cc: Marco Stornelli Cc: Jeff liu Cc: Chris Mason Cc: Sunil Mushran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index 191663a8def..d576b84d913 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1674,6 +1674,98 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, return error; } +/* + * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. + */ +static pgoff_t shmem_seek_hole_data(struct address_space *mapping, + pgoff_t index, pgoff_t end, int origin) +{ + struct page *page; + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + bool done = false; + int i; + + pagevec_init(&pvec, 0); + pvec.nr = 1; /* start small: we may be there already */ + while (!done) { + pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + pvec.nr, pvec.pages, indices); + if (!pvec.nr) { + if (origin == SEEK_DATA) + index = end; + break; + } + for (i = 0; i < pvec.nr; i++, index++) { + if (index < indices[i]) { + if (origin == SEEK_HOLE) { + done = true; + break; + } + index = indices[i]; + } + page = pvec.pages[i]; + if (page && !radix_tree_exceptional_entry(page)) { + if (!PageUptodate(page)) + page = NULL; + } + if (index >= end || + (page && origin == SEEK_DATA) || + (!page && origin == SEEK_HOLE)) { + done = true; + break; + } + } + shmem_deswap_pagevec(&pvec); + pagevec_release(&pvec); + pvec.nr = PAGEVEC_SIZE; + cond_resched(); + } + return index; +} + +static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) +{ + struct address_space *mapping; + struct inode *inode; + pgoff_t start, end; + loff_t new_offset; + + if (origin != SEEK_DATA && origin != SEEK_HOLE) + return generic_file_llseek_size(file, offset, origin, + MAX_LFS_FILESIZE); + mapping = file->f_mapping; + inode = mapping->host; + mutex_lock(&inode->i_mutex); + /* We're holding i_mutex so we can access i_size directly */ + + if (offset < 0) + offset = -EINVAL; + else if (offset >= inode->i_size) + offset = -ENXIO; + else { + start = offset >> PAGE_CACHE_SHIFT; + end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + new_offset = shmem_seek_hole_data(mapping, start, end, origin); + new_offset <<= PAGE_CACHE_SHIFT; + if (new_offset > offset) { + if (new_offset < inode->i_size) + offset = new_offset; + else if (origin == SEEK_DATA) + offset = -ENXIO; + else + offset = inode->i_size; + } + } + + if (offset >= 0 && offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + mutex_unlock(&inode->i_mutex); + return offset; +} + static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -2679,7 +2771,7 @@ static const struct address_space_operations shmem_aops = { static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS - .llseek = generic_file_llseek, + .llseek = shmem_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = shmem_file_aio_read, -- cgit v1.2.3-18-g5258