diff options
Diffstat (limited to 'fs/buffer.c')
| -rw-r--r-- | fs/buffer.c | 341 |
1 files changed, 250 insertions, 91 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index b5f044283ed..eba6e4f621c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -41,19 +41,26 @@ #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> +#include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) -inline void -init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) +void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) { bh->b_end_io = handler; bh->b_private = private; } EXPORT_SYMBOL(init_buffer); +inline void touch_buffer(struct buffer_head *bh) +{ + trace_block_touch_buffer(bh); + mark_page_accessed(bh->b_page); +} +EXPORT_SYMBOL(touch_buffer); + static int sleep_on_buffer(void *word) { io_schedule(); @@ -70,12 +77,46 @@ EXPORT_SYMBOL(__lock_buffer); void unlock_buffer(struct buffer_head *bh) { clear_bit_unlock(BH_Lock, &bh->b_state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&bh->b_state, BH_Lock); } EXPORT_SYMBOL(unlock_buffer); /* + * Returns if the page has dirty or writeback buffers. If all the buffers + * are unlocked and clean then the PageDirty information is stale. If + * any of the pages are locked, it is assumed they are locked for IO. + */ +void buffer_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct buffer_head *head, *bh; + *dirty = false; + *writeback = false; + + BUG_ON(!PageLocked(page)); + + if (!page_has_buffers(page)) + return; + + if (PageWriteback(page)) + *writeback = true; + + head = page_buffers(page); + bh = head; + do { + if (buffer_locked(bh)) + *writeback = true; + + if (buffer_dirty(bh)) + *dirty = true; + + bh = bh->b_this_page; + } while (bh != head); +} +EXPORT_SYMBOL(buffer_check_dirty_writeback); + +/* * Block until a buffer comes unlocked. This doesn't stop it * from becoming locked again - you have to lock it yourself * if you want to preserve its state. @@ -186,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) int all_mapped = 1; index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); - page = find_get_page(bd_mapping, index); + page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); if (!page) goto out; @@ -555,7 +596,7 @@ void emergency_thaw_all(void) */ int sync_mapping_buffers(struct address_space *mapping) { - struct address_space *buffer_mapping = mapping->assoc_mapping; + struct address_space *buffer_mapping = mapping->private_data; if (buffer_mapping == NULL || list_empty(&mapping->private_list)) return 0; @@ -588,10 +629,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) struct address_space *buffer_mapping = bh->b_page->mapping; mark_buffer_dirty(bh); - if (!mapping->assoc_mapping) { - mapping->assoc_mapping = buffer_mapping; + if (!mapping->private_data) { + mapping->private_data = buffer_mapping; } else { - BUG_ON(mapping->assoc_mapping != buffer_mapping); + BUG_ON(mapping->private_data != buffer_mapping); } if (!bh->b_assoc_map) { spin_lock(&buffer_mapping->private_lock); @@ -613,14 +654,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); static void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { - spin_lock_irq(&mapping->tree_lock); + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } @@ -788,7 +831,7 @@ void invalidate_inode_buffers(struct inode *inode) if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->assoc_mapping; + struct address_space *buffer_mapping = mapping->private_data; spin_lock(&buffer_mapping->private_lock); while (!list_empty(list)) @@ -811,7 +854,7 @@ int remove_inode_buffers(struct inode *inode) if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->assoc_mapping; + struct address_space *buffer_mapping = mapping->private_data; spin_lock(&buffer_mapping->private_lock); while (!list_empty(list)) { @@ -850,19 +893,14 @@ try_again: if (!bh) goto no_grow; - bh->b_bdev = NULL; bh->b_this_page = head; bh->b_blocknr = -1; head = bh; - bh->b_state = 0; - atomic_set(&bh->b_count, 0); bh->b_size = size; /* Link the buffer to its page */ set_bh_page(bh, page, offset); - - init_buffer(bh, NULL, NULL); } return head; /* @@ -911,6 +949,18 @@ link_dev_buffers(struct page *page, struct buffer_head *head) attach_page_buffers(page, head); } +static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) +{ + sector_t retval = ~((sector_t)0); + loff_t sz = i_size_read(bdev->bd_inode); + + if (sz) { + unsigned int sizebits = blksize_bits(size); + retval = (sz >> sizebits); + } + return retval; +} + /* * Initialise the state of a blockdev page's buffers. */ @@ -921,7 +971,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; int uptodate = PageUptodate(page); - sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode)); + sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size); do { if (!buffer_mapped(bh)) { @@ -957,9 +1007,19 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct buffer_head *bh; sector_t end_block; int ret = 0; /* Will call free_more_memory() */ + gfp_t gfp_mask; + + gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; + gfp_mask |= __GFP_MOVABLE; + /* + * XXX: __getblk_slow() can not really deal with failure and + * will endlessly loop on improvised global reclaim. Prefer + * looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp_mask |= __GFP_NOFAIL; - page = find_or_create_page(inode->i_mapping, index, - (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); + page = find_or_create_page(inode->i_mapping, index, gfp_mask); if (!page) return ret; @@ -1105,6 +1165,8 @@ void mark_buffer_dirty(struct buffer_head *bh) { WARN_ON_ONCE(!buffer_uptodate(bh)); + trace_block_dirty_buffer(bh); + /* * Very *carefully* optimize the it-is-already-dirty case. * @@ -1252,7 +1314,7 @@ static void bh_lru_install(struct buffer_head *bh) } while (out < BH_LRU_SIZE) bhs[out++] = NULL; - memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); + memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs)); } bh_lru_unlock(); @@ -1304,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) struct buffer_head *bh = lookup_bh_lru(bdev, block, size); if (bh == NULL) { + /* __find_get_block_slow will mark the page accessed */ bh = __find_get_block_slow(bdev, block); if (bh) bh_lru_install(bh); - } - if (bh) + } else touch_buffer(bh); + return bh; } EXPORT_SYMBOL(__find_get_block); @@ -1421,16 +1484,27 @@ EXPORT_SYMBOL(set_bh_page); /* * Called when truncating a buffer on a page completely. */ + +/* Bits that are cleared during an invalidate */ +#define BUFFER_FLAGS_DISCARD \ + (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \ + 1 << BH_Delay | 1 << BH_Unwritten) + static void discard_buffer(struct buffer_head * bh) { + unsigned long b_state, b_state_old; + lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - clear_buffer_delay(bh); - clear_buffer_unwritten(bh); + b_state = bh->b_state; + for (;;) { + b_state_old = cmpxchg(&bh->b_state, b_state, + (b_state & ~BUFFER_FLAGS_DISCARD)); + if (b_state_old == b_state) + break; + b_state = b_state_old; + } unlock_buffer(bh); } @@ -1438,7 +1512,8 @@ static void discard_buffer(struct buffer_head * bh) * block_invalidatepage - invalidate part or all of a buffer-backed page * * @page: the page which is affected - * @offset: the index of the truncation point + * @offset: start of the range to invalidate + * @length: length of the range to invalidate * * block_invalidatepage() is called when all or part of the page has become * invalidated by a truncate operation. @@ -1449,15 +1524,22 @@ static void discard_buffer(struct buffer_head * bh) * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ -void block_invalidatepage(struct page *page, unsigned long offset) +void block_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; unsigned int curr_off = 0; + unsigned int stop = length + offset; BUG_ON(!PageLocked(page)); if (!page_has_buffers(page)) goto out; + /* + * Check for overflow + */ + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + head = page_buffers(page); bh = head; do { @@ -1465,6 +1547,12 @@ void block_invalidatepage(struct page *page, unsigned long offset) next = bh->b_this_page; /* + * Are we still fully in range ? + */ + if (next_off > stop) + goto out; + + /* * is this block fully invalidated? */ if (offset <= curr_off) @@ -1485,6 +1573,7 @@ out: } EXPORT_SYMBOL(block_invalidatepage); + /* * We attach and possibly dirty the buffers atomically wrt * __set_page_dirty_buffers() via private_lock. try_to_free_buffers @@ -1553,6 +1642,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block) EXPORT_SYMBOL(unmap_underlying_metadata); /* + * Size is a power-of-two in the range 512..PAGE_SIZE, + * and the case we care about most is PAGE_SIZE. + * + * So this *could* possibly be written with those + * constraints in mind (relevant mostly if some + * architecture has a slow bit-scan instruction) + */ +static inline int block_size_bits(unsigned int blocksize) +{ + return ilog2(blocksize); +} + +static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state) +{ + BUG_ON(!PageLocked(page)); + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state); + return page_buffers(page); +} + +/* * NOTE! All mapped/uptodate combinations are valid: * * Mapped Uptodate Meaning @@ -1589,19 +1700,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, sector_t block; sector_t last_block; struct buffer_head *bh, *head; - const unsigned blocksize = 1 << inode->i_blkbits; + unsigned int blocksize, bbits; int nr_underway = 0; int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); - BUG_ON(!PageLocked(page)); - - last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; - - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, + head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); - } /* * Be very careful. We have no exclusion from __set_page_dirty_buffers @@ -1613,9 +1718,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page, * handle that here by just cleaning them. */ - block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - head = page_buffers(page); bh = head; + blocksize = bh->b_size; + bbits = block_size_bits(blocksize); + + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + last_block = (i_size_read(inode) - 1) >> bbits; /* * Get all the dirty buffers mapped to disk addresses and @@ -1806,12 +1914,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, BUG_ON(to > PAGE_CACHE_SIZE); BUG_ON(from > to); - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = create_page_buffers(page, inode, 0); + blocksize = head->b_size; + bbits = block_size_bits(blocksize); - bbits = inode->i_blkbits; block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); for(bh = head, block_start = 0; bh != head || !block_start; @@ -1881,11 +1987,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, unsigned blocksize; struct buffer_head *bh, *head; - blocksize = 1 << inode->i_blkbits; + bh = head = page_buffers(page); + blocksize = bh->b_size; - for(bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start=block_end, bh = bh->b_this_page) { + block_start = 0; + do { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) @@ -1895,7 +2001,10 @@ static int __block_commit_write(struct inode *inode, struct page *page, mark_buffer_dirty(bh); } clear_buffer_new(bh); - } + + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); /* * If this is a partial write which happened to make all buffers @@ -2017,10 +2126,9 @@ EXPORT_SYMBOL(generic_write_end); * Returns true if all buffers which correspond to a file portion * we want to read are uptodate. */ -int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, - unsigned long from) +int block_is_partially_uptodate(struct page *page, unsigned long from, + unsigned long count) { - struct inode *inode = page->mapping->host; unsigned block_start, block_end, blocksize; unsigned to; struct buffer_head *bh, *head; @@ -2029,13 +2137,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, if (!page_has_buffers(page)) return 0; - blocksize = 1 << inode->i_blkbits; - to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); + head = page_buffers(page); + blocksize = head->b_size; + to = min_t(unsigned, PAGE_CACHE_SIZE - from, count); to = from + to; if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) return 0; - head = page_buffers(page); bh = head; block_start = 0; do { @@ -2068,18 +2176,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block) struct inode *inode = page->mapping->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; - unsigned int blocksize; + unsigned int blocksize, bbits; int nr, i; int fully_mapped = 1; - BUG_ON(!PageLocked(page)); - blocksize = 1 << inode->i_blkbits; - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = create_page_buffers(page, inode, 0); + blocksize = head->b_size; + bbits = block_size_bits(blocksize); - iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; + iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + lblock = (i_size_read(inode)+blocksize-1) >> bbits; bh = head; nr = 0; i = 0; @@ -2307,7 +2413,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { struct page *page = vmf->page; - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); unsigned long end; loff_t size; int ret; @@ -2334,7 +2440,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (unlikely(ret < 0)) goto out_unlock; set_page_dirty(page); - wait_on_page_writeback(page); + wait_for_stable_page(page); return 0; out_unlock: unlock_page(page); @@ -2346,7 +2452,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { int ret; - struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; sb_start_pagefault(sb); @@ -2785,10 +2891,9 @@ EXPORT_SYMBOL(block_truncate_page); /* * The generic ->writepage function for buffer-backed address_spaces - * this form passes in the end_io handler used to finish the IO. */ -int block_write_full_page_endio(struct page *page, get_block_t *get_block, - struct writeback_control *wbc, bh_end_io_t *handler) +int block_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) { struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); @@ -2798,7 +2903,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, /* Is the page fully inside i_size? */ if (page->index < end_index) return __block_write_full_page(inode, page, get_block, wbc, - handler); + end_buffer_async_write); /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_CACHE_SIZE-1); @@ -2808,7 +2913,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, * they may have been added in ext3_writepage(). Make them * freeable here, so the page does not leak. */ - do_invalidatepage(page, 0); + do_invalidatepage(page, 0, PAGE_CACHE_SIZE); unlock_page(page); return 0; /* don't care */ } @@ -2821,18 +2926,8 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, * writes to that region are not written out to the file." */ zero_user_segment(page, offset, PAGE_CACHE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc, handler); -} -EXPORT_SYMBOL(block_write_full_page_endio); - -/* - * The generic ->writepage function for buffer-backed address_spaces - */ -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) -{ - return block_write_full_page_endio(page, get_block, wbc, - end_buffer_async_write); + return __block_write_full_page(inode, page, get_block, wbc, + end_buffer_async_write); } EXPORT_SYMBOL(block_write_full_page); @@ -2864,7 +2959,57 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) bio_put(bio); } -int submit_bh(int rw, struct buffer_head * bh) +/* + * This allows us to do IO even on the odd last sectors + * of a device, even if the bh block size is some multiple + * of the physical sector size. + * + * We'll just truncate the bio to the size of the device, + * and clear the end of the buffer head manually. + * + * Truly out-of-range accesses will turn into actual IO + * errors, this only handles the "we need to be able to + * do IO at the final sector" case. + */ +static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) +{ + sector_t maxsector; + unsigned bytes; + + maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; + if (!maxsector) + return; + + /* + * If the *whole* IO is past the end of the device, + * let it through, and the IO layer will turn it into + * an EIO. + */ + if (unlikely(bio->bi_iter.bi_sector >= maxsector)) + return; + + maxsector -= bio->bi_iter.bi_sector; + bytes = bio->bi_iter.bi_size; + if (likely((bytes >> 9) <= maxsector)) + return; + + /* Uhhuh. We've got a bh that straddles the device size! */ + bytes = maxsector << 9; + + /* Truncate the bio.. */ + bio->bi_iter.bi_size = bytes; + bio->bi_io_vec[0].bv_len = bytes; + + /* ..and clear the end of the buffer for reads */ + if ((rw & RW_MASK) == READ) { + void *kaddr = kmap_atomic(bh->b_page); + memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); + kunmap_atomic(kaddr); + flush_dcache_page(bh->b_page); + } +} + +int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) { struct bio *bio; int ret = 0; @@ -2887,18 +3032,26 @@ int submit_bh(int rw, struct buffer_head * bh) */ bio = bio_alloc(GFP_NOIO, 1); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_io_vec[0].bv_page = bh->b_page; bio->bi_io_vec[0].bv_len = bh->b_size; bio->bi_io_vec[0].bv_offset = bh_offset(bh); bio->bi_vcnt = 1; - bio->bi_idx = 0; - bio->bi_size = bh->b_size; + bio->bi_iter.bi_size = bh->b_size; bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; + bio->bi_flags |= bio_flags; + + /* Take care of bh's that straddle the end of the device */ + guard_bh_eod(rw, bio, bh); + + if (buffer_meta(bh)) + rw |= REQ_META; + if (buffer_prio(bh)) + rw |= REQ_PRIO; bio_get(bio); submit_bio(rw, bio); @@ -2909,6 +3062,12 @@ int submit_bh(int rw, struct buffer_head * bh) bio_put(bio); return ret; } +EXPORT_SYMBOL_GPL(_submit_bh); + +int submit_bh(int rw, struct buffer_head *bh) +{ + return _submit_bh(rw, bh, 0); +} EXPORT_SYMBOL(submit_bh); /** @@ -2930,7 +3089,7 @@ EXPORT_SYMBOL(submit_bh); * until the buffer gets unlocked). * * ll_rw_block sets b_end_io to simple completion handler that marks - * the buffer up-to-date (if approriate), unlocks the buffer and wakes + * the buffer up-to-date (if appropriate), unlocks the buffer and wakes * any waiters. * * All of the buffers must be for the same device, and must also be a @@ -3149,7 +3308,7 @@ static struct kmem_cache *bh_cachep __read_mostly; * Once the number of bh's in the machine exceeds this level, we start * stripping them in writeback. */ -static int max_buffer_heads; +static unsigned long max_buffer_heads; int buffer_heads_over_limit; @@ -3265,7 +3424,7 @@ EXPORT_SYMBOL(bh_submit_read); void __init buffer_init(void) { - int nrpages; + unsigned long nrpages; bh_cachep = kmem_cache_create("buffer_head", sizeof(struct buffer_head), 0, |
