diff options
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 1591 |
1 files changed, 1358 insertions, 233 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8d970774641..8ca2763df09 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -32,12 +32,23 @@ #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/uio.h> #include <linux/bio.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" +#include "ext4_extents.h" + +static inline int ext4_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, + new_size); +} + +static void ext4_invalidatepage(struct page *page, unsigned long offset); /* * Test whether an inode is a fast symlink. @@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode) { handle_t *handle; + if (ext4_should_order_data(inode)) + ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); if (is_bad_inode(inode)) @@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, * direct blocks */ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int indirect_blks, int blks, - ext4_fsblk_t new_blocks[4], int *err) + ext4_lblk_t iblock, ext4_fsblk_t goal, + int indirect_blks, int blks, + ext4_fsblk_t new_blocks[4], int *err) { int target, i; - unsigned long count = 0; + unsigned long count = 0, blk_allocated = 0; int index = 0; ext4_fsblk_t current_block = 0; int ret = 0; @@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, * the first direct block of this branch. That's the * minimum number of blocks need to allocate(required) */ - target = blks + indirect_blks; - - while (1) { + /* first we try to allocate the indirect blocks */ + target = indirect_blks; + while (target > 0) { count = target; /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext4_new_blocks(handle,inode,goal,&count,err); + current_block = ext4_new_meta_blocks(handle, inode, + goal, &count, err); if (*err) goto failed_out; @@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, new_blocks[index++] = current_block++; count--; } - - if (count > 0) + if (count > 0) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + printk(KERN_INFO "%s returned more blocks than " + "requested\n", __func__); + WARN_ON(1); break; + } } - /* save the new block number for the first direct block */ - new_blocks[index] = current_block; - + target = blks - count ; + blk_allocated = count; + if (!target) + goto allocated; + /* Now allocate data blocks */ + count = target; + /* allocating blocks for data blocks */ + current_block = ext4_new_blocks(handle, inode, iblock, + goal, &count, err); + if (*err && (target == blks)) { + /* + * if the allocation failed and we didn't allocate + * any blocks before + */ + goto failed_out; + } + if (!*err) { + if (target == blks) { + /* + * save the new block number + * for the first direct block + */ + new_blocks[index] = current_block; + } + blk_allocated += count; + } +allocated: /* total number of blocks allocated for direct blocks */ - ret = count; + ret = blk_allocated; *err = 0; return ret; failed_out: @@ -584,8 +631,9 @@ failed_out: * as described above and return 0. */ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - int indirect_blks, int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) + ext4_lblk_t iblock, int indirect_blks, + int *blks, ext4_fsblk_t goal, + ext4_lblk_t *offsets, Indirect *branch) { int blocksize = inode->i_sb->s_blocksize; int i, n = 0; @@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, ext4_fsblk_t new_blocks[4]; ext4_fsblk_t current_block; - num = ext4_alloc_blocks(handle, inode, goal, indirect_blks, + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, *blks, new_blocks, &err); if (err) return err; @@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, struct ext4_inode_info *ei = EXT4_I(inode); int count = 0; ext4_fsblk_t first_block = 0; + loff_t disksize; J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); @@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, /* * Block out ext4_truncate while we alter the tree */ - err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal, - offsets + (partial - chain), partial); + err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, + &count, goal, + offsets + (partial - chain), partial); /* * The ext4_splice_branch call will free and forget any buffers @@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, * protect it if you're about to implement concurrent * ext4_get_block() -bzzz */ - if (!err && extend_disksize && inode->i_size > ei->i_disksize) - ei->i_disksize = inode->i_size; + if (!err && extend_disksize) { + disksize = ((loff_t) iblock + count) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > ei->i_disksize) + ei->i_disksize = disksize; + } if (err) goto cleanup; @@ -934,7 +989,7 @@ out: */ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, unsigned long max_blocks, struct buffer_head *bh, - int create, int extend_disksize) + int create, int extend_disksize, int flag) { int retval; @@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, * with create == 1 flag. */ down_write((&EXT4_I(inode)->i_data_sem)); + + /* + * if the caller is from delayed allocation writeout path + * we have already reserved fs blocks for allocation + * let the underlying get_block() function know to + * avoid double accounting + */ + if (flag) + EXT4_I(inode)->i_delalloc_reserved_flag = 1; /* * We need to check for EXT4 here because migrate * could have changed the inode type in between @@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, ~EXT4_EXT_MIGRATE; } } + + if (flag) { + EXT4_I(inode)->i_delalloc_reserved_flag = 0; + /* + * Update reserved blocks/metadata blocks + * after successful block allocation + * which were deferred till now + */ + if ((retval > 0) && buffer_delay(bh)) + ext4_da_release_space(inode, retval, 0); + } + up_write((&EXT4_I(inode)->i_data_sem)); return retval; } @@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock, } ret = ext4_get_blocks_wrap(handle, inode, iblock, - max_blocks, bh_result, create, 0); + max_blocks, bh_result, create, 0, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext4_get_blocks_wrap(handle, inode, block, 1, - &dummy, create, 1); + &dummy, create, 1, 0); /* * ext4_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. @@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, to = from + len; retry: - page = __grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; - handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { - unlock_page(page); - page_cache_release(page); ret = PTR_ERR(handle); goto out; } + page = __grab_cache_page(mapping, index); + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } + *pagep = page; + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, ext4_get_block); @@ -1225,8 +1302,8 @@ retry: } if (ret) { - ext4_journal_stop(handle); unlock_page(page); + ext4_journal_stop(handle); page_cache_release(page); } @@ -1236,15 +1313,6 @@ out: return ret; } -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh) -{ - int err = jbd2_journal_dirty_data(handle, bh); - if (err) - ext4_journal_abort_handle(__func__, __func__, - bh, handle, err); - return err; -} - /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct buffer_head *bh) { @@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) } /* - * Generic write_end handler for ordered and writeback ext4 journal modes. - * We can't use generic_write_end, because that unlocks the page and we need to - * unlock the page after ext4_journal_stop, but ext4_journal_stop must run - * after block_write_end. - */ -static int ext4_generic_write_end(struct file *file, - struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = file->f_mapping->host; - - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); - mark_inode_dirty(inode); - } - - return copied; -} - -/* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * @@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file, struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; unsigned from, to; int ret = 0, ret2; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, ext4_journal_dirty_data); + ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { /* @@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file, new_i_size = pos + copied; if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; if (ret2 < 0) @@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file, ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); - page_cache_release(page); return ret ? ret : copied; } @@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file, struct page *page, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; int ret = 0, ret2; loff_t new_i_size; @@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file, if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; if (ret2 < 0) @@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file, ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); - page_cache_release(page); return ret ? ret : copied; } @@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file, ret = ret2; } + unlock_page(page); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - unlock_page(page); page_cache_release(page); return ret ? ret : copied; } +/* + * Calculate the number of metadata blocks need to reserve + * to allocate @blocks for non extent file based file + */ +static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) +{ + int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ind_blks, dind_blks, tind_blks; + + /* number of new indirect blocks needed */ + ind_blks = (blocks + icap - 1) / icap; + + dind_blks = (ind_blks + icap - 1) / icap; + + tind_blks = 1; + + return ind_blks + dind_blks + tind_blks; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate given number of blocks + */ +static int ext4_calc_metadata_amount(struct inode *inode, int blocks) +{ + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + return ext4_ext_calc_metadata_amount(inode, blocks); + + return ext4_indirect_calc_metadata_amount(inode, blocks); +} + +static int ext4_da_reserve_space(struct inode *inode, int nrblocks) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned long md_needed, mdblocks, total = 0; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; + mdblocks = ext4_calc_metadata_amount(inode, total); + BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); + + md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; + total = md_needed + nrblocks; + + if (ext4_has_free_blocks(sbi, total) < total) { + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return -ENOSPC; + } + + /* reduce fs free blocks counter */ + percpu_counter_sub(&sbi->s_freeblocks_counter, total); + + EXT4_I(inode)->i_reserved_data_blocks += nrblocks; + EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; + + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return 0; /* success */ +} + +void ext4_da_release_space(struct inode *inode, int used, int to_free) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int total, mdb, mdb_free, release; + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + /* recalculate the number of metablocks still need to be reserved */ + total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free; + mdb = ext4_calc_metadata_amount(inode, total); + + /* figure out how many metablocks to release */ + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; + + /* Account for allocated meta_blocks */ + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; + + release = to_free + mdb_free; + + /* update fs free blocks counter for truncate case */ + percpu_counter_add(&sbi->s_freeblocks_counter, release); + + /* update per-inode reservations */ + BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free); + + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + EXT4_I(inode)->i_reserved_meta_blocks = mdb; + EXT4_I(inode)->i_allocated_meta_blocks = 0; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +} + +static void ext4_da_page_release_reservation(struct page *page, + unsigned long offset) +{ + int to_release = 0; + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + + if ((offset <= curr_off) && (buffer_delay(bh))) { + to_release++; + clear_buffer_delay(bh); + } + curr_off = next_off; + } while ((bh = bh->b_this_page) != head); + ext4_da_release_space(page->mapping->host, 0, to_release); +} + +/* + * Delayed allocation stuff + */ + +struct mpage_da_data { + struct inode *inode; + struct buffer_head lbh; /* extent of blocks */ + unsigned long first_page, next_page; /* extent of pages */ + get_block_t *get_block; + struct writeback_control *wbc; +}; + +/* + * mpage_da_submit_io - walks through extent of pages and try to write + * them with __mpage_writepage() + * + * @mpd->inode: inode + * @mpd->first_page: first page of the extent + * @mpd->next_page: page after the last page of the extent + * @mpd->get_block: the filesystem's block mapper function + * + * By the time mpage_da_submit_io() is called we expect all blocks + * to be allocated. this may be wrong if allocation failed. + * + * As pages are already locked by write_cache_pages(), we can't use it + */ +static int mpage_da_submit_io(struct mpage_da_data *mpd) +{ + struct address_space *mapping = mpd->inode->i_mapping; + struct mpage_data mpd_pp = { + .bio = NULL, + .last_block_in_bio = 0, + .get_block = mpd->get_block, + .use_writepage = 1, + }; + int ret = 0, err, nr_pages, i; + unsigned long index, end; + struct pagevec pvec; + + BUG_ON(mpd->next_page <= mpd->first_page); + + pagevec_init(&pvec, 0); + index = mpd->first_page; + end = mpd->next_page - 1; + + while (index <= end) { + /* XXX: optimize tail */ + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + index++; + + err = __mpage_writepage(page, mpd->wbc, &mpd_pp); + + /* + * In error case, we have to continue because + * remaining pages are still locked + * XXX: unlock and re-dirty them? + */ + if (ret == 0) + ret = err; + } + pagevec_release(&pvec); + } + if (mpd_pp.bio) + mpage_bio_submit(WRITE, mpd_pp.bio); + + return ret; +} + +/* + * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers + * + * @mpd->inode - inode to walk through + * @exbh->b_blocknr - first block on a disk + * @exbh->b_size - amount of space in bytes + * @logical - first logical block to start assignment with + * + * the function goes through all passed space and put actual disk + * block numbers into buffer heads, dropping BH_Delay + */ +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, + struct buffer_head *exbh) +{ + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + int blocks = exbh->b_size >> inode->i_blkbits; + sector_t pblock = exbh->b_blocknr, cur_logical; + struct buffer_head *head, *bh; + unsigned long index, end; + struct pagevec pvec; + int nr_pages, i; + + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + pagevec_init(&pvec, 0); + + while (index <= end) { + /* XXX: optimize tail */ + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + index = page->index; + if (index > end) + break; + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + BUG_ON(!page_has_buffers(page)); + + bh = page_buffers(page); + head = bh; + + /* skip blocks out of the range */ + do { + if (cur_logical >= logical) + break; + cur_logical++; + } while ((bh = bh->b_this_page) != head); + + do { + if (cur_logical >= logical + blocks) + break; + if (buffer_delay(bh)) { + bh->b_blocknr = pblock; + clear_buffer_delay(bh); + } else if (buffer_mapped(bh)) + BUG_ON(bh->b_blocknr != pblock); + + cur_logical++; + pblock++; + } while ((bh = bh->b_this_page) != head); + } + pagevec_release(&pvec); + } +} + + +/* + * __unmap_underlying_blocks - just a helper function to unmap + * set of blocks described by @bh + */ +static inline void __unmap_underlying_blocks(struct inode *inode, + struct buffer_head *bh) +{ + struct block_device *bdev = inode->i_sb->s_bdev; + int blocks, i; + + blocks = bh->b_size >> inode->i_blkbits; + for (i = 0; i < blocks; i++) + unmap_underlying_metadata(bdev, bh->b_blocknr + i); +} + +/* + * mpage_da_map_blocks - go through given space + * + * @mpd->lbh - bh describing space + * @mpd->get_block - the filesystem's block mapper function + * + * The function skips space we know is already mapped to disk blocks. + * + * The function ignores errors ->get_block() returns, thus real + * error handling is postponed to __mpage_writepage() + */ +static void mpage_da_map_blocks(struct mpage_da_data *mpd) +{ + struct buffer_head *lbh = &mpd->lbh; + int err = 0, remain = lbh->b_size; + sector_t next = lbh->b_blocknr; + struct buffer_head new; + + /* + * We consider only non-mapped and non-allocated blocks + */ + if (buffer_mapped(lbh) && !buffer_delay(lbh)) + return; + + while (remain) { + new.b_state = lbh->b_state; + new.b_blocknr = 0; + new.b_size = remain; + err = mpd->get_block(mpd->inode, next, &new, 1); + if (err) { + /* + * Rather than implement own error handling + * here, we just leave remaining blocks + * unallocated and try again with ->writepage() + */ + break; + } + BUG_ON(new.b_size == 0); + + if (buffer_new(&new)) + __unmap_underlying_blocks(mpd->inode, &new); + + /* + * If blocks are delayed marked, we need to + * put actual blocknr and drop delayed bit + */ + if (buffer_delay(lbh)) + mpage_put_bnr_to_bhs(mpd, next, &new); + + /* go for the remaining blocks */ + next += new.b_size >> mpd->inode->i_blkbits; + remain -= new.b_size; + } +} + +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) + +/* + * mpage_add_bh_to_extent - try to add one more block to extent of blocks + * + * @mpd->lbh - extent of blocks + * @logical - logical number of the block in the file + * @bh - bh of the block (used to access block's state) + * + * the function is used to collect contig. blocks in same state + */ +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, + sector_t logical, struct buffer_head *bh) +{ + struct buffer_head *lbh = &mpd->lbh; + sector_t next; + + next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); + + /* + * First block in the extent + */ + if (lbh->b_size == 0) { + lbh->b_blocknr = logical; + lbh->b_size = bh->b_size; + lbh->b_state = bh->b_state & BH_FLAGS; + return; + } + + /* + * Can we merge the block to our big extent? + */ + if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { + lbh->b_size += bh->b_size; + return; + } + + /* + * We couldn't merge the block to our extent, so we + * need to flush current extent and start new one + */ + mpage_da_map_blocks(mpd); + + /* + * Now start a new extent + */ + lbh->b_size = bh->b_size; + lbh->b_state = bh->b_state & BH_FLAGS; + lbh->b_blocknr = logical; +} + +/* + * __mpage_da_writepage - finds extent of pages and blocks + * + * @page: page to consider + * @wbc: not used, we just follow rules + * @data: context + * + * The function finds extents of pages and scan them for all blocks. + */ +static int __mpage_da_writepage(struct page *page, + struct writeback_control *wbc, void *data) +{ + struct mpage_da_data *mpd = data; + struct inode *inode = mpd->inode; + struct buffer_head *bh, *head, fake; + sector_t logical; + + /* + * Can we merge this page to current extent? + */ + if (mpd->next_page != page->index) { + /* + * Nope, we can't. So, we map non-allocated blocks + * and start IO on them using __mpage_writepage() + */ + if (mpd->next_page != mpd->first_page) { + mpage_da_map_blocks(mpd); + mpage_da_submit_io(mpd); + } + + /* + * Start next extent of pages ... + */ + mpd->first_page = page->index; + + /* + * ... and blocks + */ + mpd->lbh.b_size = 0; + mpd->lbh.b_state = 0; + mpd->lbh.b_blocknr = 0; + } + + mpd->next_page = page->index + 1; + logical = (sector_t) page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (!page_has_buffers(page)) { + /* + * There is no attached buffer heads yet (mmap?) + * we treat the page asfull of dirty blocks + */ + bh = &fake; + bh->b_size = PAGE_CACHE_SIZE; + bh->b_state = 0; + set_buffer_dirty(bh); + set_buffer_uptodate(bh); + mpage_add_bh_to_extent(mpd, logical, bh); + } else { + /* + * Page with regular buffer heads, just add all dirty ones + */ + head = page_buffers(page); + bh = head; + do { + BUG_ON(buffer_locked(bh)); + if (buffer_dirty(bh)) + mpage_add_bh_to_extent(mpd, logical, bh); + logical++; + } while ((bh = bh->b_this_page) != head); + } + + return 0; +} + +/* + * mpage_da_writepages - walk the list of dirty pages of the given + * address space, allocates non-allocated blocks, maps newly-allocated + * blocks to existing bhs and issue IO them + * + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @get_block: the filesystem's block mapper function. + * + * This is a library function, which implements the writepages() + * address_space_operation. + * + * In order to avoid duplication of logic that deals with partial pages, + * multiple bio per page, etc, we find non-allocated blocks, allocate + * them with minimal calls to ->get_block() and re-use __mpage_writepage() + * + * It's important that we call __mpage_writepage() only once for each + * involved page, otherwise we'd have to implement more complicated logic + * to deal with pages w/o PG_lock or w/ PG_writeback and so on. + * + * See comments to mpage_writepages() + */ +static int mpage_da_writepages(struct address_space *mapping, + struct writeback_control *wbc, + get_block_t get_block) +{ + struct mpage_da_data mpd; + int ret; + + if (!get_block) + return generic_writepages(mapping, wbc); + + mpd.wbc = wbc; + mpd.inode = mapping->host; + mpd.lbh.b_size = 0; + mpd.lbh.b_state = 0; + mpd.lbh.b_blocknr = 0; + mpd.first_page = 0; + mpd.next_page = 0; + mpd.get_block = get_block; + + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); + + /* + * Handle last extent of pages + */ + if (mpd.next_page != mpd.first_page) { + mpage_da_map_blocks(&mpd); + mpage_da_submit_io(&mpd); + } + + return ret; +} + +/* + * this is a special callback for ->write_begin() only + * it's intention is to return mapped block or reserve space + */ +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + + BUG_ON(create == 0); + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + + /* + * first, we need to know whether the block is allocated already + * preallocated blocks are unmapped but should treated + * the same as allocated blocks. + */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); + if ((ret == 0) && !buffer_delay(bh_result)) { + /* the block isn't (pre)allocated yet, let's reserve space */ + /* + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ + ret = ext4_da_reserve_space(inode, 1); + if (ret) + /* not enough space to reserve */ + return ret; + + map_bh(bh_result, inode->i_sb, 0); + set_buffer_new(bh_result); + set_buffer_delay(bh_result); + } else if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + + return ret; +} +#define EXT4_DELALLOC_RSVED 1 +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + loff_t disksize = EXT4_I(inode)->i_disksize; + handle_t *handle = NULL; + + handle = ext4_journal_current_handle(); + if (!handle) { + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, 0, 0, 0); + BUG_ON(!ret); + } else { + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, create, 0, EXT4_DELALLOC_RSVED); + } + + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + + /* + * Update on-disk size along with block allocation + * we don't use 'extend_disksize' as size may change + * within already allocated block -bzzz + */ + disksize = ((loff_t) iblock + ret) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + /* + * XXX: replace with spinlock if seen contended -bzzz + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + up_write(&EXT4_I(inode)->i_data_sem); + + if (EXT4_I(inode)->i_disksize == disksize) { + ret = ext4_mark_inode_dirty(handle, inode); + return ret; + } + } + ret = 0; + } + return ret; +} + +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +{ + /* + * unmapped buffer is possible for holes. + * delay buffer is possible with delayed allocation + */ + return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); +} + +static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + + /* + * we don't want to do block allocation in writepage + * so call get_block_wrap with create = 0 + */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, + bh_result, 0, 0, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + ret = 0; + } + return ret; +} + +/* + * get called vi ext4_da_writepages after taking page lock (have journal handle) + * get called via journal_submit_inode_data_buffers (no journal handle) + * get called via shrink_page_list via pdflush (no journal handle) + * or grab_page_cache when doing write_begin (have journal handle) + */ +static int ext4_da_writepage(struct page *page, + struct writeback_control *wbc) +{ + int ret = 0; + loff_t size; + unsigned long len; + struct buffer_head *page_bufs; + struct inode *inode = page->mapping->host; + + size = i_size_read(inode); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + + if (page_has_buffers(page)) { + page_bufs = page_buffers(page); + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_unmapped_or_delay)) { + /* + * We don't want to do block allocation + * So redirty the page and return + * We may reach here when we do a journal commit + * via journal_submit_inode_data_buffers. + * If we don't have mapping block we just ignore + * them. We can also reach here via shrink_page_list + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } else { + /* + * The test for page_has_buffers() is subtle: + * We know the page is dirty but it lost buffers. That means + * that at some moment in time after write_begin()/write_end() + * has been called all buffers have been clean and thus they + * must have been written at least once. So they are all + * mapped and we can happily proceed with mapping them + * and writing the page. + * + * Try to initialize the buffer_heads and check whether + * all are mapped and non delay. We don't want to + * do block allocation here. + */ + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ext4_normal_get_block_write); + if (!ret) { + page_bufs = page_buffers(page); + /* check whether all are mapped and non delay */ + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_unmapped_or_delay)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } else { + /* + * We can't do block allocation here + * so just redity the page and unlock + * and return + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } + + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) + ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); + else + ret = block_write_full_page(page, + ext4_normal_get_block_write, + wbc); + + return ret; +} + +/* + * For now just follow the DIO way to estimate the max credits + * needed to write out EXT4_MAX_WRITEBACK_PAGES. + * todo: need to calculate the max credits need for + * extent based files, currently the DIO credits is based on + * indirect-blocks mapping way. + * + * Probably should have a generic way to calculate credits + * for DIO, writepages, and truncate + */ +#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS +#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS + +static int ext4_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + handle_t *handle = NULL; + int needed_blocks; + int ret = 0; + long to_write; + loff_t range_start = 0; + + /* + * No pages to write? This is mainly a kludge to avoid starting + * a transaction for special inodes like journal inode on last iput() + * because that could violate lock ordering on umount + */ + if (!mapping->nrpages) + return 0; + + /* + * Estimate the worse case needed credits to write out + * EXT4_MAX_BUF_BLOCKS pages + */ + needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; + + to_write = wbc->nr_to_write; + if (!wbc->range_cyclic) { + /* + * If range_cyclic is not set force range_cont + * and save the old writeback_index + */ + wbc->range_cont = 1; + range_start = wbc->range_start; + } + + while (!ret && to_write) { + /* start a new transaction*/ + handle = ext4_journal_start(inode, |