diff options
Diffstat (limited to 'fs/ext4/inode.c')
| -rw-r--r-- | fs/ext4/inode.c | 2279 |
1 files changed, 1098 insertions, 1181 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d6382b89ecb..8a064734e6e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -38,6 +38,7 @@ #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/aio.h> +#include <linux/bitops.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -132,20 +133,23 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static void ext4_invalidatepage(struct page *page, unsigned long offset); +static void ext4_invalidatepage(struct page *page, unsigned int offset, + unsigned int length); static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); -static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags); +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); /* * Test whether an inode is a fast symlink. */ static int ext4_inode_is_fast_symlink(struct inode *inode) { - int ea_blocks = EXT4_I(inode)->i_file_acl ? - (inode->i_sb->s_blocksize >> 9) : 0; + int ea_blocks = EXT4_I(inode)->i_file_acl ? + EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; + + if (ext4_has_inline_data(inode)) + return 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } @@ -214,8 +218,9 @@ void ext4_evict_inode(struct inode *inode) jbd2_complete_transaction(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } - truncate_inode_pages(&inode->i_data, 0); - ext4_ioend_shutdown(inode); + truncate_inode_pages_final(&inode->i_data); + + WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); goto no_delete; } @@ -224,9 +229,9 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); - truncate_inode_pages(&inode->i_data, 0); - ext4_ioend_shutdown(inode); + truncate_inode_pages_final(&inode->i_data); + WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); if (is_bad_inode(inode)) goto no_delete; @@ -423,66 +428,6 @@ static int __check_block_validity(struct inode *inode, const char *func, #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) -/* - * Return the number of contiguous dirty pages in a given inode - * starting at page frame idx. - */ -static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, - unsigned int max_pages) -{ - struct address_space *mapping = inode->i_mapping; - pgoff_t index; - struct pagevec pvec; - pgoff_t num = 0; - int i, nr_pages, done = 0; - - if (max_pages == 0) - return 0; - pagevec_init(&pvec, 0); - while (!done) { - index = idx; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - (pgoff_t)PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - lock_page(page); - if (unlikely(page->mapping != mapping) || - !PageDirty(page) || - PageWriteback(page) || - page->index != idx) { - done = 1; - unlock_page(page); - break; - } - if (page_has_buffers(page)) { - bh = head = page_buffers(page); - do { - if (!buffer_delay(bh) && - !buffer_unwritten(bh)) - done = 1; - bh = bh->b_this_page; - } while (!done && (bh != head)); - } - unlock_page(page); - if (done) - break; - idx++; - num++; - if (num >= max_pages) { - done = 1; - break; - } - } - pagevec_release(&pvec); - } - return num; -} - #ifdef ES_AGGRESSIVE_TEST static void ext4_map_blocks_es_recheck(handle_t *handle, struct inode *inode, @@ -501,7 +446,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * could be converted. */ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -524,7 +469,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, if (es_map->m_lblk != map->m_lblk || es_map->m_flags != map->m_flags || es_map->m_pblk != map->m_pblk) { - printk("ES cache assertation failed for inode: %lu " + printk("ES cache assertion failed for inode: %lu " "es_cached ex [%d/%d/%llu/%x] != " "found ex [%d/%d/%llu/%x] retval %d flags %x\n", inode->i_ino, es_map->m_lblk, es_map->m_len, @@ -547,8 +492,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * - * On success, it returns the number of blocks being mapped or allocate. - * if create==0 and the blocks are pre-allocated and uninitialized block, + * On success, it returns the number of blocks being mapped or allocated. + * if create==0 and the blocks are pre-allocated and unwritten block, * the result buffer head is unmapped. If the create ==1, it will make sure * the buffer head is mapped. * @@ -562,6 +507,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, { struct extent_status es; int retval; + int ret = 0; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -573,8 +519,19 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); + /* + * ext4_map_blocks returns an int, and m_len is an unsigned int + */ + if (unlikely(map->m_len > INT_MAX)) + map->m_len = INT_MAX; + + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EIO; + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + ext4_es_lru_add(inode); if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -601,7 +558,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -610,17 +567,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, EXT4_GET_BLOCKS_KEEP_SIZE); } if (retval > 0) { - int ret; - unsigned long long status; + unsigned int status; -#ifdef ES_AGGRESSIVE_TEST - if (retval != map->m_len) { - printk("ES len assertation failed for inode: %lu " - "retval %d != map->m_len %d " - "in %s (lookup)\n", inode->i_ino, retval, - map->m_len, __func__); + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); } -#endif status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -638,7 +593,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -655,7 +610,13 @@ found: * with buffer head unmapped. */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) - return retval; + /* + * If we need to convert extent to unwritten + * we continue and do the actual work in + * ext4_ext_map_blocks() + */ + if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) + return retval; /* * Here we clear m_flags because after allocating an new extent, @@ -664,12 +625,12 @@ found: map->m_flags &= ~EXT4_MAP_FLAGS; /* - * New blocks allocate and/or writing to uninitialized extent + * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_blocks() * with create == 1 flag. */ - down_write((&EXT4_I(inode)->i_data_sem)); + down_write(&EXT4_I(inode)->i_data_sem); /* * if the caller is from delayed allocation writeout path @@ -711,17 +672,15 @@ found: ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); if (retval > 0) { - int ret; - unsigned long long status; + unsigned int status; -#ifdef ES_AGGRESSIVE_TEST - if (retval != map->m_len) { - printk("ES len assertation failed for inode: %lu " - "retval %d != map->m_len %d " - "in %s (allocation)\n", inode->i_ino, retval, - map->m_len, __func__); + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); } -#endif /* * If the extent has been zeroed out, we don't need to update @@ -747,7 +706,7 @@ found: has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -787,8 +746,12 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, ret = ext4_map_blocks(handle, inode, &map, flags); if (ret > 0) { + ext4_io_end_t *io_end = ext4_inode_aio(inode); + map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) + set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } @@ -962,6 +925,7 @@ int do_journal_get_write_access(handle_t *handle, */ if (dirty) clear_buffer_dirty(bh); + BUFFER_TRACE(bh, "get write access"); ret = ext4_journal_get_write_access(handle, bh); if (!ret && dirty) ret = ext4_handle_dirty_metadata(handle, NULL, bh); @@ -1029,7 +993,8 @@ retry_journal: ext4_journal_stop(handle); goto retry_grab; } - wait_on_page_writeback(page); + /* In case writeback began while the page was unlocked */ + wait_for_stable_page(page); if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(page, pos, len, ext4_get_block_write); @@ -1118,10 +1083,13 @@ static int ext4_write_end(struct file *file, } } - if (ext4_has_inline_data(inode)) - copied = ext4_write_inline_data_end(inode, pos, len, - copied, page); - else + if (ext4_has_inline_data(inode)) { + ret = ext4_write_inline_data_end(inode, pos, len, + copied, page); + if (ret < 0) + goto errout; + copied = ret; + } else copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -1157,8 +1125,6 @@ static int ext4_write_end(struct file *file, if (i_size_changed) ext4_mark_inode_dirty(handle, inode); - if (copied < 0) - ret = copied; if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside @@ -1260,7 +1226,6 @@ static int ext4_journalled_write_end(struct file *file, */ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; @@ -1272,7 +1237,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); /* * ext4_calc_metadata_amount() has side effects, which we have @@ -1292,10 +1256,6 @@ repeat: ei->i_da_metadata_calc_len = save_len; ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - cond_resched(); - goto repeat; - } return -ENOSPC; } ei->i_reserved_meta_blocks += md_needed; @@ -1309,7 +1269,6 @@ repeat: */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; @@ -1331,7 +1290,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); /* * ext4_calc_metadata_amount() has side effects, which we have @@ -1351,10 +1309,6 @@ repeat: ei->i_da_metadata_calc_len = save_len; ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - cond_resched(); - goto repeat; - } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } @@ -1415,21 +1369,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free) } static void ext4_da_page_release_reservation(struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { int to_release = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; struct inode *inode = page->mapping->host; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned int stop = offset + length; int num_clusters; ext4_fsblk_t lblk; + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + head = page_buffers(page); bh = head; do { unsigned int next_off = curr_off + bh->b_size; + if (next_off > stop) + break; + if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; clear_buffer_delay(bh); @@ -1460,140 +1421,43 @@ static void ext4_da_page_release_reservation(struct page *page, * Delayed allocation stuff */ -/* - * mpage_da_submit_io - walks through extent of pages and try to write - * them with writepage() call back - * - * @mpd->inode: inode - * @mpd->first_page: first page of the extent - * @mpd->next_page: page after the last page of the extent - * - * By the time mpage_da_submit_io() is called we expect all blocks - * to be allocated. this may be wrong if allocation failed. - * - * As pages are already locked by write_cache_pages(), we can't use it - */ -static int mpage_da_submit_io(struct mpage_da_data *mpd, - struct ext4_map_blocks *map) -{ - struct pagevec pvec; - unsigned long index, end; - int ret = 0, err, nr_pages, i; - struct inode *inode = mpd->inode; - struct address_space *mapping = inode->i_mapping; - loff_t size = i_size_read(inode); - unsigned int len, block_start; - struct buffer_head *bh, *page_bufs = NULL; - sector_t pblock = 0, cur_logical = 0; - struct ext4_io_submit io_submit; +struct mpage_da_data { + struct inode *inode; + struct writeback_control *wbc; - BUG_ON(mpd->next_page <= mpd->first_page); - memset(&io_submit, 0, sizeof(io_submit)); + pgoff_t first_page; /* The first page to write */ + pgoff_t next_page; /* Current page to examine */ + pgoff_t last_page; /* Last page to examine */ /* - * We need to start from the first_page to the next_page - 1 - * to make sure we also write the mapped dirty buffer_heads. - * If we look at mpd->b_blocknr we would only be looking - * at the currently mapped buffer_heads. + * Extent to map - this can be after first_page because that can be + * fully mapped. We somewhat abuse m_flags to store whether the extent + * is delalloc or unwritten. */ - index = mpd->first_page; - end = mpd->next_page - 1; - - pagevec_init(&pvec, 0); - while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - int skip_page = 0; - struct page *page = pvec.pages[i]; - - index = page->index; - if (index > end) - break; - - if (index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - if (map) { - cur_logical = index << (PAGE_CACHE_SHIFT - - inode->i_blkbits); - pblock = map->m_pblk + (cur_logical - - map->m_lblk); - } - index++; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - - bh = page_bufs = page_buffers(page); - block_start = 0; - do { - if (map && (cur_logical >= map->m_lblk) && - (cur_logical <= (map->m_lblk + - (map->m_len - 1)))) { - if (buffer_delay(bh)) { - clear_buffer_delay(bh); - bh->b_blocknr = pblock; - } - if (buffer_unwritten(bh) || - buffer_mapped(bh)) - BUG_ON(bh->b_blocknr != pblock); - if (map->m_flags & EXT4_MAP_UNINIT) - set_buffer_uninit(bh); - clear_buffer_unwritten(bh); - } - - /* - * skip page if block allocation undone and - * block is dirty - */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) - skip_page = 1; - bh = bh->b_this_page; - block_start += bh->b_size; - cur_logical++; - pblock++; - } while (bh != page_bufs); - - if (skip_page) { - unlock_page(page); - continue; - } - - clear_page_dirty_for_io(page); - err = ext4_bio_write_page(&io_submit, page, len, - mpd->wbc); - if (!err) - mpd->pages_written++; - /* - * In error case, we have to continue because - * remaining pages are still locked - */ - if (ret == 0) - ret = err; - } - pagevec_release(&pvec); - } - ext4_io_submit(&io_submit); - return ret; -} + struct ext4_map_blocks map; + struct ext4_io_submit io_submit; /* IO submission data */ +}; -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) +static void mpage_release_unused_pages(struct mpage_da_data *mpd, + bool invalidate) { int nr_pages, i; pgoff_t index, end; struct pagevec pvec; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - ext4_lblk_t start, last; + + /* This is necessary when next_page == 0. */ + if (mpd->first_page >= mpd->next_page) + return; index = mpd->first_page; end = mpd->next_page - 1; - - start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); - ext4_es_remove_extent(inode, start, last - start + 1); + if (invalidate) { + ext4_lblk_t start, last; + start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, start, last - start + 1); + } pagevec_init(&pvec, 0); while (index <= end) { @@ -1606,14 +1470,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) break; BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - block_invalidatepage(page, 0); - ClearPageUptodate(page); + if (invalidate) { + block_invalidatepage(page, 0, PAGE_CACHE_SIZE); + ClearPageUptodate(page); + } unlock_page(page); } index = pvec.pages[nr_pages - 1]->index + 1; pagevec_release(&pvec); } - return; } static void ext4_print_free_blocks(struct inode *inode) @@ -1642,215 +1507,6 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -/* - * mpage_da_map_and_submit - go through given space, map them - * if necessary, and then submit them for I/O - * - * @mpd - bh describing space - * - * The function skips space we know is already mapped to disk blocks. - * - */ -static void mpage_da_map_and_submit(struct mpage_da_data *mpd) -{ - int err, blks, get_blocks_flags; - struct ext4_map_blocks map, *mapp = NULL; - sector_t next = mpd->b_blocknr; - unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; - loff_t disksize = EXT4_I(mpd->inode)->i_disksize; - handle_t *handle = NULL; - - /* - * If the blocks are mapped already, or we couldn't accumulate - * any blocks, then proceed immediately to the submission stage. - */ - if ((mpd->b_size == 0) || - ((mpd->b_state & (1 << BH_Mapped)) && - !(mpd->b_state & (1 << BH_Delay)) && - !(mpd->b_state & (1 << BH_Unwritten)))) - goto submit_io; - - handle = ext4_journal_current_handle(); - BUG_ON(!handle); - - /* - * Call ext4_map_blocks() to allocate any delayed allocation - * blocks, or to convert an uninitialized extent to be - * initialized (in the case where we have written into - * one or more preallocated blocks). - * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to - * indicate that we are on the delayed allocation path. This - * affects functions in many different parts of the allocation - * call path. This flag exists primarily because we don't - * want to change *many* call functions, so ext4_map_blocks() - * will set the EXT4_STATE_DELALLOC_RESERVED flag once the - * inode's allocation semaphore is taken. - * - * If the blocks in questions were delalloc blocks, set - * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting - * variables are updated after the blocks have been allocated. - */ - map.m_lblk = next; - map.m_len = max_blocks; - /* - * We're in delalloc path and it is possible that we're going to - * need more metadata blocks than previously reserved. However - * we must not fail because we're in writeback and there is - * nothing we can do about it so it might result in data loss. - * So use reserved blocks to allocate metadata if possible. - */ - get_blocks_flags = EXT4_GET_BLOCKS_CREATE | - EXT4_GET_BLOCKS_METADATA_NOFAIL; - if (ext4_should_dioread_nolock(mpd->inode)) - get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (mpd->b_state & (1 << BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; - - - blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); - if (blks < 0) { - struct super_block *sb = mpd->inode->i_sb; - - err = blks; - /* - * If get block returns EAGAIN or ENOSPC and there - * appears to be free blocks we will just let - * mpage_da_submit_io() unlock all of the pages. - */ - if (err == -EAGAIN) - goto submit_io; - - if (err == -ENOSPC && ext4_count_free_clusters(sb)) { - mpd->retval = err; - goto submit_io; - } - - /* - * get block failure will cause us to loop in - * writepages, because a_ops->writepage won't be able - * to make progress. The page will be redirtied by - * writepage and writepages will again try to write - * the same. - */ - if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { - ext4_msg(sb, KERN_CRIT, - "delayed block allocation failed for inode %lu " - "at logical offset %llu with max blocks %zd " - "with error %d", mpd->inode->i_ino, - (unsigned long long) next, - mpd->b_size >> mpd->inode->i_blkbits, err); - ext4_msg(sb, KERN_CRIT, - "This should not happen!! Data will be lost"); - if (err == -ENOSPC) - ext4_print_free_blocks(mpd->inode); - } - /* invalidate all the pages */ - ext4_da_block_invalidatepages(mpd); - - /* Mark this page range as having been completed */ - mpd->io_done = 1; - return; - } - BUG_ON(blks == 0); - - mapp = ↦ - if (map.m_flags & EXT4_MAP_NEW) { - struct block_device *bdev = mpd->inode->i_sb->s_bdev; - int i; - - for (i = 0; i < map.m_len; i++) - unmap_underlying_metadata(bdev, map.m_pblk + i); - } - - /* - * Update on-disk size along with block allocation. - */ - disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; - if (disksize > i_size_read(mpd->inode)) - disksize = i_size_read(mpd->inode); - if (disksize > EXT4_I(mpd->inode)->i_disksize) { - ext4_update_i_disksize(mpd->inode, disksize); - err = ext4_mark_inode_dirty(handle, mpd->inode); - if (err) - ext4_error(mpd->inode->i_sb, - "Failed to mark inode %lu dirty", - mpd->inode->i_ino); - } - -submit_io: - mpage_da_submit_io(mpd, mapp); - mpd->io_done = 1; -} - -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ - (1 << BH_Delay) | (1 << BH_Unwritten)) - -/* - * mpage_add_bh_to_extent - try to add one more block to extent of blocks - * - * @mpd->lbh - extent of blocks - * @logical - logical number of the block in the file - * @b_state - b_state of the buffer head added - * - * the function is used to collect contig. blocks in same state - */ -static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, - unsigned long b_state) -{ - sector_t next; - int blkbits = mpd->inode->i_blkbits; - int nrblocks = mpd->b_size >> blkbits; - - /* - * XXX Don't go larger than mballoc is willing to allocate - * This is a stopgap solution. We eventually need to fold - * mpage_da_submit_io() into this function and then call - * ext4_map_blocks() multiple times in a loop - */ - if (nrblocks >= (8*1024*1024 >> blkbits)) - goto flush_it; - - /* check if the reserved journal credits might overflow */ - if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) { - if (nrblocks >= EXT4_MAX_TRANS_DATA) { - /* - * With non-extent format we are limited by the journal - * credit available. Total credit needed to insert - * nrblocks contiguous blocks is dependent on the - * nrblocks. So limit nrblocks. - */ - goto flush_it; - } - } - /* - * First block in the extent - */ - if (mpd->b_size == 0) { - mpd->b_blocknr = logical; - mpd->b_size = 1 << blkbits; - mpd->b_state = b_state & BH_FLAGS; - return; - } - - next = mpd->b_blocknr + nrblocks; - /* - * Can we merge the block to our big extent? - */ - if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) { - mpd->b_size += 1 << blkbits; - return; - } - -flush_it: - /* - * We couldn't merge the block to our extent, so we - * need to flush current extent and start new one - */ - mpage_da_map_and_submit(mpd); - return; -} - static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) { return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); @@ -1885,10 +1541,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, &es)) { - + ext4_es_lru_add(inode); if (ext4_es_is_hole(&es)) { retval = 0; - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); goto add_delayed; } @@ -1925,7 +1581,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * Try to see if we can get the block without requesting a new * file system block. */ - down_read((&EXT4_I(inode)->i_data_sem)); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) { /* * We will soon create blocks for this page, and let @@ -1990,16 +1646,15 @@ add_delayed: set_buffer_delay(bh); } else if (retval > 0) { int ret; - unsigned long long status; + unsigned int status; -#ifdef ES_AGGRESSIVE_TEST - if (retval != map->m_len) { - printk("ES len assertation failed for inode: %lu " - "retval %d != map->m_len %d " - "in %s (lookup)\n", inode->i_ino, retval, - map->m_len, __func__); + if (unlikely(retval != map->m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map->m_len); + WARN_ON(1); } -#endif status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -2118,6 +1773,7 @@ static int __ext4_journalled_writepage(struct page *page, BUG_ON(!ext4_handle_valid(handle)); if (inline_data) { + BUFFER_TRACE(inode_bh, "get write access"); ret = ext4_journal_get_write_access(handle, inode_bh); err = ext4_handle_dirty_metadata(handle, inode, inode_bh); @@ -2137,7 +1793,7 @@ static int __ext4_journalled_writepage(struct page *page, ret = err; if (!ext4_has_inline_data(inode)) - ext4_walk_page_buffers(handle, page_bufs, 0, len, + ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: @@ -2156,7 +1812,7 @@ out: * lock so we have to do some magic. * * This function can get called via... - * - ext4_da_writepages after taking page lock (have journal handle) + * - ext4_writepages after taking page lock (have journal handle) * - journal_submit_inode_data_buffers (no journal handle) * - shrink_page_list via the kswapd/direct reclaim (no journal handle) * - grab_page_cache when doing write_begin (have journal handle) @@ -2195,6 +1851,7 @@ static int ext4_writepage(struct page *page, struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; struct ext4_io_submit io_submit; + bool keep_towrite = false; trace_ext4_writepage(page); size = i_size_read(inode); @@ -2225,6 +1882,7 @@ static int ext4_writepage(struct page *page, unlock_page(page); return 0; } + keep_towrite = true; } if (PageChecked(page) && ext4_should_journal_data(inode)) @@ -2234,76 +1892,456 @@ static int ext4_writepage(struct page *page, */ return __ext4_journalled_writepage(page, len); - memset(&io_submit, 0, sizeof(io_submit)); - ret = ext4_bio_write_page(&io_submit, page, len, wbc); + ext4_io_submit_init(&io_submit, wbc); + io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_submit.io_end) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return -ENOMEM; + } + ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite); ext4_io_submit(&io_submit); + /* Drop io_end reference we got from init */ + ext4_put_io_end_defer(io_submit.io_end); return ret; } +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) +{ + int len; + loff_t size = i_size_read(mpd->inode); + int err; + + BUG_ON(page->index != mpd->first_page); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + clear_page_dirty_for_io(page); + err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); + if (!err) + mpd->wbc->nr_to_write--; + mpd->first_page++; + + return err; +} + +#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) + /* - * This is called via ext4_da_writepages() to - * calculate the total number of credits to reserve to fit - * a single extent allocation into a single transaction, - * ext4_da_writpeages() will loop calling this before - * the block allocation. + * mballoc gives us at most this number of blocks... + * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). + * The rest of mballoc seems to handle chunks up to full group size. */ +#define MAX_WRITEPAGES_EXTENT_LEN 2048 -static int ext4_da_writepages_trans_blocks(struct inode *inode) +/* + * mpage_add_bh_to_extent - try to add bh to extent of blocks to map + * + * @mpd - extent of blocks + * @lblk - logical number of the block in the file + * @bh - buffer head we want to add to the extent + * + * The function is used to collect contig. blocks in the same state. If the + * buffer doesn't require mapping for writeback and we haven't started the + * extent of buffers to map yet, the function returns 'true' immediately - the + * caller can write the buffer right away. Otherwise the function returns true + * if the block has been added to the extent, false if the block couldn't be + * added. + */ +static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, + struct buffer_head *bh) +{ + struct ext4_map_blocks *map = &mpd->map; + + /* Buffer that doesn't need mapping for writeback? */ + if (!buffer_dirty(bh) || !buffer_mapped(bh) || + (!buffer_delay(bh) && !buffer_unwritten(bh))) { + /* So far no extent to map => we write the buffer right away */ + if (map->m_len == 0) + return true; + return false; + } + + /* First block in the extent? */ + if (map->m_len == 0) { + map->m_lblk = lblk; + map->m_len = 1; + map->m_flags = bh->b_state & BH_FLAGS; + return true; + } + + /* Don't go larger than mballoc is willing to allocate */ + if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) + return false; + + /* Can we merge the block to our big extent? */ + if (lblk == map->m_lblk + map->m_len && + (bh->b_state & BH_FLAGS) == map->m_flags) { + map->m_len++; + return true; + } + return false; +} + +/* + * mpage_process_page_bufs - submit page buffers for IO or add them to extent + * + * @mpd - extent of blocks for mapping + * @head - the first buffer in the page + * @bh - buffer we should start processing from + * @lblk - logical number of the block in the file corresponding to @bh + * + * Walk through page buffers from @bh upto @head (exclusive) and either submit + * the page for IO if all buffers in this page were mapped and there's no + * accumulated extent of buffers to map or add buffers in the page to the + * extent of buffers to map. The function returns 1 if the caller can continue + * by processing the next page, 0 if it should stop adding buffers to the + * extent to map because we cannot extend it anymore. It can also return value + * < 0 in case of error during IO submission. + */ +static int mpage_process_page_bufs(struct mpage_da_data *mpd, + struct buffer_head *head, + struct buffer_head *bh, + ext4_lblk_t lblk) +{ + struct inode *inode = mpd->inode; + int err; + ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1) + >> inode->i_blkbits; + + do { + BUG_ON(buffer_locked(bh)); + + if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { + /* Found extent to map? */ + if (mpd->map.m_len) + return 0; + /* Everything mapped so far and we hit EOF */ + break; + } + } while (lblk++, (bh = bh->b_this_page) != head); + /* So far everything mapped? Submit the page for IO. */ + if (mpd->map.m_len == 0) { + err = mpage_submit_page(mpd, head->b_page); + if (err < 0) + return err; + } + return lblk < blocks; +} + +/* + * mpage_map_buffers - update buffers corresponding to changed extent and + * submit fully mapped pages for IO + * + * @mpd - description of extent to map, on return next extent to map + * + * Scan buffers corresponding to changed extent (we expect corresponding pages + * to be already locked) and update buffer state according to new extent state. + * We map delalloc buffers to their physical location, clear unwritten bits, + * and mark buffers as uninit when we perform writes to unwritten extents + * and do extent conversion after IO is finished. If the last page is not fully + * mapped, we update @map to the next extent in the last page that needs + * mapping. Otherwise we submit the page for IO. + */ +static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) { - int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + struct pagevec pvec; + int nr_pages, i; + struct inode *inode = mpd->inode; + struct buffer_head *head, *bh; + int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits; + pgoff_t start, end; + ext4_lblk_t lblk; + sector_t pblock; + int err; + + start = mpd->map.m_lblk >> bpp_bits; + end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; + lblk = start << bpp_bits; + pblock = mpd->map.m_pblk; + + pagevec_init(&pvec, 0); + while (start <= end) { + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, + PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + if (page->index > end) + break; + /* Up to 'end' pages must be contiguous */ + BUG_ON(page->index != start); + bh = head = page_buffers(page); + do { + if (lblk < mpd->map.m_lblk) + continue; + if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { + /* + * Buffer after end of mapped extent. + * Find next buffer in the page to map. + */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + /* + * FIXME: If dioread_nolock supports + * blocksize < pagesize, we need to make + * sure we add size mapped so far to + * io_end->size as the following call + * can submit the page for IO. + */ + err = mpage_process_page_bufs(mpd, head, + bh, lblk); + pagevec_release(&pvec); + if (err > 0) + err = 0; + return err; + } + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock++; + } + clear_buffer_unwritten(bh); + } while (lblk++, (bh = bh->b_this_page) != head); + + /* + * FIXME: This is going to break if dioread_nolock + * supports blocksize < pagesize as we will try to + * convert potentially unmapped parts of inode. + */ + mpd->io_submit.io_end->size += PAGE_CACHE_SIZE; + /* Page fully mapped - let IO run! */ + err = mpage_submit_page(mpd, page); + if (err < 0) { + pagevec_release(&pvec); + return err; + } + start++; + } + pagevec_release(&pvec); + } + /* Extent fully mapped and matches with page boundary. We are done. */ + mpd->map.m_len = 0; + mpd->map.m_flags = 0; + return 0; +} + +static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) +{ + struct inode *inode = mpd->inode; + struct ext4_map_blocks *map = &mpd->map; + int get_blocks_flags; + int err, dioread_nolock; + + trace_ext4_da_write_pages_extent(inode, map); /* - * With non-extent format the journal credit needed to - * insert nrblocks contiguous block is dependent on - * number of contiguous block. So we will limit - * number of contiguous block to a sane value + * Call ext4_map_blocks() to allocate any delayed allocation blocks, or + * to convert an unwritten extent to be initialized (in the case + * where we have written into one or more preallocated blocks). It is + * possible that we're going to need more metadata blocks than + * previously reserved. However we must not fail because we're in + * writeback and there is nothing we can do about it so it might result + * in data loss. So use reserved blocks to allocate metadata if + * possible. + * + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks + * in question are delalloc blocks. This affects functions in many + * different parts of the allocation call path. This flag exists + * primarily because we don't want to change *many* call functions, so + * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag + * once the inode's allocation semaphore is taken. */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && - (max_blocks > EXT4_MAX_TRANS_DATA)) - max_blocks = EXT4_MAX_TRANS_DATA; + get_blocks_flags = EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL; + dioread_nolock = ext4_should_dioread_nolock(inode); + if (dioread_nolock) + get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; + if (map->m_flags & (1 << BH_Delay)) + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; - return ext4_chunk_trans_blocks(inode, max_blocks); + err = ext4_map_blocks(handle, inode, map, get_blocks_flags); + if (err < 0) + return err; + if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { + if (!mpd->io_submit.io_end->handle && + ext4_handle_valid(handle)) { + mpd->io_submit.io_end->handle = handle->h_rsv_handle; + handle->h_rsv_handle = NULL; + } + ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); + } + + BUG_ON(map->m_len == 0); + if (map->m_flags & EXT4_MAP_NEW) { + struct block_device *bdev = inode->i_sb->s_bdev; + int i; + + for (i = 0; i < map->m_len; i++) + unmap_underlying_metadata(bdev, map->m_pblk + i); + } + return 0; } /* - * write_cache_pages_da - walk the list of dirty pages of the given - * address space and accumulate pages that need writing, and call - * mpage_da_map_and_submit to map a single contiguous memory region - * and then write them. + * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length + * mpd->len and submit pages underlying it for IO + * + * @handle - handle for journal operations + * @mpd - extent to map + * @give_up_on_write - we set this to true iff there is a fatal error and there + * is no hope of writing the data. The caller should discard + * dirty pages to avoid infinite loops. + * + * The function maps extent starting at mpd->lblk of length mpd->len. If it is + * delayed, blocks are allocated, if it is unwritten, we may need to convert + * them to initialized or split the described range from larger unwritten + * extent. Note that we need not map all the described range since allocation + * can return less blocks or the range is covered by more unwritten extents. We + * cannot map more because we are limited by reserved transaction credits. On + * the other hand we always make sure that the last touched page is fully + * mapped so that it can be written out (and thus forward progress is + * guaranteed). After mapping we submit all mapped pages for IO. */ -static int write_cache_pages_da(handle_t *handle, - struct address_space *mapping, - struct writeback_control *wbc, - struct mpage_da_data *mpd, - pgoff_t *done_index) +static int mpage_map_and_submit_extent(handle_t *handle, + struct mpage_da_data *mpd, + bool *give_up_on_write) { - struct buffer_head *bh, *head; - struct inode *inode = mapping->host; - struct pagevec pvec; - unsigned int nr_pages; - sector_t logical; - pgoff_t index, end; - long nr_to_write = wbc->nr_to_write; - int i, tag, ret = 0; - - memset(mpd, 0, sizeof(struct mpage_da_data)); - mpd->wbc = wbc; - mpd->inode = inode; - pagevec_init(&pvec, 0); - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + struct inode *inode = mpd->inode; + struct ext4_map_blocks *map = &mpd->map; + int err; + loff_t disksize; - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + mpd->io_submit.io_end->offset = + ((loff_t)map->m_lblk) << inode->i_blkbits; + do { + err = mpage_map_one_extent(handle, mpd); + if (err < 0) { + struct super_block *sb = inode->i_sb; + + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + goto invalidate_dirty_pages; + /* + * Let the uper layers retry transient errors. + * In the case of ENOSPC, if ext4_count_free_blocks() + * is non-zero, a commit should free up blocks. + */ + if ((err == -ENOMEM) || + (err == -ENOSPC && ext4_count_free_clusters(sb))) + return err; + ext4_msg(sb, KERN_CRIT, + "Delayed block allocation failed for " + "inode %lu at logical offset %llu with" + " max blocks %u with error %d", + inode->i_ino, + (unsigned long long)map->m_lblk, + (unsigned)map->m_len, -err); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will " + "be lost\n"); + if (err == -ENOSPC) + ext4_print_free_blocks(inode); + invalidate_dirty_pages: + *give_up_on_write = true; + return err; + } + /* + * Update buffer state, submit mapped pages, and get us new + * extent to map + */ + err = mpage_map_and_submit_buffers(mpd); + if (err < 0) + return err; + } while (map->m_len); + + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ + disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; + if (disksize > EXT4_I(inode)->i_disksize) { + int err2; + loff_t i_size; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + err2 = ext4_mark_inode_dirty(handle, inode); + up_write(&EXT4_I(inode)->i_data_sem); + if (err2) + ext4_error(inode->i_sb, + "Failed to mark inode %lu dirty", + inode->i_ino); + if (!err) + err = err2; + } + return err; +} + +/* + * Calculate the total number of credits to reserve for one writepages + * iteration. This is called from ext4_writepages(). We map an extent of + * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping + * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + + * bpp - 1 blocks in bpp different extents. + */ +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int bpp = ext4_journal_blocks_per_page(inode); + + return ext4_meta_trans_blocks(inode, + MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); +} + +/* + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages + * and underlying extent to map + * + * @mpd - where to look for pages + * + * Walk dirty pages in the mapping. If they are fully mapped, submit them for + * IO immediately. When we find a page which isn't mapped we start accumulating + * extent of buffers underlying these pages that needs mapping (formed by + * either delayed or unwritten buffers). We also lock the pages containing + * these buffers. The extent found is returned in @mpd structure (starting at + * mpd->lblk with length mpd->len blocks). + * + * Note that this function can attach bios to one io_end structure which are + * neither logically nor physically contiguous. Although it may seem as an + * unnecessary complication, it is actually inevitable in blocksize < pagesize + * case as we need to track IO to all buffers underlying a page in one io_end. + */ +static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) +{ + struct address_space *mapping = mpd->inode->i_mapping; + struct pagevec pvec; + unsigned int nr_pages; + long left = mpd->wbc->nr_to_write; + pgoff_t index = mpd->first_page; + pgoff_t end = mpd->last_page; + int tag; + int i, err = 0; + int blkbits = mpd->inode->i_blkbits; + ext4_lblk_t lblk; + struct buffer_head *head; + + if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; - *done_index = index; + pagevec_init(&pvec, 0); + mpd->map.m_len = 0; + mpd->next_page = index; while (index <= end) { nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) - return 0; + goto out; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -2318,31 +2356,32 @@ static int write_cache_pages_da(handle_t *handle, if (page->index > end) goto out; - *done_index = page->index + 1; - /* - * If we can't merge this page, and we have - * accumulated an contiguous region, write it + * Accumulated enough dirty pages? This doesn't apply + * to WB_SYNC_ALL mode. For integrity sync we have to + * keep going because someone may be concurrently + * dirtying pages, and we might have synced a lot of + * newly appeared dirty pages, but have not synced all + * of the old dirty pages. */ - if ((mpd->next_page != page->index) && - (mpd->next_page != mpd->first_page)) { - mpage_da_map_and_submit(mpd); - goto ret_extent_tail; - } + if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) + goto out; - lock_page(page); + /* If we can't merge this page, we are done. */ + if (mpd->map.m_len > 0 && mpd->next_page != page->index) + goto out; + lock_page(page); /* - * If the page is no longer dirty, or its - * mapping no longer corresponds to inode we - * are writing (which means it has been - * truncated or invalidated), or the page is - * already under writeback and we are not - * doing a data integrity writeback, skip the page + * If the page is no longer dirty, or its mapping no + * longer corresponds to inode we are writing (which + * means it has been truncated or invalidated), or the + * page is already under writeback and we are not doing + * a data integrity writeback, skip the page */ if (!PageDirty(page) || (PageWriteback(page) && - (wbc->sync_mode == WB_SYNC_NONE)) || + (mpd->wbc->sync_mode == WB_SYNC_NONE)) || unlikely(page->mapping != mapping)) { unlock_page(page); continue; @@ -2351,106 +2390,54 @@ static int write_cache_pages_da(handle_t *handle, wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); - /* - * If we have inline data and arrive here, it means that - * we will soon create the block for the 1st page, so - * we'd better clear the inline data here. - */ - if (ext4_has_inline_data(inode)) { - BUG_ON(ext4_test_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA)); - ext4_destroy_inline_data(handle, inode); - } - - if (mpd->next_page != page->index) + if (mpd->map.m_len == 0) mpd->first_page = page->index; mpd->next_page = page->index + 1; - logical = (sector_t) page->index << - (PAGE_CACHE_SHIFT - inode->i_blkbits); - /* Add all dirty buffers to mpd */ + lblk = ((ext4_lblk_t)page->index) << + (PAGE_CACHE_SHIFT - blkbits); head = page_buffers(page); - bh = head; - do { - BUG_ON(buffer_locked(bh)); - /* - * We need to try to allocate unmapped blocks - * in the same page. Otherwise we won't make - * progress with the page in ext4_writepage - */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) { - mpage_add_bh_to_extent(mpd, logical, - bh->b_state); - if (mpd->io_done) - goto ret_extent_tail; - } else if (buffer_dirty(bh) && - buffer_mapped(bh)) { - /* - * mapped dirty buffer. We need to - * update the b_state because we look - * at b_state in mpage_da_map_blocks. - * We don't update b_size because if we - * find an unmapped buffer_head later - * we need to use the b_state flag of - * that buffer_head. - */ - if (mpd->b_size == 0) - mpd->b_state = - bh->b_state & BH_FLAGS; - } - logical++; - } while ((bh = bh->b_this_page) != head); - - if (nr_to_write > 0) { - nr_to_write--; - if (nr_to_write == 0 && - wbc->sync_mode == WB_SYNC_NONE) - /* - * We stop writing back only if we are - * not doing integrity sync. In case of - * integrity sync we have to keep going - * because someone may be concurrently - * dirtying pages, and we might have - * synced a lot of newly appeared dirty - * pages, but have not synced all of the - * old dirty pages. - */ - goto out; - } + err = mpage_process_page_bufs(mpd, head, head, lblk); + if (err <= 0) + goto out; + err = 0; + left--; } pagevec_release(&pvec); cond_resched(); } return 0; -ret_extent_tail: - ret = MPAGE_DA_EXTENT_TAIL; out: pagevec_release(&pvec); - cond_resched(); - return ret; + return err; } +static int __writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = ext4_writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} -static int ext4_da_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int ext4_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - pgoff_t index; + pgoff_t writeback_index = 0; + long nr_to_write = wbc->nr_to_write; int range_whole = 0; + int cycled = 1; handle_t *handle = NULL; struct mpage_da_data mpd; struct inode *inode = mapping->host; - int pages_written = 0; - unsigned int max_pages; - int range_cyclic, cycled = 1, io_done = 0; - int needed_blocks, ret = 0; - long desired_nr_to_write, nr_to_writebump = 0; - loff_t range_start = wbc->range_start; + int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); - pgoff_t done_index = 0; - pgoff_t end; + bool done; struct blk_plug plug; + bool give_up_on_write = false; - trace_ext4_da_writepages(inode, wbc); + trace_ext4_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting @@ -2458,7 +2445,16 @@ static int ext4_da_writepages(struct address_space *mapping, * because that could violate lock ordering on umount */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - return 0; + goto out_writepages; + + if (ext4_should_journal_data(inode)) { + struct blk_plug plug; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __writepage, mapping); + blk_finish_plug(&plug); + goto out_writepages; + } /* * If the filesystem has aborted, it is read-only, so return @@ -2466,158 +2462,151 @@ static int ext4_da_writepages(struct address_space *mapping, * will obscure the real source of the problem. We test * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because * the latter could be true if the filesystem is mounted - * read-only, and in that case, ext4_da_writepages should + * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) - return -EROFS; + if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) { + ret = -EROFS; + goto out_writepages; + } + + if (ext4_should_dioread_nolock(inode)) { + /* + * We may need to convert up to one extent per block in + * the page and we may dirty the inode. + */ + rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits); + } + + /* + * If we have inline data and arrive here, it means that + * we will soon create the block for the 1st page, so + * we'd better clear the inline data here. + */ + if (ext4_has_inline_data(inode)) { + /* Just inode will be modified... */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_writepages; + } + BUG_ON(ext4_test_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA)); + ext4_destroy_inline_data(handle, inode); + ext4_journal_stop(handle); + } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - range_cyclic = wbc->range_cyclic; if (wbc->range_cyclic) { - index = mapping->writeback_index; - if (index) + writeback_index = mapping->writeback_index; + if (writeback_index) cycled = 0; - wbc->range_start = index << PAGE_CACHE_SHIFT; - wbc->range_end = LLONG_MAX; - wbc->range_cyclic = 0; - end = -1; + mpd.first_page = writeback_index; + mpd.last_page = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - } - - /* - * This works around two forms of stupidity. The first is in - * the writeback code, which caps the maximum number of pages - * written to be 1024 pages. This is wrong on multiple - * levels; different architectues have a different page size, - * which changes the maximum amount of data which gets - * written. Secondly, 4 megabytes is way too small. XFS - * forces this value to be 16 megabytes by multiplying - * nr_to_write parameter by four, and then relies on its - * allocator to allocate larger extents to make them - * contiguous. Unfortunately this brings us to the second - * stupidity, which is that ext4's mballoc code only allocates - * at most 2048 blocks. So we force contiguous writes up to - * the number of dirty blocks in the inode, or - * sbi->max_writeback_mb_bump whichever is smaller. - */ - max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); - if (!range_cyclic && range_whole) { - if (wbc->nr_to_write == LONG_MAX) - desired_nr_to_write = wbc->nr_to_write; - else - desired_nr_to_write = wbc->nr_to_write * 8; - } else - desired_nr_to_write = ext4_num_dirty_pages(inode, index, - max_pages); - if (desired_nr_to_write > max_pages) - desired_nr_to_write = max_pages; - - if (wbc->nr_to_write < desired_nr_to_write) { - nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; - wbc->nr_to_write = desired_nr_to_write; + mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT; + mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT; } + mpd.inode = inode; + mpd.wbc = wbc; + ext4_io_submit_init(&mpd.io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); - + tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); + done = false; blk_start_plug(&plug); - while (!ret && wbc->nr_to_write > 0) { + while (!done && mpd.first_page <= mpd.last_page) { + /* For each extent of pages we use new io_end */ + mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd.io_submit.io_end) { + ret = -ENOMEM; + break; + } /* - * we insert one extent at a time. So we need - * credit needed for single extent allocation. - * journalled mode is currently not supported - * by delalloc + * We have two constraints: We find one extent to map and we + * must always write out whole page (makes a difference when + * blocksize < pagesize) so that we don't block on IO when we + * try to write out the rest of the page. Journalled mode is + * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); needed_blocks = ext4_da_writepages_trans_blocks(inode); - /* start a new transaction*/ - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - needed_blocks); + /* start a new transaction */ + handle = ext4_journal_start_with_reserve(inode, + EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); - blk_finish_plug(&plug); - goto out_writepages; + /* Release allocated io_end */ + ext4_put_io_end(mpd.io_submit.io_end); + break; } - /* - * Now call write_cache_pages_da() to find the next - * contiguous region of logical blocks that need - * blocks to be allocated by ext4 and submit them. - */ - ret = write_cache_pages_da(handle, mapping, - wbc, &mpd, &done_index); - /* - * If we have a contiguous extent of pages and we - * haven't done the I/O yet, map the blocks and submit - * them for I/O. - */ - if (!mpd.io_done && mpd.next_page != mpd.first_page) { - mpage_da_map_and_submit(&mpd); - ret = MPAGE_DA_EXTENT_TAIL; + trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); + ret = mpage_prepare_extent_to_map(&mpd); + if (!ret) { + if (mpd.map.m_len) + ret = mpage_map_and_submit_extent(handle, &mpd, + &give_up_on_write); + else { + /* + * We scanned the whole range (or exhausted + * nr_to_write), submitted what was mapped and + * didn't find anything needing mapping. We are + * done. + */ + done = true; + } } - trace_ext4_da_write_pages(inode, &mpd); - wbc->nr_to_write -= mpd.pages_written; - ext4_journal_stop(handle); - - if ((mpd.retval == -ENOSPC) && sbi->s_journal) { - /* commit the transaction which would + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, give_up_on_write); + /* Drop our io_end reference we got from init */ + ext4_put_io_end(mpd.io_submit.io_end); + + if (ret == -ENOSPC && sbi->s_journal) { + /* + * Commit the transaction which would * free blocks released in the transaction * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); ret = 0; - } else if (ret == MPAGE_DA_EXTENT_TAIL) { - /* - * Got one extent now try with rest of the pages. - * If mpd.retval is set -EIO, journal is aborted. - * So we don't need to write any more. - */ - pages_written += mpd.pages_written; - ret = mpd.retval; - io_done = 1; - } else if (wbc->nr_to_write) - /* - * There is no more writeout needed - * or we requested for a noblocking writeout - * and we found the device congested - */ + continue; + } + /* Fatal error - ENOMEM, EIO... */ + if (ret) break; } blk_finish_plug(&plug); - if (!io_done && !cycled) { + if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; - index = 0; - wbc->range_start = index << PAGE_CACHE_SHIFT; - wbc->range_end = mapping->writeback_index - 1; + mpd.last_page = writeback_index - 1; + mpd.first_page = 0; goto retry; } /* Update index */ - wbc->range_cyclic = range_cyclic; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* - * set the writeback_index so that range_cyclic + * Set the writeback_index so that range_cyclic * mode will write it back later */ - mapping->writeback_index = done_index; + mapping->writeback_index = mpd.first_page; out_writepages: - wbc->nr_to_write -= nr_to_writebump; - wbc->range_start = range_start; - trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); + trace_ext4_writepages_result(inode, wbc, ret, + nr_to_write - wbc->nr_to_write); return ret; } @@ -2720,7 +2709,7 @@ retry_journal: goto retry_grab; } /* In case writeback began while the page was unlocked */ - wait_on_page_writeback(page); + wait_for_stable_page(page); ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); if (ret < 0) { @@ -2829,7 +2818,8 @@ static int ext4_da_write_end(struct file *file, return ret ? ret : copied; } -static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +static void ext4_da_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { /* * Drop reserved blocks @@ -2838,10 +2828,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset) if (!page_has_buffers(page)) goto out; - ext4_da_page_release_reservation(page, offset); + ext4_da_page_release_reservation(page, offset, length); out: - ext4_invalidatepage(page, offset); + ext4_invalidatepage(page, offset, length); return; } @@ -2864,7 +2854,7 @@ int ext4_alloc_da_blocks(struct inode *inode) * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: * - * ext4_da_writepages() -> + * ext4_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> * mpage_add_bh_to_extent() @@ -2989,37 +2979,40 @@ ext4_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } -static void ext4_invalidatepage(struct page *page, unsigned long offset) +static void ext4_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - trace_ext4_invalidatepage(page, offset); + trace_ext4_invalidatepage(page, offset, length); /* No journalling happens on data buffers when this function is used */ WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); - block_invalidatepage(page, offset); + block_invalidatepage(page, offset, length); } static int __ext4_journalled_invalidatepage(struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); - trace_ext4_journalled_invalidatepage(page, offset); + trace_ext4_journalled_invalidatepage(page, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); - return jbd2_journal_invalidatepage(journal, page, offset); + return jbd2_journal_invalidatepage(journal, page, offset, length); } /* Wrapper for aops... */ static void ext4_journalled_invalidatepage(struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { - WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); + WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0); } static int ext4_releasepage(struct page *page, gfp_t wait) @@ -3061,15 +3054,13 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private, int ret, - bool is_async) + ssize_t size, void *private) { - struct inode *inode = file_inode(iocb->ki_filp); ext4_io_end_t *io_end = iocb->private; - /* if not async direct IO or dio with 0 bytes write, just return */ - if (!io_end || !size) - goto out; + /* if not async direct IO just return */ + if (!io_end) + return; ext_debug("ext4_end_io_dio(): io_end 0x%p " "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", @@ -3077,25 +3068,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, size); iocb->private = NULL; - - /* if not aio dio with unwritten extents, just free io and return */ - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { - ext4_free_io_end(io_end); -out: - inode_dio_done(inode); - if (is_async) - aio_complete(iocb, ret, 0); - return; - } - io_end->offset = offset; io_end->size = size; - if (is_async) { - io_end->iocb = iocb; - io_end->result = ret; - } - - ext4_add_complete_io(io_end); + ext4_put_io_end(io_end); } /* @@ -3103,9 +3078,9 @@ out: * preallocated extents, and those write extend the file, no need to * fall back to buffered IO. * - * For holes, we fallocate those blocks, mark them as uninitialized + * For holes, we fallocate those blocks, mark them as unwritten * If those blocks were preallocated, we mark sure they are split, but - * still keep the range to write as uninitialized. + * still keep the range to write as unwritten. * * The unwritten extents will be converted to written when DIO is completed. * For async direct IO, since the IO may still pending when return, we @@ -3118,29 +3093,36 @@ out: * */ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); int overwrite = 0; get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; + ext4_io_end_t *io_end = NULL; /* Use the old path for reads and writes beyond i_size. */ if (rw != WRITE || final_size > inode->i_size) - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + return ext4_ind_direct_IO(rw, iocb, iter, offset); BUG_ON(iocb->private == NULL); + /* + * Make all waiters for direct IO properly wait also for extent + * conversion. This also disallows race between truncate() and + * overwrite DIO as i_dio_count needs to be incremented under i_mutex. + */ + if (rw == WRITE) + atomic_inc(&inode->i_dio_count); + /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); if (overwrite) { - atomic_inc(&inode->i_dio_count); down_read(&EXT4_I(inode)->i_data_sem); mutex_unlock(&inode->i_mutex); } @@ -3149,12 +3131,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, * We could direct write to holes and fallocate. * * Allocated blocks to fill the hole are marked as - * uninitialized to prevent parallel buffered read to expose + * unwritten to prevent parallel buffered read to expose * the stale data before DIO complete the data IO. * * As to previously fallocated extents, ext4 get_block will * just simply mark the buffer mapped but still keep the - * extents uninitialized. + * extents unwritten. * * For non AIO case, we will convert those unwritten extents * to written after return back from blockdev_direct_IO. @@ -3167,13 +3149,15 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, iocb->private = NULL; ext4_inode_aio_set(inode, NULL); if (!is_sync_kiocb(iocb)) { - ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); + io_end = ext4_init_io_end(inode, GFP_NOFS); if (!io_end) { ret = -ENOMEM; goto retake_lock; } - io_end->flag |= EXT4_IO_END_DIRECT; - iocb->private = io_end; + /* + * Grab reference for DIO. Will be dropped in ext4_end_io_dio() + */ + iocb->private = ext4_get_io_end(io_end); /* * we save the io structure for current async direct * IO, so that later ext4_map_blocks() could flag the @@ -3190,40 +3174,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, dio_flags = DIO_LOCKING; } ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, + inode->i_sb->s_bdev, iter, + offset, get_block_func, ext4_end_io_dio, NULL, dio_flags); - if (iocb->private) - ext4_inode_aio_set(inode, NULL); /* - * The io_end structure takes a reference to the inode, that - * structure needs to be destroyed and the reference to the - * inode need to be dropped, when IO is complete, even with 0 - * byte write, or failed. - * - * In the successful AIO DIO case, the io_end structure will - * be destroyed and the reference to the inode will be dropped - * after the end_io call back function is called. - * - * In the case there is 0 byte write, or error case, since VFS - * direct IO won't invoke the end_io call back function, we - * need to free the end_io structure here. + * Put our reference to io_end. This can free the io_end structure e.g. + * in sync IO case or in case of error. It can even perform extent + * conversion if all bios we submitted finished before we got here. + * Note that in that case iocb->private can be already set to NULL + * here. */ - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { - ext4_free_io_end(iocb->private); - iocb->private = NULL; - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + if (io_end) { + ext4_inode_aio_set(inode, NULL); + ext4_put_io_end(io_end); + /* + * When no IO was submitted ext4_end_io_dio() was not + * called so we have to put iocb's reference. + */ + if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { + WARN_ON(iocb->private != io_end); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); + ext4_put_io_end(io_end); + iocb->private = NULL; + } + } + if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { int err; /* * for non AIO case, since the IO is already * completed, we could do the conversion right here */ - err = ext4_convert_unwritten_extents(inode, + err = ext4_convert_unwritten_extents(NULL, inode, offset, ret); if (err < 0) ret = err; @@ -3231,9 +3217,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, } retake_lock: + if (rw == WRITE) + inode_dio_done(inode); /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) { - inode_dio_done(inode); up_read(&EXT4_I(inode)->i_data_sem); mutex_lock(&inode->i_mutex); } @@ -3242,11 +3229,11 @@ retake_lock: } static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; /* @@ -3259,13 +3246,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, if (ext4_has_inline_data(inode)) return 0; - trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + trace_ext4_direct_IO_enter(inode, offset, count, rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + ret = ext4_ext_direct_IO(rw, iocb, iter, offset); else - ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); - trace_ext4_direct_IO_exit(inode, offset, - iov_length(iov, nr_segs), rw, ret); + ret = ext4_ind_direct_IO(rw, iocb, iter, offset); + trace_ext4_direct_IO_exit(inode, offset, count, rw, ret); return ret; } @@ -3292,6 +3278,7 @@ static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, + .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, .bmap = ext4_bmap, @@ -3307,6 +3294,7 @@ static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, + .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, .set_page_dirty = ext4_journalled_set_page_dirty, @@ -3322,7 +3310,7 @@ static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, .writepage = ext4_writepage, - .writepages = ext4_da_writepages, + .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, .bmap = ext4_bmap, @@ -3355,20 +3343,22 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } - /* - * ext4_discard_partial_page_buffers() - * Wrapper function for ext4_discard_partial_page_buffers_no_lock. - * This function finds and locks the page containing the offset - * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. - * Calling functions that already have the page locked should call - * ext4_discard_partial_page_buffers_no_lock directly. + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' */ -int ext4_discard_partial_page_buffers(handle_t *handle, - struct address_space *mapping, loff_t from, - loff_t length, int flags) +static int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) { + ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, max, pos; + ext4_lblk_t iblock; struct inode *inode = mapping->host; + struct buffer_head *bh; struct page *page; int err = 0; @@ -3377,67 +3367,12 @@ int ext4_discard_partial_page_buffers(handle_t *handle, if (!page) return -ENOMEM; - err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page, - from, length, flags); - - unlock_page(page); - page_cache_release(page); - return err; -} - -/* - * ext4_discard_partial_page_buffers_no_lock() - * Zeros a page range of length 'length' starting from offset 'from'. - * Buffer heads that correspond to the block aligned regions of the - * zeroed range will be unmapped. Unblock aligned regions - * will have the corresponding buffer head mapped if needed so that - * that region of the page can be updated with the partial zero out. - * - * This function assumes that the page has already been locked. The - * The range to be discarded must be contained with in the given page. - * If the specified range exceeds the end of the page it will be shortened - * to the end of the page that corresponds to 'from'. This function is - * appropriate for updating a page and it buffer heads to be unmapped and - * zeroed for blocks that have been either released, or are going to be - * released. - * - * handle: The journal handle - * inode: The files inode - * page: A locked page that contains the offset "from" - * from: The starting byte offset (from the beginning of the file) - * to begin discarding - * len: The length of bytes to discard - * flags: Optional flags that may be used: - * - * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED - * Only zero the regions of the page whose buffer heads - * have already been unmapped. This flag is appropriate - * for updating the contents of a page whose blocks may - * have already been released, and we only want to zero - * out the regions that correspond to those released blocks. - * - * Returns zero on success or negative on failure. - */ -static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, - struct inode *inode, struct page *page, loff_t from, - loff_t length, int flags) -{ - ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; - unsigned int offset = from & (PAGE_CACHE_SIZE-1); - unsigned int blocksize, max, pos; - ext4_lblk_t iblock; - struct buffer_head *bh; - int err = 0; - blocksize = inode->i_sb->s_blocksize; - max = PAGE_CACHE_SIZE - offset; - - if (index != page->index) - return -EINVAL; + max = blocksize - (offset & (blocksize - 1)); /* * correct length if it does not fall between - * 'from' and the end of the page + * 'from' and the end of the block */ if (length > max || length < 0) length = max; @@ -3455,106 +3390,111 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, iblock++; pos += blocksize; } + if (buffer_freed(bh)) { + BUFFER_TRACE(bh, "freed: skip"); + goto unlock; + } + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "unmapped"); + ext4_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) { + BUFFER_TRACE(bh, "still unmapped"); + goto unlock; + } + } - pos = offset; - while (pos < offset + length) { - unsigned int end_of_block, range_to_discard; - - err = 0; - - /* The length of space left to zero and unmap */ - range_to_discard = offset + length - pos; - - /* The length of space until the end of the block */ - end_of_block = blocksize - (pos & (blocksize-1)); + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); - /* - * Do not unmap or zero past end of block - * for this buffer head - */ - if (range_to_discard > end_of_block) - range_to_discard = end_of_block; + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + if (ext4_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + zero_user(page, offset, length); + BUFFER_TRACE(bh, "zeroed end of block"); + if (ext4_should_journal_data(inode)) { + err = ext4_handle_dirty_metadata(handle, inode, bh); + } else { + err = 0; + mark_buffer_dirty(bh); + if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) + err = ext4_jbd2_file_inode(handle, inode); + } - /* - * Skip this buffer head if we are only zeroing unampped - * regions of the page - */ - if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED && - buffer_mapped(bh)) - goto next; - - /* If the range is block aligned, unmap */ - if (range_to_discard == blocksize) { - clear_buffer_dirty(bh); - bh->b_bdev = NULL; - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - clear_buffer_delay(bh); - clear_buffer_unwritten(bh); - clear_buffer_uptodate(bh); - zero_user(page, pos, range_to_discard); - BUFFER_TRACE(bh, "Buffer discarded"); - goto next; - } +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} - /* - * If this block is not completely contained in the range - * to be discarded, then it is not going to be released. Because - * we need to keep this block, we need to make sure this part - * of the page is uptodate before we modify it by writeing - * partial zeros on it. - */ - if (!buffer_mapped(bh)) { - /* - * Buffer head must be mapped before we can read - * from the block - */ - BUFFER_TRACE(bh, "unmapped"); - ext4_get_block(inode, iblock, bh, 0); - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(bh)) { - BUFFER_TRACE(bh, "still unmapped"); - goto next; - } - } +/* + * ext4_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext4_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) +{ + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned length; + unsigned blocksize; + struct inode *inode = mapping->host; - /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) - set_buffer_uptodate(bh); + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); - if (!buffer_uptodate(bh)) { - err = -EIO; - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - /* Uhhuh. Read error. Complain and punt.*/ - if (!buffer_uptodate(bh)) - goto next; - } + return ext4_block_zero_page_range(handle, mapping, from, length); +} - if (ext4_should_journal_data(inode)) { - BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); - if (err) - goto next; - } +int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, + loff_t lstart, loff_t length) +{ + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + unsigned partial_start, partial_end; + ext4_fsblk_t start, end; + loff_t byte_end = (lstart + length - 1); + int err = 0; - zero_user(page, pos, range_to_discard); + partial_start = lstart & (sb->s_blocksize - 1); + partial_end = byte_end & (sb->s_blocksize - 1); - err = 0; - if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); - } else - mark_buffer_dirty(bh); + start = lstart >> sb->s_blocksize_bits; + end = byte_end >> sb->s_blocksize_bits; - BUFFER_TRACE(bh, "Partial buffer zeroed"); -next: - bh = bh->b_this_page; - iblock++; - pos += range_to_discard; + /* Handle partial zero within the single block */ + if (start == end && + (partial_start || (partial_end != sb->s_blocksize - 1))) { + err = ext4_block_zero_page_range(handle, mapping, + lstart, length); + return err; } - + /* Handle partial zero out on the start of the range */ + if (partial_start) { + err = ext4_block_zero_page_range(handle, mapping, + lstart, sb->s_blocksize); + if (err) + return err; + } + /* Handle partial zero out on the end of the range */ + if (partial_end != sb->s_blocksize - 1) + err = ext4_block_zero_page_range(handle, mapping, + byte_end - partial_end, + partial_end + 1); return err; } @@ -3580,14 +3520,12 @@ int ext4_can_truncate(struct inode *inode) * Returns: 0 on success or negative on failure */ -int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) +int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) { - struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; - loff_t first_page, last_page, page_len; - loff_t first_page_offset, last_page_offset; + loff_t first_block_offset, last_block_offset; handle_t *handle; unsigned int credits; int ret = 0; @@ -3595,12 +3533,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (EXT4_SB(sb)->s_cluster_ratio > 1) { - /* TODO: Add support for bigalloc file systems */ - return -EOPNOTSUPP; - } - - trace_ext4_punch_hole(inode, offset, length); + trace_ext4_punch_hole(inode, offset, length, 0); /* * Write out all dirty pages to avoid race conditions @@ -3614,15 +3547,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) } mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -3638,23 +3562,28 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) offset; } - first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - last_page = (offset + length) >> PAGE_CACHE_SHIFT; - - first_page_offset = first_page << PAGE_CACHE_SHIFT; - last_page_offset = last_page << PAGE_CACHE_SHIFT; + if (offset & (sb->s_blocksize - 1) || + (offset + length) & (sb->s_blocksize - 1)) { + /* + * Attach jinode to inode for jbd2 if we do any zeroing of + * partial block + */ + ret = ext4_inode_attach_jinode(inode); + if (ret < 0) + goto out_mutex; - /* Now release the pages */ - if (last_page_offset > first_page_offset) { - truncate_pagecache_range(inode, first_page_offset, - last_page_offset - 1); } + first_block_offset = round_up(offset, sb->s_blocksize); + last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + + /* Now release the pages and zero block aligned part of pages*/ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + /* Wait all existing dio workers, newcomers will block on i_mutex */ ext4_inode_block_unlocked_dio(inode); - ret = ext4_flush_unwritten_io(inode); - if (ret) - goto out_dio; inode_dio_wait(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -3668,66 +3597,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) goto out_dio; } - /* - * Now we need to zero out the non-page-aligned data in the - * pages at the start and tail of the hole, and unmap the - * buffer heads for the block aligned regions of the page that - * were completely zeroed. - */ - if (first_page > last_page) { - /* - * If the file space being truncated is contained - * within a page just zero out and unmap the middle of - * that page - */ - ret = ext4_discard_partial_page_buffers(handle, - mapping, offset, length, 0); - - if (ret) - goto out_stop; - } else { - /* - * zero out and unmap the partial page that contains - * the start of the hole - */ - page_len = first_page_offset - offset; - if (page_len > 0) { - ret = ext4_discard_partial_page_buffers(handle, mapping, - offset, page_len, 0); - if (ret) - goto out_stop; - } - - /* - * zero out and unmap the partial page that contains - * the end of the hole - */ - page_len = offset + length - last_page_offset; - if (page_len > 0) { - ret = ext4_discard_partial_page_buffers(handle, mapping, - last_page_offset, page_len, 0); - if (ret) - goto out_stop; - } - } - - /* - * If i_size is contained in the last page, we need to - * unmap and zero the partial page after i_size - */ - if (inode->i_size >> PAGE_CACHE_SHIFT == last_page && - inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - if (page_len > 0) { - ret = ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0); - - if (ret) - goto out_stop; - } - } + ret = ext4_zero_partial_blocks(handle, inode, offset, + length); + if (ret) + goto out_stop; first_block = (offset + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); @@ -3754,10 +3627,15 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) ret = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); + + /* Now release the pages again to reduce race window */ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: @@ -3769,6 +3647,31 @@ out_mutex: return ret; } +int ext4_inode_attach_jinode(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct jbd2_inode *jinode; + + if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) + return 0; + + jinode = jbd2_alloc_inode(GFP_KERNEL); + spin_lock(&inode->i_lock); + if (!ei->jinode) { + if (!jinode) { + spin_unlock(&inode->i_lock); + return -ENOMEM; + } + ei->jinode = jinode; + jbd2_journal_init_jbd_inode(ei->jinode, inode); + jinode = NULL; + } + spin_unlock(&inode->i_lock); + if (unlikely(jinode != NULL)) + jbd2_free_inode(jinode); + return 0; +} + /* * ext4_truncate() * @@ -3803,11 +3706,10 @@ void ext4_truncate(struct inode *inode) unsigned int credits; handle_t *handle; struct address_space *mapping = inode->i_mapping; - loff_t page_len; /* * There is a possibility that we're either freeing the inode - * or it completely new indode. In those cases we might not + * or it's a completely new inode. In those cases we might not * have i_mutex locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) @@ -3830,11 +3732,11 @@ void ext4_truncate(struct inode *inode) return; } - /* - * finish any pending end_io work so we won't run the risk of - * converting any truncated blocks to initialized later - */ - ext4_flush_unwritten_io(inode); + /* If we zero-out tail of the page, we have to create jinode for jbd2 */ + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { + if (ext4_inode_attach_jinode(inode) < 0) + return; + } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -3847,14 +3749,8 @@ void ext4_truncate(struct inode *inode) return; } - if (inode->i_size % PAGE_CACHE_SIZE != 0) { - page_len = PAGE_CACHE_SIZE - - (inode->i_size & (PAGE_CACHE_SIZE - 1)); - - if (ext4_discard_partial_page_buffers(handle, - mapping, inode->i_size, page_len, 0)) - goto out_stop; - } + if (inode->i_size & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, mapping, inode->i_size); /* * We add the inode to the orphan list, so that if this @@ -4053,18 +3949,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) void ext4_set_inode_flags(struct inode *inode) { unsigned int flags = EXT4_I(inode)->i_flags; + unsigned int new_fl = 0; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); if (flags & EXT4_SYNC_FL) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + inode_set_flags(inode, new_fl, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4283,11 +4181,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); - inode->i_version = le32_to_cpu(raw_inode->i_disk_version); - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - inode->i_version |= - (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + inode->i_version = le32_to_cpu(raw_inode->i_disk_version); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + inode->i_version |= + (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; + } } ret = 0; @@ -4410,12 +4310,15 @@ static int ext4_do_update_inode(handle_t *handle, struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; + struct super_block *sb = inode->i_sb; int err = 0, rc, block; - int need_datasync = 0; + int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; - /* For fields not not tracking in the in-memory inode, + spin_lock(&ei->i_raw_lock); + + /* For fields not tracked in the in-memory inode, * initialise them to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); @@ -4453,12 +4356,13 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); - if (ext4_inode_blocks_set(handle, raw_inode, ei)) + if (ext4_inode_blocks_set(handle, raw_inode, ei)) { + spin_unlock(&ei->i_raw_lock); goto out_brelse; + } raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); - if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != - cpu_to_le32(EXT4_OS_HURD)) + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); @@ -4467,24 +4371,11 @@ static int ext4_do_update_inode(handle_t *handle, need_datasync = 1; } if (ei->i_disksize > 0x7fffffffULL) { - struct super_block *sb = inode->i_sb; if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || EXT4_SB(sb)->s_es->s_rev_level == - cpu_to_le32(EXT4_GOOD_OLD_REV)) { - /* If this is the first large file - * created, add a flag to the superblock. - */ - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - goto out_brelse; - ext4_update_dynamic_rev(sb); - EXT4_SET_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE); - ext4_handle_sync(handle); - err = ext4_handle_dirty_super(handle, sb); - } + cpu_to_le32(EXT4_GOOD_OLD_REV)) + set_large_file = 1; } raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { @@ -4503,22 +4394,37 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_block[block] = ei->i_data[block]; } - raw_inode->i_disk_version = cpu_to_le32(inode->i_version); - if (ei->i_extra_isize) { - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) - raw_inode->i_version_hi = - cpu_to_le32(inode->i_version >> 32); - raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { + raw_inode->i_disk_version = cpu_to_le32(inode->i_version); + if (ei->i_extra_isize) { + if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) + raw_inode->i_version_hi = + cpu_to_le32(inode->i_version >> 32); + raw_inode->i_extra_isize = + cpu_to_le16(ei->i_extra_isize); + } } ext4_inode_csum_set(inode, raw_inode, ei); + spin_unlock(&ei->i_raw_lock); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); rc = ext4_handle_dirty_metadata(handle, NULL, bh); if (!err) err = rc; ext4_clear_inode_state(inode, EXT4_STATE_NEW); - + if (set_large_file) { + BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); + err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + if (err) + goto out_brelse; + ext4_update_dynamic_rev(sb); + EXT4_SET_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE); + ext4_handle_sync(handle); + err = ext4_handle_dirty_super(handle, sb); + } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_brelse: brelse(bh); @@ -4531,21 +4437,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -4557,15 +4462,15 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (EXT4_SB(inode->i_sb)->s_journal) { @@ -4575,7 +4480,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) return -EIO; } - if (wbc->sync_mode != WB_SYNC_ALL) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext4_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_force_commit(inode->i_sb); @@ -4585,7 +4495,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) err = __ext4_get_inode_loc(inode, &iloc, 0); if (err) return err; - if (wbc->sync_mode == WB_SYNC_ALL) + /* + * sync(2) will flush the whole buffer cache. No need to do + * it here separately for each inode. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, @@ -4623,7 +4537,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) inode->i_size >> PAGE_CACHE_SHIFT); if (!page) return; - ret = __ext4_journalled_invalidatepage(page, offset); + ret = __ext4_journalled_invalidatepage(page, offset, + PAGE_CACHE_SIZE - offset); unlock_page(page); page_cache_release(page); if (ret != -EBUSY) @@ -4703,7 +4618,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_journal_stop(handle); } - if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + handle_t *handle; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4711,73 +4627,73 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } - } - if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && - (attr->ia_size < inode->i_size)) { - handle_t *handle; + if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) + inode_inc_iversion(inode); - handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); - if (IS_ERR(handle)) { - error = PTR_ERR(handle); - goto err_out; - } - if (ext4_handle_valid(handle)) { - error = ext4_orphan_add(handle, inode); - orphan = 1; - } - EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; - ext4_journal_stop(handle); - - if (ext4_should_order_data(inode)) { - error = ext4_begin_ordered_truncate(inode, + if (S_ISREG(inode->i_mode) && + (attr->ia_size < inode->i_size)) { + if (ext4_should_order_data(inode)) { + error = ext4_begin_ordered_truncate(inode, attr->ia_size); - if (error) { - /* Do as much error cleanup as possible */ - handle = ext4_journal_start(inode, - EXT4_HT_INODE, 3); - if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); + if (error) goto err_out; - } - ext4_orphan_del(handle, inode); - orphan = 0; - ext4_journal_stop(handle); + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); goto err_out; } - } - } - - if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != inode->i_size) { - loff_t oldsize = inode->i_size; - - i_size_write(inode, attr->ia_size); - /* - * Blocks are going to be removed from the inode. Wait - * for dio in flight. Temporarily disable - * dioread_nolock to prevent livelock. - */ - if (orphan) { - if (!ext4_should_journal_data(inode)) { - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ext4_inode_resume_unlocked_dio(inode); - } else - ext4_wait_for_tail_page_commit(inode); + if (ext4_handle_valid(handle)) { + error = ext4_orphan_add(handle, inode); + orphan = 1; } + down_write(&EXT4_I(inode)->i_data_sem); + EXT4_I(inode)->i_disksize = attr->ia_size; + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; /* - * Truncate pagecache after we've waited for commit - * in data=journal mode to make pages freeable. + * We have to update i_size under i_data_sem together + * with i_disksize to avoid races with writeback code + * running ext4_wb_update_i_disksize(). */ - truncate_pagecache(inode, oldsize, inode->i_size); + if (!error) + i_size_write(inode, attr->ia_size); + up_write(&EXT4_I(inode)->i_data_sem); + ext4_journal_stop(handle); + if (error) { + ext4_orphan_del(NULL, inode); + goto err_out; + } + } else + i_size_write(inode, attr->ia_size); + + /* + * Blocks are going to be removed from the inode. Wait + * for dio in flight. Temporarily disable + * dioread_nolock to prevent livelock. + */ + if (orphan) { + if (!ext4_should_journal_data(inode)) { + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ext4_inode_resume_unlocked_dio(inode); + } else + ext4_wait_for_tail_page_commit(inode); } - ext4_truncate(inode); + /* + * Truncate pagecache after we've waited for commit + * in data=journal mode to make pages freeable. + */ + truncate_pagecache(inode, inode->i_size); } + /* + * We want to call ext4_truncate() even if attr->ia_size == + * inode->i_size for cases like truncation of fallocated space + */ + if (attr->ia_valid & ATTR_SIZE) + ext4_truncate(inode); if (!rc) { setattr_copy(inode, attr); @@ -4792,7 +4708,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_orphan_del(NULL, inode); if (!rc && (ia_valid & ATTR_MODE)) - rc = ext4_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); err_out: ext4_std_error(inode->i_sb, error); @@ -4805,12 +4721,21 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode; - unsigned long delalloc_blocks; + unsigned long long delalloc_blocks; inode = dentry->d_inode; generic_fillattr(inode, stat); /* + * If there is inline data in the inode, the inode will normally not + * have data blocks allocated (it may have an external xattr block). + * Report at least one sector for such files, so tools like tar, rsync, + * others doen't incorrectly think the file is completely sparse. + */ + if (unlikely(ext4_has_inline_data(inode))) + stat->blocks += (stat->size + 511) >> 9; + + /* * We can't update i_blocks if the block allocation is delayed * otherwise in the case of system crash before the real block * allocation is done, we will have i_blocks inconsistent with @@ -4821,17 +4746,17 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, * blocks for this file. */ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), - EXT4_I(inode)->i_reserved_data_blocks); - - stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; + EXT4_I(inode)->i_reserved_data_blocks); + stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); return 0; } -static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_index_trans_blocks(struct inode *inode, int lblocks, + int pextents) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return ext4_ind_trans_blocks(inode, nrblocks, chunk); - return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); + return ext4_ind_trans_blocks(inode, lblocks); + return ext4_ext_index_trans_blocks(inode, pextents); } /* @@ -4845,7 +4770,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -4853,14 +4779,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) int ret = 0; /* - * How many index blocks need to touch to modify nrblocks? - * The "Chunk" flag indicating whether the nrblocks is - * physically contiguous on disk - * - * For Direct IO and fallocate, they calls get_block to allocate - * one single extent at a time, so they could set the "Chunk" flag + * How many index blocks need to touch to map @lblocks logical blocks + * to @pextents physical extents? */ - idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); ret = idxblocks; @@ -4868,12 +4790,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks; - if (chunk) - groups += 1; - else - groups += nrblocks; - + groups = idxblocks + pextents; gdpblocks = groups; if (groups > ngroups) groups = ngroups; @@ -4904,7 +4821,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) int bpp = ext4_journal_blocks_per_page(inode); int ret; - ret = ext4_meta_trans_blocks(inode, bpp, 0); + ret = ext4_meta_trans_blocks(inode, bpp, bpp); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) |
