diff options
Diffstat (limited to 'fs/ocfs2/aops.c')
| -rw-r--r-- | fs/ocfs2/aops.c | 645 |
1 files changed, 386 insertions, 259 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 90383ed6100..4a231a166cf 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -27,8 +27,8 @@ #include <linux/swap.h> #include <linux/pipe_fs_i.h> #include <linux/mpage.h> +#include <linux/quotaops.h> -#define MLOG_MASK_PREFIX ML_FILE_IO #include <cluster/masklog.h> #include "ocfs2.h" @@ -43,6 +43,8 @@ #include "suballoc.h" #include "super.h" #include "symlink.h" +#include "refcounttree.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -57,8 +59,9 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); void *kaddr; - mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, - (unsigned long long)iblock, bh_result, create); + trace_ocfs2_symlink_get_block( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)iblock, bh_result, create); BUG_ON(ocfs2_inode_is_fast_symlink(inode)); @@ -68,24 +71,16 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, goto bail; } - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - &bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_inode_block(inode, &bh); if (status < 0) { mlog_errno(status); goto bail; } fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)le64_to_cpu(fe->i_blkno), 7, - fe->i_signature); - goto bail; - } - if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, le32_to_cpu(fe->i_clusters))) { + err = -ENOMEM; mlog(ML_ERROR, "block offset is outside the allocated size: " "%llu\n", (unsigned long long)iblock); goto bail; @@ -98,6 +93,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, iblock; buffer_cache_bh = sb_getblk(osb->sb, blkno); if (!buffer_cache_bh) { + err = -ENOMEM; mlog(ML_ERROR, "couldn't getblock for symlink!\n"); goto bail; } @@ -108,7 +104,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, * copy, the data is still good. */ if (buffer_jbd(buffer_cache_bh) && ocfs2_inode_is_new(inode)) { - kaddr = kmap_atomic(bh_result->b_page, KM_USER0); + kaddr = kmap_atomic(bh_result->b_page); if (!kaddr) { mlog(ML_ERROR, "couldn't kmap!\n"); goto bail; @@ -116,7 +112,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, memcpy(kaddr + (bh_result->b_size * iblock), buffer_cache_bh->b_data, bh_result->b_size); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); set_buffer_uptodate(bh_result); } brelse(buffer_cache_bh); @@ -128,15 +124,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, err = 0; bail: - if (bh) - brelse(bh); + brelse(bh); - mlog_exit(err); return err; } -static int ocfs2_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +int ocfs2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { int err = 0; unsigned int ext_flags; @@ -144,8 +138,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, u64 p_blkno, count, past_eof; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, - (unsigned long long)iblock, bh_result, create); + trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)iblock, bh_result, create); if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", @@ -173,11 +167,18 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, * ocfs2 never allocates in this function - the only time we * need to use BH_New is when we're extending i_size on a file * system which doesn't support holes, in which case BH_New - * allows block_prepare_write() to zero. + * allows __block_write_begin() to zero. + * + * If we see this on a sparse file system, then a truncate has + * raced us and removed the cluster. In this case, we clear + * the buffers dirty and uptodate bits and let the buffer code + * ignore it as a hole. */ - mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), - "ino %lu, iblock %llu\n", inode->i_ino, - (unsigned long long)iblock); + if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) { + clear_buffer_dirty(bh_result); + clear_buffer_uptodate(bh_result); + goto bail; + } /* Treat the unwritten extent as a hole for zeroing purposes. */ if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) @@ -195,21 +196,21 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); dump_stack(); + goto bail; } + } - past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, - (unsigned long long)past_eof); + past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - if (create && (iblock >= past_eof)) - set_buffer_new(bh_result); - } + trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)past_eof); + if (create && (iblock >= past_eof)) + set_buffer_new(bh_result); bail: if (err < 0) err = -EIO; - mlog_exit(err); return err; } @@ -229,7 +230,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, size = i_size_read(inode); if (size > PAGE_CACHE_SIZE || - size > ocfs2_max_inline_data(inode->i_sb)) { + size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, "Inode %llu has with inline data has bad size: %Lu", (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -237,13 +238,13 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, return -EROFS; } - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); if (size) memcpy(kaddr, di->id2.i_data.id_data, size); /* Clear the remaining part of the page */ memset(kaddr + size, 0, PAGE_CACHE_SIZE - size); flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); SetPageUptodate(page); @@ -254,13 +255,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page) { int ret; struct buffer_head *di_bh = NULL; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!PageLocked(page)); BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); - ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -281,7 +280,8 @@ static int ocfs2_readpage(struct file *file, struct page *page) loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; int ret, unlock = 1; - mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); + trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, + (page ? page->index : 0)); ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); if (ret != 0) { @@ -292,7 +292,15 @@ static int ocfs2_readpage(struct file *file, struct page *page) } if (down_read_trylock(&oi->ip_alloc_sem) == 0) { + /* + * Unlock the page and cycle ip_alloc_sem so that we don't + * busyloop waiting for ip_alloc_sem to unlock + */ ret = AOP_TRUNCATED_PAGE; + unlock_page(page); + unlock = 0; + down_read(&oi->ip_alloc_sem); + up_read(&oi->ip_alloc_sem); goto out_inode_unlock; } @@ -326,7 +334,6 @@ out_inode_unlock: out: if (unlock) unlock_page(page); - mlog_exit(ret); return ret; } @@ -399,30 +406,11 @@ out_unlock: */ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) { - int ret; - - mlog_entry("(0x%p)\n", page); - - ret = block_write_full_page(page, ocfs2_get_block, wbc); - - mlog_exit(ret); - - return ret; -} - -/* - * This is called from ocfs2_write_zero_page() which has handled it's - * own cluster locking and has ensured allocation exists for those - * blocks to be written. - */ -int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, - unsigned from, unsigned to) -{ - int ret; - - ret = block_prepare_write(page, from, to, ocfs2_get_block); + trace_ocfs2_writepage( + (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno, + page->index); - return ret; + return block_write_full_page(page, ocfs2_get_block, wbc); } /* Taken from ext3. We don't necessarily need the full blown @@ -461,39 +449,6 @@ int walk_page_buffers( handle_t *handle, return ret; } -handle_t *ocfs2_start_walk_page_trans(struct inode *inode, - struct page *page, - unsigned from, - unsigned to) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - handle_t *handle = NULL; - int ret = 0; - - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (!handle) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; - } - - if (ocfs2_should_order_data(inode)) { - ret = walk_page_buffers(handle, - page_buffers(page), - from, to, NULL, - ocfs2_journal_dirty_data); - if (ret < 0) - mlog_errno(ret); - } -out: - if (ret) { - if (handle) - ocfs2_commit_trans(osb, handle); - handle = ERR_PTR(ret); - } - return handle; -} - static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) { sector_t status; @@ -501,7 +456,8 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) int err = 0; struct inode *inode = mapping->host; - mlog_entry("(block = %llu)\n", (unsigned long long)block); + trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)block); /* We don't need to lock journal system files, since they aren't * accessed concurrently from multiple nodes. @@ -535,8 +491,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) bail: status = err ? 0 : p_blkno; - mlog_exit((int)status); - return status; } @@ -552,6 +506,9 @@ bail: * * called like this: dio->get_blocks(dio->inode, fs_startblk, * fs_count, map_bh, dio->rw == WRITE); + * + * Note that we never bother to allocate blocks here, and thus ignore the + * create argument. */ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -568,14 +525,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - /* - * Any write past EOF is not allowed because we'd be extending. - */ - if (create && (iblock + max_blocks) > inode_blocks) { - ret = -EIO; - goto bail; - } - /* This figures out the size of the next contiguous block, and * our logical offset */ ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, @@ -587,14 +536,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, goto bail; } - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { - ocfs2_error(inode->i_sb, - "Inode %llu has a hole at block %llu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)iblock); - ret = -EROFS; - goto bail; - } + /* We should already CoW the refcounted extent in case of create. */ + BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); /* * get_more_blocks() expects us to describe a hole by clearing @@ -604,20 +547,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, */ if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) map_bh(bh_result, inode->i_sb, p_blkno); - else { - /* - * ocfs2_prepare_inode_for_write() should have caught - * the case where we'd be filling a hole and triggered - * a buffered write instead. - */ - if (create) { - ret = -EIO; - mlog_errno(ret); - goto bail; - } - + else clear_buffer_mapped(bh_result); - } /* make sure we don't map more than max_blocks blocks here as that's all the kernel will handle at this point. */ @@ -628,63 +559,51 @@ bail: return ret; } -/* +/* * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're - * particularly interested in the aio/dio case. Like the core uses - * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from - * truncation on another. + * particularly interested in the aio/dio case. We use the rw_lock DLM lock + * to protect io on one node from truncation on another. */ static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private) { - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + struct inode *inode = file_inode(iocb->ki_filp); int level; /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + if (ocfs2_iocb_is_sem_locked(iocb)) + ocfs2_iocb_clear_sem_locked(iocb); + + if (ocfs2_iocb_is_unaligned_aio(iocb)) { + ocfs2_iocb_clear_unaligned_aio(iocb); + + mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); + } + ocfs2_iocb_clear_rw_locked(iocb); level = ocfs2_iocb_rw_locked_level(iocb); - if (!level) - up_read(&inode->i_alloc_sem); ocfs2_rw_unlock(inode, level); } -/* - * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen - * from ext3. PageChecked() bits have been removed as OCFS2 does not - * do journalled data. - */ -static void ocfs2_invalidatepage(struct page *page, unsigned long offset) -{ - journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - - journal_invalidatepage(journal, page, offset); -} - static int ocfs2_releasepage(struct page *page, gfp_t wait) { - journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - if (!page_has_buffers(page)) return 0; - return journal_try_to_free_buffers(journal, page, wait); + return try_to_free_buffers(page); } static ssize_t ocfs2_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; - int ret; - - mlog_entry_void(); + struct inode *inode = file_inode(file)->i_mapping->host; /* * Fallback to buffered I/O if we see an inode without @@ -693,14 +612,14 @@ static ssize_t ocfs2_direct_IO(int rw, if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; - ret = blockdev_direct_IO_no_locking(rw, iocb, inode, - inode->i_sb->s_bdev, iov, offset, - nr_segs, - ocfs2_direct_IO_get_blocks, - ocfs2_dio_end_io); + /* Fallback to buffered I/O if we are appending. */ + if (i_size_read(inode) <= offset) + return 0; - mlog_exit(ret); - return ret; + return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + iter, offset, + ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io, NULL, 0); } static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, @@ -747,7 +666,7 @@ static void ocfs2_clear_page_regions(struct page *page, ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); if (from || to) { if (from > cluster_start) @@ -758,7 +677,7 @@ static void ocfs2_clear_page_regions(struct page *page, memset(kaddr + cluster_start, 0, cluster_end - cluster_start); } - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); } /* @@ -783,7 +702,7 @@ static int ocfs2_should_read_blk(struct inode *inode, struct page *page, } /* - * Some of this taken from block_prepare_write(). We already have our + * Some of this taken from __block_write_begin(). We already have our * mapping by now though, and the entire write will be allocating or * it won't, so not much need to use BH_New. * @@ -901,18 +820,17 @@ struct ocfs2_write_cluster_desc { */ unsigned c_new; unsigned c_unwritten; + unsigned c_needs_zero; }; -static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d) -{ - return d->c_new || d->c_unwritten; -} - struct ocfs2_write_ctxt { /* Logical cluster position / len of write */ u32 w_cpos; u32 w_clen; + /* First cluster allocated in a nonsparse extend */ + u32 w_first_new_cpos; + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; /* @@ -935,11 +853,17 @@ struct ocfs2_write_ctxt { * out in so that future reads from that region will get * zero's. */ - struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; unsigned int w_num_pages; + struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; struct page *w_target_page; /* + * w_target_locked is used for page_mkwrite path indicating no unlocking + * against w_target_page in ocfs2_write_end_nolock. + */ + unsigned int w_target_locked:1; + + /* * ocfs2_write_end() uses this to know what the real range to * write in the target should be. */ @@ -972,6 +896,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) { + int i; + + /* + * w_target_locked is only set to true in the page_mkwrite() case. + * The intent is to allow us to lock the target page from write_begin() + * to write_end(). The caller must hold a ref on w_target_page. + */ + if (wc->w_target_locked) { + BUG_ON(!wc->w_target_page); + for (i = 0; i < wc->w_num_pages; i++) { + if (wc->w_target_page == wc->w_pages[i]) { + wc->w_pages[i] = NULL; + break; + } + } + mark_page_accessed(wc->w_target_page); + page_cache_release(wc->w_target_page); + } ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); brelse(wc->w_di_bh); @@ -990,6 +932,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, return -ENOMEM; wc->w_cpos = pos >> osb->s_clustersize_bits; + wc->w_first_new_cpos = UINT_MAX; cend = (pos + len - 1) >> osb->s_clustersize_bits; wc->w_clen = cend - wc->w_cpos + 1; get_bh(di_bh); @@ -1066,12 +1009,12 @@ static void ocfs2_write_failure(struct inode *inode, for(i = 0; i < wc->w_num_pages; i++) { tmppage = wc->w_pages[i]; - if (ocfs2_should_order_data(inode)) - walk_page_buffers(wc->w_handle, page_buffers(tmppage), - from, to, NULL, - ocfs2_journal_dirty_data); + if (page_has_buffers(tmppage)) { + if (ocfs2_should_order_data(inode)) + ocfs2_jbd2_file_inode(wc->w_handle, inode); - block_commit_write(tmppage, from, to); + block_commit_write(tmppage, from, to); + } } } @@ -1089,6 +1032,12 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, &cluster_start, &cluster_end); + /* treat the write as new if the a hole/lseek spanned across + * the page boundary. + */ + new = new | ((i_size_read(inode) <= page_offset(page)) && + (page_offset(page) <= user_pos)); + if (page == wc->w_target_page) { map_from = user_pos & (PAGE_CACHE_SIZE - 1); map_to = map_from + user_len; @@ -1155,23 +1104,37 @@ out: */ static int ocfs2_grab_pages_for_write(struct address_space *mapping, struct ocfs2_write_ctxt *wc, - u32 cpos, loff_t user_pos, int new, + u32 cpos, loff_t user_pos, + unsigned user_len, int new, struct page *mmap_page) { int ret = 0, i; - unsigned long start, target_index, index; + unsigned long start, target_index, end_index, index; struct inode *inode = mapping->host; + loff_t last_byte; target_index = user_pos >> PAGE_CACHE_SHIFT; /* * Figure out how many pages we'll be manipulating here. For * non allocating write, we just change the one - * page. Otherwise, we'll need a whole clusters worth. + * page. Otherwise, we'll need a whole clusters worth. If we're + * writing past i_size, we only need enough pages to cover the + * last page of the write. */ if (new) { wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); + /* + * We need the index *past* the last page we could possibly + * touch. This is the page past the end of the write or + * i_size, whichever is greater. + */ + last_byte = max(user_pos + user_len, i_size_read(inode)); + BUG_ON(last_byte < 1); + end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1; + if ((start + wc->w_num_pages) > end_index) + wc->w_num_pages = end_index - start; } else { wc->w_num_pages = 1; start = target_index; @@ -1188,20 +1151,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, */ lock_page(mmap_page); + /* Exit and let the caller retry */ if (mmap_page->mapping != mapping) { + WARN_ON(mmap_page->mapping); unlock_page(mmap_page); - /* - * Sanity check - the locking in - * ocfs2_pagemkwrite() should ensure - * that this code doesn't trigger. - */ - ret = -EINVAL; - mlog_errno(ret); + ret = -EAGAIN; goto out; } page_cache_get(mmap_page); wc->w_pages[i] = mmap_page; + wc->w_target_locked = true; } else { wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); @@ -1211,11 +1171,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, goto out; } } + wait_for_stable_page(wc->w_pages[i]); if (index == target_index) wc->w_target_page = wc->w_pages[i]; } out: + if (ret) + wc->w_target_locked = false; return ret; } @@ -1224,19 +1187,18 @@ out: */ static int ocfs2_write_cluster(struct address_space *mapping, u32 phys, unsigned int unwritten, + unsigned int should_zero, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, unsigned user_len) { - int ret, i, new, should_zero = 0; + int ret, i, new; u64 v_blkno, p_blkno; struct inode *inode = mapping->host; + struct ocfs2_extent_tree et; new = phys == 0 ? 1 : 0; - if (new || unwritten) - should_zero = 1; - if (new) { u32 tmp_pos; @@ -1245,10 +1207,10 @@ static int ocfs2_write_cluster(struct address_space *mapping, * any additional semaphores or cluster locks. */ tmp_pos = cpos; - ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, - &tmp_pos, 1, 0, wc->w_di_bh, - wc->w_handle, data_ac, - meta_ac, NULL); + ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, + &tmp_pos, 1, 0, wc->w_di_bh, + wc->w_handle, data_ac, + meta_ac, NULL); /* * This shouldn't happen because we must have already * calculated the correct meta data allocation required. The @@ -1266,7 +1228,9 @@ static int ocfs2_write_cluster(struct address_space *mapping, goto out; } } else if (unwritten) { - ret = ocfs2_mark_extent_written(inode, wc->w_di_bh, + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), + wc->w_di_bh); + ret = ocfs2_mark_extent_written(inode, &et, wc->w_handle, cpos, 1, phys, meta_ac, &wc->w_dealloc); if (ret < 0) { @@ -1306,7 +1270,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, if (tmpret) { mlog_errno(tmpret); if (ret == 0) - tmpret = ret; + ret = tmpret; } } @@ -1346,7 +1310,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping, local_len = osb->s_clustersize - cluster_off; ret = ocfs2_write_cluster(mapping, desc->c_phys, - desc->c_unwritten, data_ac, meta_ac, + desc->c_unwritten, + desc->c_needs_zero, + data_ac, meta_ac, wc, desc->c_cpos, pos, local_len); if (ret) { mlog_errno(ret); @@ -1396,14 +1362,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, * newly allocated cluster. */ desc = &wc->w_desc[0]; - if (ocfs2_should_zero_cluster(desc)) + if (desc->c_needs_zero) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, &wc->w_target_from, NULL); desc = &wc->w_desc[wc->w_clen - 1]; - if (ocfs2_should_zero_cluster(desc)) + if (desc->c_needs_zero) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, NULL, @@ -1452,6 +1418,9 @@ static int ocfs2_populate_write_desc(struct inode *inode, goto out; } + /* We should already CoW the refcountd extent. */ + BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); + /* * Assume worst case - that we're writing in * the middle of the extent. @@ -1471,13 +1440,28 @@ static int ocfs2_populate_write_desc(struct inode *inode, phys++; } + /* + * If w_first_new_cpos is < UINT_MAX, we have a non-sparse + * file that got extended. w_first_new_cpos tells us + * where the newly allocated clusters are so we can + * zero them. + */ + if (desc->c_cpos >= wc->w_first_new_cpos) { + BUG_ON(phys == 0); + desc->c_needs_zero = 1; + } + desc->c_phys = phys; if (phys == 0) { desc->c_new = 1; + desc->c_needs_zero = 1; *clusters_to_alloc = *clusters_to_alloc + 1; } - if (ext_flags & OCFS2_EXT_UNWRITTEN) + + if (ext_flags & OCFS2_EXT_UNWRITTEN) { desc->c_unwritten = 1; + desc->c_needs_zero = 1; + } num_clusters--; } @@ -1517,8 +1501,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, goto out; } - ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { ocfs2_commit_trans(osb, handle); @@ -1560,10 +1544,11 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, int ret, written = 0; loff_t end = pos + len; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = NULL; - mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", - (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, - oi->ip_dyn_features); + trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno, + len, (unsigned long long)pos, + oi->ip_dyn_features); /* * Handle inodes which already have inline data 1st. @@ -1592,7 +1577,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, /* * Check whether the write can fit. */ - if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb)) + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + if (mmap_page || + end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) return 0; do_inline_write: @@ -1620,34 +1607,86 @@ out: * write path can treat it as an non-allocating write, which has no * special case code for sparse/nonsparse files. */ -static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, - unsigned len, +static int ocfs2_expand_nonsparse_inode(struct inode *inode, + struct buffer_head *di_bh, + loff_t pos, unsigned len, struct ocfs2_write_ctxt *wc) { int ret; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); loff_t newsize = pos + len; - if (ocfs2_sparse_alloc(osb)) - return 0; + BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); if (newsize <= i_size_read(inode)) return 0; - ret = ocfs2_extend_no_holes(inode, newsize, newsize - len); + ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos); if (ret) mlog_errno(ret); + wc->w_first_new_cpos = + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + + return ret; +} + +static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, + loff_t pos) +{ + int ret = 0; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); + if (pos > i_size_read(inode)) + ret = ocfs2_zero_extend(inode, di_bh, pos); + return ret; } -int ocfs2_write_begin_nolock(struct address_space *mapping, +/* + * Try to flush truncate logs if we can free enough clusters from it. + * As for return value, "< 0" means error, "0" no space and "1" means + * we have freed enough spaces and let the caller try to allocate again. + */ +static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, + unsigned int needed) +{ + tid_t target; + int ret = 0; + unsigned int truncated_clusters; + + mutex_lock(&osb->osb_tl_inode->i_mutex); + truncated_clusters = osb->truncated_clusters; + mutex_unlock(&osb->osb_tl_inode->i_mutex); + + /* + * Check whether we can succeed in allocating if we free + * the truncate log. + */ + if (truncated_clusters < needed) + goto out; + + ret = ocfs2_flush_truncate_log(osb); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { + jbd2_log_wait_commit(osb->journal->j_journal, target); + ret = 1; + } +out: + return ret; +} + +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) { - int ret, credits = OCFS2_INODE_UPDATE_CREDITS; - unsigned int clusters_to_alloc, extents_to_split; + int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; + unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0; struct ocfs2_write_ctxt *wc; struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -1655,7 +1694,10 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle; + struct ocfs2_extent_tree et; + int try_free = 1, ret1; +try_again: ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); if (ret) { mlog_errno(ret); @@ -1675,21 +1717,47 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, } } - ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc); + if (ocfs2_sparse_alloc(osb)) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, + wc); if (ret) { mlog_errno(ret); goto out; } + ret = ocfs2_check_range_for_refcount(inode, pos, len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } else if (ret == 1) { + clusters_need = wc->w_clen; + ret = ocfs2_refcount_cow(inode, di_bh, + wc->w_cpos, wc->w_clen, UINT_MAX); + if (ret) { + mlog_errno(ret); + goto out; + } + } + ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, &extents_to_split); if (ret) { mlog_errno(ret); goto out; } + clusters_need += clusters_to_alloc; di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + trace_ocfs2_write_begin_nolock( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), + le32_to_cpu(di->i_clusters), + pos, len, flags, mmap_page, + clusters_to_alloc, extents_to_split); + /* * We set w_target_from, w_target_to here so that * ocfs2_write_end() knows which range in the target page to @@ -1702,20 +1770,37 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * ocfs2_lock_allocators(). It greatly over-estimates * the work to be done. */ - ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, - extents_to_split, &data_ac, &meta_ac); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), + wc->w_di_bh); + ret = ocfs2_lock_allocators(inode, &et, + clusters_to_alloc, extents_to_split, + &data_ac, &meta_ac); if (ret) { mlog_errno(ret); goto out; } - credits = ocfs2_calc_extend_credits(inode->i_sb, di, - clusters_to_alloc); + if (data_ac) + data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + + credits = ocfs2_calc_extend_credits(inode->i_sb, + &di->id2.i_list); } - ocfs2_set_target_boundaries(osb, wc, pos, len, - clusters_to_alloc + extents_to_split); + /* + * We have to zero sparse allocated clusters, unwritten extent clusters, + * and non-sparse clusters we just extended. For non-sparse writes, + * we know zeros will only be needed in the first and/or last cluster. + */ + if (clusters_to_alloc || extents_to_split || + (wc->w_clen && (wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero))) + cluster_of_pages = 1; + else + cluster_of_pages = 0; + + ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -1726,15 +1811,21 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, wc->w_handle = handle; + if (clusters_to_alloc) { + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); + if (ret) + goto out_commit; + } /* * We don't want this to fail in ocfs2_write_end(), so do it * here. */ - ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out_quota; } /* @@ -1742,19 +1833,30 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * that we can zero and flush if we error after adding the * extent. */ - ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, - clusters_to_alloc + extents_to_split, - mmap_page); - if (ret) { + ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, + cluster_of_pages, mmap_page); + if (ret && ret != -EAGAIN) { mlog_errno(ret); - goto out_commit; + goto out_quota; + } + + /* + * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock + * the target page. In this case, we exit with no error and no target + * page. This will trigger the caller, page_mkwrite(), to re-try + * the operation. + */ + if (ret == -EAGAIN) { + BUG_ON(wc->w_target_page); + ret = 0; + goto out_quota; } ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, len); if (ret) { mlog_errno(ret); - goto out_commit; + goto out_quota; } if (data_ac) @@ -1766,16 +1868,40 @@ success: *pagep = wc->w_target_page; *fsdata = wc; return 0; +out_quota: + if (clusters_to_alloc) + dquot_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); out_commit: ocfs2_commit_trans(osb, handle); out: ocfs2_free_write_ctxt(wc); - if (data_ac) + if (data_ac) { ocfs2_free_alloc_context(data_ac); - if (meta_ac) + data_ac = NULL; + } + if (meta_ac) { ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + + if (ret == -ENOSPC && try_free) { + /* + * Try to free some truncate log so that we can have enough + * clusters to allocate. + */ + try_free = 0; + + ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need); + if (ret1 == 1) + goto try_again; + + if (ret1 < 0) + mlog_errno(ret1); + } + return ret; } @@ -1802,7 +1928,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); @@ -1836,12 +1962,12 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, } } - kaddr = kmap_atomic(wc->w_target_page, KM_USER0); + kaddr = kmap_atomic(wc->w_target_page); memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); - mlog(0, "Data written to inode at offset %llu. " - "id_count = %u, copied = %u, i_dyn_features = 0x%x\n", + trace_ocfs2_write_end_inline( + (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)pos, *copied, le16_to_cpu(di->id2.i_data.id_count), le16_to_cpu(di->i_dyn_features)); @@ -1894,17 +2020,16 @@ int ocfs2_write_end_nolock(struct address_space *mapping, to = PAGE_CACHE_SIZE; } - if (ocfs2_should_order_data(inode)) - walk_page_buffers(wc->w_handle, page_buffers(tmppage), - from, to, NULL, - ocfs2_journal_dirty_data); - - block_commit_write(tmppage, from, to); + if (page_has_buffers(tmppage)) { + if (ocfs2_should_order_data(inode)) + ocfs2_jbd2_file_inode(wc->w_handle, inode); + block_commit_write(tmppage, from, to); + } } out_write_size: pos += copied; - if (pos > inode->i_size) { + if (pos > i_size_read(inode)) { i_size_write(inode, pos); mark_inode_dirty(inode); } @@ -1913,6 +2038,7 @@ out_write_size: inode->i_mtime = inode->i_ctime = CURRENT_TIME; di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, wc->w_di_bh); ocfs2_commit_trans(osb, handle); @@ -1940,15 +2066,16 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, } const struct address_space_operations ocfs2_aops = { - .readpage = ocfs2_readpage, - .readpages = ocfs2_readpages, - .writepage = ocfs2_writepage, - .write_begin = ocfs2_write_begin, - .write_end = ocfs2_write_end, - .bmap = ocfs2_bmap, - .sync_page = block_sync_page, - .direct_IO = ocfs2_direct_IO, - .invalidatepage = ocfs2_invalidatepage, - .releasepage = ocfs2_releasepage, - .migratepage = buffer_migrate_page, + .readpage = ocfs2_readpage, + .readpages = ocfs2_readpages, + .writepage = ocfs2_writepage, + .write_begin = ocfs2_write_begin, + .write_end = ocfs2_write_end, + .bmap = ocfs2_bmap, + .direct_IO = ocfs2_direct_IO, + .invalidatepage = block_invalidatepage, + .releasepage = ocfs2_releasepage, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; |
