diff options
Diffstat (limited to 'fs/ocfs2/aops.c')
| -rw-r--r-- | fs/ocfs2/aops.c | 286 | 
1 files changed, 187 insertions, 99 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f1e962cb3b7..4a231a166cf 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -29,7 +29,6 @@  #include <linux/mpage.h>  #include <linux/quotaops.h> -#define MLOG_MASK_PREFIX ML_FILE_IO  #include <cluster/masklog.h>  #include "ocfs2.h" @@ -45,6 +44,7 @@  #include "super.h"  #include "symlink.h"  #include "refcounttree.h" +#include "ocfs2_trace.h"  #include "buffer_head_io.h" @@ -59,8 +59,9 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);  	void *kaddr; -	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, -		   (unsigned long long)iblock, bh_result, create); +	trace_ocfs2_symlink_get_block( +			(unsigned long long)OCFS2_I(inode)->ip_blkno, +			(unsigned long long)iblock, bh_result, create);  	BUG_ON(ocfs2_inode_is_fast_symlink(inode)); @@ -79,6 +80,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,  	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,  						    le32_to_cpu(fe->i_clusters))) { +		err = -ENOMEM;  		mlog(ML_ERROR, "block offset is outside the allocated size: "  		     "%llu\n", (unsigned long long)iblock);  		goto bail; @@ -91,6 +93,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,  			    iblock;  		buffer_cache_bh = sb_getblk(osb->sb, blkno);  		if (!buffer_cache_bh) { +			err = -ENOMEM;  			mlog(ML_ERROR, "couldn't getblock for symlink!\n");  			goto bail;  		} @@ -101,7 +104,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,  		 * copy, the data is still good. */  		if (buffer_jbd(buffer_cache_bh)  		    && ocfs2_inode_is_new(inode)) { -			kaddr = kmap_atomic(bh_result->b_page, KM_USER0); +			kaddr = kmap_atomic(bh_result->b_page);  			if (!kaddr) {  				mlog(ML_ERROR, "couldn't kmap!\n");  				goto bail; @@ -109,7 +112,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,  			memcpy(kaddr + (bh_result->b_size * iblock),  			       buffer_cache_bh->b_data,  			       bh_result->b_size); -			kunmap_atomic(kaddr, KM_USER0); +			kunmap_atomic(kaddr);  			set_buffer_uptodate(bh_result);  		}  		brelse(buffer_cache_bh); @@ -123,7 +126,6 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,  bail:  	brelse(bh); -	mlog_exit(err);  	return err;  } @@ -136,8 +138,8 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,  	u64 p_blkno, count, past_eof;  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); -	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, -		   (unsigned long long)iblock, bh_result, create); +	trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno, +			      (unsigned long long)iblock, bh_result, create);  	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)  		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", @@ -199,8 +201,9 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,  	}  	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); -	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, -	     (unsigned long long)past_eof); + +	trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno, +				  (unsigned long long)past_eof);  	if (create && (iblock >= past_eof))  		set_buffer_new(bh_result); @@ -208,7 +211,6 @@ bail:  	if (err < 0)  		err = -EIO; -	mlog_exit(err);  	return err;  } @@ -236,13 +238,13 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,  		return -EROFS;  	} -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	if (size)  		memcpy(kaddr, di->id2.i_data.id_data, size);  	/* Clear the remaining part of the page */  	memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);  	flush_dcache_page(page); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  	SetPageUptodate(page); @@ -278,7 +280,8 @@ static int ocfs2_readpage(struct file *file, struct page *page)  	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;  	int ret, unlock = 1; -	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); +	trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, +			     (page ? page->index : 0));  	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);  	if (ret != 0) { @@ -289,7 +292,15 @@ static int ocfs2_readpage(struct file *file, struct page *page)  	}  	if (down_read_trylock(&oi->ip_alloc_sem) == 0) { +		/* +		 * Unlock the page and cycle ip_alloc_sem so that we don't +		 * busyloop waiting for ip_alloc_sem to unlock +		 */  		ret = AOP_TRUNCATED_PAGE; +		unlock_page(page); +		unlock = 0; +		down_read(&oi->ip_alloc_sem); +		up_read(&oi->ip_alloc_sem);  		goto out_inode_unlock;  	} @@ -323,7 +334,6 @@ out_inode_unlock:  out:  	if (unlock)  		unlock_page(page); -	mlog_exit(ret);  	return ret;  } @@ -396,15 +406,11 @@ out_unlock:   */  static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)  { -	int ret; +	trace_ocfs2_writepage( +		(unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno, +		page->index); -	mlog_entry("(0x%p)\n", page); - -	ret = block_write_full_page(page, ocfs2_get_block, wbc); - -	mlog_exit(ret); - -	return ret; +	return block_write_full_page(page, ocfs2_get_block, wbc);  }  /* Taken from ext3. We don't necessarily need the full blown @@ -450,7 +456,8 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)  	int err = 0;  	struct inode *inode = mapping->host; -	mlog_entry("(block = %llu)\n", (unsigned long long)block); +	trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno, +			 (unsigned long long)block);  	/* We don't need to lock journal system files, since they aren't  	 * accessed concurrently from multiple nodes. @@ -484,8 +491,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)  bail:  	status = err ? 0 : p_blkno; -	mlog_exit((int)status); -  	return status;  } @@ -556,66 +561,49 @@ bail:  /*   * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're - * particularly interested in the aio/dio case.  Like the core uses - * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from - * truncation on another. + * particularly interested in the aio/dio case.  We use the rw_lock DLM lock + * to protect io on one node from truncation on another.   */  static void ocfs2_dio_end_io(struct kiocb *iocb,  			     loff_t offset,  			     ssize_t bytes, -			     void *private, -			     int ret, -			     bool is_async) +			     void *private)  { -	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; +	struct inode *inode = file_inode(iocb->ki_filp);  	int level;  	/* this io's submitter should not have unlocked this before we could */  	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); -	ocfs2_iocb_clear_rw_locked(iocb); +	if (ocfs2_iocb_is_sem_locked(iocb)) +		ocfs2_iocb_clear_sem_locked(iocb); -	level = ocfs2_iocb_rw_locked_level(iocb); -	if (!level) -		up_read(&inode->i_alloc_sem); -	ocfs2_rw_unlock(inode, level); +	if (ocfs2_iocb_is_unaligned_aio(iocb)) { +		ocfs2_iocb_clear_unaligned_aio(iocb); -	if (is_async) -		aio_complete(iocb, ret, 0); -} +		mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio); +	} -/* - * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen - * from ext3.  PageChecked() bits have been removed as OCFS2 does not - * do journalled data. - */ -static void ocfs2_invalidatepage(struct page *page, unsigned long offset) -{ -	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; +	ocfs2_iocb_clear_rw_locked(iocb); -	jbd2_journal_invalidatepage(journal, page, offset); +	level = ocfs2_iocb_rw_locked_level(iocb); +	ocfs2_rw_unlock(inode, level);  }  static int ocfs2_releasepage(struct page *page, gfp_t wait)  { -	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; -  	if (!page_has_buffers(page))  		return 0; -	return jbd2_journal_try_to_free_buffers(journal, page, wait); +	return try_to_free_buffers(page);  }  static ssize_t ocfs2_direct_IO(int rw,  			       struct kiocb *iocb, -			       const struct iovec *iov, -			       loff_t offset, -			       unsigned long nr_segs) +			       struct iov_iter *iter, +			       loff_t offset)  {  	struct file *file = iocb->ki_filp; -	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; -	int ret; - -	mlog_entry_void(); +	struct inode *inode = file_inode(file)->i_mapping->host;  	/*  	 * Fallback to buffered I/O if we see an inode without @@ -628,13 +616,10 @@ static ssize_t ocfs2_direct_IO(int rw,  	if (i_size_read(inode) <= offset)  		return 0; -	ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, -				   iov, offset, nr_segs, -				   ocfs2_direct_IO_get_blocks, -				   ocfs2_dio_end_io, NULL, 0); - -	mlog_exit(ret); -	return ret; +	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, +				    iter, offset, +				    ocfs2_direct_IO_get_blocks, +				    ocfs2_dio_end_io, NULL, 0);  }  static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, @@ -681,7 +666,7 @@ static void ocfs2_clear_page_regions(struct page *page,  	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); -	kaddr = kmap_atomic(page, KM_USER0); +	kaddr = kmap_atomic(page);  	if (from || to) {  		if (from > cluster_start) @@ -692,7 +677,7 @@ static void ocfs2_clear_page_regions(struct page *page,  		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);  	} -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr);  }  /* @@ -873,6 +858,12 @@ struct ocfs2_write_ctxt {  	struct page			*w_target_page;  	/* +	 * w_target_locked is used for page_mkwrite path indicating no unlocking +	 * against w_target_page in ocfs2_write_end_nolock. +	 */ +	unsigned int			w_target_locked:1; + +	/*  	 * ocfs2_write_end() uses this to know what the real range to  	 * write in the target should be.  	 */ @@ -905,6 +896,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)  static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)  { +	int i; + +	/* +	 * w_target_locked is only set to true in the page_mkwrite() case. +	 * The intent is to allow us to lock the target page from write_begin() +	 * to write_end(). The caller must hold a ref on w_target_page. +	 */ +	if (wc->w_target_locked) { +		BUG_ON(!wc->w_target_page); +		for (i = 0; i < wc->w_num_pages; i++) { +			if (wc->w_target_page == wc->w_pages[i]) { +				wc->w_pages[i] = NULL; +				break; +			} +		} +		mark_page_accessed(wc->w_target_page); +		page_cache_release(wc->w_target_page); +	}  	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);  	brelse(wc->w_di_bh); @@ -1023,6 +1032,12 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,  	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,  					&cluster_start, &cluster_end); +	/* treat the write as new if the a hole/lseek spanned across +	 * the page boundary. +	 */ +	new = new | ((i_size_read(inode) <= page_offset(page)) && +			(page_offset(page) <= user_pos)); +  	if (page == wc->w_target_page) {  		map_from = user_pos & (PAGE_CACHE_SIZE - 1);  		map_to = map_from + user_len; @@ -1136,20 +1151,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,  			 */  			lock_page(mmap_page); +			/* Exit and let the caller retry */  			if (mmap_page->mapping != mapping) { +				WARN_ON(mmap_page->mapping);  				unlock_page(mmap_page); -				/* -				 * Sanity check - the locking in -				 * ocfs2_pagemkwrite() should ensure -				 * that this code doesn't trigger. -				 */ -				ret = -EINVAL; -				mlog_errno(ret); +				ret = -EAGAIN;  				goto out;  			}  			page_cache_get(mmap_page);  			wc->w_pages[i] = mmap_page; +			wc->w_target_locked = true;  		} else {  			wc->w_pages[i] = find_or_create_page(mapping, index,  							     GFP_NOFS); @@ -1159,11 +1171,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,  				goto out;  			}  		} +		wait_for_stable_page(wc->w_pages[i]);  		if (index == target_index)  			wc->w_target_page = wc->w_pages[i];  	}  out: +	if (ret) +		wc->w_target_locked = false;  	return ret;  } @@ -1531,9 +1546,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,  	struct ocfs2_inode_info *oi = OCFS2_I(inode);  	struct ocfs2_dinode *di = NULL; -	mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", -	     (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, -	     oi->ip_dyn_features); +	trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno, +					     len, (unsigned long long)pos, +					     oi->ip_dyn_features);  	/*  	 * Handle inodes which already have inline data 1st. @@ -1627,6 +1642,43 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,  	return ret;  } +/* + * Try to flush truncate logs if we can free enough clusters from it. + * As for return value, "< 0" means error, "0" no space and "1" means + * we have freed enough spaces and let the caller try to allocate again. + */ +static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, +					  unsigned int needed) +{ +	tid_t target; +	int ret = 0; +	unsigned int truncated_clusters; + +	mutex_lock(&osb->osb_tl_inode->i_mutex); +	truncated_clusters = osb->truncated_clusters; +	mutex_unlock(&osb->osb_tl_inode->i_mutex); + +	/* +	 * Check whether we can succeed in allocating if we free +	 * the truncate log. +	 */ +	if (truncated_clusters < needed) +		goto out; + +	ret = ocfs2_flush_truncate_log(osb); +	if (ret) { +		mlog_errno(ret); +		goto out; +	} + +	if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { +		jbd2_log_wait_commit(osb->journal->j_journal, target); +		ret = 1; +	} +out: +	return ret; +} +  int ocfs2_write_begin_nolock(struct file *filp,  			     struct address_space *mapping,  			     loff_t pos, unsigned len, unsigned flags, @@ -1634,7 +1686,7 @@ int ocfs2_write_begin_nolock(struct file *filp,  			     struct buffer_head *di_bh, struct page *mmap_page)  {  	int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; -	unsigned int clusters_to_alloc, extents_to_split; +	unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;  	struct ocfs2_write_ctxt *wc;  	struct inode *inode = mapping->host;  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -1643,7 +1695,9 @@ int ocfs2_write_begin_nolock(struct file *filp,  	struct ocfs2_alloc_context *meta_ac = NULL;  	handle_t *handle;  	struct ocfs2_extent_tree et; +	int try_free = 1, ret1; +try_again:  	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);  	if (ret) {  		mlog_errno(ret); @@ -1678,7 +1732,8 @@ int ocfs2_write_begin_nolock(struct file *filp,  		mlog_errno(ret);  		goto out;  	} else if (ret == 1) { -		ret = ocfs2_refcount_cow(inode, filp, di_bh, +		clusters_need = wc->w_clen; +		ret = ocfs2_refcount_cow(inode, di_bh,  					 wc->w_cpos, wc->w_clen, UINT_MAX);  		if (ret) {  			mlog_errno(ret); @@ -1692,9 +1747,17 @@ int ocfs2_write_begin_nolock(struct file *filp,  		mlog_errno(ret);  		goto out;  	} +	clusters_need += clusters_to_alloc;  	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; +	trace_ocfs2_write_begin_nolock( +			(unsigned long long)OCFS2_I(inode)->ip_blkno, +			(long long)i_size_read(inode), +			le32_to_cpu(di->i_clusters), +			pos, len, flags, mmap_page, +			clusters_to_alloc, extents_to_split); +  	/*  	 * We set w_target_from, w_target_to here so that  	 * ocfs2_write_end() knows which range in the target page to @@ -1707,12 +1770,6 @@ int ocfs2_write_begin_nolock(struct file *filp,  		 * ocfs2_lock_allocators(). It greatly over-estimates  		 * the work to be done.  		 */ -		mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u," -		     " clusters_to_add = %u, extents_to_split = %u\n", -		     (unsigned long long)OCFS2_I(inode)->ip_blkno, -		     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), -		     clusters_to_alloc, extents_to_split); -  		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),  					      wc->w_di_bh);  		ret = ocfs2_lock_allocators(inode, &et, @@ -1727,8 +1784,7 @@ int ocfs2_write_begin_nolock(struct file *filp,  			data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;  		credits = ocfs2_calc_extend_credits(inode->i_sb, -						    &di->id2.i_list, -						    clusters_to_alloc); +						    &di->id2.i_list);  	} @@ -1779,11 +1835,23 @@ int ocfs2_write_begin_nolock(struct file *filp,  	 */  	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,  					 cluster_of_pages, mmap_page); -	if (ret) { +	if (ret && ret != -EAGAIN) {  		mlog_errno(ret);  		goto out_quota;  	} +	/* +	 * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock +	 * the target page. In this case, we exit with no error and no target +	 * page. This will trigger the caller, page_mkwrite(), to re-try +	 * the operation. +	 */ +	if (ret == -EAGAIN) { +		BUG_ON(wc->w_target_page); +		ret = 0; +		goto out_quota; +	} +  	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,  					  len);  	if (ret) { @@ -1810,10 +1878,30 @@ out_commit:  out:  	ocfs2_free_write_ctxt(wc); -	if (data_ac) +	if (data_ac) {  		ocfs2_free_alloc_context(data_ac); -	if (meta_ac) +		data_ac = NULL; +	} +	if (meta_ac) {  		ocfs2_free_alloc_context(meta_ac); +		meta_ac = NULL; +	} + +	if (ret == -ENOSPC && try_free) { +		/* +		 * Try to free some truncate log so that we can have enough +		 * clusters to allocate. +		 */ +		try_free = 0; + +		ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need); +		if (ret1 == 1) +			goto try_again; + +		if (ret1 < 0) +			mlog_errno(ret1); +	} +  	return ret;  } @@ -1874,12 +1962,12 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,  		}  	} -	kaddr = kmap_atomic(wc->w_target_page, KM_USER0); +	kaddr = kmap_atomic(wc->w_target_page);  	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); -	kunmap_atomic(kaddr, KM_USER0); +	kunmap_atomic(kaddr); -	mlog(0, "Data written to inode at offset %llu. " -	     "id_count = %u, copied = %u, i_dyn_features = 0x%x\n", +	trace_ocfs2_write_end_inline( +	     (unsigned long long)OCFS2_I(inode)->ip_blkno,  	     (unsigned long long)pos, *copied,  	     le16_to_cpu(di->id2.i_data.id_count),  	     le16_to_cpu(di->i_dyn_features)); @@ -1941,7 +2029,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,  out_write_size:  	pos += copied; -	if (pos > inode->i_size) { +	if (pos > i_size_read(inode)) {  		i_size_write(inode, pos);  		mark_inode_dirty(inode);  	} @@ -1950,6 +2038,7 @@ out_write_size:  	inode->i_mtime = inode->i_ctime = CURRENT_TIME;  	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);  	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); +	ocfs2_update_inode_fsync_trans(handle, inode, 1);  	ocfs2_journal_dirty(handle, wc->w_di_bh);  	ocfs2_commit_trans(osb, handle); @@ -1983,9 +2072,8 @@ const struct address_space_operations ocfs2_aops = {  	.write_begin		= ocfs2_write_begin,  	.write_end		= ocfs2_write_end,  	.bmap			= ocfs2_bmap, -	.sync_page		= block_sync_page,  	.direct_IO		= ocfs2_direct_IO, -	.invalidatepage		= ocfs2_invalidatepage, +	.invalidatepage		= block_invalidatepage,  	.releasepage		= ocfs2_releasepage,  	.migratepage		= buffer_migrate_page,  	.is_partially_uptodate	= block_is_partially_uptodate,  | 
