aboutsummaryrefslogtreecommitdiff
path: root/fs/ocfs2/aops.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/aops.c')
-rw-r--r--fs/ocfs2/aops.c286
1 files changed, 187 insertions, 99 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f1e962cb3b7..4a231a166cf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -29,7 +29,6 @@
#include <linux/mpage.h>
#include <linux/quotaops.h>
-#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
#include "ocfs2.h"
@@ -45,6 +44,7 @@
#include "super.h"
#include "symlink.h"
#include "refcounttree.h"
+#include "ocfs2_trace.h"
#include "buffer_head_io.h"
@@ -59,8 +59,9 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
void *kaddr;
- mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
- (unsigned long long)iblock, bh_result, create);
+ trace_ocfs2_symlink_get_block(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)iblock, bh_result, create);
BUG_ON(ocfs2_inode_is_fast_symlink(inode));
@@ -79,6 +80,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(fe->i_clusters))) {
+ err = -ENOMEM;
mlog(ML_ERROR, "block offset is outside the allocated size: "
"%llu\n", (unsigned long long)iblock);
goto bail;
@@ -91,6 +93,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
iblock;
buffer_cache_bh = sb_getblk(osb->sb, blkno);
if (!buffer_cache_bh) {
+ err = -ENOMEM;
mlog(ML_ERROR, "couldn't getblock for symlink!\n");
goto bail;
}
@@ -101,7 +104,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
* copy, the data is still good. */
if (buffer_jbd(buffer_cache_bh)
&& ocfs2_inode_is_new(inode)) {
- kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
+ kaddr = kmap_atomic(bh_result->b_page);
if (!kaddr) {
mlog(ML_ERROR, "couldn't kmap!\n");
goto bail;
@@ -109,7 +112,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
memcpy(kaddr + (bh_result->b_size * iblock),
buffer_cache_bh->b_data,
bh_result->b_size);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
set_buffer_uptodate(bh_result);
}
brelse(buffer_cache_bh);
@@ -123,7 +126,6 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
bail:
brelse(bh);
- mlog_exit(err);
return err;
}
@@ -136,8 +138,8 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
u64 p_blkno, count, past_eof;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
- (unsigned long long)iblock, bh_result, create);
+ trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)iblock, bh_result, create);
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
@@ -199,8 +201,9 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
}
past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
- mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
- (unsigned long long)past_eof);
+
+ trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)past_eof);
if (create && (iblock >= past_eof))
set_buffer_new(bh_result);
@@ -208,7 +211,6 @@ bail:
if (err < 0)
err = -EIO;
- mlog_exit(err);
return err;
}
@@ -236,13 +238,13 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
return -EROFS;
}
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (size)
memcpy(kaddr, di->id2.i_data.id_data, size);
/* Clear the remaining part of the page */
memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
SetPageUptodate(page);
@@ -278,7 +280,8 @@ static int ocfs2_readpage(struct file *file, struct page *page)
loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
int ret, unlock = 1;
- mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+ trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
+ (page ? page->index : 0));
ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
if (ret != 0) {
@@ -289,7 +292,15 @@ static int ocfs2_readpage(struct file *file, struct page *page)
}
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+ /*
+ * Unlock the page and cycle ip_alloc_sem so that we don't
+ * busyloop waiting for ip_alloc_sem to unlock
+ */
ret = AOP_TRUNCATED_PAGE;
+ unlock_page(page);
+ unlock = 0;
+ down_read(&oi->ip_alloc_sem);
+ up_read(&oi->ip_alloc_sem);
goto out_inode_unlock;
}
@@ -323,7 +334,6 @@ out_inode_unlock:
out:
if (unlock)
unlock_page(page);
- mlog_exit(ret);
return ret;
}
@@ -396,15 +406,11 @@ out_unlock:
*/
static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
{
- int ret;
+ trace_ocfs2_writepage(
+ (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
+ page->index);
- mlog_entry("(0x%p)\n", page);
-
- ret = block_write_full_page(page, ocfs2_get_block, wbc);
-
- mlog_exit(ret);
-
- return ret;
+ return block_write_full_page(page, ocfs2_get_block, wbc);
}
/* Taken from ext3. We don't necessarily need the full blown
@@ -450,7 +456,8 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
int err = 0;
struct inode *inode = mapping->host;
- mlog_entry("(block = %llu)\n", (unsigned long long)block);
+ trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)block);
/* We don't need to lock journal system files, since they aren't
* accessed concurrently from multiple nodes.
@@ -484,8 +491,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
bail:
status = err ? 0 : p_blkno;
- mlog_exit((int)status);
-
return status;
}
@@ -556,66 +561,49 @@ bail:
/*
* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
- * particularly interested in the aio/dio case. Like the core uses
- * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
- * truncation on another.
+ * particularly interested in the aio/dio case. We use the rw_lock DLM lock
+ * to protect io on one node from truncation on another.
*/
static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
ssize_t bytes,
- void *private,
- int ret,
- bool is_async)
+ void *private)
{
- struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ struct inode *inode = file_inode(iocb->ki_filp);
int level;
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
- ocfs2_iocb_clear_rw_locked(iocb);
+ if (ocfs2_iocb_is_sem_locked(iocb))
+ ocfs2_iocb_clear_sem_locked(iocb);
- level = ocfs2_iocb_rw_locked_level(iocb);
- if (!level)
- up_read(&inode->i_alloc_sem);
- ocfs2_rw_unlock(inode, level);
+ if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+ ocfs2_iocb_clear_unaligned_aio(iocb);
- if (is_async)
- aio_complete(iocb, ret, 0);
-}
+ mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
+ }
-/*
- * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
- * from ext3. PageChecked() bits have been removed as OCFS2 does not
- * do journalled data.
- */
-static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
-{
- journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
+ ocfs2_iocb_clear_rw_locked(iocb);
- jbd2_journal_invalidatepage(journal, page, offset);
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
}
static int ocfs2_releasepage(struct page *page, gfp_t wait)
{
- journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
if (!page_has_buffers(page))
return 0;
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ return try_to_free_buffers(page);
}
static ssize_t ocfs2_direct_IO(int rw,
struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
- int ret;
-
- mlog_entry_void();
+ struct inode *inode = file_inode(file)->i_mapping->host;
/*
* Fallback to buffered I/O if we see an inode without
@@ -628,13 +616,10 @@ static ssize_t ocfs2_direct_IO(int rw,
if (i_size_read(inode) <= offset)
return 0;
- ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
- iov, offset, nr_segs,
- ocfs2_direct_IO_get_blocks,
- ocfs2_dio_end_io, NULL, 0);
-
- mlog_exit(ret);
- return ret;
+ return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+ iter, offset,
+ ocfs2_direct_IO_get_blocks,
+ ocfs2_dio_end_io, NULL, 0);
}
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
@@ -681,7 +666,7 @@ static void ocfs2_clear_page_regions(struct page *page,
ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
- kaddr = kmap_atomic(page, KM_USER0);
+ kaddr = kmap_atomic(page);
if (from || to) {
if (from > cluster_start)
@@ -692,7 +677,7 @@ static void ocfs2_clear_page_regions(struct page *page,
memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
}
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
}
/*
@@ -873,6 +858,12 @@ struct ocfs2_write_ctxt {
struct page *w_target_page;
/*
+ * w_target_locked is used for page_mkwrite path indicating no unlocking
+ * against w_target_page in ocfs2_write_end_nolock.
+ */
+ unsigned int w_target_locked:1;
+
+ /*
* ocfs2_write_end() uses this to know what the real range to
* write in the target should be.
*/
@@ -905,6 +896,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
{
+ int i;
+
+ /*
+ * w_target_locked is only set to true in the page_mkwrite() case.
+ * The intent is to allow us to lock the target page from write_begin()
+ * to write_end(). The caller must hold a ref on w_target_page.
+ */
+ if (wc->w_target_locked) {
+ BUG_ON(!wc->w_target_page);
+ for (i = 0; i < wc->w_num_pages; i++) {
+ if (wc->w_target_page == wc->w_pages[i]) {
+ wc->w_pages[i] = NULL;
+ break;
+ }
+ }
+ mark_page_accessed(wc->w_target_page);
+ page_cache_release(wc->w_target_page);
+ }
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
brelse(wc->w_di_bh);
@@ -1023,6 +1032,12 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
&cluster_start, &cluster_end);
+ /* treat the write as new if the a hole/lseek spanned across
+ * the page boundary.
+ */
+ new = new | ((i_size_read(inode) <= page_offset(page)) &&
+ (page_offset(page) <= user_pos));
+
if (page == wc->w_target_page) {
map_from = user_pos & (PAGE_CACHE_SIZE - 1);
map_to = map_from + user_len;
@@ -1136,20 +1151,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
*/
lock_page(mmap_page);
+ /* Exit and let the caller retry */
if (mmap_page->mapping != mapping) {
+ WARN_ON(mmap_page->mapping);
unlock_page(mmap_page);
- /*
- * Sanity check - the locking in
- * ocfs2_pagemkwrite() should ensure
- * that this code doesn't trigger.
- */
- ret = -EINVAL;
- mlog_errno(ret);
+ ret = -EAGAIN;
goto out;
}
page_cache_get(mmap_page);
wc->w_pages[i] = mmap_page;
+ wc->w_target_locked = true;
} else {
wc->w_pages[i] = find_or_create_page(mapping, index,
GFP_NOFS);
@@ -1159,11 +1171,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
goto out;
}
}
+ wait_for_stable_page(wc->w_pages[i]);
if (index == target_index)
wc->w_target_page = wc->w_pages[i];
}
out:
+ if (ret)
+ wc->w_target_locked = false;
return ret;
}
@@ -1531,9 +1546,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_dinode *di = NULL;
- mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
- (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
- oi->ip_dyn_features);
+ trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
+ len, (unsigned long long)pos,
+ oi->ip_dyn_features);
/*
* Handle inodes which already have inline data 1st.
@@ -1627,6 +1642,43 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
return ret;
}
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed)
+{
+ tid_t target;
+ int ret = 0;
+ unsigned int truncated_clusters;
+
+ mutex_lock(&osb->osb_tl_inode->i_mutex);
+ truncated_clusters = osb->truncated_clusters;
+ mutex_unlock(&osb->osb_tl_inode->i_mutex);
+
+ /*
+ * Check whether we can succeed in allocating if we free
+ * the truncate log.
+ */
+ if (truncated_clusters < needed)
+ goto out;
+
+ ret = ocfs2_flush_truncate_log(osb);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+ jbd2_log_wait_commit(osb->journal->j_journal, target);
+ ret = 1;
+ }
+out:
+ return ret;
+}
+
int ocfs2_write_begin_nolock(struct file *filp,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -1634,7 +1686,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
struct buffer_head *di_bh, struct page *mmap_page)
{
int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
- unsigned int clusters_to_alloc, extents_to_split;
+ unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
struct ocfs2_write_ctxt *wc;
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1643,7 +1695,9 @@ int ocfs2_write_begin_nolock(struct file *filp,
struct ocfs2_alloc_context *meta_ac = NULL;
handle_t *handle;
struct ocfs2_extent_tree et;
+ int try_free = 1, ret1;
+try_again:
ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
if (ret) {
mlog_errno(ret);
@@ -1678,7 +1732,8 @@ int ocfs2_write_begin_nolock(struct file *filp,
mlog_errno(ret);
goto out;
} else if (ret == 1) {
- ret = ocfs2_refcount_cow(inode, filp, di_bh,
+ clusters_need = wc->w_clen;
+ ret = ocfs2_refcount_cow(inode, di_bh,
wc->w_cpos, wc->w_clen, UINT_MAX);
if (ret) {
mlog_errno(ret);
@@ -1692,9 +1747,17 @@ int ocfs2_write_begin_nolock(struct file *filp,
mlog_errno(ret);
goto out;
}
+ clusters_need += clusters_to_alloc;
di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+ trace_ocfs2_write_begin_nolock(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (long long)i_size_read(inode),
+ le32_to_cpu(di->i_clusters),
+ pos, len, flags, mmap_page,
+ clusters_to_alloc, extents_to_split);
+
/*
* We set w_target_from, w_target_to here so that
* ocfs2_write_end() knows which range in the target page to
@@ -1707,12 +1770,6 @@ int ocfs2_write_begin_nolock(struct file *filp,
* ocfs2_lock_allocators(). It greatly over-estimates
* the work to be done.
*/
- mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
- " clusters_to_add = %u, extents_to_split = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
- clusters_to_alloc, extents_to_split);
-
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
wc->w_di_bh);
ret = ocfs2_lock_allocators(inode, &et,
@@ -1727,8 +1784,7 @@ int ocfs2_write_begin_nolock(struct file *filp,
data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
credits = ocfs2_calc_extend_credits(inode->i_sb,
- &di->id2.i_list,
- clusters_to_alloc);
+ &di->id2.i_list);
}
@@ -1779,11 +1835,23 @@ int ocfs2_write_begin_nolock(struct file *filp,
*/
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
cluster_of_pages, mmap_page);
- if (ret) {
+ if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out_quota;
}
+ /*
+ * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+ * the target page. In this case, we exit with no error and no target
+ * page. This will trigger the caller, page_mkwrite(), to re-try
+ * the operation.
+ */
+ if (ret == -EAGAIN) {
+ BUG_ON(wc->w_target_page);
+ ret = 0;
+ goto out_quota;
+ }
+
ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
len);
if (ret) {
@@ -1810,10 +1878,30 @@ out_commit:
out:
ocfs2_free_write_ctxt(wc);
- if (data_ac)
+ if (data_ac) {
ocfs2_free_alloc_context(data_ac);
- if (meta_ac)
+ data_ac = NULL;
+ }
+ if (meta_ac) {
ocfs2_free_alloc_context(meta_ac);
+ meta_ac = NULL;
+ }
+
+ if (ret == -ENOSPC && try_free) {
+ /*
+ * Try to free some truncate log so that we can have enough
+ * clusters to allocate.
+ */
+ try_free = 0;
+
+ ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
+ if (ret1 == 1)
+ goto try_again;
+
+ if (ret1 < 0)
+ mlog_errno(ret1);
+ }
+
return ret;
}
@@ -1874,12 +1962,12 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
}
}
- kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
+ kaddr = kmap_atomic(wc->w_target_page);
memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
- kunmap_atomic(kaddr, KM_USER0);
+ kunmap_atomic(kaddr);
- mlog(0, "Data written to inode at offset %llu. "
- "id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
+ trace_ocfs2_write_end_inline(
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)pos, *copied,
le16_to_cpu(di->id2.i_data.id_count),
le16_to_cpu(di->i_dyn_features));
@@ -1941,7 +2029,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
out_write_size:
pos += copied;
- if (pos > inode->i_size) {
+ if (pos > i_size_read(inode)) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
@@ -1950,6 +2038,7 @@ out_write_size:
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, wc->w_di_bh);
ocfs2_commit_trans(osb, handle);
@@ -1983,9 +2072,8 @@ const struct address_space_operations ocfs2_aops = {
.write_begin = ocfs2_write_begin,
.write_end = ocfs2_write_end,
.bmap = ocfs2_bmap,
- .sync_page = block_sync_page,
.direct_IO = ocfs2_direct_IO,
- .invalidatepage = ocfs2_invalidatepage,
+ .invalidatepage = block_invalidatepage,
.releasepage = ocfs2_releasepage,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,