diff options
Diffstat (limited to 'fs/ocfs2/alloc.c')
| -rw-r--r-- | fs/ocfs2/alloc.c | 4003 |
1 files changed, 2525 insertions, 1478 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 447206eb5c2..9d8fcf2f3b9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -28,14 +28,16 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/swap.h> +#include <linux/quotaops.h> +#include <linux/blkdev.h> -#define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" #include "aops.h" +#include "blockcheck.h" #include "dlmglue.h" #include "extent_map.h" #include "inode.h" @@ -46,44 +48,536 @@ #include "file.h" #include "super.h" #include "uptodate.h" +#include "xattr.h" +#include "refcounttree.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" -static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); -static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, - struct ocfs2_extent_block *eb); +enum ocfs2_contig_type { + CONTIG_NONE = 0, + CONTIG_LEFT, + CONTIG_RIGHT, + CONTIG_LEFTRIGHT, +}; +static enum ocfs2_contig_type + ocfs2_extent_rec_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec); /* - * Structures which describe a path through a btree, and functions to - * manipulate them. + * Operations for a specific extent tree type. * - * The idea here is to be as generic as possible with the tree - * manipulation code. + * To implement an on-disk btree (extent tree) type in ocfs2, add + * an ocfs2_extent_tree_operations structure and the matching + * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it + * for the allocation portion of the extent tree. + */ +struct ocfs2_extent_tree_operations { + /* + * last_eb_blk is the block number of the right most leaf extent + * block. Most on-disk structures containing an extent tree store + * this value for fast access. The ->eo_set_last_eb_blk() and + * ->eo_get_last_eb_blk() operations access this value. They are + * both required. + */ + void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et, + u64 blkno); + u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et); + + /* + * The on-disk structure usually keeps track of how many total + * clusters are stored in this extent tree. This function updates + * that value. new_clusters is the delta, and must be + * added to the total. Required. + */ + void (*eo_update_clusters)(struct ocfs2_extent_tree *et, + u32 new_clusters); + + /* + * If this extent tree is supported by an extent map, insert + * a record into the map. + */ + void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); + + /* + * If this extent tree is supported by an extent map, truncate the + * map to clusters, + */ + void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et, + u32 clusters); + + /* + * If ->eo_insert_check() exists, it is called before rec is + * inserted into the extent tree. It is optional. + */ + int (*eo_insert_check)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); + int (*eo_sanity_check)(struct ocfs2_extent_tree *et); + + /* + * -------------------------------------------------------------- + * The remaining are internal to ocfs2_extent_tree and don't have + * accessor functions + */ + + /* + * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el. + * It is required. + */ + void (*eo_fill_root_el)(struct ocfs2_extent_tree *et); + + /* + * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if + * it exists. If it does not, et->et_max_leaf_clusters is set + * to 0 (unlimited). Optional. + */ + void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et); + + /* + * ->eo_extent_contig test whether the 2 ocfs2_extent_rec + * are contiguous or not. Optional. Don't need to set it if use + * ocfs2_extent_rec as the tree leaf. + */ + enum ocfs2_contig_type + (*eo_extent_contig)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec); +}; + + +/* + * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check + * in the methods. */ -struct ocfs2_path_item { - struct buffer_head *bh; - struct ocfs2_extent_list *el; +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno); +static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters); +static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); +static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters); +static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); +static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); +static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { + .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, + .eo_update_clusters = ocfs2_dinode_update_clusters, + .eo_extent_map_insert = ocfs2_dinode_extent_map_insert, + .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate, + .eo_insert_check = ocfs2_dinode_insert_check, + .eo_sanity_check = ocfs2_dinode_sanity_check, + .eo_fill_root_el = ocfs2_dinode_fill_root_el, +}; + +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + di->i_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + return le64_to_cpu(di->i_last_eb_blk); +} + +static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); + struct ocfs2_dinode *di = et->et_object; + + le32_add_cpu(&di->i_clusters, clusters); + spin_lock(&oi->ip_lock); + oi->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&oi->ip_lock); +} + +static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; + + ocfs2_extent_map_insert_rec(inode, rec); +} + +static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; + + ocfs2_extent_map_trunc(inode, clusters); +} + +static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); + struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb); + + BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL); + mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && + (oi->ip_clusters != le32_to_cpu(rec->e_cpos)), + "Device %s, asking for sparse allocation: inode %llu, " + "cpos %u, clusters %u\n", + osb->dev_str, + (unsigned long long)oi->ip_blkno, + rec->e_cpos, oi->ip_clusters); + + return 0; +} + +static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + return 0; +} + +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + et->et_root_el = &di->id2.i_list; +} + + +static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_buf *vb = et->et_object; + + et->et_root_el = &vb->vb_xv->xr_list; +} + +static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_value_buf *vb = et->et_object; + + vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_buf *vb = et->et_object; + + return le64_to_cpu(vb->vb_xv->xr_last_eb_blk); +} + +static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_value_buf *vb = et->et_object; + + le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_value_update_clusters, + .eo_fill_root_el = ocfs2_xattr_value_fill_root_el, }; -#define OCFS2_MAX_PATH_DEPTH 5 +static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + et->et_root_el = &xb->xb_attrs.xb_root.xt_list; +} + +static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et) +{ + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + et->et_max_leaf_clusters = + ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE); +} + +static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + xt->xt_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + return le64_to_cpu(xt->xt_last_eb_blk); +} + +static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); +} -struct ocfs2_path { - int p_tree_depth; - struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; +static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_tree_update_clusters, + .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el, + .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, }; -#define path_root_bh(_path) ((_path)->p_node[0].bh) -#define path_root_el(_path) ((_path)->p_node[0].el) -#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) -#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) -#define path_num_items(_path) ((_path)->p_tree_depth + 1) +static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + dx_root->dr_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + return le64_to_cpu(dx_root->dr_last_eb_blk); +} + +static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + le32_add_cpu(&dx_root->dr_clusters, clusters); +} + +static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root)); + + return 0; +} +static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dx_root_block *dx_root = et->et_object; + + et->et_root_el = &dx_root->dr_list; +} + +static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { + .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, + .eo_update_clusters = ocfs2_dx_root_update_clusters, + .eo_sanity_check = ocfs2_dx_root_sanity_check, + .eo_fill_root_el = ocfs2_dx_root_fill_root_el, +}; + +static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + et->et_root_el = &rb->rf_list; +} + +static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + rb->rf_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + return le64_to_cpu(rb->rf_last_eb_blk); +} + +static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + le32_add_cpu(&rb->rf_clusters, clusters); +} + +static enum ocfs2_contig_type +ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) +{ + return CONTIG_NONE; +} + +static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_refcount_tree_update_clusters, + .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el, + .eo_extent_contig = ocfs2_refcount_tree_extent_contig, +}; + +static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh, + ocfs2_journal_access_func access, + void *obj, + struct ocfs2_extent_tree_operations *ops) +{ + et->et_ops = ops; + et->et_root_bh = bh; + et->et_ci = ci; + et->et_root_journal_access = access; + if (!obj) + obj = (void *)bh->b_data; + et->et_object = obj; + + et->et_ops->eo_fill_root_el(et); + if (!et->et_ops->eo_fill_max_leaf_clusters) + et->et_max_leaf_clusters = 0; + else + et->et_ops->eo_fill_max_leaf_clusters(et); +} + +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di, + NULL, &ocfs2_dinode_et_ops); +} + +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb, + NULL, &ocfs2_xattr_tree_et_ops); +} + +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct ocfs2_xattr_value_buf *vb) +{ + __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb, + &ocfs2_xattr_value_et_ops); +} + +void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr, + NULL, &ocfs2_dx_root_et_ops); +} + +void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb, + NULL, &ocfs2_refcount_tree_et_ops); +} + +static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 new_last_eb_blk) +{ + et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk); +} + +static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + return et->et_ops->eo_get_last_eb_blk(et); +} + +static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + et->et_ops->eo_update_clusters(et, clusters); +} + +static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + if (et->et_ops->eo_extent_map_insert) + et->et_ops->eo_extent_map_insert(et, rec); +} + +static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters) +{ + if (et->et_ops->eo_extent_map_truncate) + et->et_ops->eo_extent_map_truncate(et, clusters); +} + +static inline int ocfs2_et_root_journal_access(handle_t *handle, + struct ocfs2_extent_tree *et, + int type) +{ + return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh, + type); +} + +static inline enum ocfs2_contig_type + ocfs2_et_extent_contig(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec, + struct ocfs2_extent_rec *insert_rec) +{ + if (et->et_ops->eo_extent_contig) + return et->et_ops->eo_extent_contig(et, rec, insert_rec); + + return ocfs2_extent_rec_contig( + ocfs2_metadata_cache_get_super(et->et_ci), + rec, insert_rec); +} + +static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + int ret = 0; + + if (et->et_ops->eo_insert_check) + ret = et->et_ops->eo_insert_check(et, rec); + return ret; +} + +static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et) +{ + int ret = 0; + + if (et->et_ops->eo_sanity_check) + ret = et->et_ops->eo_sanity_check(et); + return ret; +} + +static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, + struct ocfs2_extent_block *eb); +static void ocfs2_adjust_rightmost_records(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_extent_rec *insert_rec); /* * Reset the actual path elements so that we can re-use the structure * to build another path. Generally, this involves freeing the buffer * heads. */ -static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) +void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) { int i, start = 0, depth = 0; struct ocfs2_path_item *node; @@ -106,11 +600,13 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) */ if (keep_root) depth = le16_to_cpu(path_root_el(path)->l_tree_depth); + else + path_root_access(path) = NULL; path->p_tree_depth = depth; } -static void ocfs2_free_path(struct ocfs2_path *path) +void ocfs2_free_path(struct ocfs2_path *path) { if (path) { ocfs2_reinit_path(path, 0); @@ -131,6 +627,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) BUG_ON(path_root_bh(dest) != path_root_bh(src)); BUG_ON(path_root_el(dest) != path_root_el(src)); + BUG_ON(path_root_access(dest) != path_root_access(src)); ocfs2_reinit_path(dest, 1); @@ -152,6 +649,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) int i; BUG_ON(path_root_bh(dest) != path_root_bh(src)); + BUG_ON(path_root_access(dest) != path_root_access(src)); for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { brelse(dest->p_node[i].bh); @@ -187,7 +685,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, } static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, - struct ocfs2_extent_list *root_el) + struct ocfs2_extent_list *root_el, + ocfs2_journal_access_func access) { struct ocfs2_path *path; @@ -199,27 +698,54 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, get_bh(root_bh); path_root_bh(path) = root_bh; path_root_el(path) = root_el; + path_root_access(path) = access; } return path; } +struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) +{ + return ocfs2_new_path(path_root_bh(path), path_root_el(path), + path_root_access(path)); +} + +struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) +{ + return ocfs2_new_path(et->et_root_bh, et->et_root_el, + et->et_root_journal_access); +} + /* - * Allocate and initialize a new path based on a disk inode tree. + * Journal the buffer at depth idx. All idx>0 are extent_blocks, + * otherwise it's the root_access function. + * + * I don't like the way this function's name looks next to + * ocfs2_journal_access_path(), but I don't have a better one. */ -static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) +int ocfs2_path_bh_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + int idx) { - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - struct ocfs2_extent_list *el = &di->id2.i_list; + ocfs2_journal_access_func access = path_root_access(path); + + if (!access) + access = ocfs2_journal_access; - return ocfs2_new_path(di_bh, el); + if (idx) + access = ocfs2_journal_access_eb; + + return access(handle, ci, path->p_node[idx].bh, + OCFS2_JOURNAL_ACCESS_WRITE); } /* * Convenience function to journal all components in a path. */ -static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, - struct ocfs2_path *path) +int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, + handle_t *handle, + struct ocfs2_path *path) { int i, ret = 0; @@ -227,8 +753,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, goto out; for(i = 0; i < path_num_items(path); i++) { - ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, ci, path, i); if (ret < 0) { mlog_errno(ret); goto out; @@ -269,17 +794,9 @@ int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) return ret; } -enum ocfs2_contig_type { - CONTIG_NONE = 0, - CONTIG_LEFT, - CONTIG_RIGHT, - CONTIG_LEFTRIGHT, -}; - - /* * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and - * ocfs2_extent_contig only work properly against leaf nodes! + * ocfs2_extent_rec_contig only work properly against leaf nodes! */ static int ocfs2_block_extent_contig(struct super_block *sb, struct ocfs2_extent_rec *ext, @@ -305,9 +822,9 @@ static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, } static enum ocfs2_contig_type - ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - struct ocfs2_extent_rec *insert_rec) + ocfs2_extent_rec_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) { u64 blkno = le64_to_cpu(insert_rec->e_blkno); @@ -320,12 +837,12 @@ static enum ocfs2_contig_type return CONTIG_NONE; if (ocfs2_extents_adjacent(ext, insert_rec) && - ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) + ocfs2_block_extent_contig(sb, ext, blkno)) return CONTIG_RIGHT; blkno = le64_to_cpu(ext->e_blkno); if (ocfs2_extents_adjacent(insert_rec, ext) && - ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) + ocfs2_block_extent_contig(sb, insert_rec, blkno)) return CONTIG_LEFT; return CONTIG_NONE; @@ -363,46 +880,112 @@ struct ocfs2_merge_ctxt { int c_split_covers_rec; }; +static int ocfs2_validate_extent_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_extent_block *eb = + (struct ocfs2_extent_block *)bh->b_data; + + trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for extent block %llu\n", + (unsigned long long)bh->b_blocknr); + return rc; + } + + /* + * Errors after here are fatal. + */ + + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + ocfs2_error(sb, + "Extent block #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + eb->h_signature); + return -EINVAL; + } + + if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Extent block #%llu has an invalid h_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Extent block #%llu has an invalid " + "h_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(eb->h_fs_generation)); + return -EINVAL; + } + + return 0; +} + +int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(ci, eb_blkno, &tmp, + ocfs2_validate_extent_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + + /* * How many free extents have we got before we need more meta data? */ int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct inode *inode, - struct ocfs2_dinode *fe) + struct ocfs2_extent_tree *et) { int retval; - struct ocfs2_extent_list *el; + struct ocfs2_extent_list *el = NULL; struct ocfs2_extent_block *eb; struct buffer_head *eb_bh = NULL; + u64 last_eb_blk = 0; - mlog_entry_void(); - - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - retval = -EIO; - goto bail; - } + el = et->et_root_el; + last_eb_blk = ocfs2_et_get_last_eb_blk(et); - if (fe->i_last_eb_blk) { - retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &eb_bh, OCFS2_BH_CACHED, inode); + if (last_eb_blk) { + retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk, + &eb_bh); if (retval < 0) { mlog_errno(retval); goto bail; } eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; - } else - el = &fe->id2.i_list; + } BUG_ON(el->l_tree_depth != 0); retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); bail: - if (eb_bh) - brelse(eb_bh); + brelse(eb_bh); - mlog_exit(retval); + trace_ocfs2_num_free_extents(retval); return retval; } @@ -411,9 +994,8 @@ bail: * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and * l_count for you */ -static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, +static int ocfs2_create_new_meta_bhs(handle_t *handle, + struct ocfs2_extent_tree *et, int wanted, struct ocfs2_alloc_context *meta_ac, struct buffer_head *bhs[]) @@ -421,17 +1003,17 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, int count, status, i; u16 suballoc_bit_start; u32 num_got; - u64 first_blkno; + u64 suballoc_loc, first_blkno; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); struct ocfs2_extent_block *eb; - mlog_entry_void(); - count = 0; while (count < wanted) { - status = ocfs2_claim_metadata(osb, - handle, + status = ocfs2_claim_metadata(handle, meta_ac, wanted - count, + &suballoc_loc, &suballoc_bit_start, &num_got, &first_blkno); @@ -443,14 +1025,15 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, for(i = count; i < (num_got + count); i++) { bhs[i] = sb_getblk(osb->sb, first_blkno); if (bhs[i] == NULL) { - status = -EIO; + status = -ENOMEM; mlog_errno(status); goto bail; } - ocfs2_set_new_buffer_uptodate(inode, bhs[i]); + ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]); - status = ocfs2_journal_access(handle, inode, bhs[i], - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_eb(handle, et->et_ci, + bhs[i], + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; @@ -462,7 +1045,9 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); eb->h_blkno = cpu_to_le64(first_blkno); eb->h_fs_generation = cpu_to_le32(osb->fs_generation); - eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); + eb->h_suballoc_slot = + cpu_to_le16(meta_ac->ac_alloc_slot); + eb->h_suballoc_loc = cpu_to_le64(suballoc_loc); eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); eb->h_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); @@ -472,11 +1057,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, /* We'll also be dirtied by the caller, so * this isn't absolutely necessary. */ - status = ocfs2_journal_dirty(handle, bhs[i]); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, bhs[i]); } count += num_got; @@ -486,12 +1067,11 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, bail: if (status < 0) { for(i = 0; i < wanted; i++) { - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); bhs[i] = NULL; } + mlog_errno(status); } - mlog_exit(status); return status; } @@ -518,8 +1098,54 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) } /* + * Change range of the branches in the right most path according to the leaf + * extent block's rightmost record. + */ +static int ocfs2_adjust_rightmost_branch(handle_t *handle, + struct ocfs2_extent_tree *et) +{ + int status; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + path = ocfs2_new_path_from_et(et); + if (!path) { + status = -ENOMEM; + return status; + } + + status = ocfs2_find_path(et->et_ci, path, UINT_MAX); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_extend_trans(handle, path_num_items(path)); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_path(et->et_ci, handle, path); + if (status < 0) { + mlog_errno(status); + goto out; + } + + el = path_leaf_el(path); + rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1]; + + ocfs2_adjust_rightmost_records(handle, et, path, rec); + +out: + ocfs2_free_path(path); + return status; +} + +/* * Add an entire tree branch to our inode. eb_bh is the extent block - * to start at, if we don't want to start the branch at the dinode + * to start at, if we don't want to start the branch at the root * structure. * * last_eb_bh is required as we have to update it's next_leaf pointer @@ -528,10 +1154,8 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) * the new branch will be 'empty' in the sense that every block will * contain a single record with cluster count == 0. */ -static int ocfs2_add_branch(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, +static int ocfs2_add_branch(handle_t *handle, + struct ocfs2_extent_tree *et, struct buffer_head *eb_bh, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) @@ -540,29 +1164,48 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, u64 next_blkno, new_last_eb_blk; struct buffer_head *bh; struct buffer_head **new_eb_bhs = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; - u32 new_cpos; - - mlog_entry_void(); + u32 new_cpos, root_end; BUG_ON(!last_eb_bh || !*last_eb_bh); - fe = (struct ocfs2_dinode *) fe_bh->b_data; - if (eb_bh) { eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; } else - el = &fe->id2.i_list; + el = et->et_root_el; /* we never add a branch to a leaf. */ BUG_ON(!el->l_tree_depth); new_blocks = le16_to_cpu(el->l_tree_depth); + eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; + new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); + root_end = ocfs2_sum_rightmost_rec(et->et_root_el); + + /* + * If there is a gap before the root end and the real end + * of the righmost leaf block, we need to remove the gap + * between new_cpos and root_end first so that the tree + * is consistent after we add a new branch(it will start + * from new_cpos). + */ + if (root_end > new_cpos) { + trace_ocfs2_adjust_rightmost_branch( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + root_end, new_cpos); + + status = ocfs2_adjust_rightmost_branch(handle, et); + if (status) { + mlog_errno(status); + goto bail; + } + } + /* allocate the number of new eb blocks we need */ new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), GFP_KERNEL); @@ -572,16 +1215,13 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, goto bail; } - status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, + status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, meta_ac, new_eb_bhs); if (status < 0) { mlog_errno(status); goto bail; } - eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; - new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); - /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be * linked with the rest of the tree. * conversly, new_eb_bhs[0] is the new bottommost leaf. @@ -593,15 +1233,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, for(i = 0; i < new_blocks; i++) { bh = new_eb_bhs[i]; eb = (struct ocfs2_extent_block *) bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } + /* ocfs2_create_new_meta_bhs() should create it right! */ + BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); eb_el = &eb->h_list; - status = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_eb(handle, et->et_ci, bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; @@ -625,12 +1262,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, if (!eb_el->l_tree_depth) new_last_eb_blk = le64_to_cpu(eb->h_blkno); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - + ocfs2_journal_dirty(handle, bh); next_blkno = le64_to_cpu(eb->h_blkno); } @@ -640,21 +1272,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * journal_dirty erroring as it won't unless we've aborted the * handle (in which case we would never be here) so reserving * the write with journal_access is all we need to do. */ - status = ocfs2_journal_access(handle, inode, *last_eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } if (eb_bh) { - status = ocfs2_journal_access(handle, inode, eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -662,7 +1294,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, } /* Link the new branch into the rest of the tree (el will - * either be on the fe, or the extent block passed in. */ + * either be on the root_bh, or the extent block passed in. */ i = le16_to_cpu(el->l_next_free_rec); el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); @@ -671,22 +1303,15 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, /* fe needs a new last extent block pointer, as does the * next_leaf on the previously last-extent-block. */ - fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); + ocfs2_et_set_last_eb_blk(et, new_last_eb_blk); eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); - status = ocfs2_journal_dirty(handle, *last_eb_bh); - if (status < 0) - mlog_errno(status); - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) - mlog_errno(status); - if (eb_bh) { - status = ocfs2_journal_dirty(handle, eb_bh); - if (status < 0) - mlog_errno(status); - } + ocfs2_journal_dirty(handle, *last_eb_bh); + ocfs2_journal_dirty(handle, et->et_root_bh); + if (eb_bh) + ocfs2_journal_dirty(handle, eb_bh); /* * Some callers want to track the rightmost leaf so pass it @@ -700,12 +1325,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, bail: if (new_eb_bhs) { for (i = 0; i < new_blocks; i++) - if (new_eb_bhs[i]) - brelse(new_eb_bhs[i]); + brelse(new_eb_bhs[i]); kfree(new_eb_bhs); } - mlog_exit(status); return status; } @@ -714,24 +1337,19 @@ bail: * returns back the new extent block so you can add a branch to it * after this call. */ -static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, +static int ocfs2_shift_tree_depth(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) { int status, i; u32 new_clusters; struct buffer_head *new_eb_bh = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *fe_el; + struct ocfs2_extent_list *root_el; struct ocfs2_extent_list *eb_el; - mlog_entry_void(); - - status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, &new_eb_bh); if (status < 0) { mlog_errno(status); @@ -739,37 +1357,29 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, } eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } + /* ocfs2_create_new_meta_bhs() should create it right! */ + BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); eb_el = &eb->h_list; - fe = (struct ocfs2_dinode *) fe_bh->b_data; - fe_el = &fe->id2.i_list; + root_el = et->et_root_el; - status = ocfs2_journal_access(handle, inode, new_eb_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; } - /* copy the fe data into the new extent block */ - eb_el->l_tree_depth = fe_el->l_tree_depth; - eb_el->l_next_free_rec = fe_el->l_next_free_rec; - for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) - eb_el->l_recs[i] = fe_el->l_recs[i]; + /* copy the root extent list data into the new extent block */ + eb_el->l_tree_depth = root_el->l_tree_depth; + eb_el->l_next_free_rec = root_el->l_next_free_rec; + for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) + eb_el->l_recs[i] = root_el->l_recs[i]; - status = ocfs2_journal_dirty(handle, new_eb_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, new_eb_bh); - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -777,34 +1387,28 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, new_clusters = ocfs2_sum_rightmost_rec(eb_el); - /* update fe now */ - le16_add_cpu(&fe_el->l_tree_depth, 1); - fe_el->l_recs[0].e_cpos = 0; - fe_el->l_recs[0].e_blkno = eb->h_blkno; - fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); - for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) - memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); - fe_el->l_next_free_rec = cpu_to_le16(1); + /* update root_bh now */ + le16_add_cpu(&root_el->l_tree_depth, 1); + root_el->l_recs[0].e_cpos = 0; + root_el->l_recs[0].e_blkno = eb->h_blkno; + root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); + for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) + memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); + root_el->l_next_free_rec = cpu_to_le16(1); /* If this is our 1st tree depth shift, then last_eb_blk * becomes the allocated extent block */ - if (fe_el->l_tree_depth == cpu_to_le16(1)) - fe->i_last_eb_blk = eb->h_blkno; + if (root_el->l_tree_depth == cpu_to_le16(1)) + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, et->et_root_bh); *ret_new_eb_bh = new_eb_bh; new_eb_bh = NULL; status = 0; bail: - if (new_eb_bh) - brelse(new_eb_bh); + brelse(new_eb_bh); - mlog_exit(status); return status; } @@ -817,77 +1421,64 @@ bail: * 1) a lowest extent block is found, then we pass it back in * *lowest_eb_bh and return '0' * - * 2) the search fails to find anything, but the dinode has room. We + * 2) the search fails to find anything, but the root_el has room. We * pass NULL back in *lowest_eb_bh, but still return '0' * - * 3) the search fails to find anything AND the dinode is full, in + * 3) the search fails to find anything AND the root_el is full, in * which case we return > 0 * * return status < 0 indicates an error. */ -static int ocfs2_find_branch_target(struct ocfs2_super *osb, - struct inode *inode, - struct buffer_head *fe_bh, +static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, struct buffer_head **target_bh) { int status = 0, i; u64 blkno; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct buffer_head *bh = NULL; struct buffer_head *lowest_bh = NULL; - mlog_entry_void(); - *target_bh = NULL; - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; + el = et->et_root_el; while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { - ocfs2_error(inode->i_sb, "Dinode %llu has empty " + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty " "extent list (next_free_rec == 0)", - (unsigned long long)OCFS2_I(inode)->ip_blkno); + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); status = -EIO; goto bail; } i = le16_to_cpu(el->l_next_free_rec) - 1; blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (!blkno) { - ocfs2_error(inode->i_sb, "Dinode %llu has extent " + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has extent " "list where extent # %d has no physical " "block start", - (unsigned long long)OCFS2_I(inode)->ip_blkno, i); + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); status = -EIO; goto bail; } - if (bh) { - brelse(bh); - bh = NULL; - } + brelse(bh); + bh = NULL; - status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, - inode); + status = ocfs2_read_extent_block(et->et_ci, blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; } eb = (struct ocfs2_extent_block *) bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } el = &eb->h_list; if (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) { - if (lowest_bh) - brelse(lowest_bh); + brelse(lowest_bh); lowest_bh = bh; get_bh(lowest_bh); } @@ -895,16 +1486,14 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, /* If we didn't find one and the fe doesn't have any room, * then return '1' */ - if (!lowest_bh - && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) + el = et->et_root_el; + if (!lowest_bh && (el->l_next_free_rec == el->l_count)) status = 1; *target_bh = lowest_bh; bail: - if (bh) - brelse(bh); + brelse(bh); - mlog_exit(status); return status; } @@ -918,20 +1507,18 @@ bail: * * *last_eb_bh will be updated by ocfs2_add_branch(). */ -static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, - struct buffer_head *di_bh, int *final_depth, - struct buffer_head **last_eb_bh, +static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, + int *final_depth, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { int ret, shift; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - int depth = le16_to_cpu(di->id2.i_list.l_tree_depth); - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_list *el = et->et_root_el; + int depth = le16_to_cpu(el->l_tree_depth); struct buffer_head *bh = NULL; BUG_ON(meta_ac == NULL); - shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh); + shift = ocfs2_find_branch_target(et, &bh); if (shift < 0) { ret = shift; mlog_errno(ret); @@ -943,13 +1530,15 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, * another tree level */ if (shift) { BUG_ON(bh); - mlog(0, "need to shift tree depth (current = %d)\n", depth); + trace_ocfs2_grow_tree( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + depth); /* ocfs2_shift_tree_depth will return us a buffer with * the new extent block (so we can pass that to * ocfs2_add_branch). */ - ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh, - meta_ac, &bh); + ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -974,8 +1563,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* call ocfs2_add_branch to add the final part of the tree with * the new data. */ - mlog(0, "add branch. bh = %p\n", bh); - ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh, + ret = ocfs2_add_branch(handle, et, bh, last_eb_bh, meta_ac); if (ret < 0) { mlog_errno(ret); @@ -990,15 +1578,6 @@ out: } /* - * This is only valid for leaf nodes, which are the only ones that can - * have empty extents anyway. - */ -static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) -{ - return !rec->e_leaf_clusters; -} - -/* * This function will discard the rightmost extent record. */ static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) @@ -1029,8 +1608,7 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, BUG_ON(!next_free); /* The tree code before us didn't allow enough room in the leaf. */ - if (el->l_next_free_rec == el->l_count && !has_empty) - BUG(); + BUG_ON(el->l_next_free_rec == el->l_count && !has_empty); /* * The easiest way to approach this is to just remove the @@ -1059,8 +1637,9 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, } insert_index = i; - mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", - insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); + trace_ocfs2_rotate_leaf(insert_cpos, insert_index, + has_empty, next_free, + le16_to_cpu(el->l_count)); BUG_ON(insert_index < 0); BUG_ON(insert_index >= le16_to_cpu(el->l_count)); @@ -1155,9 +1734,9 @@ set_and_inc: * * The array index of the subtree root is passed back. */ -static int ocfs2_find_subtree_root(struct inode *inode, - struct ocfs2_path *left, - struct ocfs2_path *right) +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right) { int i = 0; @@ -1173,10 +1752,10 @@ static int ocfs2_find_subtree_root(struct inode *inode, * The caller didn't pass two adjacent paths. */ mlog_bug_on_msg(i > left->p_tree_depth, - "Inode %lu, left depth %u, right depth %u\n" + "Owner %llu, left depth %u, right depth %u\n" "left leaf blk %llu, right leaf blk %llu\n", - inode->i_ino, left->p_tree_depth, - right->p_tree_depth, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + left->p_tree_depth, right->p_tree_depth, (unsigned long long)path_leaf_bh(left)->b_blocknr, (unsigned long long)path_leaf_bh(right)->b_blocknr); } while (left->p_node[i].bh->b_blocknr == @@ -1193,7 +1772,7 @@ typedef void (path_insert_t)(void *, struct buffer_head *); * This code can be called with a cpos larger than the tree, in which * case it will return the rightmost path. */ -static int __ocfs2_find_path(struct inode *inode, +static int __ocfs2_find_path(struct ocfs2_caching_info *ci, struct ocfs2_extent_list *root_el, u32 cpos, path_insert_t *func, void *data) { @@ -1204,15 +1783,14 @@ static int __ocfs2_find_path(struct inode *inode, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; - struct ocfs2_inode_info *oi = OCFS2_I(inode); el = root_el; while (el->l_tree_depth) { if (le16_to_cpu(el->l_next_free_rec) == 0) { - ocfs2_error(inode->i_sb, - "Inode %llu has empty extent list at " + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has empty extent list at " "depth %u\n", - (unsigned long long)oi->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth)); ret = -EROFS; goto out; @@ -1235,10 +1813,10 @@ static int __ocfs2_find_path(struct inode *inode, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (blkno == 0) { - ocfs2_error(inode->i_sb, - "Inode %llu has bad blkno in extent list " + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has bad blkno in extent list " "at depth %u (index %d)\n", - (unsigned long long)oi->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth), i); ret = -EROFS; goto out; @@ -1246,8 +1824,7 @@ static int __ocfs2_find_path(struct inode *inode, brelse(bh); bh = NULL; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, - &bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_extent_block(ci, blkno, &bh); if (ret) { mlog_errno(ret); goto out; @@ -1255,18 +1832,13 @@ static int __ocfs2_find_path(struct inode *inode, eb = (struct ocfs2_extent_block *) bh->b_data; el = &eb->h_list; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EIO; - goto out; - } if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { - ocfs2_error(inode->i_sb, - "Inode %llu has bad count in extent list " + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has bad count in extent list " "at block %llu (next free=%u, count=%u)\n", - (unsigned long long)oi->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)bh->b_blocknr, le16_to_cpu(el->l_next_free_rec), le16_to_cpu(el->l_count)); @@ -1310,14 +1882,14 @@ static void find_path_ins(void *data, struct buffer_head *bh) ocfs2_path_insert_eb(fp->path, fp->index, bh); fp->index++; } -static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, - u32 cpos) +int ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_path *path, u32 cpos) { struct find_path_data data; data.index = 1; data.path = path; - return __ocfs2_find_path(inode, path_root_el(path), cpos, + return __ocfs2_find_path(ci, path_root_el(path), cpos, find_path_ins, &data); } @@ -1342,13 +1914,14 @@ static void find_leaf_ins(void *data, struct buffer_head *bh) * * This function doesn't handle non btree extent lists. */ -int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, - u32 cpos, struct buffer_head **leaf_bh) +int ocfs2_find_leaf(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *root_el, u32 cpos, + struct buffer_head **leaf_bh) { int ret; struct buffer_head *bh = NULL; - ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); + ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh); if (ret) { mlog_errno(ret); goto out; @@ -1388,7 +1961,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, * immediately to their right. */ left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); - if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { + if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) { + BUG_ON(right_child_el->l_tree_depth); BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); } @@ -1450,13 +2024,15 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, * - When our insert into the right path leaf is at the leftmost edge * and requires an update of the path immediately to it's left. This * can occur at the end of some types of rotation and appending inserts. + * - When we've adjusted the last extent record in the left path leaf and the + * 1st extent record in the right path leaf during cross extent block merge. */ -static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, +static void ocfs2_complete_edge_insert(handle_t *handle, struct ocfs2_path *left_path, struct ocfs2_path *right_path, int subtree_index) { - int ret, i, idx; + int i, idx; struct ocfs2_extent_list *el, *left_el, *right_el; struct ocfs2_extent_rec *left_rec, *right_rec; struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; @@ -1476,7 +2052,7 @@ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, left_el = path_leaf_el(left_path); right_el = path_leaf_el(right_path); for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { - mlog(0, "Adjust records at index %u\n", i); + trace_ocfs2_complete_edge_insert(i); /* * One nice property of knowing that all of these @@ -1494,13 +2070,8 @@ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, right_el); - ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); - if (ret) - mlog_errno(ret); - - ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, left_path->p_node[i].bh); + ocfs2_journal_dirty(handle, right_path->p_node[i].bh); /* * Setup our list pointers now so that the current @@ -1524,13 +2095,11 @@ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, root_bh = left_path->p_node[subtree_index].bh; - ret = ocfs2_journal_dirty(handle, root_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, root_bh); } -static int ocfs2_rotate_subtree_right(struct inode *inode, - handle_t *handle, +static int ocfs2_rotate_subtree_right(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, int subtree_index) @@ -1546,10 +2115,10 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, left_el = path_leaf_el(left_path); if (left_el->l_next_free_rec != left_el->l_count) { - ocfs2_error(inode->i_sb, + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Inode %llu has non-full interior leaf node %llu" "(next free = %u)", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)left_leaf_bh->b_blocknr, le16_to_cpu(left_el->l_next_free_rec)); return -EROFS; @@ -1565,25 +2134,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + subtree_index); if (ret) { mlog_errno(ret); goto out; } for(i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_journal_access(handle, inode, - right_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, - left_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, i); if (ret) { mlog_errno(ret); goto out; @@ -1596,16 +2163,12 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, /* This is a code error, not a disk corruption. */ mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " "because rightmost leaf block %llu is empty\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)right_leaf_bh->b_blocknr); ocfs2_create_empty_extent(right_el); - ret = ocfs2_journal_dirty(handle, right_leaf_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, right_leaf_bh); /* Do the copy now. */ i = le16_to_cpu(left_el->l_next_free_rec) - 1; @@ -1624,14 +2187,10 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); le16_add_cpu(&left_el->l_next_free_rec, 1); - ret = ocfs2_journal_dirty(handle, left_leaf_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + ocfs2_journal_dirty(handle, left_leaf_bh); - ocfs2_complete_edge_insert(inode, handle, left_path, right_path, - subtree_index); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); out: return ret; @@ -1643,8 +2202,8 @@ out: * * Will return zero if the path passed in is already the leftmost path. */ -static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, - struct ocfs2_path *path, u32 *cpos) +int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) { int i, j, ret = 0; u64 blkno; @@ -1721,12 +2280,14 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, int op_credits, struct ocfs2_path *path) { + int ret = 0; int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; if (handle->h_buffer_credits < credits) - return ocfs2_extend_trans(handle, credits); + ret = ocfs2_extend_trans(handle, + credits - handle->h_buffer_credits); - return 0; + return ret; } /* @@ -1785,7 +2346,7 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) * * The array is assumed to be large enough to hold an entire path (tree depth). * - * Upon succesful return from this function: + * Upon successful return from this function: * * - The 'right_path' array will contain a path to the leaf block * whose range contains e_cpos. @@ -1794,8 +2355,8 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) * *ret_left_path will contain a valid path which can be passed to * ocfs2_insert_path(). */ -static int ocfs2_rotate_tree_right(struct inode *inode, - handle_t *handle, +static int ocfs2_rotate_tree_right(handle_t *handle, + struct ocfs2_extent_tree *et, enum ocfs2_split_type split, u32 insert_cpos, struct ocfs2_path *right_path, @@ -1804,24 +2365,26 @@ static int ocfs2_rotate_tree_right(struct inode *inode, int ret, start, orig_credits = handle->h_buffer_credits; u32 cpos; struct ocfs2_path *left_path = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); *ret_left_path = NULL; - left_path = ocfs2_new_path(path_root_bh(right_path), - path_root_el(right_path)); + left_path = ocfs2_new_path_from_path(right_path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); goto out; } - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); + ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); if (ret) { mlog_errno(ret); goto out; } - mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); + trace_ocfs2_rotate_tree_right( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos); /* * What we want to do here is: @@ -1850,10 +2413,12 @@ static int ocfs2_rotate_tree_right(struct inode *inode, * rotating subtrees. */ while (cpos && insert_cpos <= cpos) { - mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", - insert_cpos, cpos); + trace_ocfs2_rotate_tree_right( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos); - ret = ocfs2_find_path(inode, left_path, cpos); + ret = ocfs2_find_path(et->et_ci, left_path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -1861,10 +2426,11 @@ static int ocfs2_rotate_tree_right(struct inode *inode, mlog_bug_on_msg(path_leaf_bh(left_path) == path_leaf_bh(right_path), - "Inode %lu: error during insert of %u " + "Owner %llu: error during insert of %u " "(left path cpos %u) results in two identical " "paths ending at %llu\n", - inode->i_ino, insert_cpos, cpos, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos, (unsigned long long) path_leaf_bh(left_path)->b_blocknr); @@ -1890,12 +2456,12 @@ static int ocfs2_rotate_tree_right(struct inode *inode, goto out_ret_path; } - start = ocfs2_find_subtree_root(inode, left_path, right_path); + start = ocfs2_find_subtree_root(et, left_path, right_path); - mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", - start, - (unsigned long long) right_path->p_node[start].bh->b_blocknr, - right_path->p_tree_depth); + trace_ocfs2_rotate_subtree(start, + (unsigned long long) + right_path->p_node[start].bh->b_blocknr, + right_path->p_tree_depth); ret = ocfs2_extend_rotate_transaction(handle, start, orig_credits, right_path); @@ -1904,7 +2470,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, goto out; } - ret = ocfs2_rotate_subtree_right(inode, handle, left_path, + ret = ocfs2_rotate_subtree_right(handle, et, left_path, right_path, start); if (ret) { mlog_errno(ret); @@ -1936,8 +2502,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, */ ocfs2_mv_path(right_path, left_path); - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, - &cpos); + ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); if (ret) { mlog_errno(ret); goto out; @@ -1951,15 +2516,37 @@ out_ret_path: return ret; } -static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, - struct ocfs2_path *path) +static int ocfs2_update_edge_lengths(handle_t *handle, + struct ocfs2_extent_tree *et, + int subtree_index, struct ocfs2_path *path) { - int i, idx; + int i, idx, ret; struct ocfs2_extent_rec *rec; struct ocfs2_extent_list *el; struct ocfs2_extent_block *eb; u32 range; + /* + * In normal tree rotation process, we will never touch the + * tree branch above subtree_index and ocfs2_extend_rotate_transaction + * doesn't reserve the credits for them either. + * + * But we do have a special case here which will update the rightmost + * records for all the bh in the path. + * So we have to allocate extra credits and access them. + */ + ret = ocfs2_extend_trans(handle, subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(et->et_ci, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + /* Path should always be rightmost. */ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; BUG_ON(eb->h_next_leaf_blk != 0ULL); @@ -1980,9 +2567,12 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, path->p_node[i].bh); } +out: + return ret; } -static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, +static void ocfs2_unlink_path(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_cached_dealloc_ctxt *dealloc, struct ocfs2_path *path, int unlink_start) { @@ -2004,12 +2594,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, mlog(ML_ERROR, "Inode %llu, attempted to remove extent block " "%llu with %u records\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(el->l_next_free_rec)); ocfs2_journal_dirty(handle, bh); - ocfs2_remove_from_cache(inode, bh); + ocfs2_remove_from_cache(et->et_ci, bh); continue; } @@ -2022,11 +2612,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, if (ret) mlog_errno(ret); - ocfs2_remove_from_cache(inode, bh); + ocfs2_remove_from_cache(et->et_ci, bh); } } -static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, +static void ocfs2_unlink_subtree(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, int subtree_index, @@ -2057,11 +2648,12 @@ static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, root_bh); ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); - ocfs2_unlink_path(inode, handle, dealloc, right_path, + ocfs2_unlink_path(handle, et, dealloc, right_path, subtree_index + 1); } -static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, +static int ocfs2_rotate_subtree_left(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, int subtree_index, @@ -2069,8 +2661,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, int *deleted) { int ret, i, del_right_subtree = 0, right_has_empty = 0; - struct buffer_head *root_bh, *di_bh = path_root_bh(right_path); - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; struct ocfs2_extent_block *eb; @@ -2103,9 +2694,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, return -EAGAIN; if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { - ret = ocfs2_journal_access(handle, inode, - path_leaf_bh(right_path), - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_eb(handle, et->et_ci, + path_leaf_bh(right_path), + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -2122,8 +2713,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, * We have to update i_last_eb_blk during the meta * data delete. */ - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -2138,25 +2729,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, */ BUG_ON(right_has_empty && !del_right_subtree); - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + subtree_index); if (ret) { mlog_errno(ret); goto out; } for(i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_journal_access(handle, inode, - right_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, - left_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, i); if (ret) { mlog_errno(ret); goto out; @@ -2184,20 +2773,21 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, ocfs2_remove_empty_extent(right_leaf_el); } - ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); - if (ret) - mlog_errno(ret); - ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); if (del_right_subtree) { - ocfs2_unlink_subtree(inode, handle, left_path, right_path, + ocfs2_unlink_subtree(handle, et, left_path, right_path, subtree_index, dealloc); - ocfs2_update_edge_lengths(inode, handle, left_path); + ret = ocfs2_update_edge_lengths(handle, et, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; - di->i_last_eb_blk = eb->h_blkno; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); /* * Removal of the extent in the left leaf was skipped @@ -2207,13 +2797,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, if (right_has_empty) ocfs2_remove_empty_extent(left_leaf_el); - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, et_root_bh); *deleted = 1; } else - ocfs2_complete_edge_insert(inode, handle, left_path, right_path, + ocfs2_complete_edge_insert(handle, left_path, right_path, subtree_index); out: @@ -2229,8 +2817,8 @@ out: * This looks similar, but is subtly different to * ocfs2_find_cpos_for_left_leaf(). */ -static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, - struct ocfs2_path *path, u32 *cpos) +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) { int i, j, ret = 0; u64 blkno; @@ -2299,35 +2887,34 @@ out: return ret; } -static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, - handle_t *handle, - struct buffer_head *bh, - struct ocfs2_extent_list *el) +static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path) { int ret; + struct buffer_head *bh = path_leaf_bh(path); + struct ocfs2_extent_list *el = path_leaf_el(path); if (!ocfs2_is_empty_extent(&el->l_recs[0])) return 0; - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, + path_num_items(path) - 1); if (ret) { mlog_errno(ret); goto out; } ocfs2_remove_empty_extent(el); - - ret = ocfs2_journal_dirty(handle, bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, bh); out: return ret; } -static int __ocfs2_rotate_tree_left(struct inode *inode, - handle_t *handle, int orig_credits, +static int __ocfs2_rotate_tree_left(handle_t *handle, + struct ocfs2_extent_tree *et, + int orig_credits, struct ocfs2_path *path, struct ocfs2_cached_dealloc_ctxt *dealloc, struct ocfs2_path **empty_extent_path) @@ -2336,20 +2923,19 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, u32 right_cpos; struct ocfs2_path *left_path = NULL; struct ocfs2_path *right_path = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); *empty_extent_path = NULL; - ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path, - &right_cpos); + ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); if (ret) { mlog_errno(ret); goto out; } - left_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + left_path = ocfs2_new_path_from_path(path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -2358,8 +2944,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, ocfs2_cp_path(left_path, path); - right_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + right_path = ocfs2_new_path_from_path(path); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); @@ -2367,17 +2952,16 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, } while (right_cpos) { - ret = ocfs2_find_path(inode, right_path, right_cpos); + ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (ret) { mlog_errno(ret); goto out; } - subtree_root = ocfs2_find_subtree_root(inode, left_path, + subtree_root = ocfs2_find_subtree_root(et, left_path, right_path); - mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", - subtree_root, + trace_ocfs2_rotate_subtree(subtree_root, (unsigned long long) right_path->p_node[subtree_root].bh->b_blocknr, right_path->p_tree_depth); @@ -2393,15 +2977,14 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, * Caller might still want to make changes to the * tree root, so re-add it to the journal here. */ - ret = ocfs2_journal_access(handle, inode, - path_root_bh(left_path), - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, 0); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_rotate_subtree_left(inode, handle, left_path, + ret = ocfs2_rotate_subtree_left(handle, et, left_path, right_path, subtree_root, dealloc, &deleted); if (ret == -EAGAIN) { @@ -2430,7 +3013,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, ocfs2_mv_path(left_path, right_path); - ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, + ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &right_cpos); if (ret) { mlog_errno(ret); @@ -2445,30 +3028,21 @@ out: return ret; } -static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, - struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc) +static int ocfs2_remove_rightmost_path(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret, subtree_index; u32 cpos; struct ocfs2_path *left_path = NULL; - struct ocfs2_dinode *di; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - /* - * XXX: This code assumes that the root is an inode, which is - * true for now but may change as tree code gets generic. - */ - di = (struct ocfs2_dinode *)path_root_bh(path)->b_data; - if (!OCFS2_IS_VALID_DINODE(di)) { - ret = -EIO; - ocfs2_error(inode->i_sb, - "Inode %llu has invalid path root", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - goto out; - } + ret = ocfs2_et_sanity_check(et); + if (ret) + goto out; /* * There's two ways we handle this depending on * whether path is the only existing one. @@ -2481,13 +3055,14 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_journal_access_path(inode, handle, path); + ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + path, &cpos); if (ret) { mlog_errno(ret); goto out; @@ -2498,50 +3073,54 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, * We have a path to the left of this one - it needs * an update too. */ - left_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + left_path = ocfs2_new_path_from_path(path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); goto out; } - ret = ocfs2_find_path(inode, left_path, cpos); + ret = ocfs2_find_path(et->et_ci, left_path, cpos); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access_path(inode, handle, left_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); if (ret) { mlog_errno(ret); goto out; } - subtree_index = ocfs2_find_subtree_root(inode, left_path, path); + subtree_index = ocfs2_find_subtree_root(et, left_path, path); - ocfs2_unlink_subtree(inode, handle, left_path, path, + ocfs2_unlink_subtree(handle, et, left_path, path, subtree_index, dealloc); - ocfs2_update_edge_lengths(inode, handle, left_path); + ret = ocfs2_update_edge_lengths(handle, et, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; - di->i_last_eb_blk = eb->h_blkno; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); } else { /* * 'path' is also the leftmost path which * means it must be the only one. This gets * handled differently because we want to - * revert the inode back to having extents + * revert the root back to having extents * in-line. */ - ocfs2_unlink_path(inode, handle, dealloc, path, 1); + ocfs2_unlink_path(handle, et, dealloc, path, 1); - el = &di->id2.i_list; + el = et->et_root_el; el->l_tree_depth = 0; el->l_next_free_rec = 0; memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); - di->i_last_eb_blk = 0; + ocfs2_et_set_last_eb_blk(et, 0); } ocfs2_journal_dirty(handle, path_root_bh(path)); @@ -2567,7 +3146,8 @@ out: * the rightmost tree leaf record is removed so the caller is * responsible for detecting and correcting that. */ -static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, +static int ocfs2_rotate_tree_left(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_cached_dealloc_ctxt *dealloc) { @@ -2583,12 +3163,10 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, if (path->p_tree_depth == 0) { rightmost_no_delete: /* - * In-inode extents. This is trivially handled, so do + * Inline extents. This is trivially handled, so do * it up front. */ - ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, - path_leaf_bh(path), - path_leaf_el(path)); + ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path); if (ret) mlog_errno(ret); goto out; @@ -2604,7 +3182,7 @@ rightmost_no_delete: * * 1) is handled via ocfs2_rotate_rightmost_leaf_left() * 2a) we need the left branch so that we can update it with the unlink - * 2b) we need to bring the inode back to inline extents. + * 2b) we need to bring the root back to inline extents. */ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; @@ -2620,9 +3198,9 @@ rightmost_no_delete: if (le16_to_cpu(el->l_next_free_rec) == 0) { ret = -EIO; - ocfs2_error(inode->i_sb, - "Inode %llu has empty extent block at %llu", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty extent block at %llu", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)le64_to_cpu(eb->h_blkno)); goto out; } @@ -2636,7 +3214,7 @@ rightmost_no_delete: * nonempty list. */ - ret = ocfs2_remove_rightmost_path(inode, handle, path, + ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc); if (ret) mlog_errno(ret); @@ -2648,7 +3226,7 @@ rightmost_no_delete: * and restarting from there. */ try_rotate: - ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, + ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path, dealloc, &restart_path); if (ret && ret != -EAGAIN) { mlog_errno(ret); @@ -2659,7 +3237,7 @@ try_rotate: tmp_path = restart_path; restart_path = NULL; - ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, + ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, tmp_path, dealloc, &restart_path); if (ret && ret != -EAGAIN) { @@ -2712,27 +3290,147 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, } } +static int ocfs2_get_right_path(struct ocfs2_extent_tree *et, + struct ocfs2_path *left_path, + struct ocfs2_path **ret_right_path) +{ + int ret; + u32 right_cpos; + struct ocfs2_path *right_path = NULL; + struct ocfs2_extent_list *left_el; + + *ret_right_path = NULL; + + /* This function shouldn't be called for non-trees. */ + BUG_ON(left_path->p_tree_depth == 0); + + left_el = path_leaf_el(left_path); + BUG_ON(left_el->l_next_free_rec != left_el->l_count); + + ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + left_path, &right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* This function shouldn't be called for the rightmost leaf. */ + BUG_ON(right_cpos == 0); + + right_path = ocfs2_new_path_from_path(left_path); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ret_right_path = right_path; +out: + if (ret) + ocfs2_free_path(right_path); + return ret; +} + /* * Remove split_rec clusters from the record at index and merge them - * onto the beginning of the record at index + 1. + * onto the beginning of the record "next" to it. + * For index < l_count - 1, the next means the extent rec at index + 1. + * For index == l_count - 1, the "next" means the 1st extent rec of the + * next extent block. */ -static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh, - handle_t *handle, - struct ocfs2_extent_rec *split_rec, - struct ocfs2_extent_list *el, int index) +static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, + handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *split_rec, + int index) { - int ret; + int ret, next_free, i; unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); struct ocfs2_extent_rec *left_rec; struct ocfs2_extent_rec *right_rec; + struct ocfs2_extent_list *right_el; + struct ocfs2_path *right_path = NULL; + int subtree_index = 0; + struct ocfs2_extent_list *el = path_leaf_el(left_path); + struct buffer_head *bh = path_leaf_bh(left_path); + struct buffer_head *root_bh = NULL; BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); - left_rec = &el->l_recs[index]; - right_rec = &el->l_recs[index + 1]; - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + if (index == le16_to_cpu(el->l_next_free_rec) - 1 && + le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { + /* we meet with a cross extent block merge. */ + ret = ocfs2_get_right_path(et, left_path, &right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + right_el = path_leaf_el(right_path); + next_free = le16_to_cpu(right_el->l_next_free_rec); + BUG_ON(next_free <= 0); + right_rec = &right_el->l_recs[0]; + if (ocfs2_is_empty_extent(right_rec)) { + BUG_ON(next_free <= 1); + right_rec = &right_el->l_recs[1]; + } + + BUG_ON(le32_to_cpu(left_rec->e_cpos) + + le16_to_cpu(left_rec->e_leaf_clusters) != + le32_to_cpu(right_rec->e_cpos)); + + subtree_index = ocfs2_find_subtree_root(et, left_path, + right_path); + + ret = ocfs2_extend_rotate_transaction(handle, subtree_index, + handle->h_buffer_credits, + right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = subtree_index + 1; + i < path_num_items(right_path); i++) { + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + right_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + } else { + BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1); + right_rec = &el->l_recs[index + 1]; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, + path_num_items(left_path) - 1); if (ret) { mlog_errno(ret); goto out; @@ -2742,42 +3440,160 @@ static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh, le32_add_cpu(&right_rec->e_cpos, -split_clusters); le64_add_cpu(&right_rec->e_blkno, - -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); + -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), + split_clusters)); le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); ocfs2_cleanup_merge(el, index); - ret = ocfs2_journal_dirty(handle, bh); - if (ret) + ocfs2_journal_dirty(handle, bh); + if (right_path) { + ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); + } +out: + if (right_path) + ocfs2_free_path(right_path); + return ret; +} + +static int ocfs2_get_left_path(struct ocfs2_extent_tree *et, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret; + u32 left_cpos; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + /* This function shouldn't be called for non-trees. */ + BUG_ON(right_path->p_tree_depth == 0); + + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + right_path, &left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* This function shouldn't be called for the leftmost leaf. */ + BUG_ON(left_cpos == 0); + + left_path = ocfs2_new_path_from_path(right_path); + if (!left_path) { + ret = -ENOMEM; mlog_errno(ret); + goto out; + } + ret = ocfs2_find_path(et->et_ci, left_path, left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + *ret_left_path = left_path; out: + if (ret) + ocfs2_free_path(left_path); return ret; } /* * Remove split_rec clusters from the record at index and merge them - * onto the tail of the record at index - 1. + * onto the tail of the record "before" it. + * For index > 0, the "before" means the extent rec at index - 1. + * + * For index == 0, the "before" means the last record of the previous + * extent block. And there is also a situation that we may need to + * remove the rightmost leaf extent block in the right_path and change + * the right path to indicate the new rightmost path. */ -static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, +static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *split_rec, - struct ocfs2_extent_list *el, int index) + struct ocfs2_cached_dealloc_ctxt *dealloc, + int index) { - int ret, has_empty_extent = 0; + int ret, i, subtree_index = 0, has_empty_extent = 0; unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); struct ocfs2_extent_rec *left_rec; struct ocfs2_extent_rec *right_rec; + struct ocfs2_extent_list *el = path_leaf_el(right_path); + struct buffer_head *bh = path_leaf_bh(right_path); + struct buffer_head *root_bh = NULL; + struct ocfs2_path *left_path = NULL; + struct ocfs2_extent_list *left_el; - BUG_ON(index <= 0); + BUG_ON(index < 0); - left_rec = &el->l_recs[index - 1]; right_rec = &el->l_recs[index]; - if (ocfs2_is_empty_extent(&el->l_recs[0])) - has_empty_extent = 1; + if (index == 0) { + /* we meet with a cross extent block merge. */ + ret = ocfs2_get_left_path(et, right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + left_el = path_leaf_el(left_path); + BUG_ON(le16_to_cpu(left_el->l_next_free_rec) != + le16_to_cpu(left_el->l_count)); + + left_rec = &left_el->l_recs[ + le16_to_cpu(left_el->l_next_free_rec) - 1]; + BUG_ON(le32_to_cpu(left_rec->e_cpos) + + le16_to_cpu(left_rec->e_leaf_clusters) != + le32_to_cpu(split_rec->e_cpos)); + + subtree_index = ocfs2_find_subtree_root(et, left_path, + right_path); + + ret = ocfs2_extend_rotate_transaction(handle, subtree_index, + handle->h_buffer_credits, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = subtree_index + 1; + i < path_num_items(right_path); i++) { + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + right_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, + left_path, i); + if (ret) { + mlog_errno(ret); + goto out; + } + } + } else { + left_rec = &el->l_recs[index - 1]; + if (ocfs2_is_empty_extent(&el->l_recs[0])) + has_empty_extent = 1; + } + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, + path_num_items(right_path) - 1); if (ret) { mlog_errno(ret); goto out; @@ -2790,36 +3606,62 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, *left_rec = *split_rec; has_empty_extent = 0; - } else { + } else le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); - } le32_add_cpu(&right_rec->e_cpos, split_clusters); le64_add_cpu(&right_rec->e_blkno, - ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); + ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), + split_clusters)); le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); ocfs2_cleanup_merge(el, index); - ret = ocfs2_journal_dirty(handle, bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, bh); + if (left_path) { + ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); + /* + * In the situation that the right_rec is empty and the extent + * block is empty also, ocfs2_complete_edge_insert can't handle + * it and we need to delete the right extent block. + */ + if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && + le16_to_cpu(el->l_next_free_rec) == 1) { + + ret = ocfs2_remove_rightmost_path(handle, et, + right_path, + dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Now the rightmost extent block has been deleted. + * So we use the new rightmost path. + */ + ocfs2_mv_path(right_path, left_path); + left_path = NULL; + } else + ocfs2_complete_edge_insert(handle, left_path, + right_path, subtree_index); + } out: + if (left_path) + ocfs2_free_path(left_path); return ret; } -static int ocfs2_try_to_merge_extent(struct inode *inode, - handle_t *handle, - struct ocfs2_path *left_path, +static int ocfs2_try_to_merge_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, int split_index, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, struct ocfs2_merge_ctxt *ctxt) - { int ret = 0; - struct ocfs2_extent_list *el = path_leaf_el(left_path); + struct ocfs2_extent_list *el = path_leaf_el(path); struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; BUG_ON(ctxt->c_contig_type == CONTIG_NONE); @@ -2832,8 +3674,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * extents - having more than one in a leaf is * illegal. */ - ret = ocfs2_rotate_tree_left(inode, handle, left_path, - dealloc); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); goto out; @@ -2847,7 +3688,6 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * Left-right contig implies this. */ BUG_ON(!ctxt->c_split_covers_rec); - BUG_ON(split_index == 0); /* * Since the leftright insert always covers the entire @@ -2858,9 +3698,13 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * Since the adding of an empty extent shifts * everything back to the right, there's no need to * update split_index here. + * + * When the split_index is zero, we need to merge it to the + * prevoius extent block. It is more efficient and easier + * if we do merge_right first and merge_left later. */ - ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path), - handle, split_rec, el, split_index); + ret = ocfs2_merge_rec_right(path, handle, et, split_rec, + split_index); if (ret) { mlog_errno(ret); goto out; @@ -2871,33 +3715,28 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, */ BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); - /* - * The left merge left us with an empty extent, remove - * it. - */ - ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc); + /* The merge left us with an empty extent, remove it. */ + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); goto out; } - split_index--; + rec = &el->l_recs[split_index]; /* * Note that we don't pass split_rec here on purpose - - * we've merged it into the left side. + * we've merged it into the rec already. */ - ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path), - handle, rec, el, split_index); + ret = ocfs2_merge_rec_left(path, handle, et, rec, + dealloc, split_index); + if (ret) { mlog_errno(ret); goto out; } - BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); - - ret = ocfs2_rotate_tree_left(inode, handle, left_path, - dealloc); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); /* * Error from this last rotate is not critical, so * print but don't bubble it up. @@ -2914,18 +3753,16 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * the record on the left (hence the left merge). */ if (ctxt->c_contig_type == CONTIG_RIGHT) { - ret = ocfs2_merge_rec_left(inode, - path_leaf_bh(left_path), - handle, split_rec, el, + ret = ocfs2_merge_rec_left(path, handle, et, + split_rec, dealloc, split_index); if (ret) { mlog_errno(ret); goto out; } } else { - ret = ocfs2_merge_rec_right(inode, - path_leaf_bh(left_path), - handle, split_rec, el, + ret = ocfs2_merge_rec_right(path, handle, + et, split_rec, split_index); if (ret) { mlog_errno(ret); @@ -2938,7 +3775,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * The merge may have left an empty extent in * our leaf. Try to rotate it away. */ - ret = ocfs2_rotate_tree_left(inode, handle, left_path, + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) mlog_errno(ret); @@ -2985,10 +3822,10 @@ static void ocfs2_subtract_from_rec(struct super_block *sb, * list. If this leaf is part of an allocation tree, it is assumed * that the tree above has been prepared. */ -static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, +static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *insert_rec, struct ocfs2_extent_list *el, - struct ocfs2_insert_type *insert, - struct inode *inode) + struct ocfs2_insert_type *insert) { int i = insert->ins_contig_index; unsigned int range; @@ -3000,7 +3837,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); BUG_ON(i == -1); rec = &el->l_recs[i]; - ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec, + ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), + insert->ins_split, rec, insert_rec); goto rotate; } @@ -3042,10 +3880,10 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= le16_to_cpu(el->l_count), - "inode %lu, depth %u, count %u, next free %u, " + "owner %llu, depth %u, count %u, next free %u, " "rec.cpos %u, rec.clusters %u, " "insert.cpos %u, insert.clusters %u\n", - inode->i_ino, + ocfs2_metadata_cache_owner(et->et_ci), le16_to_cpu(el->l_tree_depth), le16_to_cpu(el->l_count), le16_to_cpu(el->l_next_free_rec), @@ -3073,18 +3911,8 @@ rotate: ocfs2_rotate_leaf(el, insert_rec); } -static inline void ocfs2_update_dinode_clusters(struct inode *inode, - struct ocfs2_dinode *di, - u32 clusters) -{ - le32_add_cpu(&di->i_clusters, clusters); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); - spin_unlock(&OCFS2_I(inode)->ip_lock); -} - -static void ocfs2_adjust_rightmost_records(struct inode *inode, - handle_t *handle, +static void ocfs2_adjust_rightmost_records(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec) { @@ -3102,9 +3930,9 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode, next_free = le16_to_cpu(el->l_next_free_rec); if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has a bad extent list", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); ret = -EIO; return; } @@ -3117,14 +3945,12 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode, le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); - ret = ocfs2_journal_dirty(handle, bh); - if (ret) - mlog_errno(ret); - + ocfs2_journal_dirty(handle, bh); } } -static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, +static int ocfs2_append_rec_to_path(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *insert_rec, struct ocfs2_path *right_path, struct ocfs2_path **ret_left_path) @@ -3152,31 +3978,33 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { u32 left_cpos; - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, - &left_cpos); + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + right_path, &left_cpos); if (ret) { mlog_errno(ret); goto out; } - mlog(0, "Append may need a left path update. cpos: %u, " - "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), - left_cpos); + trace_ocfs2_append_rec_to_path( + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), + le32_to_cpu(insert_rec->e_cpos), + left_cpos); /* * No need to worry if the append is already in the * leftmost leaf. */ if (left_cpos) { - left_path = ocfs2_new_path(path_root_bh(right_path), - path_root_el(right_path)); + left_path = ocfs2_new_path_from_path(right_path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); goto out; } - ret = ocfs2_find_path(inode, left_path, left_cpos); + ret = ocfs2_find_path(et->et_ci, left_path, + left_cpos); if (ret) { mlog_errno(ret); goto out; @@ -3189,13 +4017,13 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, } } - ret = ocfs2_journal_access_path(inode, handle, right_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); if (ret) { mlog_errno(ret); goto out; } - ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec); + ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec); *ret_left_path = left_path; ret = 0; @@ -3206,7 +4034,7 @@ out: return ret; } -static void ocfs2_split_record(struct inode *inode, +static void ocfs2_split_record(struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, struct ocfs2_extent_rec *split_rec, @@ -3217,7 +4045,7 @@ static void ocfs2_split_record(struct inode *inode, struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; struct ocfs2_extent_rec *rec, *tmprec; - right_el = path_leaf_el(right_path);; + right_el = path_leaf_el(right_path); if (left_path) left_el = path_leaf_el(left_path); @@ -3279,20 +4107,21 @@ static void ocfs2_split_record(struct inode *inode, } rec = &el->l_recs[index]; - ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec); + ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), + split, rec, split_rec); ocfs2_rotate_leaf(insert_el, split_rec); } /* - * This function only does inserts on an allocation b-tree. For dinode - * lists, ocfs2_insert_at_leaf() is called directly. + * This function only does inserts on an allocation b-tree. For tree + * depth = 0, ocfs2_insert_at_leaf() is called directly. * * right_path is the path we want to do the actual insert * in. left_path should only be passed in if we need to update that * portion of the tree after an edge insert. */ -static int ocfs2_insert_path(struct inode *inode, - handle_t *handle, +static int ocfs2_insert_path(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, struct ocfs2_extent_rec *insert_rec, @@ -3302,23 +4131,19 @@ static int ocfs2_insert_path(struct inode *inode, struct buffer_head *leaf_bh = path_leaf_bh(right_path); if (left_path) { - int credits = handle->h_buffer_credits; - /* * There's a chance that left_path got passed back to * us without being accounted for in the * journal. Extend our transaction here to be sure we * can change those blocks. */ - credits += left_path->p_tree_depth; - - ret = ocfs2_extend_trans(handle, credits); + ret = ocfs2_extend_trans(handle, left_path->p_tree_depth); if (ret < 0) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access_path(inode, handle, left_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); if (ret < 0) { mlog_errno(ret); goto out; @@ -3329,7 +4154,7 @@ static int ocfs2_insert_path(struct inode *inode, * Pass both paths to the journal. The majority of inserts * will be touching all components anyway. */ - ret = ocfs2_journal_access_path(inode, handle, right_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); if (ret < 0) { mlog_errno(ret); goto out; @@ -3341,7 +4166,7 @@ static int ocfs2_insert_path(struct inode *inode, * of splits, but it's easier to just let one separate * function sort it all out. */ - ocfs2_split_record(inode, left_path, right_path, + ocfs2_split_record(et, left_path, right_path, insert_rec, insert->ins_split); /* @@ -3350,17 +4175,13 @@ static int ocfs2_insert_path(struct inode *inode, * dirty this for us. */ if (left_path) - ret = ocfs2_journal_dirty(handle, - path_leaf_bh(left_path)); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, + path_leaf_bh(left_path)); } else - ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), - insert, inode); + ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), + insert); - ret = ocfs2_journal_dirty(handle, leaf_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, leaf_bh); if (left_path) { /* @@ -3369,10 +4190,10 @@ static int ocfs2_insert_path(struct inode *inode, * * XXX: Should we extend the transaction here? */ - subtree_index = ocfs2_find_subtree_root(inode, left_path, + subtree_index = ocfs2_find_subtree_root(et, left_path, right_path); - ocfs2_complete_edge_insert(inode, handle, left_path, - right_path, subtree_index); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); } ret = 0; @@ -3380,9 +4201,8 @@ out: return ret; } -static int ocfs2_do_insert_extent(struct inode *inode, - handle_t *handle, - struct buffer_head *di_bh, +static int ocfs2_do_insert_extent(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *insert_rec, struct ocfs2_insert_type *type) { @@ -3390,25 +4210,23 @@ static int ocfs2_do_insert_extent(struct inode *inode, u32 cpos; struct ocfs2_path *right_path = NULL; struct ocfs2_path *left_path = NULL; - struct ocfs2_dinode *di; struct ocfs2_extent_list *el; - di = (struct ocfs2_dinode *) di_bh->b_data; - el = &di->id2.i_list; + el = et->et_root_el; - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } if (le16_to_cpu(el->l_tree_depth) == 0) { - ocfs2_insert_at_leaf(insert_rec, el, type, inode); + ocfs2_insert_at_leaf(et, insert_rec, el, type); goto out_update_clusters; } - right_path = ocfs2_new_inode_path(di_bh); + right_path = ocfs2_new_path_from_et(et); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); @@ -3427,7 +4245,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, cpos = UINT_MAX; } - ret = ocfs2_find_path(inode, right_path, cpos); + ret = ocfs2_find_path(et->et_ci, right_path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -3446,7 +4264,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, * can wind up skipping both of these two special cases... */ if (rotate) { - ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split, + ret = ocfs2_rotate_tree_right(handle, et, type->ins_split, le32_to_cpu(insert_rec->e_cpos), right_path, &left_path); if (ret) { @@ -3458,15 +4276,15 @@ static int ocfs2_do_insert_extent(struct inode *inode, * ocfs2_rotate_tree_right() might have extended the * transaction without re-journaling our tree root. */ - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } } else if (type->ins_appending == APPEND_TAIL && type->ins_contig != CONTIG_LEFT) { - ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, + ret = ocfs2_append_rec_to_path(handle, et, insert_rec, right_path, &left_path); if (ret) { mlog_errno(ret); @@ -3474,7 +4292,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, } } - ret = ocfs2_insert_path(inode, handle, left_path, right_path, + ret = ocfs2_insert_path(handle, et, left_path, right_path, insert_rec, type); if (ret) { mlog_errno(ret); @@ -3483,12 +4301,10 @@ static int ocfs2_do_insert_extent(struct inode *inode, out_update_clusters: if (type->ins_split == SPLIT_NONE) - ocfs2_update_dinode_clusters(inode, di, - le16_to_cpu(insert_rec->e_leaf_clusters)); + ocfs2_et_update_clusters(et, + le16_to_cpu(insert_rec->e_leaf_clusters)); - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, et->et_root_bh); out: ocfs2_free_path(left_path); @@ -3498,33 +4314,115 @@ out: } static enum ocfs2_contig_type -ocfs2_figure_merge_contig_type(struct inode *inode, +ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, + struct ocfs2_path *path, struct ocfs2_extent_list *el, int index, struct ocfs2_extent_rec *split_rec) { - struct ocfs2_extent_rec *rec; + int status; enum ocfs2_contig_type ret = CONTIG_NONE; + u32 left_cpos, right_cpos; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_extent_list *new_el; + struct ocfs2_path *left_path = NULL, *right_path = NULL; + struct buffer_head *bh; + struct ocfs2_extent_block *eb; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + + if (index > 0) { + rec = &el->l_recs[index - 1]; + } else if (path->p_tree_depth > 0) { + status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); + if (status) + goto out; + + if (left_cpos != 0) { + left_path = ocfs2_new_path_from_path(path); + if (!left_path) + goto out; + + status = ocfs2_find_path(et->et_ci, left_path, + left_cpos); + if (status) + goto out; + + new_el = path_leaf_el(left_path); + + if (le16_to_cpu(new_el->l_next_free_rec) != + le16_to_cpu(new_el->l_count)) { + bh = path_leaf_bh(left_path); + eb = (struct ocfs2_extent_block *)bh->b_data; + ocfs2_error(sb, + "Extent block #%llu has an " + "invalid l_next_free_rec of " + "%d. It should have " + "matched the l_count of %d", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec), + le16_to_cpu(new_el->l_count)); + status = -EINVAL; + goto out; + } + rec = &new_el->l_recs[ + le16_to_cpu(new_el->l_next_free_rec) - 1]; + } + } /* * We're careful to check for an empty extent record here - * the merge code will know what to do if it sees one. */ - - if (index > 0) { - rec = &el->l_recs[index - 1]; + if (rec) { if (index == 1 && ocfs2_is_empty_extent(rec)) { if (split_rec->e_cpos == el->l_recs[index].e_cpos) ret = CONTIG_RIGHT; } else { - ret = ocfs2_extent_contig(inode, rec, split_rec); + ret = ocfs2_et_extent_contig(et, rec, split_rec); } } - if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) { + rec = NULL; + if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) + rec = &el->l_recs[index + 1]; + else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && + path->p_tree_depth > 0) { + status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); + if (status) + goto out; + + if (right_cpos == 0) + goto out; + + right_path = ocfs2_new_path_from_path(path); + if (!right_path) + goto out; + + status = ocfs2_find_path(et->et_ci, right_path, right_cpos); + if (status) + goto out; + + new_el = path_leaf_el(right_path); + rec = &new_el->l_recs[0]; + if (ocfs2_is_empty_extent(rec)) { + if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { + bh = path_leaf_bh(right_path); + eb = (struct ocfs2_extent_block *)bh->b_data; + ocfs2_error(sb, + "Extent block #%llu has an " + "invalid l_next_free_rec of %d", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec)); + status = -EINVAL; + goto out; + } + rec = &new_el->l_recs[1]; + } + } + + if (rec) { enum ocfs2_contig_type contig_type; - rec = &el->l_recs[index + 1]; - contig_type = ocfs2_extent_contig(inode, rec, split_rec); + contig_type = ocfs2_et_extent_contig(et, rec, split_rec); if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) ret = CONTIG_LEFTRIGHT; @@ -3532,10 +4430,16 @@ ocfs2_figure_merge_contig_type(struct inode *inode, ret = contig_type; } +out: + if (left_path) + ocfs2_free_path(left_path); + if (right_path) + ocfs2_free_path(right_path); + return ret; } -static void ocfs2_figure_contig_type(struct inode *inode, +static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, struct ocfs2_insert_type *insert, struct ocfs2_extent_list *el, struct ocfs2_extent_rec *insert_rec) @@ -3546,14 +4450,29 @@ static void ocfs2_figure_contig_type(struct inode *inode, BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], - insert_rec); + contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i], + insert_rec); if (contig_type != CONTIG_NONE) { insert->ins_contig_index = i; break; } } insert->ins_contig = contig_type; + + if (insert->ins_contig != CONTIG_NONE) { + struct ocfs2_extent_rec *rec = + &el->l_recs[insert->ins_contig_index]; + unsigned int len = le16_to_cpu(rec->e_leaf_clusters) + + le16_to_cpu(insert_rec->e_leaf_clusters); + + /* + * Caller might want us to limit the size of extents, don't + * calculate contiguousness if we might exceed that limit. + */ + if (et->et_max_leaf_clusters && + (len > et->et_max_leaf_clusters)) + insert->ins_contig = CONTIG_NONE; + } } /* @@ -3562,8 +4481,8 @@ static void ocfs2_figure_contig_type(struct inode *inode, * ocfs2_figure_appending_type() will figure out whether we'll have to * insert at the tail of the rightmost leaf. * - * This should also work against the dinode list for tree's with 0 - * depth. If we consider the dinode list to be the rightmost leaf node + * This should also work against the root extent list for tree's with 0 + * depth. If we consider the root extent list to be the rightmost leaf node * then the logic here makes sense. */ static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, @@ -3601,7 +4520,7 @@ set_tail_append: } /* - * Helper function called at the begining of an insert. + * Helper function called at the beginning of an insert. * * This computes a few things that are commonly used in the process of * inserting into the btree: @@ -3613,15 +4532,13 @@ set_tail_append: * All of the information is stored on the ocfs2_insert_type * structure. */ -static int ocfs2_figure_insert_type(struct inode *inode, - struct buffer_head *di_bh, +static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et, struct buffer_head **last_eb_bh, struct ocfs2_extent_rec *insert_rec, int *free_records, struct ocfs2_insert_type *insert) { int ret; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct ocfs2_path *path = NULL; @@ -3629,7 +4546,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, insert->ins_split = SPLIT_NONE; - el = &di->id2.i_list; + el = et->et_root_el; insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); if (el->l_tree_depth) { @@ -3639,11 +4556,11 @@ static int ocfs2_figure_insert_type(struct inode *inode, * ocfs2_figure_insert_type() and ocfs2_add_branch() * may want it later. */ - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), &bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_extent_block(et->et_ci, + ocfs2_et_get_last_eb_blk(et), + &bh); if (ret) { - mlog_exit(ret); + mlog_errno(ret); goto out; } eb = (struct ocfs2_extent_block *) bh->b_data; @@ -3662,12 +4579,12 @@ static int ocfs2_figure_insert_type(struct inode *inode, le16_to_cpu(el->l_next_free_rec); if (!insert->ins_tree_depth) { - ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_contig_type(et, insert, el, insert_rec); ocfs2_figure_appending_type(insert, el, insert_rec); return 0; } - path = ocfs2_new_inode_path(di_bh); + path = ocfs2_new_path_from_et(et); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -3680,7 +4597,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * us the rightmost tree path. This is accounted for below in * the appending code. */ - ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); + ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos)); if (ret) { mlog_errno(ret); goto out; @@ -3696,7 +4613,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * into two types of appends: simple record append, or a * rotate inside the tail leaf. */ - ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_contig_type(et, insert, el, insert_rec); /* * The insert code isn't quite ready to deal with all cases of @@ -3717,7 +4634,8 @@ static int ocfs2_figure_insert_type(struct inode *inode, * the case that we're doing a tail append, so maybe we can * take advantage of that information somehow. */ - if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { + if (ocfs2_et_get_last_eb_blk(et) == + path_leaf_bh(path)->b_blocknr) { /* * Ok, ocfs2_find_path() returned us the rightmost * tree path. This might be an appending insert. There are @@ -3740,14 +4658,12 @@ out: } /* - * Insert an extent into an inode btree. + * Insert an extent into a btree. * - * The caller needs to update fe->i_clusters + * The caller needs to update the owning btree's cluster count. */ -int ocfs2_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, +int ocfs2_insert_extent(handle_t *handle, + struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, u32 new_clusters, @@ -3760,40 +4676,34 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, struct ocfs2_insert_type insert = {0, }; struct ocfs2_extent_rec rec; - BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); - - mlog(0, "add %u clusters at position %u to inode %llu\n", - new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); - - mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && - (OCFS2_I(inode)->ip_clusters != cpos), - "Device %s, asking for sparse allocation: inode %llu, " - "cpos %u, clusters %u\n", - osb->dev_str, - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, - OCFS2_I(inode)->ip_clusters); + trace_ocfs2_insert_extent_start( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, new_clusters); memset(&rec, 0, sizeof(rec)); rec.e_cpos = cpu_to_le32(cpos); rec.e_blkno = cpu_to_le64(start_blk); rec.e_leaf_clusters = cpu_to_le16(new_clusters); rec.e_flags = flags; + status = ocfs2_et_insert_check(et, &rec); + if (status) { + mlog_errno(status); + goto bail; + } - status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, + status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec, &free_records, &insert); if (status < 0) { mlog_errno(status); goto bail; } - mlog(0, "Insert.appending: %u, Insert.Contig: %u, " - "Insert.contig_index: %d, Insert.free_records: %d, " - "Insert.tree_depth: %d\n", - insert.ins_appending, insert.ins_contig, insert.ins_contig_index, - free_records, insert.ins_tree_depth); + trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig, + insert.ins_contig_index, free_records, + insert.ins_tree_depth); if (insert.ins_contig == CONTIG_NONE && free_records == 0) { - status = ocfs2_grow_tree(inode, handle, fe_bh, + status = ocfs2_grow_tree(handle, et, &insert.ins_tree_depth, &last_eb_bh, meta_ac); if (status) { @@ -3803,17 +4713,134 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, } /* Finally, we can add clusters. This might rotate the tree for us. */ - status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); + status = ocfs2_do_insert_extent(handle, et, &rec, &insert); if (status < 0) mlog_errno(status); else - ocfs2_extent_map_insert_rec(inode, &rec); + ocfs2_et_extent_map_insert(et, &rec); bail: - if (last_eb_bh) - brelse(last_eb_bh); + brelse(last_eb_bh); - mlog_exit(status); + return status; +} + +/* + * Allcate and add clusters into the extent b-tree. + * The new clusters(clusters_to_add) will be inserted at logical_offset. + * The extent b-tree's root is specified by et, and + * it is not limited to the file storage. Any extent tree can use this + * function if it implements the proper ocfs2_extent_tree. + */ +int ocfs2_add_clusters_in_btree(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) +{ + int status = 0, err = 0; + int need_free = 0; + int free_extents; + enum ocfs2_alloc_restarted reason = RESTART_NONE; + u32 bit_off, num_bits; + u64 block; + u8 flags = 0; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); + + BUG_ON(!clusters_to_add); + + if (mark_unwritten) + flags = OCFS2_EXT_UNWRITTEN; + + free_extents = ocfs2_num_free_extents(osb, et); + if (free_extents < 0) { + status = free_extents; + mlog_errno(status); + goto leave; + } + + /* there are two cases which could cause us to EAGAIN in the + * we-need-more-metadata case: + * 1) we haven't reserved *any* + * 2) we are so fragmented, we've needed to add metadata too + * many times. */ + if (!free_extents && !meta_ac) { + err = -1; + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } else if ((!free_extents) + && (ocfs2_alloc_context_bits_left(meta_ac) + < ocfs2_extend_meta_needed(et->et_root_el))) { + err = -2; + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } + + status = __ocfs2_claim_clusters(handle, data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + + /* reserve our write early -- insert_extent may update the tree root */ + status = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + need_free = 1; + goto bail; + } + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + trace_ocfs2_add_clusters_in_btree( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + bit_off, num_bits); + status = ocfs2_insert_extent(handle, et, *logical_offset, block, + num_bits, flags, meta_ac); + if (status < 0) { + mlog_errno(status); + need_free = 1; + goto bail; + } + + ocfs2_journal_dirty(handle, et->et_root_bh); + + clusters_to_add -= num_bits; + *logical_offset += num_bits; + + if (clusters_to_add) { + err = clusters_to_add; + status = -EAGAIN; + reason = RESTART_TRANS; + } + +bail: + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num_bits); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num_bits); + } + +leave: + if (reason_ret) + *reason_ret = reason; + trace_ocfs2_add_clusters_in_btree_ret(status, reason, err); return status; } @@ -3837,10 +4864,9 @@ static void ocfs2_make_right_split_rec(struct super_block *sb, split_rec->e_flags = rec->e_flags; } -static int ocfs2_split_and_insert(struct inode *inode, - handle_t *handle, +static int ocfs2_split_and_insert(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, - struct buffer_head *di_bh, struct buffer_head **last_eb_bh, int split_index, struct ocfs2_extent_rec *orig_split_rec, @@ -3854,7 +4880,6 @@ static int ocfs2_split_and_insert(struct inode *inode, struct ocfs2_extent_rec split_rec = *orig_split_rec; struct ocfs2_insert_type insert; struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di; leftright: /* @@ -3863,8 +4888,7 @@ leftright: */ rec = path_leaf_el(path)->l_recs[split_index]; - di = (struct ocfs2_dinode *)di_bh->b_data; - rightmost_el = &di->id2.i_list; + rightmost_el = et->et_root_el; depth = le16_to_cpu(rightmost_el->l_tree_depth); if (depth) { @@ -3875,8 +4899,8 @@ leftright: if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh, - meta_ac); + ret = ocfs2_grow_tree(handle, et, + &depth, last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); goto out; @@ -3904,8 +4928,8 @@ leftright: */ insert.ins_split = SPLIT_RIGHT; - ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range, - &rec); + ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), + &tmprec, insert_range, &rec); split_rec = tmprec; @@ -3913,8 +4937,7 @@ leftright: do_leftright = 1; } - ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, - &insert); + ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); if (ret) { mlog_errno(ret); goto out; @@ -3930,7 +4953,7 @@ leftright: ocfs2_reinit_path(path, 1); cpos = le32_to_cpu(split_rec.e_cpos); - ret = ocfs2_find_path(inode, path, cpos); + ret = ocfs2_find_path(et->et_ci, path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -3945,10 +4968,32 @@ out: return ret; } +static int ocfs2_replace_extent_rec(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + struct ocfs2_extent_list *el, + int split_index, + struct ocfs2_extent_rec *split_rec) +{ + int ret; + + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, + path_num_items(path) - 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + el->l_recs[split_index] = *split_rec; + + ocfs2_journal_dirty(handle, path_leaf_bh(path)); +out: + return ret; +} + /* - * Mark part or all of the extent record at split_index in the leaf - * pointed to by path as written. This removes the unwritten - * extent flag. + * Split part or all of the extent record at split_index in the leaf + * pointed to by path. Merge with the contiguous extent record if needed. * * Care is taken to handle contiguousness so as to not grow the tree. * @@ -3956,22 +5001,22 @@ out: * of the tree is required. All other cases will degrade into a less * optimal tree layout. * - * last_eb_bh should be the rightmost leaf block for any inode with a - * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call. + * last_eb_bh should be the rightmost leaf block for any extent + * btree. Since a split may grow the tree or a merge might shrink it, + * the caller cannot trust the contents of that buffer after this call. * * This code is optimized for readability - several passes might be * made over certain portions of the tree. All of those blocks will * have been brought into cache (and pinned via the journal), so the * extra overhead is not expressed in terms of disk reads. */ -static int __ocfs2_mark_extent_written(struct inode *inode, - struct buffer_head *di_bh, - handle_t *handle, - struct ocfs2_path *path, - int split_index, - struct ocfs2_extent_rec *split_rec, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) +int ocfs2_split_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret = 0; struct ocfs2_extent_list *el = path_leaf_el(path); @@ -3980,12 +5025,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode, struct ocfs2_merge_ctxt ctxt; struct ocfs2_extent_list *rightmost_el; - if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) { - ret = -EIO; - mlog_errno(ret); - goto out; - } - if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { @@ -3994,34 +5033,27 @@ static int __ocfs2_mark_extent_written(struct inode *inode, goto out; } - ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el, + ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, split_index, split_rec); /* * The core merge / split code wants to know how much room is - * left in this inodes allocation tree, so we pass the + * left in this allocation tree, so we pass the * rightmost extent list. */ if (path->p_tree_depth) { struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_extent_block(et->et_ci, + ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret) { - mlog_exit(ret); + mlog_errno(ret); goto out; } eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EROFS; - goto out; - } - rightmost_el = &eb->h_list; } else rightmost_el = path_root_el(path); @@ -4034,21 +5066,22 @@ static int __ocfs2_mark_extent_written(struct inode *inode, ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); - mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n", - split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent, - ctxt.c_split_covers_rec); + trace_ocfs2_split_extent(split_index, ctxt.c_contig_type, + ctxt.c_has_empty_extent, + ctxt.c_split_covers_rec); if (ctxt.c_contig_type == CONTIG_NONE) { if (ctxt.c_split_covers_rec) - el->l_recs[split_index] = *split_rec; + ret = ocfs2_replace_extent_rec(handle, et, path, el, + split_index, split_rec); else - ret = ocfs2_split_and_insert(inode, handle, path, di_bh, + ret = ocfs2_split_and_insert(handle, et, path, &last_eb_bh, split_index, split_rec, meta_ac); if (ret) mlog_errno(ret); } else { - ret = ocfs2_try_to_merge_extent(inode, handle, path, + ret = ocfs2_try_to_merge_extent(handle, et, path, split_index, split_rec, dealloc, &ctxt); if (ret) @@ -4061,50 +5094,40 @@ out: } /* - * Mark the already-existing extent at cpos as written for len clusters. + * Change the flags of the already-existing extent at cpos for len clusters. + * + * new_flags: the flags we want to set. + * clear_flags: the flags we want to clear. + * phys: the new physical offset we want this new extent starts from. * * If the existing extent is larger than the request, initiate a * split. An attempt will be made at merging with adjacent extents. * * The caller is responsible for passing down meta_ac if we'll need it. */ -int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, - handle_t *handle, u32 cpos, u32 len, u32 phys, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) +int ocfs2_change_extent_flag(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int new_flags, int clear_flags) { int ret, index; - u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys); + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys); struct ocfs2_extent_rec split_rec; struct ocfs2_path *left_path = NULL; struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; - mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n", - inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno); - - if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " - "that are being written to, but the feature bit " - "is not set in the super block.", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - ret = -EROFS; - goto out; - } - - /* - * XXX: This should be fixed up so that we just re-insert the - * next extent records. - */ - ocfs2_extent_map_trunc(inode, 0); - - left_path = ocfs2_new_inode_path(di_bh); + left_path = ocfs2_new_path_from_et(et); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); goto out; } - ret = ocfs2_find_path(inode, left_path, cpos); + ret = ocfs2_find_path(et->et_ci, left_path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -4113,38 +5136,107 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, index = ocfs2_search_extent_list(el, cpos); if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " + ocfs2_error(sb, + "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; goto out; } + ret = -EIO; + rec = &el->l_recs[index]; + if (new_flags && (rec->e_flags & new_flags)) { + mlog(ML_ERROR, "Owner %llu tried to set %d flags on an " + "extent that already had them", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + new_flags); + goto out; + } + + if (clear_flags && !(rec->e_flags & clear_flags)) { + mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an " + "extent that didn't have them", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + clear_flags); + goto out; + } + memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); split_rec.e_cpos = cpu_to_le32(cpos); split_rec.e_leaf_clusters = cpu_to_le16(len); split_rec.e_blkno = cpu_to_le64(start_blkno); - split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; - split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; - - ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path, - index, &split_rec, meta_ac, dealloc); + split_rec.e_flags = rec->e_flags; + if (new_flags) + split_rec.e_flags |= new_flags; + if (clear_flags) + split_rec.e_flags &= ~clear_flags; + + ret = ocfs2_split_extent(handle, et, left_path, + index, &split_rec, meta_ac, + dealloc); if (ret) mlog_errno(ret); out: ocfs2_free_path(left_path); return ret; + } -static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, - handle_t *handle, struct ocfs2_path *path, +/* + * Mark the already-existing extent at cpos as written for len clusters. + * This removes the unwritten extent flag. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + + trace_ocfs2_mark_extent_written( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + cpos, len, phys); + + if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " + "that are being written to, but the feature bit " + "is not set in the super block.", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + ret = -EROFS; + goto out; + } + + /* + * XXX: This should be fixed up so that we just re-insert the + * next extent records. + */ + ocfs2_et_extent_map_truncate(et, 0); + + ret = ocfs2_change_extent_flag(handle, et, cpos, + len, phys, meta_ac, dealloc, + 0, OCFS2_EXT_UNWRITTEN); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, + struct ocfs2_path *path, int index, u32 new_range, struct ocfs2_alloc_context *meta_ac) { - int ret, depth, credits = handle->h_buffer_credits; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + int ret, depth, credits; struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *rightmost_el, *el; @@ -4157,13 +5249,14 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, */ el = path_leaf_el(path); rec = &el->l_recs[index]; - ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec); + ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), + &split_rec, new_range, rec); depth = path->p_tree_depth; if (depth > 0) { - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_extent_block(et->et_ci, + ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -4174,7 +5267,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, } else rightmost_el = path_leaf_el(path); - credits += path->p_tree_depth + ocfs2_extend_meta_needed(di); + credits = path->p_tree_depth + + ocfs2_extend_meta_needed(et->et_root_el); ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); @@ -4183,7 +5277,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh, + ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); @@ -4197,7 +5291,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, insert.ins_split = SPLIT_RIGHT; insert.ins_tree_depth = depth; - ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert); + ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); if (ret) mlog_errno(ret); @@ -4206,7 +5300,8 @@ out: return ret; } -static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, +static int ocfs2_truncate_rec(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, int index, struct ocfs2_cached_dealloc_ctxt *dealloc, u32 cpos, u32 len) @@ -4214,14 +5309,14 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, int ret; u32 left_cpos, rec_range, trunc_range; int wants_rotate = 0, is_rightmost_tree_rec = 0; - struct super_block *sb = inode->i_sb; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); struct ocfs2_path *left_path = NULL; struct ocfs2_extent_list *el = path_leaf_el(path); struct ocfs2_extent_rec *rec; struct ocfs2_extent_block *eb; if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); goto out; @@ -4253,29 +5348,28 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, * by this leaf and the one to it's left. * * There are two cases we can skip: - * 1) Path is the leftmost one in our inode tree. + * 1) Path is the leftmost one in our btree. * 2) The leaf is rightmost and will be empty after * we remove the extent record - the rotate code * knows how to update the newly formed edge. */ - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, - &left_cpos); + ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); if (ret) { mlog_errno(ret); goto out; } if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { - left_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + left_path = ocfs2_new_path_from_path(path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); goto out; } - ret = ocfs2_find_path(inode, left_path, left_cpos); + ret = ocfs2_find_path(et->et_ci, left_path, + left_cpos); if (ret) { mlog_errno(ret); goto out; @@ -4291,13 +5385,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_journal_access_path(inode, handle, path); + ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access_path(inode, handle, left_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); if (ret) { mlog_errno(ret); goto out; @@ -4320,7 +5414,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, * be deleted by the rotate code. */ rec = &el->l_recs[next_free - 1]; - ocfs2_adjust_rightmost_records(inode, handle, path, + ocfs2_adjust_rightmost_records(handle, et, path, rec); } } else if (le32_to_cpu(rec->e_cpos) == cpos) { @@ -4332,11 +5426,12 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, /* Remove rightmost portion of the record */ le16_add_cpu(&rec->e_leaf_clusters, -len); if (is_rightmost_tree_rec) - ocfs2_adjust_rightmost_records(inode, handle, path, rec); + ocfs2_adjust_rightmost_records(handle, et, path, rec); } else { /* Caller should have trapped this. */ - mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) " - "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, + mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) " + "(%u, %u)\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), le32_to_cpu(rec->e_cpos), le16_to_cpu(rec->e_leaf_clusters), cpos, len); BUG(); @@ -4345,14 +5440,14 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, if (left_path) { int subtree_index; - subtree_index = ocfs2_find_subtree_root(inode, left_path, path); - ocfs2_complete_edge_insert(inode, handle, left_path, path, + subtree_index = ocfs2_find_subtree_root(et, left_path, path); + ocfs2_complete_edge_insert(handle, left_path, path, subtree_index); } ocfs2_journal_dirty(handle, path_leaf_bh(path)); - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); goto out; @@ -4363,8 +5458,9 @@ out: return ret; } -int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, - u32 cpos, u32 len, handle_t *handle, +int ocfs2_remove_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { @@ -4372,18 +5468,22 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, u32 rec_range, trunc_range; struct ocfs2_extent_rec *rec; struct ocfs2_extent_list *el; - struct ocfs2_path *path; + struct ocfs2_path *path = NULL; - ocfs2_extent_map_trunc(inode, 0); + /* + * XXX: Why are we truncating to 0 instead of wherever this + * affects us? + */ + ocfs2_et_extent_map_truncate(et, 0); - path = ocfs2_new_inode_path(di_bh); + path = ocfs2_new_path_from_et(et); if (!path) { ret = -ENOMEM; mlog_errno(ret); goto out; } - ret = ocfs2_find_path(inode, path, cpos); + ret = ocfs2_find_path(et->et_ci, path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -4392,10 +5492,11 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -4422,20 +5523,20 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); - mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d " - "(cpos %u, len %u)\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index, - le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); + trace_ocfs2_remove_extent( + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, len, index, le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { - ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, + ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, cpos, len); if (ret) { mlog_errno(ret); goto out; } } else { - ret = ocfs2_split_tree(inode, di_bh, handle, path, index, + ret = ocfs2_split_tree(handle, et, path, index, trunc_range, meta_ac); if (ret) { mlog_errno(ret); @@ -4448,7 +5549,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, */ ocfs2_reinit_path(path, 1); - ret = ocfs2_find_path(inode, path, cpos); + ret = ocfs2_find_path(et->et_ci, path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -4457,9 +5558,9 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(inode->i_sb, - "Inode %llu: split at cpos %u lost record.", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu: split at cpos %u lost record.", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; goto out; @@ -4473,17 +5574,17 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); if (rec_range != trunc_range) { - ocfs2_error(inode->i_sb, - "Inode %llu: error after split at cpos %u" + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu: error after split at cpos %u" "trunc len %u, existing record is (%u,%u)", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos, len, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; goto out; } - ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, + ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, cpos, len); if (ret) { mlog_errno(ret); @@ -4496,6 +5597,170 @@ out: return ret; } +/* + * ocfs2_reserve_blocks_for_rec_trunc() would look basically the + * same as ocfs2_lock_alloctors(), except for it accepts a blocks + * number to reserve some extra blocks, and it only handles meta + * data allocations. + * + * Currently, only ocfs2_remove_btree_range() uses it for truncating + * and punching holes. + */ +static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 extents_to_split, + struct ocfs2_alloc_context **ac, + int extra_blocks) +{ + int ret = 0, num_free_extents; + unsigned int max_recs_needed = 2 * extents_to_split; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *ac = NULL; + + num_free_extents = ocfs2_num_free_extents(osb, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) + extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); + + if (extra_blocks) { + ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + +out: + if (ret) { + if (*ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + } + + return ret; +} + +int ocfs2_remove_btree_range(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 cpos, u32 phys_cpos, u32 len, int flags, + struct ocfs2_cached_dealloc_ctxt *dealloc, + u64 refcount_loc) +{ + int ret, credits = 0, extra_blocks = 0; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_refcount_tree *ref_tree = NULL; + + if ((flags & OCFS2_EXT_REFCOUNTED) && len) { + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_prepare_refcount_change_for_del(inode, + refcount_loc, + phys_blkno, + len, + &credits, + &extra_blocks); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + } + + ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac, + extra_blocks); + if (ret) { + mlog_errno(ret); + goto bail; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, + ocfs2_remove_extent_credits(osb->sb) + credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_et_root_journal_access(handle, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + dquot_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(inode->i_sb, len)); + + ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_et_update_clusters(et, -len); + ocfs2_update_inode_fsync_trans(handle, inode, 1); + + ocfs2_journal_dirty(handle, et->et_root_bh); + + if (phys_blkno) { + if (flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(osb->sb, + phys_blkno), + len, meta_ac, + dealloc, 1); + else + ret = ocfs2_truncate_log_append(osb, handle, + phys_blkno, len); + if (ret) + mlog_errno(ret); + + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + mutex_unlock(&tl_inode->i_mutex); +bail: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + + return ret; +} + int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) { struct buffer_head *tl_bh = osb->osb_tl_bh; @@ -4541,21 +5806,18 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - mlog_entry("start_blk = %llu, num_clusters = %u\n", - (unsigned long long)start_blk, num_clusters); - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); di = (struct ocfs2_dinode *) tl_bh->b_data; - tl = &di->id2.i_dealloc; - if (!OCFS2_IS_VALID_DINODE(di)) { - OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); - status = -EIO; - goto bail; - } + /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated + * by the underlying call to ocfs2_read_inode_block(), so any + * corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; tl_count = le16_to_cpu(tl->tl_count); mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || tl_count == 0, @@ -4573,17 +5835,16 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, goto bail; } - status = ocfs2_journal_access(handle, tl_inode, tl_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } - mlog(0, "Log truncate of %u clusters starting at cluster %u to " - "%llu (index = %d)\n", num_clusters, start_cluster, - (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index); - + trace_ocfs2_truncate_log_append( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index, + start_cluster, num_clusters); if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { /* * Move index back to the record we are coalescing with. @@ -4592,23 +5853,20 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, index--; num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); - mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", - index, le32_to_cpu(tl->tl_recs[index].t_start), - num_clusters); + trace_ocfs2_truncate_log_append( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + index, le32_to_cpu(tl->tl_recs[index].t_start), + num_clusters); } else { tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); tl->tl_used = cpu_to_le16(index + 1); } tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); - status = ocfs2_journal_dirty(handle, tl_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, tl_bh); + osb->truncated_clusters += num_clusters; bail: - mlog_exit(status); return status; } @@ -4627,16 +5885,14 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, struct inode *tl_inode = osb->osb_tl_inode; struct buffer_head *tl_bh = osb->osb_tl_bh; - mlog_entry_void(); - di = (struct ocfs2_dinode *) tl_bh->b_data; tl = &di->id2.i_dealloc; i = le16_to_cpu(tl->tl_used) - 1; while (i >= 0) { /* Caller has given us at least enough credits to * update the truncate log dinode */ - status = ocfs2_journal_access(handle, tl_inode, tl_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -4644,11 +5900,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, tl->tl_used = cpu_to_le16(i); - status = ocfs2_journal_dirty(handle, tl_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, tl_bh); /* TODO: Perhaps we can calculate the bulk of the * credits up front rather than extending like @@ -4668,8 +5920,9 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, /* if start_blk is not set, we ignore the record as * invalid. */ if (start_blk) { - mlog(0, "free record %d, start = %u, clusters = %u\n", - i, le32_to_cpu(rec.t_start), num_clusters); + trace_ocfs2_replay_truncate_records( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + i, le32_to_cpu(rec.t_start), num_clusters); status = ocfs2_free_clusters(handle, data_alloc_inode, data_alloc_bh, start_blk, @@ -4682,8 +5935,9 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, i--; } + osb->truncated_clusters = 0; + bail: - mlog_exit(status); return status; } @@ -4700,21 +5954,20 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - mlog_entry_void(); - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); di = (struct ocfs2_dinode *) tl_bh->b_data; - tl = &di->id2.i_dealloc; - if (!OCFS2_IS_VALID_DINODE(di)) { - OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); - status = -EIO; - goto out; - } + /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated + * by the underlying call to ocfs2_read_inode_block(), so any + * corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; num_to_flush = le16_to_cpu(tl->tl_used); - mlog(0, "Flush %u records from truncate log #%llu\n", - num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); + trace_ocfs2_flush_truncate_log( + (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, + num_to_flush); if (!num_to_flush) { status = 0; goto out; @@ -4760,7 +6013,6 @@ out_mutex: iput(data_alloc_inode); out: - mlog_exit(status); return status; } @@ -4783,20 +6035,19 @@ static void ocfs2_truncate_log_worker(struct work_struct *work) container_of(work, struct ocfs2_super, osb_truncate_log_wq.work); - mlog_entry_void(); - status = ocfs2_flush_truncate_log(osb); if (status < 0) mlog_errno(status); - - mlog_exit(status); + else + ocfs2_init_steal_slots(osb); } #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, int cancel) { - if (osb->osb_tl_inode) { + if (osb->osb_tl_inode && + atomic_read(&osb->osb_tl_disable) == 0) { /* We want to push off log flushes while truncates are * still running. */ if (cancel) @@ -4825,8 +6076,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, goto bail; } - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); + status = ocfs2_read_inode_block(inode, &bh); if (status < 0) { iput(inode); mlog_errno(status); @@ -4836,7 +6086,6 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, *tl_inode = inode; *tl_bh = bh; bail: - mlog_exit(status); return status; } @@ -4856,7 +6105,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, *tl_copy = NULL; - mlog(0, "recover truncate log from slot %d\n", slot_num); + trace_ocfs2_begin_truncate_log_recovery(slot_num); status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); if (status < 0) { @@ -4865,16 +6114,15 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, } di = (struct ocfs2_dinode *) tl_bh->b_data; - tl = &di->id2.i_dealloc; - if (!OCFS2_IS_VALID_DINODE(di)) { - OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); - status = -EIO; - goto bail; - } + /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's + * validated by the underlying call to ocfs2_read_inode_block(), + * so any corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; if (le16_to_cpu(tl->tl_used)) { - mlog(0, "We'll have %u logs to recover\n", - le16_to_cpu(tl->tl_used)); + trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used)); *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); if (!(*tl_copy)) { @@ -4891,7 +6139,8 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, * tl_used. */ tl->tl_used = 0; - status = ocfs2_write_block(osb, tl_bh, tl_inode); + ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check); + status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode)); if (status < 0) { mlog_errno(status); goto bail; @@ -4901,15 +6150,14 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, bail: if (tl_inode) iput(tl_inode); - if (tl_bh) - brelse(tl_bh); + brelse(tl_bh); if (status < 0 && (*tl_copy)) { kfree(*tl_copy); *tl_copy = NULL; + mlog_errno(status); } - mlog_exit(status); return status; } @@ -4924,8 +6172,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_truncate_log *tl; - mlog_entry_void(); - if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); return -EINVAL; @@ -4933,8 +6179,9 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, tl = &tl_copy->id2.i_dealloc; num_recs = le16_to_cpu(tl->tl_used); - mlog(0, "cleanup %u records from %llu\n", num_recs, - (unsigned long long)le64_to_cpu(tl_copy->i_blkno)); + trace_ocfs2_complete_truncate_log_recovery( + (unsigned long long)le64_to_cpu(tl_copy->i_blkno), + num_recs); mutex_lock(&tl_inode->i_mutex); for(i = 0; i < num_recs; i++) { @@ -4969,7 +6216,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, bail_up: mutex_unlock(&tl_inode->i_mutex); - mlog_exit(status); return status; } @@ -4978,7 +6224,7 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; - mlog_entry_void(); + atomic_set(&osb->osb_tl_disable, 1); if (tl_inode) { cancel_delayed_work(&osb->osb_truncate_log_wq); @@ -4991,8 +6237,6 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) brelse(osb->osb_tl_bh); iput(osb->osb_tl_inode); } - - mlog_exit_void(); } int ocfs2_truncate_log_init(struct ocfs2_super *osb) @@ -5001,8 +6245,6 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) struct inode *tl_inode = NULL; struct buffer_head *tl_bh = NULL; - mlog_entry_void(); - status = ocfs2_get_truncate_log_info(osb, osb->slot_num, &tl_inode, @@ -5015,10 +6257,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) * until we're sure all is well. */ INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker); + atomic_set(&osb->osb_tl_disable, 0); osb->osb_tl_bh = tl_bh; osb->osb_tl_inode = tl_inode; - mlog_exit(status); return status; } @@ -5041,10 +6283,14 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) */ /* - * Describes a single block free from a suballocator + * Describe a single bit freed from a suballocator. For the block + * suballocators, it represents one block. For the global cluster + * allocator, it represents some clusters and free_bit indicates + * clusters number. */ struct ocfs2_cached_block_free { struct ocfs2_cached_block_free *free_next; + u64 free_bg; u64 free_blk; unsigned int free_bit; }; @@ -5056,10 +6302,10 @@ struct ocfs2_per_slot_free_list { struct ocfs2_cached_block_free *f_first; }; -static int ocfs2_free_cached_items(struct ocfs2_super *osb, - int sysfile_type, - int slot, - struct ocfs2_cached_block_free *head) +static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, + int sysfile_type, + int slot, + struct ocfs2_cached_block_free *head) { int ret; u64 bg_blkno; @@ -5091,10 +6337,13 @@ static int ocfs2_free_cached_items(struct ocfs2_super *osb, } while (head) { - bg_blkno = ocfs2_which_suballoc_group(head->free_blk, - head->free_bit); - mlog(0, "Free bit: (bit %u, blkno %llu)\n", - head->free_bit, (unsigned long long)head->free_blk); + if (head->free_bg) + bg_blkno = head->free_bg; + else + bg_blkno = ocfs2_which_suballoc_group(head->free_blk, + head->free_bit); + trace_ocfs2_free_cached_blocks( + (unsigned long long)head->free_blk, head->free_bit); ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, head->free_bit, bg_blkno, 1); @@ -5134,6 +6383,81 @@ out: return ret; } +int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + u64 blkno, unsigned int bit) +{ + int ret = 0; + struct ocfs2_cached_block_free *item; + + item = kzalloc(sizeof(*item), GFP_NOFS); + if (item == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit); + + item->free_blk = blkno; + item->free_bit = bit; + item->free_next = ctxt->c_global_allocator; + + ctxt->c_global_allocator = item; + return ret; +} + +static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, + struct ocfs2_cached_block_free *head) +{ + struct ocfs2_cached_block_free *tmp; + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + int ret = 0; + + mutex_lock(&tl_inode->i_mutex); + + while (head) { + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_truncate_log_append(osb, handle, head->free_blk, + head->free_bit); + + ocfs2_commit_trans(osb, handle); + tmp = head; + head = head->free_next; + kfree(tmp); + + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + mutex_unlock(&tl_inode->i_mutex); + + while (head) { + /* Premature exit may have left some dangling items. */ + tmp = head; + head = head->free_next; + kfree(tmp); + } + + return ret; +} + int ocfs2_run_deallocs(struct ocfs2_super *osb, struct ocfs2_cached_dealloc_ctxt *ctxt) { @@ -5147,10 +6471,12 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb, fl = ctxt->c_first_suballocator; if (fl->f_first) { - mlog(0, "Free items: (type %u, slot %d)\n", - fl->f_inode_type, fl->f_slot); - ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type, - fl->f_slot, fl->f_first); + trace_ocfs2_run_deallocs(fl->f_inode_type, + fl->f_slot); + ret2 = ocfs2_free_cached_blocks(osb, + fl->f_inode_type, + fl->f_slot, + fl->f_first); if (ret2) mlog_errno(ret2); if (!ret) @@ -5161,6 +6487,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb, kfree(fl); } + if (ctxt->c_global_allocator) { + ret2 = ocfs2_free_cached_clusters(osb, + ctxt->c_global_allocator); + if (ret2) + mlog_errno(ret2); + if (!ret) + ret = ret2; + + ctxt->c_global_allocator = NULL; + } + return ret; } @@ -5190,9 +6527,9 @@ ocfs2_find_per_slot_free_list(int type, return fl; } -static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, - int type, int slot, u64 blkno, - unsigned int bit) +int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + int type, int slot, u64 suballoc, + u64 blkno, unsigned int bit) { int ret; struct ocfs2_per_slot_free_list *fl; @@ -5205,16 +6542,18 @@ static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, goto out; } - item = kmalloc(sizeof(*item), GFP_NOFS); + item = kzalloc(sizeof(*item), GFP_NOFS); if (item == NULL) { ret = -ENOMEM; mlog_errno(ret); goto out; } - mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", - type, slot, bit, (unsigned long long)blkno); + trace_ocfs2_cache_block_dealloc(type, slot, + (unsigned long long)suballoc, + (unsigned long long)blkno, bit); + item->free_bg = suballoc; item->free_blk = blkno; item->free_bit = bit; item->free_next = fl->f_first; @@ -5231,437 +6570,21 @@ static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, { return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, le16_to_cpu(eb->h_suballoc_slot), + le64_to_cpu(eb->h_suballoc_loc), le64_to_cpu(eb->h_blkno), le16_to_cpu(eb->h_suballoc_bit)); } -/* This function will figure out whether the currently last extent - * block will be deleted, and if it will, what the new last extent - * block will be so we can update his h_next_leaf_blk field, as well - * as the dinodes i_last_eb_blk */ -static int ocfs2_find_new_last_ext_blk(struct inode *inode, - unsigned int clusters_to_del, - struct ocfs2_path *path, - struct buffer_head **new_last_eb) -{ - int next_free, ret = 0; - u32 cpos; - struct ocfs2_extent_rec *rec; - struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; - struct buffer_head *bh = NULL; - - *new_last_eb = NULL; - - /* we have no tree, so of course, no last_eb. */ - if (!path->p_tree_depth) - goto out; - - /* trunc to zero special case - this makes tree_depth = 0 - * regardless of what it is. */ - if (OCFS2_I(inode)->ip_clusters == clusters_to_del) - goto out; - - el = path_leaf_el(path); - BUG_ON(!el->l_next_free_rec); - - /* - * Make sure that this extent list will actually be empty - * after we clear away the data. We can shortcut out if - * there's more than one non-empty extent in the - * list. Otherwise, a check of the remaining extent is - * necessary. - */ - next_free = le16_to_cpu(el->l_next_free_rec); - rec = NULL; - if (ocfs2_is_empty_extent(&el->l_recs[0])) { - if (next_free > 2) - goto out; - - /* We may have a valid extent in index 1, check it. */ - if (next_free == 2) - rec = &el->l_recs[1]; - - /* - * Fall through - no more nonempty extents, so we want - * to delete this leaf. - */ - } else { - if (next_free > 1) - goto out; - - rec = &el->l_recs[0]; - } - - if (rec) { - /* - * Check it we'll only be trimming off the end of this - * cluster. - */ - if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) - goto out; - } - - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); - if (ret) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); - if (ret) { - mlog_errno(ret); - goto out; - } - - eb = (struct ocfs2_extent_block *) bh->b_data; - el = &eb->h_list; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EROFS; - goto out; - } - - *new_last_eb = bh; - get_bh(*new_last_eb); - mlog(0, "returning block %llu, (cpos: %u)\n", - (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); -out: - brelse(bh); - - return ret; -} - -/* - * Trim some clusters off the rightmost edge of a tree. Only called - * during truncate. - * - * The caller needs to: - * - start journaling of each path component. - * - compute and fully set up any new last ext block - */ -static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, - handle_t *handle, struct ocfs2_truncate_context *tc, - u32 clusters_to_del, u64 *delete_start) -{ - int ret, i, index = path->p_tree_depth; - u32 new_edge = 0; - u64 deleted_eb = 0; - struct buffer_head *bh; - struct ocfs2_extent_list *el; - struct ocfs2_extent_rec *rec; - - *delete_start = 0; - - while (index >= 0) { - bh = path->p_node[index].bh; - el = path->p_node[index].el; - - mlog(0, "traveling tree (index = %d, block = %llu)\n", - index, (unsigned long long)bh->b_blocknr); - - BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); - - if (index != - (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { - ocfs2_error(inode->i_sb, - "Inode %lu has invalid ext. block %llu", - inode->i_ino, - (unsigned long long)bh->b_blocknr); - ret = -EROFS; - goto out; - } - -find_tail_record: - i = le16_to_cpu(el->l_next_free_rec) - 1; - rec = &el->l_recs[i]; - - mlog(0, "Extent list before: record %d: (%u, %u, %llu), " - "next = %u\n", i, le32_to_cpu(rec->e_cpos), - ocfs2_rec_clusters(el, rec), - (unsigned long long)le64_to_cpu(rec->e_blkno), - le16_to_cpu(el->l_next_free_rec)); - - BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); - - if (le16_to_cpu(el->l_tree_depth) == 0) { - /* - * If the leaf block contains a single empty - * extent and no records, we can just remove - * the block. - */ - if (i == 0 && ocfs2_is_empty_extent(rec)) { - memset(rec, 0, - sizeof(struct ocfs2_extent_rec)); - el->l_next_free_rec = cpu_to_le16(0); - - goto delete; - } - - /* - * Remove any empty extents by shifting things - * left. That should make life much easier on - * the code below. This condition is rare - * enough that we shouldn't see a performance - * hit. - */ - if (ocfs2_is_empty_extent(&el->l_recs[0])) { - le16_add_cpu(&el->l_next_free_rec, -1); - - for(i = 0; - i < le16_to_cpu(el->l_next_free_rec); i++) - el->l_recs[i] = el->l_recs[i + 1]; - - memset(&el->l_recs[i], 0, - sizeof(struct ocfs2_extent_rec)); - - /* - * We've modified our extent list. The - * simplest way to handle this change - * is to being the search from the - * start again. - */ - goto find_tail_record; - } - - le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); - - /* - * We'll use "new_edge" on our way back up the - * tree to know what our rightmost cpos is. - */ - new_edge = le16_to_cpu(rec->e_leaf_clusters); - new_edge += le32_to_cpu(rec->e_cpos); - - /* - * The caller will use this to delete data blocks. - */ - *delete_start = le64_to_cpu(rec->e_blkno) - + ocfs2_clusters_to_blocks(inode->i_sb, - le16_to_cpu(rec->e_leaf_clusters)); - - /* - * If it's now empty, remove this record. - */ - if (le16_to_cpu(rec->e_leaf_clusters) == 0) { - memset(rec, 0, - sizeof(struct ocfs2_extent_rec)); - le16_add_cpu(&el->l_next_free_rec, -1); - } - } else { - if (le64_to_cpu(rec->e_blkno) == deleted_eb) { - memset(rec, 0, - sizeof(struct ocfs2_extent_rec)); - le16_add_cpu(&el->l_next_free_rec, -1); - - goto delete; - } - - /* Can this actually happen? */ - if (le16_to_cpu(el->l_next_free_rec) == 0) - goto delete; - - /* - * We never actually deleted any clusters - * because our leaf was empty. There's no - * reason to adjust the rightmost edge then. - */ - if (new_edge == 0) - goto delete; - - rec->e_int_clusters = cpu_to_le32(new_edge); - le32_add_cpu(&rec->e_int_clusters, - -le32_to_cpu(rec->e_cpos)); - - /* - * A deleted child record should have been - * caught above. - */ - BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); - } - -delete: - ret = ocfs2_journal_dirty(handle, bh); - if (ret) { - mlog_errno(ret); - goto out; - } - - mlog(0, "extent list container %llu, after: record %d: " - "(%u, %u, %llu), next = %u.\n", - (unsigned long long)bh->b_blocknr, i, - le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), - (unsigned long long)le64_to_cpu(rec->e_blkno), - le16_to_cpu(el->l_next_free_rec)); - - /* - * We must be careful to only attempt delete of an - * extent block (and not the root inode block). - */ - if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { - struct ocfs2_extent_block *eb = - (struct ocfs2_extent_block *)bh->b_data; - - /* - * Save this for use when processing the - * parent block. - */ - deleted_eb = le64_to_cpu(eb->h_blkno); - - mlog(0, "deleting this extent block.\n"); - - ocfs2_remove_from_cache(inode, bh); - - BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); - BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); - BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); - - ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb); - /* An error here is not fatal. */ - if (ret < 0) - mlog_errno(ret); - } else { - deleted_eb = 0; - } - - index--; - } - - ret = 0; -out: - return ret; -} - -static int ocfs2_do_truncate(struct ocfs2_super *osb, - unsigned int clusters_to_del, - struct inode *inode, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_truncate_context *tc, - struct ocfs2_path *path) -{ - int status; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *last_eb = NULL; - struct ocfs2_extent_list *el; - struct buffer_head *last_eb_bh = NULL; - u64 delete_blk = 0; - - fe = (struct ocfs2_dinode *) fe_bh->b_data; - - status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, - path, &last_eb_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - /* - * Each component will be touched, so we might as well journal - * here to avoid having to handle errors later. - */ - status = ocfs2_journal_access_path(inode, handle, path); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - if (last_eb_bh) { - status = ocfs2_journal_access(handle, inode, last_eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - } - - el = &(fe->id2.i_list); - - /* - * Lower levels depend on this never happening, but it's best - * to check it up here before changing the tree. - */ - if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { - ocfs2_error(inode->i_sb, - "Inode %lu has an empty extent record, depth %u\n", - inode->i_ino, le16_to_cpu(el->l_tree_depth)); - status = -EROFS; - goto bail; - } - - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - - clusters_to_del; - spin_unlock(&OCFS2_I(inode)->ip_lock); - le32_add_cpu(&fe->i_clusters, -clusters_to_del); - inode->i_blocks = ocfs2_inode_sector_count(inode); - - status = ocfs2_trim_tree(inode, path, handle, tc, - clusters_to_del, &delete_blk); - if (status) { - mlog_errno(status); - goto bail; - } - - if (le32_to_cpu(fe->i_clusters) == 0) { - /* trunc to zero is a special case. */ - el->l_tree_depth = 0; - fe->i_last_eb_blk = 0; - } else if (last_eb) - fe->i_last_eb_blk = last_eb->h_blkno; - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - if (last_eb) { - /* If there will be a new last extent block, then by - * definition, there cannot be any leaves to the right of - * him. */ - last_eb->h_next_leaf_blk = 0; - status = ocfs2_journal_dirty(handle, last_eb_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - - if (delete_blk) { - status = ocfs2_truncate_log_append(osb, handle, delete_blk, - clusters_to_del); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - status = 0; -bail: - - mlog_exit(status); - return status; -} - -static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) +static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) { set_buffer_uptodate(bh); mark_buffer_dirty(bh); return 0; } -static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) -{ - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - return ocfs2_journal_dirty_data(handle, bh); -} - -static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, - unsigned int from, unsigned int to, - struct page *page, int zero, u64 *phys) +void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, + unsigned int from, unsigned int to, + struct page *page, int zero, u64 *phys) { int ret, partial = 0; @@ -5677,17 +6600,13 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, * here if they aren't - ocfs2_map_page_blocks() * might've skipped some */ - if (ocfs2_should_order_data(inode)) { - ret = walk_page_buffers(handle, - page_buffers(page), - from, to, &partial, - ocfs2_ordered_zero_func); - if (ret < 0) - mlog_errno(ret); - } else { - ret = walk_page_buffers(handle, page_buffers(page), - from, to, &partial, - ocfs2_writeback_zero_func); + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_zero_func); + if (ret < 0) + mlog_errno(ret); + else if (ocfs2_should_order_data(inode)) { + ret = ocfs2_jbd2_file_inode(handle, inode); if (ret < 0) mlog_errno(ret); } @@ -5733,25 +6652,21 @@ out: ocfs2_unlock_and_free_pages(pages, numpages); } -static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num) +int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num) { int numpages, ret = 0; - struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned long index; loff_t last_page_bytes; BUG_ON(start > end); - BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != - (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); - numpages = 0; last_page_bytes = PAGE_ALIGN(end); index = start >> PAGE_CACHE_SHIFT; do { - pages[numpages] = grab_cache_page(mapping, index); + pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); if (!pages[numpages]) { ret = -ENOMEM; mlog_errno(ret); @@ -5774,6 +6689,17 @@ out: return ret; } +static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num) +{ + struct super_block *sb = inode->i_sb; + + BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != + (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); + + return ocfs2_grab_pages(inode, start, end, pages, num); +} + /* * Zero the area past i_size but still within an allocated * cluster. This avoids exposing nonzero data on subsequent file @@ -5840,32 +6766,40 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, * wait on them - the truncate_inode_pages() call later will * do that for us. */ - ret = do_sync_mapping_range(inode->i_mapping, range_start, - range_end - 1, SYNC_FILE_RANGE_WRITE); + ret = filemap_fdatawrite_range(inode->i_mapping, range_start, + range_end - 1); if (ret) mlog_errno(ret); out: - if (pages) - kfree(pages); + kfree(pages); return ret; } -static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di) +static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode, + struct ocfs2_dinode *di) { unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); - memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2)); + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2) - + xattrsize); + else + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2)); } void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di) { - ocfs2_zero_dinode_id2(inode, di); + ocfs2_zero_dinode_id2_with_xattr(inode, di); di->id2.i_list.l_tree_depth = 0; di->id2.i_list.l_next_free_rec = 0; - di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb)); + di->id2.i_list.l_count = cpu_to_le16( + ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di)); } void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) @@ -5882,15 +6816,18 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) * We clear the entire i_data structure here so that all * fields can be properly initialized. */ - ocfs2_zero_dinode_id2(inode, di); + ocfs2_zero_dinode_id2_with_xattr(inode, di); - idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb)); + idata->id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(inode->i_sb, di)); } int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct buffer_head *di_bh) { int ret, i, has_data, num_pages = 0; + int need_free = 0; + u32 bit_off, num; handle_t *handle; u64 uninitialized_var(block); struct ocfs2_inode_info *oi = OCFS2_I(inode); @@ -5899,6 +6836,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct ocfs2_alloc_context *data_ac = NULL; struct page **pages = NULL; loff_t end = osb->s_clustersize; + struct ocfs2_extent_tree et; + int did_quota = 0; has_data = i_size_read(inode) ? 1 : 0; @@ -5918,26 +6857,34 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } } - handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS); + handle = ocfs2_start_trans(osb, + ocfs2_inline_to_extents_credits(osb->sb)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out_unlock; } - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } if (has_data) { - u32 bit_off, num; unsigned int page_end; u64 phys; - ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (ret) + goto out_commit; + did_quota = 1; + + data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; + + ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &num); if (ret) { mlog_errno(ret); @@ -5961,6 +6908,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -5971,6 +6919,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, ret = ocfs2_read_inline_data(inode, pages[0], di_bh); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -5988,6 +6937,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_dinode_new_extent_list(inode, di); ocfs2_journal_dirty(handle, di_bh); @@ -5998,10 +6948,11 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * this proves to be false, we could always re-build * the in-inode data from our pages. */ - ret = ocfs2_insert_extent(osb, handle, inode, di_bh, - 0, block, 1, 0, NULL); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); if (ret) { mlog_errno(ret); + need_free = 1; goto out_commit; } @@ -6009,6 +6960,22 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } out_commit: + if (ret < 0 && did_quota) + dquot_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + + if (need_free) { + if (data_ac->ac_which == OCFS2_AC_USE_LOCAL) + ocfs2_free_local_alloc_bits(osb, handle, data_ac, + bit_off, num); + else + ocfs2_free_clusters(handle, + data_ac->ac_inode, + data_ac->ac_bh, + ocfs2_clusters_to_blocks(osb->sb, bit_off), + num); + } + ocfs2_commit_trans(osb, handle); out_unlock: @@ -6032,22 +6999,28 @@ out: */ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_truncate_context *tc) + struct buffer_head *di_bh) { - int status, i, credits, tl_sem = 0; - u32 clusters_to_del, new_highest_cpos, range; + int status = 0, i, flags = 0; + u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff; + u64 blkno = 0; struct ocfs2_extent_list *el; - handle_t *handle = NULL; - struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_extent_rec *rec; struct ocfs2_path *path = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_list *root_el = &(di->id2.i_list); + u64 refcount_loc = le64_to_cpu(di->i_refcount_loc); + struct ocfs2_extent_tree et; + struct ocfs2_cached_dealloc_ctxt dealloc; - mlog_entry_void(); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ocfs2_init_dealloc_ctxt(&dealloc); new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); - path = ocfs2_new_inode_path(fe_bh); + path = ocfs2_new_path(di_bh, &di->id2.i_list, + ocfs2_journal_access_di); if (!path) { status = -ENOMEM; mlog_errno(status); @@ -6068,14 +7041,17 @@ start: /* * Truncate always works against the rightmost tree branch. */ - status = ocfs2_find_path(inode, path, UINT_MAX); + status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX); if (status) { mlog_errno(status); goto bail; } - mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", - OCFS2_I(inode)->ip_clusters, path->p_tree_depth); + trace_ocfs2_commit_truncate( + (unsigned long long)OCFS2_I(inode)->ip_blkno, + new_highest_cpos, + OCFS2_I(inode)->ip_clusters, + path->p_tree_depth); /* * By now, el will point to the extent list on the bottom most @@ -6099,61 +7075,60 @@ start: } i = le16_to_cpu(el->l_next_free_rec) - 1; - range = le32_to_cpu(el->l_recs[i].e_cpos) + - ocfs2_rec_clusters(el, &el->l_recs[i]); - if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { - clusters_to_del = 0; - } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { - clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); + rec = &el->l_recs[i]; + flags = rec->e_flags; + range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); + + if (i == 0 && ocfs2_is_empty_extent(rec)) { + /* + * Lower levels depend on this never happening, but it's best + * to check it up here before changing the tree. + */ + if (root_el->l_tree_depth && rec->e_int_clusters == 0) { + ocfs2_error(inode->i_sb, "Inode %lu has an empty " + "extent record, depth %u\n", inode->i_ino, + le16_to_cpu(root_el->l_tree_depth)); + status = -EROFS; + goto bail; + } + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = 0; + blkno = 0; + } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) { + /* + * Truncate entire record. + */ + trunc_cpos = le32_to_cpu(rec->e_cpos); + trunc_len = ocfs2_rec_clusters(el, rec); + blkno = le64_to_cpu(rec->e_blkno); } else if (range > new_highest_cpos) { - clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + - le32_to_cpu(el->l_recs[i].e_cpos)) - - new_highest_cpos; + /* + * Partial truncate. it also should be + * the last truncate we're doing. + */ + trunc_cpos = new_highest_cpos; + trunc_len = range - new_highest_cpos; + coff = new_highest_cpos - le32_to_cpu(rec->e_cpos); + blkno = le64_to_cpu(rec->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, coff); } else { + /* + * Truncate completed, leave happily. + */ status = 0; goto bail; } - mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", - clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); + phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); - mutex_lock(&tl_inode->i_mutex); - tl_sem = 1; - /* ocfs2_truncate_log_needs_flush guarantees us at least one - * record is free for use. If there isn't any, we flush to get - * an empty truncate log. */ - if (ocfs2_truncate_log_needs_flush(osb)) { - status = __ocfs2_flush_truncate_log(osb); - if (status < 0) { - mlog_errno(status); - goto bail; - } - } - - credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, - (struct ocfs2_dinode *)fe_bh->b_data, - el); - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - handle = NULL; - mlog_errno(status); - goto bail; - } - - status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, - tc, path); + status = ocfs2_remove_btree_range(inode, &et, trunc_cpos, + phys_cpos, trunc_len, flags, &dealloc, + refcount_loc); if (status < 0) { mlog_errno(status); goto bail; } - mutex_unlock(&tl_inode->i_mutex); - tl_sem = 0; - - ocfs2_commit_trans(osb, handle); - handle = NULL; - ocfs2_reinit_path(path, 1); /* @@ -6166,84 +7141,10 @@ bail: ocfs2_schedule_truncate_log_flush(osb, 1); - if (tl_sem) - mutex_unlock(&tl_inode->i_mutex); - - if (handle) - ocfs2_commit_trans(osb, handle); - - ocfs2_run_deallocs(osb, &tc->tc_dealloc); + ocfs2_run_deallocs(osb, &dealloc); ocfs2_free_path(path); - /* This will drop the ext_alloc cluster lock for us */ - ocfs2_free_truncate_context(tc); - - mlog_exit(status); - return status; -} - -/* - * Expects the inode to already be locked. - */ -int ocfs2_prepare_truncate(struct ocfs2_super *osb, - struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_truncate_context **tc) -{ - int status; - unsigned int new_i_clusters; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *eb; - struct buffer_head *last_eb_bh = NULL; - - mlog_entry_void(); - - *tc = NULL; - - new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, - i_size_read(inode)); - fe = (struct ocfs2_dinode *) fe_bh->b_data; - - mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" - "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters, - (unsigned long long)le64_to_cpu(fe->i_size)); - - *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); - if (!(*tc)) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); - - if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); - if (status < 0) { - mlog_errno(status); - goto bail; - } - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - - brelse(last_eb_bh); - status = -EIO; - goto bail; - } - } - - (*tc)->tc_last_eb_bh = last_eb_bh; - - status = 0; -bail: - if (status < 0) { - if (*tc) - ocfs2_free_truncate_context(*tc); - *tc = NULL; - } - mlog_exit_void(); return status; } @@ -6263,7 +7164,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, if (end > i_size_read(inode)) end = i_size_read(inode); - BUG_ON(start >= end); + BUG_ON(start > end); if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || @@ -6286,8 +7187,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, goto out; } - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; @@ -6313,6 +7214,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); out_commit: @@ -6322,18 +7224,163 @@ out: return ret; } -static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) +static int ocfs2_trim_extent(struct super_block *sb, + struct ocfs2_group_desc *gd, + u32 start, u32 count) { - /* - * The caller is responsible for completing deallocation - * before freeing the context. - */ - if (tc->tc_dealloc.c_first_suballocator != NULL) - mlog(ML_NOTICE, - "Truncate completion has non-empty dealloc context\n"); + u64 discard, bcount; + + bcount = ocfs2_clusters_to_blocks(sb, count); + discard = le64_to_cpu(gd->bg_blkno) + + ocfs2_clusters_to_blocks(sb, start); + + trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount); + + return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0); +} + +static int ocfs2_trim_group(struct super_block *sb, + struct ocfs2_group_desc *gd, + u32 start, u32 max, u32 minbits) +{ + int ret = 0, count = 0, next; + void *bitmap = gd->bg_bitmap; + + if (le16_to_cpu(gd->bg_free_bits_count) < minbits) + return 0; + + trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno), + start, max, minbits); + + while (start < max) { + start = ocfs2_find_next_zero_bit(bitmap, max, start); + if (start >= max) + break; + next = ocfs2_find_next_bit(bitmap, max, start); + + if ((next - start) >= minbits) { + ret = ocfs2_trim_extent(sb, gd, + start, next - start); + if (ret < 0) { + mlog_errno(ret); + break; + } + count += next - start; + } + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits) + break; + } + + if (ret < 0) + count = ret; - if (tc->tc_last_eb_bh) - brelse(tc->tc_last_eb_bh); + return count; +} + +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + u64 start, len, trimmed, first_group, last_group, group; + int ret, cnt; + u32 first_bit, last_bit, minlen; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct buffer_head *gd_bh = NULL; + struct ocfs2_dinode *main_bm; + struct ocfs2_group_desc *gd = NULL; + + start = range->start >> osb->s_clustersize_bits; + len = range->len >> osb->s_clustersize_bits; + minlen = range->minlen >> osb->s_clustersize_bits; + + if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) + return -EINVAL; - kfree(tc); + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + + mutex_lock(&main_bm_inode->i_mutex); + + ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; + + if (start >= le32_to_cpu(main_bm->i_clusters)) { + ret = -EINVAL; + goto out_unlock; + } + + len = range->len >> osb->s_clustersize_bits; + if (start + len > le32_to_cpu(main_bm->i_clusters)) + len = le32_to_cpu(main_bm->i_clusters) - start; + + trace_ocfs2_trim_fs(start, len, minlen); + + /* Determine first and last group to examine based on start and len */ + first_group = ocfs2_which_cluster_group(main_bm_inode, start); + if (first_group == osb->first_cluster_group_blkno) + first_bit = start; + else + first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); + last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); + last_bit = osb->bitmap_cpg; + + trimmed = 0; + for (group = first_group; group <= last_group;) { + if (first_bit + len >= osb->bitmap_cpg) + last_bit = osb->bitmap_cpg; + else + last_bit = first_bit + len; + + ret = ocfs2_read_group_descriptor(main_bm_inode, + main_bm, group, + &gd_bh); + if (ret < 0) { + mlog_errno(ret); + break; + } + + gd = (struct ocfs2_group_desc *)gd_bh->b_data; + cnt = ocfs2_trim_group(sb, gd, first_bit, last_bit, minlen); + brelse(gd_bh); + gd_bh = NULL; + if (cnt < 0) { + ret = cnt; + mlog_errno(ret); + break; + } + + trimmed += cnt; + len -= osb->bitmap_cpg - first_bit; + first_bit = 0; + if (group == osb->first_cluster_group_blkno) + group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); + else + group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); + } + range->len = trimmed * sb->s_blocksize; +out_unlock: + ocfs2_inode_unlock(main_bm_inode, 0); + brelse(main_bm_bh); +out_mutex: + mutex_unlock(&main_bm_inode->i_mutex); + iput(main_bm_inode); +out: + return ret; } |
