diff options
Diffstat (limited to 'fs/ocfs2')
53 files changed, 8529 insertions, 1203 deletions
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 01596079dd6..31f25ce32c9 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -28,6 +28,7 @@ ocfs2-objs := \ locks.o \ mmap.o \ namei.o \ + refcounttree.o \ resize.o \ slot_map.o \ suballoc.o \ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ab513ddaeff..38a42f5d59f 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -49,10 +49,21 @@ #include "super.h" #include "uptodate.h" #include "xattr.h" +#include "refcounttree.h" #include "buffer_head_io.h" +enum ocfs2_contig_type { + CONTIG_NONE = 0, + CONTIG_LEFT, + CONTIG_RIGHT, + CONTIG_LEFTRIGHT, +}; +static enum ocfs2_contig_type + ocfs2_extent_rec_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec); /* * Operations for a specific extent tree type. * @@ -79,18 +90,30 @@ struct ocfs2_extent_tree_operations { * that value. new_clusters is the delta, and must be * added to the total. Required. */ - void (*eo_update_clusters)(struct inode *inode, - struct ocfs2_extent_tree *et, + void (*eo_update_clusters)(struct ocfs2_extent_tree *et, u32 new_clusters); /* + * If this extent tree is supported by an extent map, insert + * a record into the map. + */ + void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); + + /* + * If this extent tree is supported by an extent map, truncate the + * map to clusters, + */ + void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et, + u32 clusters); + + /* * If ->eo_insert_check() exists, it is called before rec is * inserted into the extent tree. It is optional. */ - int (*eo_insert_check)(struct inode *inode, - struct ocfs2_extent_tree *et, + int (*eo_insert_check)(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); - int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et); + int (*eo_sanity_check)(struct ocfs2_extent_tree *et); /* * -------------------------------------------------------------- @@ -109,8 +132,17 @@ struct ocfs2_extent_tree_operations { * it exists. If it does not, et->et_max_leaf_clusters is set * to 0 (unlimited). Optional. */ - void (*eo_fill_max_leaf_clusters)(struct inode *inode, - struct ocfs2_extent_tree *et); + void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et); + + /* + * ->eo_extent_contig test whether the 2 ocfs2_extent_rec + * are contiguous or not. Optional. Don't need to set it if use + * ocfs2_extent_rec as the tree leaf. + */ + enum ocfs2_contig_type + (*eo_extent_contig)(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec); }; @@ -121,19 +153,22 @@ struct ocfs2_extent_tree_operations { static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, u64 blkno); -static void ocfs2_dinode_update_clusters(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, u32 clusters); -static int ocfs2_dinode_insert_check(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); +static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters); +static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); -static int ocfs2_dinode_sanity_check(struct inode *inode, - struct ocfs2_extent_tree *et); +static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, .eo_update_clusters = ocfs2_dinode_update_clusters, + .eo_extent_map_insert = ocfs2_dinode_extent_map_insert, + .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate, .eo_insert_check = ocfs2_dinode_insert_check, .eo_sanity_check = ocfs2_dinode_sanity_check, .eo_fill_root_el = ocfs2_dinode_fill_root_el, @@ -156,40 +191,53 @@ static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) return le64_to_cpu(di->i_last_eb_blk); } -static void ocfs2_dinode_update_clusters(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et, u32 clusters) { + struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); struct ocfs2_dinode *di = et->et_object; le32_add_cpu(&di->i_clusters, clusters); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); - spin_unlock(&OCFS2_I(inode)->ip_lock); + spin_lock(&oi->ip_lock); + oi->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&oi->ip_lock); } -static int ocfs2_dinode_insert_check(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; + + ocfs2_extent_map_insert_rec(inode, rec); +} + +static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode; + + ocfs2_extent_map_trunc(inode, clusters); +} + +static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci); + struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb); - BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); + BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL); mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && - (OCFS2_I(inode)->ip_clusters != - le32_to_cpu(rec->e_cpos)), + (oi->ip_clusters != le32_to_cpu(rec->e_cpos)), "Device %s, asking for sparse allocation: inode %llu, " "cpos %u, clusters %u\n", osb->dev_str, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - rec->e_cpos, - OCFS2_I(inode)->ip_clusters); + (unsigned long long)oi->ip_blkno, + rec->e_cpos, oi->ip_clusters); return 0; } -static int ocfs2_dinode_sanity_check(struct inode *inode, - struct ocfs2_extent_tree *et) +static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et) { struct ocfs2_dinode *di = et->et_object; @@ -229,8 +277,7 @@ static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) return le64_to_cpu(vb->vb_xv->xr_last_eb_blk); } -static void ocfs2_xattr_value_update_clusters(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, u32 clusters) { struct ocfs2_xattr_value_buf *vb = et->et_object; @@ -252,12 +299,11 @@ static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) et->et_root_el = &xb->xb_attrs.xb_root.xt_list; } -static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode, - struct ocfs2_extent_tree *et) +static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et) { + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); et->et_max_leaf_clusters = - ocfs2_clusters_for_bytes(inode->i_sb, - OCFS2_MAX_XATTR_TREE_LEAF_SIZE); + ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE); } static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, @@ -277,8 +323,7 @@ static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) return le64_to_cpu(xt->xt_last_eb_blk); } -static void ocfs2_xattr_tree_update_clusters(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, u32 clusters) { struct ocfs2_xattr_block *xb = et->et_object; @@ -309,8 +354,7 @@ static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et) return le64_to_cpu(dx_root->dr_last_eb_blk); } -static void ocfs2_dx_root_update_clusters(struct inode *inode, - struct ocfs2_extent_tree *et, +static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et, u32 clusters) { struct ocfs2_dx_root_block *dx_root = et->et_object; @@ -318,8 +362,7 @@ static void ocfs2_dx_root_update_clusters(struct inode *inode, le32_add_cpu(&dx_root->dr_clusters, clusters); } -static int ocfs2_dx_root_sanity_check(struct inode *inode, - struct ocfs2_extent_tree *et) +static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et) { struct ocfs2_dx_root_block *dx_root = et->et_object; @@ -343,8 +386,54 @@ static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { .eo_fill_root_el = ocfs2_dx_root_fill_root_el, }; +static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + et->et_root_el = &rb->rf_list; +} + +static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + rb->rf_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + return le64_to_cpu(rb->rf_last_eb_blk); +} + +static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_refcount_block *rb = et->et_object; + + le32_add_cpu(&rb->rf_clusters, clusters); +} + +static enum ocfs2_contig_type +ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) +{ + return CONTIG_NONE; +} + +static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_refcount_tree_update_clusters, + .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el, + .eo_extent_contig = ocfs2_refcount_tree_extent_contig, +}; + static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh, ocfs2_journal_access_func access, void *obj, @@ -352,6 +441,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, { et->et_ops = ops; et->et_root_bh = bh; + et->et_ci = ci; et->et_root_journal_access = access; if (!obj) obj = (void *)bh->b_data; @@ -361,41 +451,49 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, if (!et->et_ops->eo_fill_max_leaf_clusters) et->et_max_leaf_clusters = 0; else - et->et_ops->eo_fill_max_leaf_clusters(inode, et); + et->et_ops->eo_fill_max_leaf_clusters(et); } void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh) { - __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di, + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di, NULL, &ocfs2_dinode_et_ops); } void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh) { - __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb, + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb, NULL, &ocfs2_xattr_tree_et_ops); } void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct ocfs2_xattr_value_buf *vb) { - __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb, + __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb, &ocfs2_xattr_value_et_ops); } void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh) { - __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr, + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr, NULL, &ocfs2_dx_root_et_ops); } +void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb, + NULL, &ocfs2_refcount_tree_et_ops); +} + static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, u64 new_last_eb_blk) { @@ -407,78 +505,71 @@ static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) return et->et_ops->eo_get_last_eb_blk(et); } -static inline void ocfs2_et_update_clusters(struct inode *inode, - struct ocfs2_extent_tree *et, +static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et, u32 clusters) { - et->et_ops->eo_update_clusters(inode, et, clusters); + et->et_ops->eo_update_clusters(et, clusters); +} + +static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + if (et->et_ops->eo_extent_map_insert) + et->et_ops->eo_extent_map_insert(et, rec); +} + +static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et, + u32 clusters) +{ + if (et->et_ops->eo_extent_map_truncate) + et->et_ops->eo_extent_map_truncate(et, clusters); } static inline int ocfs2_et_root_journal_access(handle_t *handle, - struct inode *inode, struct ocfs2_extent_tree *et, int type) { - return et->et_root_journal_access(handle, inode, et->et_root_bh, + return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh, type); } -static inline int ocfs2_et_insert_check(struct inode *inode, - struct ocfs2_extent_tree *et, +static inline enum ocfs2_contig_type + ocfs2_et_extent_contig(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec, + struct ocfs2_extent_rec *insert_rec) +{ + if (et->et_ops->eo_extent_contig) + return et->et_ops->eo_extent_contig(et, rec, insert_rec); + + return ocfs2_extent_rec_contig( + ocfs2_metadata_cache_get_super(et->et_ci), + rec, insert_rec); +} + +static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec) { int ret = 0; if (et->et_ops->eo_insert_check) - ret = et->et_ops->eo_insert_check(inode, et, rec); + ret = et->et_ops->eo_insert_check(et, rec); return ret; } -static inline int ocfs2_et_sanity_check(struct inode *inode, - struct ocfs2_extent_tree *et) +static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et) { int ret = 0; if (et->et_ops->eo_sanity_check) - ret = et->et_ops->eo_sanity_check(inode, et); + ret = et->et_ops->eo_sanity_check(et); return ret; } static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, struct ocfs2_extent_block *eb); - -/* - * Structures which describe a path through a btree, and functions to - * manipulate them. - * - * The idea here is to be as generic as possible with the tree - * manipulation code. - */ -struct ocfs2_path_item { - struct buffer_head *bh; - struct ocfs2_extent_list *el; -}; - -#define OCFS2_MAX_PATH_DEPTH 5 - -struct ocfs2_path { - int p_tree_depth; - ocfs2_journal_access_func p_root_access; - struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; -}; - -#define path_root_bh(_path) ((_path)->p_node[0].bh) -#define path_root_el(_path) ((_path)->p_node[0].el) -#define path_root_access(_path)((_path)->p_root_access) -#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) -#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) -#define path_num_items(_path) ((_path)->p_tree_depth + 1) - -static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, - u32 cpos); -static void ocfs2_adjust_rightmost_records(struct inode *inode, - handle_t *handle, +static void ocfs2_adjust_rightmost_records(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec); /* @@ -486,7 +577,7 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode, * to build another path. Generally, this involves freeing the buffer * heads. */ -static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) +void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) { int i, start = 0, depth = 0; struct ocfs2_path_item *node; @@ -515,7 +606,7 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) path->p_tree_depth = depth; } -static void ocfs2_free_path(struct ocfs2_path *path) +void ocfs2_free_path(struct ocfs2_path *path) { if (path) { ocfs2_reinit_path(path, 0); @@ -613,13 +704,13 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, return path; } -static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) +struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) { return ocfs2_new_path(path_root_bh(path), path_root_el(path), path_root_access(path)); } -static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) +struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) { return ocfs2_new_path(et->et_root_bh, et->et_root_el, et->et_root_journal_access); @@ -632,10 +723,10 @@ static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) * I don't like the way this function's name looks next to * ocfs2_journal_access_path(), but I don't have a better one. */ -static int ocfs2_path_bh_journal_access(handle_t *handle, - struct inode *inode, - struct ocfs2_path *path, - int idx) +int ocfs2_path_bh_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + int idx) { ocfs2_journal_access_func access = path_root_access(path); @@ -645,15 +736,16 @@ static int ocfs2_path_bh_journal_access(handle_t *handle, if (idx) access = ocfs2_journal_access_eb; - return access(handle, inode, path->p_node[idx].bh, + return access(handle, ci, path->p_node[idx].bh, OCFS2_JOURNAL_ACCESS_WRITE); } /* * Convenience function to journal all components in a path. */ -static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, - struct ocfs2_path *path) +int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, + handle_t *handle, + struct ocfs2_path *path) { int i, ret = 0; @@ -661,7 +753,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, goto out; for(i = 0; i < path_num_items(path); i++) { - ret = ocfs2_path_bh_journal_access(handle, inode, path, i); + ret = ocfs2_path_bh_journal_access(handle, ci, path, i); if (ret < 0) { mlog_errno(ret); goto out; @@ -702,17 +794,9 @@ int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) return ret; } -enum ocfs2_contig_type { - CONTIG_NONE = 0, - CONTIG_LEFT, - CONTIG_RIGHT, - CONTIG_LEFTRIGHT, -}; - - /* * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and - * ocfs2_extent_contig only work properly against leaf nodes! + * ocfs2_extent_rec_contig only work properly against leaf nodes! */ static int ocfs2_block_extent_contig(struct super_block *sb, struct ocfs2_extent_rec *ext, @@ -738,9 +822,9 @@ static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, } static enum ocfs2_contig_type - ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - struct ocfs2_extent_rec *insert_rec) + ocfs2_extent_rec_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) { u64 blkno = le64_to_cpu(insert_rec->e_blkno); @@ -753,12 +837,12 @@ static enum ocfs2_contig_type return CONTIG_NONE; if (ocfs2_extents_adjacent(ext, insert_rec) && - ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) + ocfs2_block_extent_contig(sb, ext, blkno)) return CONTIG_RIGHT; blkno = le64_to_cpu(ext->e_blkno); if (ocfs2_extents_adjacent(insert_rec, ext) && - ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) + ocfs2_block_extent_contig(sb, insert_rec, blkno)) return CONTIG_LEFT; return CONTIG_NONE; @@ -853,13 +937,13 @@ static int ocfs2_validate_extent_block(struct super_block *sb, return 0; } -int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, +int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, struct buffer_head **bh) { int rc; struct buffer_head *tmp = *bh; - rc = ocfs2_read_block(inode, eb_blkno, &tmp, + rc = ocfs2_read_block(ci, eb_blkno, &tmp, ocfs2_validate_extent_block); /* If ocfs2_read_block() got us a new bh, pass it up. */ @@ -874,7 +958,6 @@ int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, * How many free extents have we got before we need more meta data? */ int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct inode *inode, struct ocfs2_extent_tree *et) { int retval; @@ -889,7 +972,8 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb, last_eb_blk = ocfs2_et_get_last_eb_blk(et); if (last_eb_blk) { - retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); + retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk, + &eb_bh); if (retval < 0) { mlog_errno(retval); goto bail; @@ -913,9 +997,8 @@ bail: * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and * l_count for you */ -static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, +static int ocfs2_create_new_meta_bhs(handle_t *handle, + struct ocfs2_extent_tree *et, int wanted, struct ocfs2_alloc_context *meta_ac, struct buffer_head *bhs[]) @@ -924,6 +1007,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, u16 suballoc_bit_start; u32 num_got; u64 first_blkno; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); struct ocfs2_extent_block *eb; mlog_entry_void(); @@ -949,9 +1034,10 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, mlog_errno(status); goto bail; } - ocfs2_set_new_buffer_uptodate(inode, bhs[i]); + ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]); - status = ocfs2_journal_access_eb(handle, inode, bhs[i], + status = ocfs2_journal_access_eb(handle, et->et_ci, + bhs[i], OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1023,7 +1109,6 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) * extent block's rightmost record. */ static int ocfs2_adjust_rightmost_branch(handle_t *handle, - struct inode *inode, struct ocfs2_extent_tree *et) { int status; @@ -1037,7 +1122,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle, return status; } - status = ocfs2_find_path(inode, path, UINT_MAX); + status = ocfs2_find_path(et->et_ci, path, UINT_MAX); if (status < 0) { mlog_errno(status); goto out; @@ -1050,7 +1135,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle, goto out; } - status = ocfs2_journal_access_path(inode, handle, path); + status = ocfs2_journal_access_path(et->et_ci, handle, path); if (status < 0) { mlog_errno(status); goto out; @@ -1059,7 +1144,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle, el = path_leaf_el(path); rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1]; - ocfs2_adjust_rightmost_records(inode, handle, path, rec); + ocfs2_adjust_rightmost_records(handle, et, path, rec); out: ocfs2_free_path(path); @@ -1068,7 +1153,7 @@ out: /* * Add an entire tree branch to our inode. eb_bh is the extent block - * to start at, if we don't want to start the branch at the dinode + * to start at, if we don't want to start the branch at the root * structure. * * last_eb_bh is required as we have to update it's next_leaf pointer @@ -1077,9 +1162,7 @@ out: * the new branch will be 'empty' in the sense that every block will * contain a single record with cluster count == 0. */ -static int ocfs2_add_branch(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, +static int ocfs2_add_branch(handle_t *handle, struct ocfs2_extent_tree *et, struct buffer_head *eb_bh, struct buffer_head **last_eb_bh, @@ -1123,7 +1206,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, if (root_end > new_cpos) { mlog(0, "adjust the cluster end from %u to %u\n", root_end, new_cpos); - status = ocfs2_adjust_rightmost_branch(handle, inode, et); + status = ocfs2_adjust_rightmost_branch(handle, et); if (status) { mlog_errno(status); goto bail; @@ -1139,7 +1222,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, goto bail; } - status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, + status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, meta_ac, new_eb_bhs); if (status < 0) { mlog_errno(status); @@ -1161,7 +1244,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); eb_el = &eb->h_list; - status = ocfs2_journal_access_eb(handle, inode, bh, + status = ocfs2_journal_access_eb(handle, et->et_ci, bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1201,20 +1284,20 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * journal_dirty erroring as it won't unless we've aborted the * handle (in which case we would never be here) so reserving * the write with journal_access is all we need to do. */ - status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh, + status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } - status = ocfs2_et_root_journal_access(handle, inode, et, + status = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } if (eb_bh) { - status = ocfs2_journal_access_eb(handle, inode, eb_bh, + status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1274,9 +1357,7 @@ bail: * returns back the new extent block so you can add a branch to it * after this call. */ -static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, +static int ocfs2_shift_tree_depth(handle_t *handle, struct ocfs2_extent_tree *et, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) @@ -1290,7 +1371,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, mlog_entry_void(); - status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, &new_eb_bh); if (status < 0) { mlog_errno(status); @@ -1304,7 +1385,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, eb_el = &eb->h_list; root_el = et->et_root_el; - status = ocfs2_journal_access_eb(handle, inode, new_eb_bh, + status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1323,7 +1404,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } - status = ocfs2_et_root_journal_access(handle, inode, et, + status = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1379,9 +1460,7 @@ bail: * * return status < 0 indicates an error. */ -static int ocfs2_find_branch_target(struct ocfs2_super *osb, - struct inode *inode, - struct ocfs2_extent_tree *et, +static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et, struct buffer_head **target_bh) { int status = 0, i; @@ -1399,19 +1478,21 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { - ocfs2_error(inode->i_sb, "Dinode %llu has empty " + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty " "extent list (next_free_rec == 0)", - (unsigned long long)OCFS2_I(inode)->ip_blkno); + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); status = -EIO; goto bail; } i = le16_to_cpu(el->l_next_free_rec) - 1; blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (!blkno) { - ocfs2_error(inode->i_sb, "Dinode %llu has extent " + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has extent " "list where extent # %d has no physical " "block start", - (unsigned long long)OCFS2_I(inode)->ip_blkno, i); + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i); status = -EIO; goto bail; } @@ -1419,7 +1500,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, brelse(bh); bh = NULL; - status = ocfs2_read_extent_block(inode, blkno, &bh); + status = ocfs2_read_extent_block(et->et_ci, blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1460,20 +1541,18 @@ bail: * * *last_eb_bh will be updated by ocfs2_add_branch(). */ -static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, - struct ocfs2_extent_tree *et, int *final_depth, - struct buffer_head **last_eb_bh, +static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, + int *final_depth, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { int ret, shift; struct ocfs2_extent_list *el = et->et_root_el; int depth = le16_to_cpu(el->l_tree_depth); - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *bh = NULL; BUG_ON(meta_ac == NULL); - shift = ocfs2_find_branch_target(osb, inode, et, &bh); + shift = ocfs2_find_branch_target(et, &bh); if (shift < 0) { ret = shift; mlog_errno(ret); @@ -1490,8 +1569,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* ocfs2_shift_tree_depth will return us a buffer with * the new extent block (so we can pass that to * ocfs2_add_branch). */ - ret = ocfs2_shift_tree_depth(osb, handle, inode, et, - meta_ac, &bh); + ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1517,7 +1595,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* call ocfs2_add_branch to add the final part of the tree with * the new data. */ mlog(0, "add branch. bh = %p\n", bh); - ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh, + ret = ocfs2_add_branch(handle, et, bh, last_eb_bh, meta_ac); if (ret < 0) { mlog_errno(ret); @@ -1687,7 +1765,7 @@ set_and_inc: * * The array index of the subtree root is passed back. */ -static int ocfs2_find_subtree_root(struct inode *inode, +static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, struct ocfs2_path *left, struct ocfs2_path *right) { @@ -1705,10 +1783,10 @@ static int ocfs2_find_subtree_root(struct inode *inode, * The caller didn't pass two adjacent paths. */ mlog_bug_on_msg(i > left->p_tree_depth, - "Inode %lu, left depth %u, right depth %u\n" + "Owner %llu, left depth %u, right depth %u\n" "left leaf blk %llu, right leaf blk %llu\n", - inode->i_ino, left->p_tree_depth, - right->p_tree_depth, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + left->p_tree_depth, right->p_tree_depth, (unsigned long long)path_leaf_bh(left)->b_blocknr, (unsigned long long)path_leaf_bh(right)->b_blocknr); } while (left->p_node[i].bh->b_blocknr == @@ -1725,7 +1803,7 @@ typedef void (path_insert_t)(void *, struct buffer_head *); * This code can be called with a cpos larger than the tree, in which * case it will return the rightmost path. */ -static int __ocfs2_find_path(struct inode *inode, +static int __ocfs2_find_path(struct ocfs2_caching_info *ci, struct ocfs2_extent_list *root_el, u32 cpos, path_insert_t *func, void *data) { @@ -1736,15 +1814,14 @@ static int __ocfs2_find_path(struct inode *inode, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; - struct ocfs2_inode_info *oi = OCFS2_I(inode); el = root_el; while (el->l_tree_depth) { if (le16_to_cpu(el->l_next_free_rec) == 0) { - ocfs2_error(inode->i_sb, - "Inode %llu has empty extent list at " + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has empty extent list at " "depth %u\n", - (unsigned long long)oi->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth)); ret = -EROFS; goto out; @@ -1767,10 +1844,10 @@ static int __ocfs2_find_path(struct inode *inode, blkno = le64_to_cpu(el->l_recs[i].e_blkno); if (blkno == 0) { - ocfs2_error(inode->i_sb, - "Inode %llu has bad blkno in extent list " + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has bad blkno in extent list " "at depth %u (index %d)\n", - (unsigned long long)oi->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(ci), le16_to_cpu(el->l_tree_depth), i); ret = -EROFS; goto out; @@ -1778,7 +1855,7 @@ static int __ocfs2_find_path(struct inode *inode, brelse(bh); bh = NULL; - ret = ocfs2_read_extent_block(inode, blkno, &bh); + ret = ocfs2_read_extent_block(ci, blkno, &bh); if (ret) { mlog_errno(ret); goto out; @@ -1789,10 +1866,10 @@ static int __ocfs2_find_path(struct inode *inode, if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { - ocfs2_error(inode->i_sb, - "Inode %llu has bad count in extent list " + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has bad count in extent list " "at block %llu (next free=%u, count=%u)\n", - (unsigned long long)oi->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)bh->b_blocknr, le16_to_cpu(el->l_next_free_rec), le16_to_cpu(el->l_count)); @@ -1836,14 +1913,14 @@ static void find_path_ins(void *data, struct buffer_head *bh) ocfs2_path_insert_eb(fp->path, fp->index, bh); fp->index++; } -static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, - u32 cpos) +int ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_path *path, u32 cpos) { struct find_path_data data; data.index = 1; data.path = path; - return __ocfs2_find_path(inode, path_root_el(path), cpos, + return __ocfs2_find_path(ci, path_root_el(path), cpos, find_path_ins, &data); } @@ -1868,13 +1945,14 @@ static void find_leaf_ins(void *data, struct buffer_head *bh) * * This function doesn't handle non btree extent lists. */ -int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, - u32 cpos, struct buffer_head **leaf_bh) +int ocfs2_find_leaf(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *root_el, u32 cpos, + struct buffer_head **leaf_bh) { int ret; struct buffer_head *bh = NULL; - ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); + ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh); if (ret) { mlog_errno(ret); goto out; @@ -1980,7 +2058,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, * - When we've adjusted the last extent record in the left path leaf and the * 1st extent record in the right path leaf during cross extent block merge. */ -static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, +static void ocfs2_complete_edge_insert(handle_t *handle, struct ocfs2_path *left_path, struct ocfs2_path *right_path, int subtree_index) @@ -2058,8 +2136,8 @@ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, mlog_errno(ret); } -static int ocfs2_rotate_subtree_right(struct inode *inode, - handle_t *handle, +static int ocfs2_rotate_subtree_right(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, int subtree_index) @@ -2075,10 +2153,10 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, left_el = path_leaf_el(left_path); if (left_el->l_next_free_rec != left_el->l_count) { - ocfs2_error(inode->i_sb, + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Inode %llu has non-full interior leaf node %llu" "(next free = %u)", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)left_leaf_bh->b_blocknr, le16_to_cpu(left_el->l_next_free_rec)); return -EROFS; @@ -2094,7 +2172,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, subtree_index); if (ret) { mlog_errno(ret); @@ -2102,14 +2180,14 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, } for(i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, i); if (ret) { mlog_errno(ret); @@ -2123,7 +2201,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, /* This is a code error, not a disk corruption. */ mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " "because rightmost leaf block %llu is empty\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)right_leaf_bh->b_blocknr); ocfs2_create_empty_extent(right_el); @@ -2157,8 +2235,8 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, goto out; } - ocfs2_complete_edge_insert(inode, handle, left_path, right_path, - subtree_index); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); out: return ret; @@ -2248,10 +2326,18 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, int op_credits, struct ocfs2_path *path) { + int ret; int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; - if (handle->h_buffer_credits < credits) - return ocfs2_extend_trans(handle, credits); + if (handle->h_buffer_credits < credits) { + ret = ocfs2_extend_trans(handle, + credits - handle->h_buffer_credits); + if (ret) + return ret; + + if (unlikely(handle->h_buffer_credits < credits)) + return ocfs2_extend_trans(handle, credits); + } return 0; } @@ -2321,8 +2407,8 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) * *ret_left_path will contain a valid path which can be passed to * ocfs2_insert_path(). */ -static int ocfs2_rotate_tree_right(struct inode *inode, - handle_t *handle, +static int ocfs2_rotate_tree_right(handle_t *handle, + struct ocfs2_extent_tree *et, enum ocfs2_split_type split, u32 insert_cpos, struct ocfs2_path *right_path, @@ -2331,6 +2417,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, int ret, start, orig_credits = handle->h_buffer_credits; u32 cpos; struct ocfs2_path *left_path = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); *ret_left_path = NULL; @@ -2341,7 +2428,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, goto out; } - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); + ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); if (ret) { mlog_errno(ret); goto out; @@ -2379,7 +2466,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", insert_cpos, cpos); - ret = ocfs2_find_path(inode, left_path, cpos); + ret = ocfs2_find_path(et->et_ci, left_path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -2387,10 +2474,11 @@ static int ocfs2_rotate_tree_right(struct inode *inode, mlog_bug_on_msg(path_leaf_bh(left_path) == path_leaf_bh(right_path), - "Inode %lu: error during insert of %u " + "Owner %llu: error during insert of %u " "(left path cpos %u) results in two identical " "paths ending at %llu\n", - inode->i_ino, insert_cpos, cpos, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + insert_cpos, cpos, (unsigned long long) path_leaf_bh(left_path)->b_blocknr); @@ -2416,7 +2504,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, goto out_ret_path; } - start = ocfs2_find_subtree_root(inode, left_path, right_path); + start = ocfs2_find_subtree_root(et, left_path, right_path); mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", start, @@ -2430,7 +2518,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, goto out; } - ret = ocfs2_rotate_subtree_right(inode, handle, left_path, + ret = ocfs2_rotate_subtree_right(handle, et, left_path, right_path, start); if (ret) { mlog_errno(ret); @@ -2462,8 +2550,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, */ ocfs2_mv_path(right_path, left_path); - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, - &cpos); + ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos); if (ret) { mlog_errno(ret); goto out; @@ -2477,7 +2564,8 @@ out_ret_path: return ret; } -static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, +static int ocfs2_update_edge_lengths(handle_t *handle, + struct ocfs2_extent_tree *et, int subtree_index, struct ocfs2_path *path) { int i, idx, ret; @@ -2502,7 +2590,7 @@ static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_journal_access_path(inode, handle, path); + ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); goto out; @@ -2532,7 +2620,8 @@ out: return ret; } -static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, +static void ocfs2_unlink_path(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_cached_dealloc_ctxt *dealloc, struct ocfs2_path *path, int unlink_start) { @@ -2554,12 +2643,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, mlog(ML_ERROR, "Inode %llu, attempted to remove extent block " "%llu with %u records\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(el->l_next_free_rec)); ocfs2_journal_dirty(handle, bh); - ocfs2_remove_from_cache(inode, bh); + ocfs2_remove_from_cache(et->et_ci, bh); continue; } @@ -2572,11 +2661,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, if (ret) mlog_errno(ret); - ocfs2_remove_from_cache(inode, bh); + ocfs2_remove_from_cache(et->et_ci, bh); } } -static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, +static void ocfs2_unlink_subtree(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, int subtree_index, @@ -2607,17 +2697,17 @@ static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, root_bh); ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); - ocfs2_unlink_path(inode, handle, dealloc, right_path, + ocfs2_unlink_path(handle, et, dealloc, right_path, subtree_index + 1); } -static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, +static int ocfs2_rotate_subtree_left(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, int subtree_index, struct ocfs2_cached_dealloc_ctxt *dealloc, - int *deleted, - struct ocfs2_extent_tree *et) + int *deleted) { int ret, i, del_right_subtree = 0, right_has_empty = 0; struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); @@ -2653,7 +2743,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, return -EAGAIN; if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { - ret = ocfs2_journal_access_eb(handle, inode, + ret = ocfs2_journal_access_eb(handle, et->et_ci, path_leaf_bh(right_path), OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { @@ -2672,7 +2762,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, * We have to update i_last_eb_blk during the meta * data delete. */ - ret = ocfs2_et_root_journal_access(handle, inode, et, + ret = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -2688,7 +2778,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, */ BUG_ON(right_has_empty && !del_right_subtree); - ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, subtree_index); if (ret) { mlog_errno(ret); @@ -2696,14 +2786,14 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, } for(i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, i); if (ret) { mlog_errno(ret); @@ -2740,9 +2830,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, mlog_errno(ret); if (del_right_subtree) { - ocfs2_unlink_subtree(inode, handle, left_path, right_path, + ocfs2_unlink_subtree(handle, et, left_path, right_path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + ret = ocfs2_update_edge_lengths(handle, et, subtree_index, left_path); if (ret) { mlog_errno(ret); @@ -2766,7 +2856,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, *deleted = 1; } else - ocfs2_complete_edge_insert(inode, handle, left_path, right_path, + ocfs2_complete_edge_insert(handle, left_path, right_path, subtree_index); out: @@ -2852,8 +2942,8 @@ out: return ret; } -static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, - handle_t *handle, +static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path) { int ret; @@ -2863,7 +2953,7 @@ static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, if (!ocfs2_is_empty_extent(&el->l_recs[0])) return 0; - ret = ocfs2_path_bh_journal_access(handle, inode, path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, path_num_items(path) - 1); if (ret) { mlog_errno(ret); @@ -2880,24 +2970,24 @@ out: return ret; } -static int __ocfs2_rotate_tree_left(struct inode *inode, - handle_t *handle, int orig_credits, +static int __ocfs2_rotate_tree_left(handle_t *handle, + struct ocfs2_extent_tree *et, + int orig_credits, struct ocfs2_path *path, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_path **empty_extent_path, - struct ocfs2_extent_tree *et) + struct ocfs2_path **empty_extent_path) { int ret, subtree_root, deleted; u32 right_cpos; struct ocfs2_path *left_path = NULL; struct ocfs2_path *right_path = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); *empty_extent_path = NULL; - ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path, - &right_cpos); + ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); if (ret) { mlog_errno(ret); goto out; @@ -2920,13 +3010,13 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, } while (right_cpos) { - ret = ocfs2_find_path(inode, right_path, right_cpos); + ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (ret) { mlog_errno(ret); goto out; } - subtree_root = ocfs2_find_subtree_root(inode, left_path, + subtree_root = ocfs2_find_subtree_root(et, left_path, right_path); mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", @@ -2946,16 +3036,16 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, * Caller might still want to make changes to the * tree root, so re-add it to the journal here. */ - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, 0); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_rotate_subtree_left(inode, handle, left_path, + ret = ocfs2_rotate_subtree_left(handle, et, left_path, right_path, subtree_root, - dealloc, &deleted, et); + dealloc, &deleted); if (ret == -EAGAIN) { /* * The rotation has to temporarily stop due to @@ -2982,7 +3072,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, ocfs2_mv_path(left_path, right_path); - ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, + ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &right_cpos); if (ret) { mlog_errno(ret); @@ -2997,10 +3087,10 @@ out: return ret; } -static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, +static int ocfs2_remove_rightmost_path(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_extent_tree *et) + struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret, subtree_index; u32 cpos; @@ -3009,7 +3099,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, struct ocfs2_extent_list *el; - ret = ocfs2_et_sanity_check(inode, et); + ret = ocfs2_et_sanity_check(et); if (ret) goto out; /* @@ -3024,13 +3114,14 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_journal_access_path(inode, handle, path); + ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + path, &cpos); if (ret) { mlog_errno(ret); goto out; @@ -3048,23 +3139,23 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_find_path(inode, left_path, cpos); + ret = ocfs2_find_path(et->et_ci, left_path, cpos); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access_path(inode, handle, left_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); if (ret) { mlog_errno(ret); goto out; } - subtree_index = ocfs2_find_subtree_root(inode, left_path, path); + subtree_index = ocfs2_find_subtree_root(et, left_path, path); - ocfs2_unlink_subtree(inode, handle, left_path, path, + ocfs2_unlink_subtree(handle, et, left_path, path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + ret = ocfs2_update_edge_lengths(handle, et, subtree_index, left_path); if (ret) { mlog_errno(ret); @@ -3078,10 +3169,10 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, * 'path' is also the leftmost path which * means it must be the only one. This gets * handled differently because we want to - * revert the inode back to having extents + * revert the root back to having extents * in-line. */ - ocfs2_unlink_path(inode, handle, dealloc, path, 1); + ocfs2_unlink_path(handle, et, dealloc, path, 1); el = et->et_root_el; el->l_tree_depth = 0; @@ -3114,10 +3205,10 @@ out: * the rightmost tree leaf record is removed so the caller is * responsible for detecting and correcting that. */ -static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, +static int ocfs2_rotate_tree_left(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_extent_tree *et) + struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret, orig_credits = handle->h_buffer_credits; struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; @@ -3134,8 +3225,7 @@ rightmost_no_delete: * Inline extents. This is trivially handled, so do * it up front. */ - ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, - path); + ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path); if (ret) mlog_errno(ret); goto out; @@ -3151,7 +3241,7 @@ rightmost_no_delete: * * 1) is handled via ocfs2_rotate_rightmost_leaf_left() * 2a) we need the left branch so that we can update it with the unlink - * 2b) we need to bring the inode back to inline extents. + * 2b) we need to bring the root back to inline extents. */ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; @@ -3167,9 +3257,9 @@ rightmost_no_delete: if (le16_to_cpu(el->l_next_free_rec) == 0) { ret = -EIO; - ocfs2_error(inode->i_sb, - "Inode %llu has empty extent block at %llu", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has empty extent block at %llu", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), (unsigned long long)le64_to_cpu(eb->h_blkno)); goto out; } @@ -3183,8 +3273,8 @@ rightmost_no_delete: * nonempty list. */ - ret = ocfs2_remove_rightmost_path(inode, handle, path, - dealloc, et); + ret = ocfs2_remove_rightmost_path(handle, et, path, + dealloc); if (ret) mlog_errno(ret); goto out; @@ -3195,8 +3285,8 @@ rightmost_no_delete: * and restarting from there. */ try_rotate: - ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, - dealloc, &restart_path, et); + ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path, + dealloc, &restart_path); if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out; @@ -3206,9 +3296,9 @@ try_rotate: tmp_path = restart_path; restart_path = NULL; - ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, + ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, tmp_path, dealloc, - &restart_path, et); + &restart_path); if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out; @@ -3259,7 +3349,7 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, } } -static int ocfs2_get_right_path(struct inode *inode, +static int ocfs2_get_right_path(struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path **ret_right_path) { @@ -3276,8 +3366,8 @@ static int ocfs2_get_right_path(struct inode *inode, left_el = path_leaf_el(left_path); BUG_ON(left_el->l_next_free_rec != left_el->l_count); - ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, - &right_cpos); + ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + left_path, &right_cpos); if (ret) { mlog_errno(ret); goto out; @@ -3293,7 +3383,7 @@ static int ocfs2_get_right_path(struct inode *inode, goto out; } - ret = ocfs2_find_path(inode, right_path, right_cpos); + ret = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (ret) { mlog_errno(ret); goto out; @@ -3313,9 +3403,9 @@ out: * For index == l_count - 1, the "next" means the 1st extent rec of the * next extent block. */ -static int ocfs2_merge_rec_right(struct inode *inode, - struct ocfs2_path *left_path, +static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *split_rec, int index) { @@ -3336,7 +3426,7 @@ static int ocfs2_merge_rec_right(struct inode *inode, if (index == le16_to_cpu(el->l_next_free_rec) - 1 && le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { /* we meet with a cross extent block merge. */ - ret = ocfs2_get_right_path(inode, left_path, &right_path); + ret = ocfs2_get_right_path(et, left_path, &right_path); if (ret) { mlog_errno(ret); goto out; @@ -3355,8 +3445,8 @@ static int ocfs2_merge_rec_right(struct inode *inode, le16_to_cpu(left_rec->e_leaf_clusters) != le32_to_cpu(right_rec->e_cpos)); - subtree_index = ocfs2_find_subtree_root(inode, - left_path, right_path); + subtree_index = ocfs2_find_subtree_root(et, left_path, + right_path); ret = ocfs2_extend_rotate_transaction(handle, subtree_index, handle->h_buffer_credits, @@ -3369,7 +3459,7 @@ static int ocfs2_merge_rec_right(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, subtree_index); if (ret) { mlog_errno(ret); @@ -3378,14 +3468,14 @@ static int ocfs2_merge_rec_right(struct inode *inode, for (i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, i); if (ret) { mlog_errno(ret); @@ -3398,7 +3488,7 @@ static int ocfs2_merge_rec_right(struct inode *inode, right_rec = &el->l_recs[index + 1]; } - ret = ocfs2_path_bh_journal_access(handle, inode, left_path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, path_num_items(left_path) - 1); if (ret) { mlog_errno(ret); @@ -3409,7 +3499,8 @@ static int ocfs2_merge_rec_right(struct inode *inode, le32_add_cpu(&right_rec->e_cpos, -split_clusters); le64_add_cpu(&right_rec->e_blkno, - -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); + -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), + split_clusters)); le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); ocfs2_cleanup_merge(el, index); @@ -3423,8 +3514,8 @@ static int ocfs2_merge_rec_right(struct inode *inode, if (ret) mlog_errno(ret); - ocfs2_complete_edge_insert(inode, handle, left_path, - right_path, subtree_index); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); } out: if (right_path) @@ -3432,7 +3523,7 @@ out: return ret; } -static int ocfs2_get_left_path(struct inode *inode, +static int ocfs2_get_left_path(struct ocfs2_extent_tree *et, struct ocfs2_path *right_path, struct ocfs2_path **ret_left_path) { @@ -3445,7 +3536,7 @@ static int ocfs2_get_left_path(struct inode *inode, /* This function shouldn't be called for non-trees. */ BUG_ON(right_path->p_tree_depth == 0); - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), right_path, &left_cpos); if (ret) { mlog_errno(ret); @@ -3462,7 +3553,7 @@ static int ocfs2_get_left_path(struct inode *inode, goto out; } - ret = ocfs2_find_path(inode, left_path, left_cpos); + ret = ocfs2_find_path(et->et_ci, left_path, left_cpos); if (ret) { mlog_errno(ret); goto out; @@ -3485,12 +3576,11 @@ out: * remove the rightmost leaf extent block in the right_path and change * the right path to indicate the new rightmost path. */ -static int ocfs2_merge_rec_left(struct inode *inode, - struct ocfs2_path *right_path, +static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_extent_tree *et, int index) { int ret, i, subtree_index = 0, has_empty_extent = 0; @@ -3508,7 +3598,7 @@ static int ocfs2_merge_rec_left(struct inode *inode, right_rec = &el->l_recs[index]; if (index == 0) { /* we meet with a cross extent block merge. */ - ret = ocfs2_get_left_path(inode, right_path, &left_path); + ret = ocfs2_get_left_path(et, right_path, &left_path); if (ret) { mlog_errno(ret); goto out; @@ -3524,8 +3614,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, le16_to_cpu(left_rec->e_leaf_clusters) != le32_to_cpu(split_rec->e_cpos)); - subtree_index = ocfs2_find_subtree_root(inode, - left_path, right_path); + subtree_index = ocfs2_find_subtree_root(et, left_path, + right_path); ret = ocfs2_extend_rotate_transaction(handle, subtree_index, handle->h_buffer_credits, @@ -3538,7 +3628,7 @@ static int ocfs2_merge_rec_left(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, subtree_index); if (ret) { mlog_errno(ret); @@ -3547,14 +3637,14 @@ static int ocfs2_merge_rec_left(struct inode *inode, for (i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_path_bh_journal_access(handle, inode, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path, i); if (ret) { mlog_errno(ret); @@ -3567,7 +3657,7 @@ static int ocfs2_merge_rec_left(struct inode *inode, has_empty_extent = 1; } - ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path, path_num_items(right_path) - 1); if (ret) { mlog_errno(ret); @@ -3586,7 +3676,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, le32_add_cpu(&right_rec->e_cpos, split_clusters); le64_add_cpu(&right_rec->e_blkno, - ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); + ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci), + split_clusters)); le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); ocfs2_cleanup_merge(el, index); @@ -3608,9 +3699,9 @@ static int ocfs2_merge_rec_left(struct inode *inode, if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && le16_to_cpu(el->l_next_free_rec) == 1) { - ret = ocfs2_remove_rightmost_path(inode, handle, + ret = ocfs2_remove_rightmost_path(handle, et, right_path, - dealloc, et); + dealloc); if (ret) { mlog_errno(ret); goto out; @@ -3622,7 +3713,7 @@ static int ocfs2_merge_rec_left(struct inode *inode, ocfs2_mv_path(right_path, left_path); left_path = NULL; } else - ocfs2_complete_edge_insert(inode, handle, left_path, + ocfs2_complete_edge_insert(handle, left_path, right_path, subtree_index); } out: @@ -3631,15 +3722,13 @@ out: return ret; } -static int ocfs2_try_to_merge_extent(struct inode *inode, - handle_t *handle, +static int ocfs2_try_to_merge_extent(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, int split_index, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_merge_ctxt *ctxt, - struct ocfs2_extent_tree *et) - + struct ocfs2_merge_ctxt *ctxt) { int ret = 0; struct ocfs2_extent_list *el = path_leaf_el(path); @@ -3655,8 +3744,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * extents - having more than one in a leaf is * illegal. */ - ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc, et); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); goto out; @@ -3685,8 +3773,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * prevoius extent block. It is more efficient and easier * if we do merge_right first and merge_left later. */ - ret = ocfs2_merge_rec_right(inode, path, - handle, split_rec, + ret = ocfs2_merge_rec_right(path, handle, et, split_rec, split_index); if (ret) { mlog_errno(ret); @@ -3699,8 +3786,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); /* The merge left us with an empty extent, remove it. */ - ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc, et); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); goto out; @@ -3712,18 +3798,15 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * Note that we don't pass split_rec here on purpose - * we've merged it into the rec already. */ - ret = ocfs2_merge_rec_left(inode, path, - handle, rec, - dealloc, et, - split_index); + ret = ocfs2_merge_rec_left(path, handle, et, rec, + dealloc, split_index); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc, et); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); /* * Error from this last rotate is not critical, so * print but don't bubble it up. @@ -3740,19 +3823,16 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * the record on the left (hence the left merge). */ if (ctxt->c_contig_type == CONTIG_RIGHT) { - ret = ocfs2_merge_rec_left(inode, - path, - handle, split_rec, - dealloc, et, + ret = ocfs2_merge_rec_left(path, handle, et, + split_rec, dealloc, split_index); if (ret) { mlog_errno(ret); goto out; } } else { - ret = ocfs2_merge_rec_right(inode, - path, - handle, split_rec, + ret = ocfs2_merge_rec_right(path, handle, + et, split_rec, split_index); if (ret) { mlog_errno(ret); @@ -3765,8 +3845,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * The merge may have left an empty extent in * our leaf. Try to rotate it away. */ - ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc, et); + ret = ocfs2_rotate_tree_left(handle, et, path, + dealloc); if (ret) mlog_errno(ret); ret = 0; @@ -3812,10 +3892,10 @@ static void ocfs2_subtract_from_rec(struct super_block *sb, * list. If this leaf is part of an allocation tree, it is assumed * that the tree above has been prepared. */ -static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, +static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *insert_rec, struct ocfs2_extent_list *el, - struct ocfs2_insert_type *insert, - struct inode *inode) + struct ocfs2_insert_type *insert) { int i = insert->ins_contig_index; unsigned int range; @@ -3827,7 +3907,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); BUG_ON(i == -1); rec = &el->l_recs[i]; - ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec, + ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), + insert->ins_split, rec, insert_rec); goto rotate; } @@ -3869,10 +3950,10 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= le16_to_cpu(el->l_count), - "inode %lu, depth %u, count %u, next free %u, " + "owner %llu, depth %u, count %u, next free %u, " "rec.cpos %u, rec.clusters %u, " "insert.cpos %u, insert.clusters %u\n", - inode->i_ino, + ocfs2_metadata_cache_owner(et->et_ci), le16_to_cpu(el->l_tree_depth), le16_to_cpu(el->l_count), le16_to_cpu(el->l_next_free_rec), @@ -3900,8 +3981,8 @@ rotate: ocfs2_rotate_leaf(el, insert_rec); } -static void ocfs2_adjust_rightmost_records(struct inode *inode, - handle_t *handle, +static void ocfs2_adjust_rightmost_records(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec) { @@ -3919,9 +4000,9 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode, next_free = le16_to_cpu(el->l_next_free_rec); if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has a bad extent list", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); ret = -EIO; return; } @@ -3941,7 +4022,8 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode, } } -static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, +static int ocfs2_append_rec_to_path(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *insert_rec, struct ocfs2_path *right_path, struct ocfs2_path **ret_left_path) @@ -3969,8 +4051,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { u32 left_cpos; - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, - &left_cpos); + ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci), + right_path, &left_cpos); if (ret) { mlog_errno(ret); goto out; @@ -3992,7 +4074,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_find_path(inode, left_path, left_cpos); + ret = ocfs2_find_path(et->et_ci, left_path, + left_cpos); if (ret) { mlog_errno(ret); goto out; @@ -4005,13 +4088,13 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, } } - ret = ocfs2_journal_access_path(inode, handle, right_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); if (ret) { mlog_errno(ret); goto out; } - ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec); + ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec); *ret_left_path = left_path; ret = 0; @@ -4022,7 +4105,7 @@ out: return ret; } -static void ocfs2_split_record(struct inode *inode, +static void ocfs2_split_record(struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, struct ocfs2_extent_rec *split_rec, @@ -4095,7 +4178,8 @@ static void ocfs2_split_record(struct inode *inode, } rec = &el->l_recs[index]; - ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec); + ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci), + split, rec, split_rec); ocfs2_rotate_leaf(insert_el, split_rec); } @@ -4107,8 +4191,8 @@ static void ocfs2_split_record(struct inode *inode, * in. left_path should only be passed in if we need to update that * portion of the tree after an edge insert. */ -static int ocfs2_insert_path(struct inode *inode, - handle_t *handle, +static int ocfs2_insert_path(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *left_path, struct ocfs2_path *right_path, struct ocfs2_extent_rec *insert_rec, @@ -4134,7 +4218,7 @@ static int ocfs2_insert_path(struct inode *inode, goto out; } - ret = ocfs2_journal_access_path(inode, handle, left_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); if (ret < 0) { mlog_errno(ret); goto out; @@ -4145,7 +4229,7 @@ static int ocfs2_insert_path(struct inode *inode, * Pass both paths to the journal. The majority of inserts * will be touching all components anyway. */ - ret = ocfs2_journal_access_path(inode, handle, right_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, right_path); if (ret < 0) { mlog_errno(ret); goto out; @@ -4157,7 +4241,7 @@ static int ocfs2_insert_path(struct inode *inode, * of splits, but it's easier to just let one separate * function sort it all out. */ - ocfs2_split_record(inode, left_path, right_path, + ocfs2_split_record(et, left_path, right_path, insert_rec, insert->ins_split); /* @@ -4171,8 +4255,8 @@ static int ocfs2_insert_path(struct inode *inode, if (ret) mlog_errno(ret); } else - ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), - insert, inode); + ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path), + insert); ret = ocfs2_journal_dirty(handle, leaf_bh); if (ret) @@ -4185,10 +4269,10 @@ static int ocfs2_insert_path(struct inode *inode, * * XXX: Should we extend the transaction here? */ - subtree_index = ocfs2_find_subtree_root(inode, left_path, + subtree_index = ocfs2_find_subtree_root(et, left_path, right_path); - ocfs2_complete_edge_insert(inode, handle, left_path, - right_path, subtree_index); + ocfs2_complete_edge_insert(handle, left_path, right_path, + subtree_index); } ret = 0; @@ -4196,8 +4280,7 @@ out: return ret; } -static int ocfs2_do_insert_extent(struct inode *inode, - handle_t *handle, +static int ocfs2_do_insert_extent(handle_t *handle, struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *insert_rec, struct ocfs2_insert_type *type) @@ -4210,7 +4293,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, el = et->et_root_el; - ret = ocfs2_et_root_journal_access(handle, inode, et, + ret = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -4218,7 +4301,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, } if (le16_to_cpu(el->l_tree_depth) == 0) { - ocfs2_insert_at_leaf(insert_rec, el, type, inode); + ocfs2_insert_at_leaf(et, insert_rec, el, type); goto out_update_clusters; } @@ -4241,7 +4324,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, cpos = UINT_MAX; } - ret = ocfs2_find_path(inode, right_path, cpos); + ret = ocfs2_find_path(et->et_ci, right_path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -4260,7 +4343,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, * can wind up skipping both of these two special cases... */ if (rotate) { - ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split, + ret = ocfs2_rotate_tree_right(handle, et, type->ins_split, le32_to_cpu(insert_rec->e_cpos), right_path, &left_path); if (ret) { @@ -4272,7 +4355,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, * ocfs2_rotate_tree_right() might have extended the * transaction without re-journaling our tree root. */ - ret = ocfs2_et_root_journal_access(handle, inode, et, + ret = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -4280,7 +4363,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, } } else if (type->ins_appending == APPEND_TAIL && type->ins_contig != CONTIG_LEFT) { - ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, + ret = ocfs2_append_rec_to_path(handle, et, insert_rec, right_path, &left_path); if (ret) { mlog_errno(ret); @@ -4288,7 +4371,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, } } - ret = ocfs2_insert_path(inode, handle, left_path, right_path, + ret = ocfs2_insert_path(handle, et, left_path, right_path, insert_rec, type); if (ret) { mlog_errno(ret); @@ -4297,7 +4380,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, out_update_clusters: if (type->ins_split == SPLIT_NONE) - ocfs2_et_update_clusters(inode, et, + ocfs2_et_update_clusters(et, le16_to_cpu(insert_rec->e_leaf_clusters)); ret = ocfs2_journal_dirty(handle, et->et_root_bh); @@ -4312,7 +4395,8 @@ out: } static enum ocfs2_contig_type -ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, +ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, + struct ocfs2_path *path, struct ocfs2_extent_list *el, int index, struct ocfs2_extent_rec *split_rec) { @@ -4324,12 +4408,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, struct ocfs2_path *left_path = NULL, *right_path = NULL; struct buffer_head *bh; struct ocfs2_extent_block *eb; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); if (index > 0) { rec = &el->l_recs[index - 1]; } else if (path->p_tree_depth > 0) { - status = ocfs2_find_cpos_for_left_leaf(inode->i_sb, - path, &left_cpos); + status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); if (status) goto out; @@ -4338,7 +4422,8 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (!left_path) goto out; - status = ocfs2_find_path(inode, left_path, left_cpos); + status = ocfs2_find_path(et->et_ci, left_path, + left_cpos); if (status) goto out; @@ -4348,7 +4433,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, le16_to_cpu(new_el->l_count)) { bh = path_leaf_bh(left_path); eb = (struct ocfs2_extent_block *)bh->b_data; - ocfs2_error(inode->i_sb, + ocfs2_error(sb, "Extent block #%llu has an " "invalid l_next_free_rec of " "%d. It should have " @@ -4373,7 +4458,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (split_rec->e_cpos == el->l_recs[index].e_cpos) ret = CONTIG_RIGHT; } else { - ret = ocfs2_extent_contig(inode, rec, split_rec); + ret = ocfs2_et_extent_contig(et, rec, split_rec); } } @@ -4382,8 +4467,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, rec = &el->l_recs[index + 1]; else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && path->p_tree_depth > 0) { - status = ocfs2_find_cpos_for_right_leaf(inode->i_sb, - path, &right_cpos); + status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); if (status) goto out; @@ -4394,7 +4478,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (!right_path) goto out; - status = ocfs2_find_path(inode, right_path, right_cpos); + status = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (status) goto out; @@ -4404,7 +4488,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { bh = path_leaf_bh(right_path); eb = (struct ocfs2_extent_block *)bh->b_data; - ocfs2_error(inode->i_sb, + ocfs2_error(sb, "Extent block #%llu has an " "invalid l_next_free_rec of %d", (unsigned long long)le64_to_cpu(eb->h_blkno), @@ -4419,7 +4503,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (rec) { enum ocfs2_contig_type contig_type; - contig_type = ocfs2_extent_contig(inode, rec, split_rec); + contig_type = ocfs2_et_extent_contig(et, rec, split_rec); if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) ret = CONTIG_LEFTRIGHT; @@ -4436,11 +4520,10 @@ out: return ret; } -static void ocfs2_figure_contig_type(struct inode *inode, +static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, struct ocfs2_insert_type *insert, struct ocfs2_extent_list *el, - struct ocfs2_extent_rec *insert_rec, - struct ocfs2_extent_tree *et) + struct ocfs2_extent_rec *insert_rec) { int i; enum ocfs2_contig_type contig_type = CONTIG_NONE; @@ -4448,8 +4531,8 @@ static void ocfs2_figure_contig_type(struct inode *inode, BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], - insert_rec); + contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i], + insert_rec); if (contig_type != CONTIG_NONE) { insert->ins_contig_index = i; break; @@ -4530,8 +4613,7 @@ set_tail_append: * All of the information is stored on the ocfs2_insert_type * structure. */ -static int ocfs2_figure_insert_type(struct inode *inode, - struct ocfs2_extent_tree *et, +static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et, struct buffer_head **last_eb_bh, struct ocfs2_extent_rec *insert_rec, int *free_records, @@ -4555,7 +4637,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * ocfs2_figure_insert_type() and ocfs2_add_branch() * may want it later. */ - ret = ocfs2_read_extent_block(inode, + ret = ocfs2_read_extent_block(et->et_ci, ocfs2_et_get_last_eb_blk(et), &bh); if (ret) { @@ -4578,7 +4660,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, le16_to_cpu(el->l_next_free_rec); if (!insert->ins_tree_depth) { - ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); + ocfs2_figure_contig_type(et, insert, el, insert_rec); ocfs2_figure_appending_type(insert, el, insert_rec); return 0; } @@ -4596,7 +4678,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * us the rightmost tree path. This is accounted for below in * the appending code. */ - ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); + ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos)); if (ret) { mlog_errno(ret); goto out; @@ -4612,7 +4694,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * into two types of appends: simple record append, or a * rotate inside the tail leaf. */ - ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); + ocfs2_figure_contig_type(et, insert, el, insert_rec); /* * The insert code isn't quite ready to deal with all cases of @@ -4657,13 +4739,11 @@ out: } /* - * Insert an extent into an inode btree. + * Insert an extent into a btree. * - * The caller needs to update fe->i_clusters + * The caller needs to update the owning btree's cluster count. */ -int ocfs2_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, +int ocfs2_insert_extent(handle_t *handle, struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, @@ -4677,21 +4757,22 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, struct ocfs2_insert_type insert = {0, }; struct ocfs2_extent_rec rec; - mlog(0, "add %u clusters at position %u to inode %llu\n", - new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); + mlog(0, "add %u clusters at position %u to owner %llu\n", + new_clusters, cpos, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); memset(&rec, 0, sizeof(rec)); rec.e_cpos = cpu_to_le32(cpos); rec.e_blkno = cpu_to_le64(start_blk); rec.e_leaf_clusters = cpu_to_le16(new_clusters); rec.e_flags = flags; - status = ocfs2_et_insert_check(inode, et, &rec); + status = ocfs2_et_insert_check(et, &rec); if (status) { mlog_errno(status); goto bail; } - status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec, + status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec, &free_records, &insert); if (status < 0) { mlog_errno(status); @@ -4705,7 +4786,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, free_records, insert.ins_tree_depth); if (insert.ins_contig == CONTIG_NONE && free_records == 0) { - status = ocfs2_grow_tree(inode, handle, et, + status = ocfs2_grow_tree(handle, et, &insert.ins_tree_depth, &last_eb_bh, meta_ac); if (status) { @@ -4715,11 +4796,11 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, } /* Finally, we can add clusters. This might rotate the tree for us. */ - status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert); + status = ocfs2_do_insert_extent(handle, et, &rec, &insert); if (status < 0) mlog_errno(status); - else if (et->et_ops == &ocfs2_dinode_et_ops) - ocfs2_extent_map_insert_rec(inode, &rec); + else + ocfs2_et_extent_map_insert(et, &rec); bail: brelse(last_eb_bh); @@ -4735,13 +4816,11 @@ bail: * it is not limited to the file storage. Any extent tree can use this * function if it implements the proper ocfs2_extent_tree. */ -int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, - struct inode *inode, +int ocfs2_add_clusters_in_btree(handle_t *handle, + struct ocfs2_extent_tree *et, u32 *logical_offset, u32 clusters_to_add, int mark_unwritten, - struct ocfs2_extent_tree *et, - handle_t *handle, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret) @@ -4752,13 +4831,15 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, u32 bit_off, num_bits; u64 block; u8 flags = 0; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); BUG_ON(!clusters_to_add); if (mark_unwritten) flags = OCFS2_EXT_UNWRITTEN; - free_extents = ocfs2_num_free_extents(osb, inode, et); + free_extents = ocfs2_num_free_extents(osb, et); if (free_extents < 0) { status = free_extents; mlog_errno(status); @@ -4795,7 +4876,7 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, BUG_ON(num_bits > clusters_to_add); /* reserve our write early -- insert_extent may update the tree root */ - status = ocfs2_et_root_journal_access(handle, inode, et, + status = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -4803,10 +4884,10 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, } block = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "Allocating %u clusters at block %u for inode %llu\n", - num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_insert_extent(osb, handle, inode, et, - *logical_offset, block, + mlog(0, "Allocating %u clusters at block %u for owner %llu\n", + num_bits, bit_off, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); + status = ocfs2_insert_extent(handle, et, *logical_offset, block, num_bits, flags, meta_ac); if (status < 0) { mlog_errno(status); @@ -4856,10 +4937,9 @@ static void ocfs2_make_right_split_rec(struct super_block *sb, split_rec->e_flags = rec->e_flags; } -static int ocfs2_split_and_insert(struct inode *inode, - handle_t *handle, - struct ocfs2_path *path, +static int ocfs2_split_and_insert(handle_t *handle, struct ocfs2_extent_tree *et, + struct ocfs2_path *path, struct buffer_head **last_eb_bh, int split_index, struct ocfs2_extent_rec *orig_split_rec, @@ -4892,7 +4972,7 @@ leftright: if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, et, + ret = ocfs2_grow_tree(handle, et, &depth, last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); @@ -4921,8 +5001,8 @@ leftright: */ insert.ins_split = SPLIT_RIGHT; - ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range, - &rec); + ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), + &tmprec, insert_range, &rec); split_rec = tmprec; @@ -4930,7 +5010,7 @@ leftright: do_leftright = 1; } - ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); + ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); if (ret) { mlog_errno(ret); goto out; @@ -4946,7 +5026,7 @@ leftright: ocfs2_reinit_path(path, 1); cpos = le32_to_cpu(split_rec.e_cpos); - ret = ocfs2_find_path(inode, path, cpos); + ret = ocfs2_find_path(et->et_ci, path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -4961,8 +5041,8 @@ out: return ret; } -static int ocfs2_replace_extent_rec(struct inode *inode, - handle_t *handle, +static int ocfs2_replace_extent_rec(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_extent_list *el, int split_index, @@ -4970,7 +5050,7 @@ static int ocfs2_replace_extent_rec(struct inode *inode, { int ret; - ret = ocfs2_path_bh_journal_access(handle, inode, path, + ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path, path_num_items(path) - 1); if (ret) { mlog_errno(ret); @@ -4985,9 +5065,8 @@ out: } /* - * Mark part or all of the extent record at split_index in the leaf - * pointed to by path as written. This removes the unwritten - * extent flag. + * Split part or all of the extent record at split_index in the leaf + * pointed to by path. Merge with the contiguous extent record if needed. * * Care is taken to handle contiguousness so as to not grow the tree. * @@ -5004,14 +5083,13 @@ out: * have been brought into cache (and pinned via the journal), so the * extra overhead is not expressed in terms of disk reads. */ -static int __ocfs2_mark_extent_written(struct inode *inode, - struct ocfs2_extent_tree *et, - handle_t *handle, - struct ocfs2_path *path, - int split_index, - struct ocfs2_extent_rec *split_rec, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) +int ocfs2_split_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) { int ret = 0; struct ocfs2_extent_list *el = path_leaf_el(path); @@ -5020,12 +5098,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode, struct ocfs2_merge_ctxt ctxt; struct ocfs2_extent_list *rightmost_el; - if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) { - ret = -EIO; - mlog_errno(ret); - goto out; - } - if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { @@ -5034,19 +5106,19 @@ static int __ocfs2_mark_extent_written(struct inode *inode, goto out; } - ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el, + ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, split_index, split_rec); /* * The core merge / split code wants to know how much room is - * left in this inodes allocation tree, so we pass the + * left in this allocation tree, so we pass the * rightmost extent list. */ if (path->p_tree_depth) { struct ocfs2_extent_block *eb; - ret = ocfs2_read_extent_block(inode, + ret = ocfs2_read_extent_block(et->et_ci, ocfs2_et_get_last_eb_blk(et), &last_eb_bh); if (ret) { @@ -5073,19 +5145,18 @@ static int __ocfs2_mark_extent_written(struct inode *inode, if (ctxt.c_contig_type == CONTIG_NONE) { if (ctxt.c_split_covers_rec) - ret = ocfs2_replace_extent_rec(inode, handle, - path, el, + ret = ocfs2_replace_extent_rec(handle, et, path, el, split_index, split_rec); else - ret = ocfs2_split_and_insert(inode, handle, path, et, + ret = ocfs2_split_and_insert(handle, et, path, &last_eb_bh, split_index, split_rec, meta_ac); if (ret) mlog_errno(ret); } else { - ret = ocfs2_try_to_merge_extent(inode, handle, path, + ret = ocfs2_try_to_merge_extent(handle, et, path, split_index, split_rec, - dealloc, &ctxt, et); + dealloc, &ctxt); if (ret) mlog_errno(ret); } @@ -5096,46 +5167,31 @@ out: } /* - * Mark the already-existing extent at cpos as written for len clusters. + * Change the flags of the already-existing extent at cpos for len clusters. + * + * new_flags: the flags we want to set. + * clear_flags: the flags we want to clear. + * phys: the new physical offset we want this new extent starts from. * * If the existing extent is larger than the request, initiate a * split. An attempt will be made at merging with adjacent extents. * * The caller is responsible for passing down meta_ac if we'll need it. */ -int ocfs2_mark_extent_written(struct inode *inode, - struct ocfs2_extent_tree *et, - handle_t *handle, u32 cpos, u32 len, u32 phys, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_cached_dealloc_ctxt *dealloc) +int ocfs2_change_extent_flag(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int new_flags, int clear_flags) { int ret, index; - u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys); + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys); struct ocfs2_extent_rec split_rec; struct ocfs2_path *left_path = NULL; struct ocfs2_extent_list *el; - - mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n", - inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno); - - if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { - ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " - "that are being written to, but the feature bit " - "is not set in the super block.", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - ret = -EROFS; - goto out; - } - - /* - * XXX: This should be fixed up so that we just re-insert the - * next extent records. - * - * XXX: This is a hack on the extent tree, maybe it should be - * an op? - */ - if (et->et_ops == &ocfs2_dinode_et_ops) - ocfs2_extent_map_trunc(inode, 0); + struct ocfs2_extent_rec *rec; left_path = ocfs2_new_path_from_et(et); if (!left_path) { @@ -5144,7 +5200,7 @@ int ocfs2_mark_extent_written(struct inode *inode, goto out; } - ret = ocfs2_find_path(inode, left_path, cpos); + ret = ocfs2_find_path(et->et_ci, left_path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -5153,34 +5209,102 @@ int ocfs2_mark_extent_written(struct inode *inode, index = ocfs2_search_extent_list(el, cpos); if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " + ocfs2_error(sb, + "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); + (unsigned long long) + ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; goto out; } + ret = -EIO; + rec = &el->l_recs[index]; + if (new_flags && (rec->e_flags & new_flags)) { + mlog(ML_ERROR, "Owner %llu tried to set %d flags on an " + "extent that already had them", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + new_flags); + goto out; + } + + if (clear_flags && !(rec->e_flags & clear_flags)) { + mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an " + "extent that didn't have them", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + clear_flags); + goto out; + } + memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); split_rec.e_cpos = cpu_to_le32(cpos); split_rec.e_leaf_clusters = cpu_to_le16(len); split_rec.e_blkno = cpu_to_le64(start_blkno); - split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; - split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; - - ret = __ocfs2_mark_extent_written(inode, et, handle, left_path, - index, &split_rec, meta_ac, - dealloc); + split_rec.e_flags = rec->e_flags; + if (new_flags) + split_rec.e_flags |= new_flags; + if (clear_flags) + split_rec.e_flags &= ~clear_flags; + + ret = ocfs2_split_extent(handle, et, left_path, + index, &split_rec, meta_ac, + dealloc); if (ret) mlog_errno(ret); out: ocfs2_free_path(left_path); return ret; + } -static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, - handle_t *handle, struct ocfs2_path *path, +/* + * Mark the already-existing extent at cpos as written for len clusters. + * This removes the unwritten extent flag. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + + mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n", + inode->i_ino, cpos, len, phys); + + if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " + "that are being written to, but the feature bit " + "is not set in the super block.", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + ret = -EROFS; + goto out; + } + + /* + * XXX: This should be fixed up so that we just re-insert the + * next extent records. + */ + ocfs2_et_extent_map_truncate(et, 0); + + ret = ocfs2_change_extent_flag(handle, et, cpos, + len, phys, meta_ac, dealloc, + 0, OCFS2_EXT_UNWRITTEN); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et, + struct ocfs2_path *path, int index, u32 new_range, struct ocfs2_alloc_context *meta_ac) { @@ -5197,11 +5321,12 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, */ el = path_leaf_el(path); rec = &el->l_recs[index]; - ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec); + ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci), + &split_rec, new_range, rec); depth = path->p_tree_depth; if (depth > 0) { - ret = ocfs2_read_extent_block(inode, + ret = ocfs2_read_extent_block(et->et_ci, ocfs2_et_get_last_eb_blk(et), &last_eb_bh); if (ret < 0) { @@ -5224,7 +5349,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh, + ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); @@ -5238,7 +5363,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, insert.ins_split = SPLIT_RIGHT; insert.ins_tree_depth = depth; - ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); + ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert); if (ret) mlog_errno(ret); @@ -5247,23 +5372,23 @@ out: return ret; } -static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, +static int ocfs2_truncate_rec(handle_t *handle, + struct ocfs2_extent_tree *et, struct ocfs2_path *path, int index, struct ocfs2_cached_dealloc_ctxt *dealloc, - u32 cpos, u32 len, - struct ocfs2_extent_tree *et) + u32 cpos, u32 len) { int ret; u32 left_cpos, rec_range, trunc_range; int wants_rotate = 0, is_rightmost_tree_rec = 0; - struct super_block *sb = inode->i_sb; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); struct ocfs2_path *left_path = NULL; struct ocfs2_extent_list *el = path_leaf_el(path); struct ocfs2_extent_rec *rec; struct ocfs2_extent_block *eb; if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); goto out; @@ -5295,14 +5420,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, * by this leaf and the one to it's left. * * There are two cases we can skip: - * 1) Path is the leftmost one in our inode tree. + * 1) Path is the leftmost one in our btree. * 2) The leaf is rightmost and will be empty after * we remove the extent record - the rotate code * knows how to update the newly formed edge. */ - ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, - &left_cpos); + ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); if (ret) { mlog_errno(ret); goto out; @@ -5316,7 +5440,8 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_find_path(inode, left_path, left_cpos); + ret = ocfs2_find_path(et->et_ci, left_path, + left_cpos); if (ret) { mlog_errno(ret); goto out; @@ -5332,13 +5457,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, goto out; } - ret = ocfs2_journal_access_path(inode, handle, path); + ret = ocfs2_journal_access_path(et->et_ci, handle, path); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access_path(inode, handle, left_path); + ret = ocfs2_journal_access_path(et->et_ci, handle, left_path); if (ret) { mlog_errno(ret); goto out; @@ -5361,7 +5486,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, * be deleted by the rotate code. */ rec = &el->l_recs[next_free - 1]; - ocfs2_adjust_rightmost_records(inode, handle, path, + ocfs2_adjust_rightmost_records(handle, et, path, rec); } } else if (le32_to_cpu(rec->e_cpos) == cpos) { @@ -5373,11 +5498,12 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, /* Remove rightmost portion of the record */ le16_add_cpu(&rec->e_leaf_clusters, -len); if (is_rightmost_tree_rec) - ocfs2_adjust_rightmost_records(inode, handle, path, rec); + ocfs2_adjust_rightmost_records(handle, et, path, rec); } else { /* Caller should have trapped this. */ - mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) " - "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, + mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) " + "(%u, %u)\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), le32_to_cpu(rec->e_cpos), le16_to_cpu(rec->e_leaf_clusters), cpos, len); BUG(); @@ -5386,14 +5512,14 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, if (left_path) { int subtree_index; - subtree_index = ocfs2_find_subtree_root(inode, left_path, path); - ocfs2_complete_edge_insert(inode, handle, left_path, path, + subtree_index = ocfs2_find_subtree_root(et, left_path, path); + ocfs2_complete_edge_insert(handle, left_path, path, subtree_index); } ocfs2_journal_dirty(handle, path_leaf_bh(path)); - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); + ret = ocfs2_rotate_tree_left(handle, et, path, dealloc); if (ret) { mlog_errno(ret); goto out; @@ -5404,9 +5530,9 @@ out: return ret; } -int ocfs2_remove_extent(struct inode *inode, +int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et, - u32 cpos, u32 len, handle_t *handle, + u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) { @@ -5416,7 +5542,11 @@ int ocfs2_remove_extent(struct inode *inode, struct ocfs2_extent_list *el; struct ocfs2_path *path = NULL; - ocfs2_extent_map_trunc(inode, 0); + /* + * XXX: Why are we truncating to 0 instead of wherever this + * affects us? + */ + ocfs2_et_extent_map_truncate(et, 0); path = ocfs2_new_path_from_et(et); if (!path) { @@ -5425,7 +5555,7 @@ int ocfs2_remove_extent(struct inode *inode, goto out; } - ret = ocfs2_find_path(inode, path, cpos); + ret = ocfs2_find_path(et->et_ci, path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -5434,10 +5564,11 @@ int ocfs2_remove_extent(struct inode *inode, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(inode->i_sb, - "Inode %llu has an extent at cpos %u which can no " + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); ret = -EROFS; goto out; } @@ -5464,20 +5595,21 @@ int ocfs2_remove_extent(struct inode *inode, BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); - mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d " + mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d " "(cpos %u, len %u)\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos, len, index, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { - ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, - cpos, len, et); + ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, + cpos, len); if (ret) { mlog_errno(ret); goto out; } } else { - ret = ocfs2_split_tree(inode, et, handle, path, index, + ret = ocfs2_split_tree(handle, et, path, index, trunc_range, meta_ac); if (ret) { mlog_errno(ret); @@ -5490,7 +5622,7 @@ int ocfs2_remove_extent(struct inode *inode, */ ocfs2_reinit_path(path, 1); - ret = ocfs2_find_path(inode, path, cpos); + ret = ocfs2_find_path(et->et_ci, path, cpos); if (ret) { mlog_errno(ret); goto out; @@ -5499,9 +5631,9 @@ int ocfs2_remove_extent(struct inode *inode, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { - ocfs2_error(inode->i_sb, - "Inode %llu: split at cpos %u lost record.", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu: split at cpos %u lost record.", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos); ret = -EROFS; goto out; @@ -5515,18 +5647,18 @@ int ocfs2_remove_extent(struct inode *inode, rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); if (rec_range != trunc_range) { - ocfs2_error(inode->i_sb, - "Inode %llu: error after split at cpos %u" + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu: error after split at cpos %u" "trunc len %u, existing record is (%u,%u)", - (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), cpos, len, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; goto out; } - ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, - cpos, len, et); + ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, + cpos, len); if (ret) { mlog_errno(ret); goto out; @@ -5573,7 +5705,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto out; } - ret = ocfs2_et_root_journal_access(handle, inode, et, + ret = ocfs2_et_root_journal_access(handle, et, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -5583,14 +5715,13 @@ int ocfs2_remove_btree_range(struct inode *inode, vfs_dq_free_space_nodirty(inode, ocfs2_clusters_to_bytes(inode->i_sb, len)); - ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, - dealloc); + ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc); if (ret) { mlog_errno(ret); goto out_commit; } - ocfs2_et_update_clusters(inode, et, -len); + ocfs2_et_update_clusters(et, -len); ret = ocfs2_journal_dirty(handle, et->et_root_bh); if (ret) { @@ -5690,7 +5821,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, goto bail; } - status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -5752,7 +5883,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, while (i >= 0) { /* Caller has given us at least enough credits to * update the truncate log dinode */ - status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -6010,7 +6141,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, tl->tl_used = 0; ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check); - status = ocfs2_write_block(osb, tl_bh, tl_inode); + status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode)); if (status < 0) { mlog_errno(status); goto bail; @@ -6400,9 +6531,9 @@ ocfs2_find_per_slot_free_list(int type, return fl; } -static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, - int type, int slot, u64 blkno, - unsigned int bit) +int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + int type, int slot, u64 blkno, + unsigned int bit) { int ret; struct ocfs2_per_slot_free_list *fl; @@ -6518,7 +6649,7 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode, goto out; } - ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); + ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh); if (ret) { mlog_errno(ret); goto out; @@ -6551,7 +6682,7 @@ out: */ static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, handle_t *handle, struct ocfs2_truncate_context *tc, - u32 clusters_to_del, u64 *delete_start) + u32 clusters_to_del, u64 *delete_start, u8 *flags) { int ret, i, index = path->p_tree_depth; u32 new_edge = 0; @@ -6561,6 +6692,7 @@ static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, struct ocfs2_extent_rec *rec; *delete_start = 0; + *flags = 0; while (index >= 0) { bh = path->p_node[index].bh; @@ -6648,6 +6780,7 @@ find_tail_record: *delete_start = le64_to_cpu(rec->e_blkno) + ocfs2_clusters_to_blocks(inode->i_sb, le16_to_cpu(rec->e_leaf_clusters)); + *flags = rec->e_flags; /* * If it's now empty, remove this record. @@ -6719,7 +6852,7 @@ delete: mlog(0, "deleting this extent block.\n"); - ocfs2_remove_from_cache(inode, bh); + ocfs2_remove_from_cache(INODE_CACHE(inode), bh); BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); @@ -6747,7 +6880,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, struct buffer_head *fe_bh, handle_t *handle, struct ocfs2_truncate_context *tc, - struct ocfs2_path *path) + struct ocfs2_path *path, + struct ocfs2_alloc_context *meta_ac) { int status; struct ocfs2_dinode *fe; @@ -6755,6 +6889,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, struct ocfs2_extent_list *el; struct buffer_head *last_eb_bh = NULL; u64 delete_blk = 0; + u8 rec_flags; fe = (struct ocfs2_dinode *) fe_bh->b_data; @@ -6769,14 +6904,14 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, * Each component will be touched, so we might as well journal * here to avoid having to handle errors later. */ - status = ocfs2_journal_access_path(inode, handle, path); + status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path); if (status < 0) { mlog_errno(status); goto bail; } if (last_eb_bh) { - status = ocfs2_journal_access_eb(handle, inode, last_eb_bh, + status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -6810,7 +6945,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, inode->i_blocks = ocfs2_inode_sector_count(inode); status = ocfs2_trim_tree(inode, path, handle, tc, - clusters_to_del, &delete_blk); + clusters_to_del, &delete_blk, &rec_flags); if (status) { mlog_errno(status); goto bail; @@ -6842,8 +6977,16 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } if (delete_blk) { - status = ocfs2_truncate_log_append(osb, handle, delete_blk, - clusters_to_del); + if (rec_flags & OCFS2_EXT_REFCOUNTED) + status = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(osb->sb, + delete_blk), + clusters_to_del, meta_ac, + &tc->tc_dealloc, 1); + else + status = ocfs2_truncate_log_append(osb, handle, + delete_blk, + clusters_to_del); if (status < 0) { mlog_errno(status); goto bail; @@ -6863,9 +7006,9 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) return 0; } -static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, - unsigned int from, unsigned int to, - struct page *page, int zero, u64 *phys) +void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, + unsigned int from, unsigned int to, + struct page *page, int zero, u64 *phys) { int ret, partial = 0; @@ -6933,20 +7076,16 @@ out: ocfs2_unlock_and_free_pages(pages, numpages); } -static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, - struct page **pages, int *num) +int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num) { int numpages, ret = 0; - struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned long index; loff_t last_page_bytes; BUG_ON(start > end); - BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != - (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); - numpages = 0; last_page_bytes = PAGE_ALIGN(end); index = start >> PAGE_CACHE_SHIFT; @@ -6974,6 +7113,17 @@ out: return ret; } +static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num) +{ + struct super_block *sb = inode->i_sb; + + BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != + (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); + + return ocfs2_grab_pages(inode, start, end, pages, num); +} + /* * Zero the area past i_size but still within an allocated * cluster. This avoids exposing nonzero data on subsequent file @@ -7138,7 +7288,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, goto out_unlock; } - ret = ocfs2_journal_access_di(handle, inode, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -7218,9 +7368,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * this proves to be false, we could always re-build * the in-inode data from our pages. */ - ocfs2_init_dinode_extent_tree(&et, inode, di_bh); - ret = ocfs2_insert_extent(osb, handle, inode, &et, - 0, block, 1, 0, NULL); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL); if (ret) { mlog_errno(ret); goto out_commit; @@ -7262,11 +7411,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, { int status, i, credits, tl_sem = 0; u32 clusters_to_del, new_highest_cpos, range; + u64 blkno = 0; struct ocfs2_extent_list *el; handle_t *handle = NULL; struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_path *path = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_refcount_tree *ref_tree = NULL; mlog_entry_void(); @@ -7292,10 +7444,12 @@ start: goto bail; } + credits = 0; + /* * Truncate always works against the rightmost tree branch. */ - status = ocfs2_find_path(inode, path, UINT_MAX); + status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX); if (status) { mlog_errno(status); goto bail; @@ -7332,10 +7486,15 @@ start: clusters_to_del = 0; } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); + blkno = le64_to_cpu(el->l_recs[i].e_blkno); } else if (range > new_highest_cpos) { clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + le32_to_cpu(el->l_recs[i].e_cpos)) - new_highest_cpos; + blkno = le64_to_cpu(el->l_recs[i].e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, + ocfs2_rec_clusters(el, &el->l_recs[i]) - + clusters_to_del); } else { status = 0; goto bail; @@ -7344,6 +7503,29 @@ start: mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); + if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) { + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & + OCFS2_HAS_REFCOUNT_FL)); + + status = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, NULL); + if (status) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh, + blkno, + clusters_to_del, + &credits, + &meta_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + mutex_lock(&tl_inode->i_mutex); tl_sem = 1; /* ocfs2_truncate_log_needs_flush guarantees us at least one @@ -7357,7 +7539,7 @@ start: } } - credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, + credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, (struct ocfs2_dinode *)fe_bh->b_data, el); handle = ocfs2_start_trans(osb, credits); @@ -7369,7 +7551,7 @@ start: } status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, - tc, path); + tc, path, meta_ac); if (status < 0) { mlog_errno(status); goto bail; @@ -7383,6 +7565,16 @@ start: ocfs2_reinit_path(path, 1); + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + + if (ref_tree) { + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + ref_tree = NULL; + } + /* * The check above will catch the case where we've truncated * away all allocation. @@ -7399,6 +7591,12 @@ bail: if (handle) ocfs2_commit_trans(osb, handle); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + ocfs2_run_deallocs(osb, &tc->tc_dealloc); ocfs2_free_path(path); @@ -7445,7 +7643,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_read_extent_block(inode, + status = ocfs2_read_extent_block(INODE_CACHE(inode), le64_to_cpu(fe->i_last_eb_blk), &last_eb_bh); if (status < 0) { @@ -7507,7 +7705,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, goto out; } - ret = ocfs2_journal_access_di(handle, inode, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 353254ba29e..9c122d57446 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -45,7 +45,8 @@ * * ocfs2_extent_tree contains info for the root of the b-tree, it must have a * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree - * functions. With metadata ecc, we now call different journal_access + * functions. It needs the ocfs2_caching_info structure associated with + * I/O on the tree. With metadata ecc, we now call different journal_access * functions for each type of metadata, so it must have the * root_journal_access function. * ocfs2_extent_tree_operations abstract the normal operations we do for @@ -56,6 +57,7 @@ struct ocfs2_extent_tree { struct ocfs2_extent_tree_operations *et_ops; struct buffer_head *et_root_bh; struct ocfs2_extent_list *et_root_el; + struct ocfs2_caching_info *et_ci; ocfs2_journal_access_func et_root_journal_access; void *et_object; unsigned int et_max_leaf_clusters; @@ -66,31 +68,32 @@ struct ocfs2_extent_tree { * specified object buffer. */ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh); void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh); struct ocfs2_xattr_value_buf; void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct ocfs2_xattr_value_buf *vb); void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh); +void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ci, + struct buffer_head *bh); /* * Read an extent block into *bh. If *bh is NULL, a bh will be * allocated. This is a cached read. The extent block will be validated * with ocfs2_validate_extent_block(). */ -int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, +int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, struct buffer_head **bh); struct ocfs2_alloc_context; -int ocfs2_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, +int ocfs2_insert_extent(handle_t *handle, struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, @@ -103,25 +106,36 @@ enum ocfs2_alloc_restarted { RESTART_TRANS, RESTART_META }; -int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, - struct inode *inode, +int ocfs2_add_clusters_in_btree(handle_t *handle, + struct ocfs2_extent_tree *et, u32 *logical_offset, u32 clusters_to_add, int mark_unwritten, - struct ocfs2_extent_tree *et, - handle_t *handle, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret); struct ocfs2_cached_dealloc_ctxt; +struct ocfs2_path; +int ocfs2_split_extent(handle_t *handle, + struct ocfs2_extent_tree *et, + struct ocfs2_path *path, + int split_index, + struct ocfs2_extent_rec *split_rec, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_mark_extent_written(struct inode *inode, struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); -int ocfs2_remove_extent(struct inode *inode, - struct ocfs2_extent_tree *et, - u32 cpos, u32 len, handle_t *handle, +int ocfs2_change_extent_flag(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int new_flags, int clear_flags); +int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et, + u32 cpos, u32 len, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_remove_btree_range(struct inode *inode, @@ -130,7 +144,6 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct inode *inode, struct ocfs2_extent_tree *et); /* @@ -195,6 +208,9 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) } int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, u64 blkno, unsigned int bit); +int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + int type, int slot, u64 blkno, + unsigned int bit); static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) { return c->c_global_allocator != NULL; @@ -222,8 +238,9 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, unsigned int start, unsigned int end, int trunc); -int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, - u32 cpos, struct buffer_head **leaf_bh); +int ocfs2_find_leaf(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *root_el, u32 cpos, + struct buffer_head **leaf_bh); int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); /* @@ -254,4 +271,50 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) return !rec->e_leaf_clusters; } +int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, + struct page **pages, int *num); +void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, + unsigned int from, unsigned int to, + struct page *page, int zero, u64 *phys); +/* + * Structures which describe a path through a btree, and functions to + * manipulate them. + * + * The idea here is to be as generic as possible with the tree + * manipulation code. + */ +struct ocfs2_path_item { + struct buffer_head *bh; + struct ocfs2_extent_list *el; +}; + +#define OCFS2_MAX_PATH_DEPTH 5 + +struct ocfs2_path { + int p_tree_depth; + ocfs2_journal_access_func p_root_access; + struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; +}; + +#define path_root_bh(_path) ((_path)->p_node[0].bh) +#define path_root_el(_path) ((_path)->p_node[0].el) +#define path_root_access(_path)((_path)->p_root_access) +#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) +#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) +#define path_num_items(_path) ((_path)->p_tree_depth + 1) + +void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root); +void ocfs2_free_path(struct ocfs2_path *path); +int ocfs2_find_path(struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + u32 cpos); +struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path); +struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et); +int ocfs2_path_bh_journal_access(handle_t *handle, + struct ocfs2_caching_info *ci, + struct ocfs2_path *path, + int idx); +int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, + handle_t *handle, + struct ocfs2_path *path); #endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 8a1e61545f4..deb2b132ae5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -44,6 +44,7 @@ #include "suballoc.h" #include "super.h" #include "symlink.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -126,8 +127,8 @@ bail: return err; } -static int ocfs2_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +int ocfs2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { int err = 0; unsigned int ext_flags; @@ -590,6 +591,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, goto bail; } + /* We should already CoW the refcounted extent. */ + BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); /* * get_more_blocks() expects us to describe a hole by clearing * the mapped bit on bh_result(). @@ -687,6 +690,10 @@ static ssize_t ocfs2_direct_IO(int rw, if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; + /* Fallback to buffered I/O if we are appending. */ + if (i_size_read(inode) <= offset) + return 0; + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, @@ -1259,7 +1266,8 @@ static int ocfs2_write_cluster(struct address_space *mapping, goto out; } } else if (unwritten) { - ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), + wc->w_di_bh); ret = ocfs2_mark_extent_written(inode, &et, wc->w_handle, cpos, 1, phys, meta_ac, &wc->w_dealloc); @@ -1448,6 +1456,9 @@ static int ocfs2_populate_write_desc(struct inode *inode, goto out; } + /* We should already CoW the refcountd extent. */ + BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); + /* * Assume worst case - that we're writing in * the middle of the extent. @@ -1528,7 +1539,7 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, goto out; } - ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { ocfs2_commit_trans(osb, handle); @@ -1699,6 +1710,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, goto out; } + ret = ocfs2_check_range_for_refcount(inode, pos, len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } else if (ret == 1) { + ret = ocfs2_refcount_cow(inode, di_bh, + wc->w_cpos, wc->w_clen, UINT_MAX); + if (ret) { + mlog_errno(ret); + goto out; + } + } + ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, &extents_to_split); if (ret) { @@ -1726,7 +1750,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), clusters_to_alloc, extents_to_split); - ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), + wc->w_di_bh); ret = ocfs2_lock_allocators(inode, &et, clusters_to_alloc, extents_to_split, &data_ac, &meta_ac); @@ -1773,7 +1798,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * We don't want this to fail in ocfs2_write_end(), so do it * here. */ - ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1997,4 +2022,5 @@ const struct address_space_operations ocfs2_aops = { .releasepage = ocfs2_releasepage, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 503e49232e1..c48e93ffc51 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -57,6 +57,8 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, struct buffer_head *di_bh); int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); +int ocfs2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); /* all ocfs2_dio_end_io()'s fault */ #define ocfs2_iocb_is_rw_locked(iocb) \ test_bit(0, (unsigned long *)&iocb->private) diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 15c8e6deee2..d43d34a1dd3 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -52,12 +52,12 @@ enum ocfs2_state_bits { BUFFER_FNS(NeedsValidate, needs_validate); int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, - struct inode *inode) + struct ocfs2_caching_info *ci) { int ret = 0; - mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n", - (unsigned long long)bh->b_blocknr, inode); + mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n", + (unsigned long long)bh->b_blocknr, ci); BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); BUG_ON(buffer_jbd(bh)); @@ -70,7 +70,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, goto out; } - mutex_lock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_lock(ci); lock_buffer(bh); set_buffer_uptodate(bh); @@ -85,7 +85,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, wait_on_buffer(bh); if (buffer_uptodate(bh)) { - ocfs2_set_buffer_uptodate(inode, bh); + ocfs2_set_buffer_uptodate(ci, bh); } else { /* We don't need to remove the clustered uptodate * information for this bh as it's not marked locally @@ -94,7 +94,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, put_bh(bh); } - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_unlock(ci); out: mlog_exit(ret); return ret; @@ -177,7 +177,7 @@ bail: return status; } -int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, +int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, struct buffer_head *bh)) @@ -185,11 +185,12 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, int status = 0; int i, ignore_cache = 0; struct buffer_head *bh; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); - mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n", - inode, (unsigned long long)block, nr, flags); + mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n", + ci, (unsigned long long)block, nr, flags); - BUG_ON(!inode); + BUG_ON(!ci); BUG_ON((flags & OCFS2_BH_READAHEAD) && (flags & OCFS2_BH_IGNORE_CACHE)); @@ -212,12 +213,12 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, goto bail; } - mutex_lock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_lock(ci); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { - bhs[i] = sb_getblk(inode->i_sb, block++); + bhs[i] = sb_getblk(sb, block++); if (bhs[i] == NULL) { - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_unlock(ci); status = -EIO; mlog_errno(status); goto bail; @@ -250,11 +251,11 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, * before our is-it-in-flight check. */ - if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) { + if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) { mlog(ML_UPTODATE, - "bh (%llu), inode %llu not uptodate\n", + "bh (%llu), owner %llu not uptodate\n", (unsigned long long)bh->b_blocknr, - (unsigned long long)OCFS2_I(inode)->ip_blkno); + (unsigned long long)ocfs2_metadata_cache_owner(ci)); /* We're using ignore_cache here to say * "go to disk" */ ignore_cache = 1; @@ -283,7 +284,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, * previously submitted request than we are * done here. */ if ((flags & OCFS2_BH_READAHEAD) - && ocfs2_buffer_read_ahead(inode, bh)) + && ocfs2_buffer_read_ahead(ci, bh)) continue; lock_buffer(bh); @@ -305,7 +306,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, * buffer lock. */ if (!(flags & OCFS2_BH_IGNORE_CACHE) && !(flags & OCFS2_BH_READAHEAD) - && ocfs2_buffer_uptodate(inode, bh)) { + && ocfs2_buffer_uptodate(ci, bh)) { unlock_buffer(bh); continue; } @@ -327,7 +328,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, if (!(flags & OCFS2_BH_READAHEAD)) { /* We know this can't have changed as we hold the - * inode sem. Avoid doing any work on the bh if the + * owner sem. Avoid doing any work on the bh if the * journal has it. */ if (!buffer_jbd(bh)) wait_on_buffer(bh); @@ -351,7 +352,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, * that better not have changed */ BUG_ON(buffer_jbd(bh)); clear_buffer_needs_validate(bh); - status = validate(inode->i_sb, bh); + status = validate(sb, bh); if (status) { put_bh(bh); bhs[i] = NULL; @@ -363,9 +364,9 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, /* Always set the buffer in the cache, even if it was * a forced read, or read-ahead which hasn't yet * completed. */ - ocfs2_set_buffer_uptodate(inode, bh); + ocfs2_set_buffer_uptodate(ci, bh); } - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_unlock(ci); mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", (unsigned long long)block, nr, @@ -399,7 +400,7 @@ static void ocfs2_check_super_or_backup(struct super_block *sb, /* * Write super block and backups doesn't need to collaborate with journal, - * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed + * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed * into this function. */ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index c75d682dadd..b97bcc6dde7 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -33,7 +33,7 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh, int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, - struct inode *inode); + struct ocfs2_caching_info *ci); int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, unsigned int nr, struct buffer_head *bhs[]); @@ -44,7 +44,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, * be set even for a READAHEAD call, as it marks the buffer for later * validation. */ -int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, +int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, struct buffer_head *bh)); @@ -55,7 +55,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, #define OCFS2_BH_IGNORE_CACHE 1 #define OCFS2_BH_READAHEAD 8 -static inline int ocfs2_read_block(struct inode *inode, u64 off, +static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off, struct buffer_head **bh, int (*validate)(struct super_block *sb, struct buffer_head *bh)) @@ -68,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off, goto bail; } - status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate); + status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate); bail: return status; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 09cc25d0461..c452d116b89 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf, } #endif /* CONFIG_DEBUG_FS */ -static struct file_operations o2hb_debug_fops = { +static const struct file_operations o2hb_debug_fops = { .open = o2hb_debug_open, .release = o2hb_debug_release, .read = o2hb_debug_read, diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 96df5416993..1cd2934de61 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -111,6 +111,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(EXPORT), define_mask(XATTR), define_mask(QUOTA), + define_mask(REFCOUNT), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 696c32e5071..9b4d11726cf 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -113,6 +113,7 @@ #define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ #define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ #define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ +#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ /* bits that are infrequently given and frequently matched in the high word */ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index f8424874fa0..da794bc07a6 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -163,7 +163,7 @@ static void nst_seq_stop(struct seq_file *seq, void *v) { } -static struct seq_operations nst_seq_ops = { +static const struct seq_operations nst_seq_ops = { .start = nst_seq_start, .next = nst_seq_next, .stop = nst_seq_stop, @@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static struct file_operations nst_seq_fops = { +static const struct file_operations nst_seq_fops = { .open = nst_fop_open, .read = seq_read, .llseek = seq_lseek, @@ -344,7 +344,7 @@ static void sc_seq_stop(struct seq_file *seq, void *v) { } -static struct seq_operations sc_seq_ops = { +static const struct seq_operations sc_seq_ops = { .start = sc_seq_start, .next = sc_seq_next, .stop = sc_seq_stop, @@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static struct file_operations sc_seq_fops = { +static const struct file_operations sc_seq_fops = { .open = sc_fop_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index b358f3bf896..28c3ec23879 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -176,7 +176,7 @@ static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle, struct ocfs2_dx_root_block *dx_root; struct ocfs2_dir_block_trailer *trailer; - ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -564,7 +564,8 @@ static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys, int ret; struct buffer_head *tmp = *bh; - ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block); + ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp, + ocfs2_validate_dir_block); if (ret) { mlog_errno(ret); goto out; @@ -622,7 +623,8 @@ static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di, u64 blkno = le64_to_cpu(di->i_dx_root); struct buffer_head *tmp = *dx_root_bh; - ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root); + ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp, + ocfs2_validate_dx_root); /* If ocfs2_read_block() got us a new bh, pass it up. */ if (!ret && !*dx_root_bh) @@ -662,7 +664,8 @@ static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno, int ret; struct buffer_head *tmp = *dx_leaf_bh; - ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf); + ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp, + ocfs2_validate_dx_leaf); /* If ocfs2_read_block() got us a new bh, pass it up. */ if (!ret && !*dx_leaf_bh) @@ -680,7 +683,7 @@ static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num, { int ret; - ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0, + ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0, ocfs2_validate_dx_leaf); if (ret) mlog_errno(ret); @@ -802,7 +805,8 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, struct ocfs2_extent_rec *rec = NULL; if (el->l_tree_depth) { - ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh); + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash, + &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -1133,7 +1137,8 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle, if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) access = ocfs2_journal_access_di; - ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE); + ret = access(handle, INODE_CACHE(dir), de_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -1176,7 +1181,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, goto bail; } if (de == de_del) { - status = access(handle, dir, bh, + status = access(handle, INODE_CACHE(dir), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { status = -EIO; @@ -1326,7 +1331,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, * the entry count needs to be updated. Also, we might be * adding to the start of the free list. */ - ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1334,7 +1339,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir, } if (!ocfs2_dx_root_inline(dx_root)) { - ret = ocfs2_journal_access_dl(handle, dir, + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), lookup->dl_dx_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { @@ -1493,7 +1498,7 @@ static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle, int ret; struct ocfs2_dx_leaf *dx_leaf; - ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1523,7 +1528,7 @@ static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle, struct ocfs2_dx_root_block *dx_root; struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; - ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1645,11 +1650,13 @@ int __ocfs2_add_entry(handle_t *handle, */ if (ocfs2_free_list_at_root(lookup)) { bh = lookup->dl_dx_root_bh; - retval = ocfs2_journal_access_dr(handle, dir, bh, + retval = ocfs2_journal_access_dr(handle, + INODE_CACHE(dir), bh, OCFS2_JOURNAL_ACCESS_WRITE); } else { bh = lookup->dl_prev_leaf_bh; - retval = ocfs2_journal_access_db(handle, dir, bh, + retval = ocfs2_journal_access_db(handle, + INODE_CACHE(dir), bh, OCFS2_JOURNAL_ACCESS_WRITE); } if (retval) { @@ -1700,11 +1707,13 @@ int __ocfs2_add_entry(handle_t *handle, } if (insert_bh == parent_fe_bh) - status = ocfs2_journal_access_di(handle, dir, + status = ocfs2_journal_access_di(handle, + INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); else { - status = ocfs2_journal_access_db(handle, dir, + status = ocfs2_journal_access_db(handle, + INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -2280,7 +2289,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, struct ocfs2_inline_data *data = &di->id2.i_data; unsigned int size = le16_to_cpu(data->id_count); - ret = ocfs2_journal_access_di(handle, inode, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -2332,9 +2341,9 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, goto bail; } - ocfs2_set_new_buffer_uptodate(inode, new_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); - status = ocfs2_journal_access_db(handle, inode, new_bh, + status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -2418,9 +2427,9 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, ret = -EIO; goto out; } - ocfs2_set_new_buffer_uptodate(dir, dx_root_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh); - ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret < 0) { mlog_errno(ret); @@ -2454,7 +2463,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, if (ret) mlog_errno(ret); - ret = ocfs2_journal_access_di(handle, dir, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); @@ -2495,9 +2504,9 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb, } dx_leaves[i] = bh; - ocfs2_set_new_buffer_uptodate(dir, bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh); - ret = ocfs2_journal_access_dl(handle, dir, bh, + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret < 0) { mlog_errno(ret); @@ -2582,7 +2591,6 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir, { int ret; u64 phys_blkno; - struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves, num_dx_leaves, &phys_blkno); @@ -2591,7 +2599,7 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir, goto out; } - ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0, + ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0, meta_ac); if (ret) mlog_errno(ret); @@ -2895,7 +2903,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, struct ocfs2_extent_tree dx_et; int did_quota = 0, bytes_allocated = 0; - ocfs2_init_dinode_extent_tree(&et, dir, di_bh); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh); alloc = ocfs2_clusters_for_bytes(sb, bytes); dx_alloc = 0; @@ -3005,9 +3013,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out_commit; } - ocfs2_set_new_buffer_uptodate(dir, dirdata_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh); - ret = ocfs2_journal_access_db(handle, dir, dirdata_bh, + ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); @@ -3060,7 +3068,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * We let the later dirent insert modify c/mtime - to the user * the data hasn't changed. */ - ret = ocfs2_journal_access_di(handle, dir, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); @@ -3085,7 +3093,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * This should never fail as our extent list is empty and all * related blocks have been journaled already. */ - ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len, + ret = ocfs2_insert_extent(handle, &et, 0, blkno, len, 0, NULL); if (ret) { mlog_errno(ret); @@ -3117,8 +3125,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_dx_dir_index_root_block(dir, dx_root_bh, dirdata_bh); } else { - ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh); - ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0, + ocfs2_init_dx_root_extent_tree(&dx_et, + INODE_CACHE(dir), + dx_root_bh); + ret = ocfs2_insert_extent(handle, &dx_et, 0, dx_insert_blkno, 1, 0, NULL); if (ret) mlog_errno(ret); @@ -3138,7 +3148,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, } blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); - ret = ocfs2_insert_extent(osb, handle, dir, &et, 1, + ret = ocfs2_insert_extent(handle, &et, 1, blkno, len, 0, NULL); if (ret) { mlog_errno(ret); @@ -3337,8 +3347,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, spin_lock(&OCFS2_I(dir)->ip_lock); if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { spin_unlock(&OCFS2_I(dir)->ip_lock); - ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh); - num_free_extents = ocfs2_num_free_extents(osb, dir, &et); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), + parent_fe_bh); + num_free_extents = ocfs2_num_free_extents(osb, &et); if (num_free_extents < 0) { status = num_free_extents; mlog_errno(status); @@ -3387,9 +3398,9 @@ do_extend: goto bail; } - ocfs2_set_new_buffer_uptodate(dir, new_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh); - status = ocfs2_journal_access_db(handle, dir, new_bh, + status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -3829,7 +3840,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)leaf_blkno, insert_hash); - ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); + ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; /* @@ -3885,7 +3896,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, } did_quota = 1; - ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -3949,7 +3960,8 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir, } for (i = 0; i < num_dx_leaves; i++) { - ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i], + ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), + orig_dx_leaves[i], OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -4165,7 +4177,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, * failure to add the dx_root_bh to the journal won't result * us losing clusters. */ - ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, + ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -4207,9 +4219,8 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir, /* This should never fail considering we start with an empty * dx_root. */ - ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); - ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, - insert_blkno, 1, 0, NULL); + ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); + ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL); if (ret) mlog_errno(ret); did_quota = 0; @@ -4469,7 +4480,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, goto out_unlock; } - ret = ocfs2_journal_access_di(handle, dir, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -4532,7 +4543,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) if (ocfs2_dx_root_inline(dx_root)) goto remove_index; - ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); + ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); /* XXX: What if dr_clusters is too large? */ while (le32_to_cpu(dx_root->dr_clusters)) { @@ -4565,7 +4576,7 @@ remove_index: goto out; } - ocfs2_remove_from_cache(dir, dx_root_bh); + ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh); out: ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &dealloc); diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 81eff8e5832..01cf8cc3d28 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 75997b4deaf..ca96bce50e1 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index df52f706f66..42b0bad7a61 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -27,7 +27,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/sysctl.h> #include <linux/spinlock.h> #include <linux/debugfs.h> @@ -479,7 +478,7 @@ bail: return -ENOMEM; } -static struct file_operations debug_purgelist_fops = { +static const struct file_operations debug_purgelist_fops = { .open = debug_purgelist_open, .release = debug_buffer_release, .read = debug_buffer_read, @@ -539,7 +538,7 @@ bail: return -ENOMEM; } -static struct file_operations debug_mle_fops = { +static const struct file_operations debug_mle_fops = { .open = debug_mle_open, .release = debug_buffer_release, .read = debug_buffer_read, @@ -683,7 +682,7 @@ static int lockres_seq_show(struct seq_file *s, void *v) return 0; } -static struct seq_operations debug_lockres_ops = { +static const struct seq_operations debug_lockres_ops = { .start = lockres_seq_start, .stop = lockres_seq_stop, .next = lockres_seq_next, @@ -742,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static struct file_operations debug_lockres_fops = { +static const struct file_operations debug_lockres_fops = { .open = debug_lockres_open, .release = debug_lockres_release, .read = seq_read, @@ -926,7 +925,7 @@ bail: return -ENOMEM; } -static struct file_operations debug_state_fops = { +static const struct file_operations debug_state_fops = { .open = debug_state_open, .release = debug_buffer_release, .read = debug_buffer_read, diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 4d9e6b288dd..0334000676d 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -28,7 +28,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/delay.h> diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 83a9f2972ac..437698e9465 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index f8b653fcd4d..83bcaf266b3 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 43e6e328056..d9fa3d22e17 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index d490b66ad9d..52ec020ea78 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> @@ -212,14 +211,18 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm, spin_lock(&dlm->spinlock); } + spin_lock(&res->spinlock); if (!list_empty(&res->purge)) { mlog(0, "removing lockres %.*s:%p from purgelist, " "master = %d\n", res->lockname.len, res->lockname.name, res, master); list_del_init(&res->purge); + spin_unlock(&res->spinlock); dlm_lockres_put(res); dlm->purge_count--; - } + } else + spin_unlock(&res->spinlock); + __dlm_unhash_lockres(res); /* lockres is not in the hash now. drop the flag and wake up diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 756f5b0998e..00f53b2aea7 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -30,7 +30,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/random.h> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 110bb57c46a..0d38d67194c 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -53,6 +53,7 @@ #include "super.h" #include "uptodate.h" #include "quota.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -110,6 +111,11 @@ static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres); +static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres, + int new_level); +static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) /* This aids in debugging situations where a bad LVB might be involved. */ @@ -278,6 +284,12 @@ static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB, }; +static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = { + .check_downconvert = ocfs2_check_refcount_downconvert, + .downconvert_worker = ocfs2_refcount_convert_worker, + .flags = 0, +}; + static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) { return lockres->l_type == OCFS2_LOCK_TYPE_META || @@ -306,6 +318,12 @@ static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_re return (struct ocfs2_mem_dqinfo *)lockres->l_priv; } +static inline struct ocfs2_refcount_tree * +ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res) +{ + return container_of(res, struct ocfs2_refcount_tree, rf_lockres); +} + static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) { if (lockres->l_ops->get_osb) @@ -693,6 +711,17 @@ void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, info); } +void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_super *osb, u64 ref_blkno, + unsigned int generation) +{ + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno, + generation, lockres->l_name); + ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT, + &ocfs2_refcount_block_lops, osb); +} + void ocfs2_lock_res_free(struct ocfs2_lock_res *res) { mlog_entry_void(); @@ -1548,8 +1577,10 @@ int ocfs2_rw_lock(struct inode *inode, int write) (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); - if (ocfs2_mount_local(osb)) + if (ocfs2_mount_local(osb)) { + mlog_exit(0); return 0; + } lockres = &OCFS2_I(inode)->ip_rw_lockres; @@ -2127,7 +2158,7 @@ static int ocfs2_inode_lock_update(struct inode *inode, /* This will discard any caching information we might have had * for the inode metadata. */ - ocfs2_metadata_cache_purge(inode); + ocfs2_metadata_cache_purge(INODE_CACHE(inode)); ocfs2_extent_map_trunc(inode, 0); @@ -3009,6 +3040,7 @@ static void ocfs2_unlock_ast(void *opaque, int error) "unlock_action %d\n", error, lockres->l_name, lockres->l_unlock_action); spin_unlock_irqrestore(&lockres->l_lock, flags); + mlog_exit_void(); return; } @@ -3495,11 +3527,11 @@ out: return UNBLOCK_CONTINUE; } -static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, - int new_level) +static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci, + struct ocfs2_lock_res *lockres, + int new_level) { - struct inode *inode = ocfs2_lock_res_inode(lockres); - int checkpointed = ocfs2_inode_fully_checkpointed(inode); + int checkpointed = ocfs2_ci_fully_checkpointed(ci); BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR); BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed); @@ -3507,10 +3539,18 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, if (checkpointed) return 1; - ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb)); + ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci))); return 0; } +static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + struct inode *inode = ocfs2_lock_res_inode(lockres); + + return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level); +} + static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres) { struct inode *inode = ocfs2_lock_res_inode(lockres); @@ -3640,6 +3680,26 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, return UNBLOCK_CONTINUE_POST; } +static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + struct ocfs2_refcount_tree *tree = + ocfs2_lock_res_refcount_tree(lockres); + + return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level); +} + +static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) +{ + struct ocfs2_refcount_tree *tree = + ocfs2_lock_res_refcount_tree(lockres); + + ocfs2_metadata_cache_purge(&tree->rf_ci); + + return UNBLOCK_CONTINUE; +} + static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres) { struct ocfs2_qinfo_lvb *lvb; @@ -3752,6 +3812,37 @@ bail: return status; } +int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex) +{ + int status; + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres; + struct ocfs2_super *osb = lockres->l_priv; + + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) + mlog_errno(status); + + return status; +} + +void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex) +{ + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres; + struct ocfs2_super *osb = lockres->l_priv; + + if (!ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); +} + /* * This is the filesystem locking protocol. It provides the lock handling * hooks for the underlying DLM. It has a maximum version number. diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 7553836931d..d1ce48e1b3d 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -101,6 +101,9 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_mem_dqinfo; void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_mem_dqinfo *info); +void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_super *osb, u64 ref_blkno, + unsigned int generation); void ocfs2_lock_res_free(struct ocfs2_lock_res *res); int ocfs2_create_new_inode_locks(struct inode *inode); int ocfs2_drop_inode_locks(struct inode *inode); @@ -148,6 +151,9 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock); void ocfs2_file_unlock(struct file *file); int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex); void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex); +struct ocfs2_refcount_tree; +int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex); +void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index f2bb1a04d25..843db64e9d4 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); + ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -353,11 +353,11 @@ static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, * eb_bh is NULL. Otherwise, eb_bh should point to the extent block * containing el. */ -static int ocfs2_figure_hole_clusters(struct inode *inode, - struct ocfs2_extent_list *el, - struct buffer_head *eb_bh, - u32 v_cluster, - u32 *num_clusters) +int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters) { int ret, i; struct buffer_head *next_eb_bh = NULL; @@ -375,7 +375,7 @@ static int ocfs2_figure_hole_clusters(struct inode *inode, if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) goto no_more_extents; - ret = ocfs2_read_extent_block(inode, + ret = ocfs2_read_extent_block(ci, le64_to_cpu(eb->h_next_leaf_blk), &next_eb_bh); if (ret) { @@ -428,7 +428,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, tree_height = le16_to_cpu(el->l_tree_depth); if (tree_height > 0) { - ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster, + &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -455,7 +456,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, * field. */ if (hole_len) { - ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, + ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode), + el, eb_bh, v_cluster, &len); if (ret) { mlog_errno(ret); @@ -539,7 +541,8 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb, int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, - struct ocfs2_extent_list *el) + struct ocfs2_extent_list *el, + unsigned int *extent_flags) { int ret = 0, i; struct buffer_head *eb_bh = NULL; @@ -548,7 +551,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, u32 coff; if (el->l_tree_depth) { - ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster, + &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -590,6 +594,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, *p_cluster = *p_cluster + coff; if (num_clusters) *num_clusters = ocfs2_rec_clusters(el, rec) - coff; + + if (extent_flags) + *extent_flags = rec->e_flags; } out: if (eb_bh) @@ -862,8 +869,8 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, BUG_ON(bhs[done + i]->b_blocknr != (p_block + i)); } - rc = ocfs2_read_blocks(inode, p_block, count, bhs + done, - flags, validate); + rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count, + bhs + done, flags, validate); if (rc) { mlog_errno(rc); break; diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index b7dd9731b46..e79d41c2c90 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -55,12 +55,18 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, - struct ocfs2_extent_list *el); + struct ocfs2_extent_list *el, + unsigned int *extent_flags); int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, struct buffer_head *bh)); +int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters); static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block, struct buffer_head **bh, int (*validate)(struct super_block *sb, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 221c5e98957..89fc8ee1f5a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -59,6 +59,7 @@ #include "xattr.h" #include "acl.h" #include "quota.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -259,7 +260,7 @@ int ocfs2_update_inode_atime(struct inode *inode, goto out; } - ret = ocfs2_journal_access_di(handle, inode, bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -334,6 +335,39 @@ out: return ret; } +static int ocfs2_cow_file_pos(struct inode *inode, + struct buffer_head *fe_bh, + u64 offset) +{ + int status; + u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + /* + * If the new offset is aligned to the range of the cluster, there is + * no space for ocfs2_zero_range_for_truncate to fill, so no need to + * CoW either. + */ + if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0) + return 0; + + status = ocfs2_get_clusters(inode, cpos, &phys, + &num_clusters, &ext_flags); + if (status) { + mlog_errno(status); + goto out; + } + + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); + +out: + return status; +} + static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, @@ -346,6 +380,17 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, mlog_entry_void(); + /* + * We need to CoW the cluster contains the offset if it is reflinked + * since we will call ocfs2_zero_range_for_truncate later which will + * write "0" from offset to the end of the cluster. + */ + status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size); + if (status) { + mlog_errno(status); + return status; + } + /* TODO: This needs to actually orphan the inode in this * transaction. */ @@ -356,7 +401,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, goto out; } - status = ocfs2_journal_access_di(handle, inode, fe_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -486,6 +531,8 @@ bail_unlock_sem: up_write(&OCFS2_I(inode)->ip_alloc_sem); bail: + if (!status && OCFS2_I(inode)->ip_clusters == 0) + status = ocfs2_try_remove_refcount_tree(inode, di_bh); mlog_exit(status); return status; @@ -515,11 +562,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, int ret; struct ocfs2_extent_tree et; - ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); - ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, - clusters_to_add, mark_unwritten, - &et, handle, - data_ac, meta_ac, reason_ret); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh); + ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset, + clusters_to_add, mark_unwritten, + data_ac, meta_ac, reason_ret); return ret; } @@ -564,7 +610,7 @@ restart_all: (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), clusters_to_add); - ocfs2_init_dinode_extent_tree(&et, inode, bh); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh); status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, &data_ac, &meta_ac); if (status) { @@ -593,7 +639,7 @@ restarted_transaction: /* reserve a write to the file entry early on - that we if we * run out of credits in the allocation path, we can still * update i_size. */ - status = ocfs2_journal_access_di(handle, inode, bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1131,7 +1177,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode, goto out; } - ret = ocfs2_journal_access_di(handle, inode, bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); @@ -1395,7 +1441,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, struct address_space *mapping = inode->i_mapping; struct ocfs2_extent_tree et; - ocfs2_init_dinode_extent_tree(&et, inode, di_bh); + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); ocfs2_init_dealloc_ctxt(&dealloc); if (byte_len == 0) @@ -1657,6 +1703,70 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, OCFS2_IOC_RESVSP64, &sr, change_size); } +int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, + size_t count) +{ + int ret = 0; + unsigned int extent_flags; + u32 cpos, clusters, extent_len, phys_cpos; + struct super_block *sb = inode->i_sb; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || + !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + return 0; + + cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, + &extent_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) { + ret = 1; + break; + } + + if (extent_len > clusters) + extent_len = clusters; + + clusters -= extent_len; + cpos += extent_len; + } +out: + return ret; +} + +static int ocfs2_prepare_inode_for_refcount(struct inode *inode, + loff_t pos, size_t count, + int *meta_level) +{ + int ret; + struct buffer_head *di_bh = NULL; + u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 clusters = + ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + *meta_level = 1; + + ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); + if (ret) + mlog_errno(ret); +out: + brelse(di_bh); + return ret; +} + static int ocfs2_prepare_inode_for_write(struct dentry *dentry, loff_t *ppos, size_t count, @@ -1713,6 +1823,22 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, end = saved_pos + count; + ret = ocfs2_check_range_for_refcount(inode, saved_pos, count); + if (ret == 1) { + ocfs2_inode_unlock(inode, meta_level); + meta_level = -1; + + ret = ocfs2_prepare_inode_for_refcount(inode, + saved_pos, + count, + &meta_level); + } + + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + /* * Skip the O_DIRECT checks if we don't need * them. @@ -1759,7 +1885,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, *ppos = saved_pos; out_unlock: - ocfs2_inode_unlock(inode, meta_level); + if (meta_level >= 0) + ocfs2_inode_unlock(inode, meta_level); out: return ret; diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 172f9fbc9fc..d66cf4f7c70 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -69,4 +69,6 @@ int ocfs2_update_inode_atime(struct inode *inode, int ocfs2_change_file_space(struct file *file, unsigned int cmd, struct ocfs2_space_resv *sr); +int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, + size_t count); #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 4dc8890ba31..0297fb8982b 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -53,6 +53,7 @@ #include "sysfile.h" #include "uptodate.h" #include "xattr.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -562,7 +563,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, goto out; } - status = ocfs2_journal_access_di(handle, inode, fe_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -646,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode, } /* set the inodes dtime */ - status = ocfs2_journal_access_di(handle, inode, di_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -662,7 +664,7 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail_commit; } - ocfs2_remove_from_cache(inode, di_bh); + ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); vfs_dq_free_inode(inode); status = ocfs2_free_dinode(handle, inode_alloc_inode, @@ -781,6 +783,12 @@ static int ocfs2_wipe_inode(struct inode *inode, goto bail_unlock_dir; } + status = ocfs2_remove_refcount_tree(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, orphan_dir_bh); if (status < 0) @@ -1112,13 +1120,14 @@ void ocfs2_clear_inode(struct inode *inode) ocfs2_lock_res_free(&oi->ip_inode_lockres); ocfs2_lock_res_free(&oi->ip_open_lockres); - ocfs2_metadata_cache_purge(inode); + ocfs2_metadata_cache_exit(INODE_CACHE(inode)); - mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached, + mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached, "Clear inode of %llu, inode has %u cache items\n", - (unsigned long long)oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached); + (unsigned long long)oi->ip_blkno, + INODE_CACHE(inode)->ci_num_cached); - mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), + mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE), "Clear inode of %llu, inode has a bad flag\n", (unsigned long long)oi->ip_blkno); @@ -1145,9 +1154,7 @@ void ocfs2_clear_inode(struct inode *inode) (unsigned long long)oi->ip_blkno, oi->ip_open_count); /* Clear all other flags. */ - oi->ip_flags = OCFS2_INODE_CACHE_INLINE; - oi->ip_created_trans = 0; - oi->ip_last_trans = 0; + oi->ip_flags = 0; oi->ip_dir_start_lookup = 0; oi->ip_blkno = 0ULL; @@ -1239,7 +1246,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle, mlog_entry("(inode %llu)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_journal_access_di(handle, inode, bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1380,8 +1387,8 @@ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int rc; struct buffer_head *tmp = *bh; - rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp, - flags, ocfs2_validate_inode_block); + rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, ocfs2_validate_inode_block); /* If ocfs2_read_blocks() got us a new bh, pass it up. */ if (!rc && !*bh) @@ -1394,3 +1401,56 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh) { return ocfs2_read_inode_block_full(inode, bh, 0); } + + +static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + return oi->ip_blkno; +} + +static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + return oi->vfs_inode.i_sb; +} + +static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + spin_lock(&oi->ip_lock); +} + +static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + spin_unlock(&oi->ip_lock); +} + +static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + mutex_lock(&oi->ip_io_mutex); +} + +static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_inode_info *oi = cache_info_to_inode(ci); + + mutex_unlock(&oi->ip_io_mutex); +} + +const struct ocfs2_caching_operations ocfs2_inode_caching_ops = { + .co_owner = ocfs2_inode_cache_owner, + .co_get_super = ocfs2_inode_cache_get_super, + .co_cache_lock = ocfs2_inode_cache_lock, + .co_cache_unlock = ocfs2_inode_cache_unlock, + .co_io_lock = ocfs2_inode_cache_io_lock, + .co_io_unlock = ocfs2_inode_cache_io_unlock, +}; + diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ea71525aad4..ba4fe07b293 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -60,12 +60,6 @@ struct ocfs2_inode_info u32 ip_dir_start_lookup; - /* next two are protected by trans_inc_lock */ - /* which transaction were we created on? Zero if none. */ - unsigned long ip_created_trans; - /* last transaction we were a part of. */ - unsigned long ip_last_trans; - struct ocfs2_caching_info ip_metadata_cache; struct ocfs2_extent_map ip_extent_map; @@ -106,8 +100,6 @@ struct ocfs2_inode_info #define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 /* Does someone have the file open O_DIRECT */ #define OCFS2_INODE_OPEN_DIRECT 0x00000040 -/* Indicates that the metadata cache should be used as an array. */ -#define OCFS2_INODE_CACHE_INLINE 0x00000080 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { @@ -120,6 +112,12 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) extern struct kmem_cache *ocfs2_inode_cache; extern const struct address_space_operations ocfs2_aops; +extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops; + +static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode) +{ + return &OCFS2_I(inode)->ip_metadata_cache; +} void ocfs2_clear_inode(struct inode *inode); void ocfs2_delete_inode(struct inode *inode); @@ -172,4 +170,10 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh); /* The same, but can be passed OCFS2_BH_* flags */ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags); + +static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci) +{ + return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache); +} + #endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 467b413bec2..31fbb061951 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -21,6 +21,7 @@ #include "ocfs2_fs.h" #include "ioctl.h" #include "resize.h" +#include "refcounttree.h" #include <linux/ext2_fs.h> @@ -115,6 +116,9 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) int status; struct ocfs2_space_resv sr; struct ocfs2_new_group_input input; + struct reflink_arguments args; + const char *old_path, *new_path; + bool preserve; switch (cmd) { case OCFS2_IOC_GETFLAGS: @@ -160,6 +164,15 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return -EFAULT; return ocfs2_group_add(inode, &input); + case OCFS2_IOC_REFLINK: + if (copy_from_user(&args, (struct reflink_arguments *)arg, + sizeof(args))) + return -EFAULT; + old_path = (const char *)(unsigned long)args.old_path; + new_path = (const char *)(unsigned long)args.new_path; + preserve = (args.preserve != 0); + + return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); default: return -ENOTTY; } @@ -182,6 +195,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: + case OCFS2_IOC_REFLINK: break; default: return -ENOIOCTLCMD; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index c48b93ac6b6..54c16b66327 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -48,6 +48,7 @@ #include "slot_map.h" #include "super.h" #include "sysfile.h" +#include "uptodate.h" #include "quota.h" #include "buffer_head_io.h" @@ -554,6 +555,14 @@ static struct ocfs2_triggers eb_triggers = { .ot_offset = offsetof(struct ocfs2_extent_block, h_check), }; +static struct ocfs2_triggers rb_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), +}; + static struct ocfs2_triggers gd_triggers = { .ot_triggers = { .t_commit = ocfs2_commit_trigger, @@ -601,14 +610,16 @@ static struct ocfs2_triggers dl_triggers = { }; static int __ocfs2_journal_access(handle_t *handle, - struct inode *inode, + struct ocfs2_caching_info *ci, struct buffer_head *bh, struct ocfs2_triggers *triggers, int type) { int status; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(ci)); - BUG_ON(!inode); + BUG_ON(!ci || !ci->ci_ops); BUG_ON(!handle); BUG_ON(!bh); @@ -627,15 +638,15 @@ static int __ocfs2_journal_access(handle_t *handle, BUG(); } - /* Set the current transaction information on the inode so + /* Set the current transaction information on the ci so * that the locking code knows whether it can drop it's locks - * on this inode or not. We're protected from the commit + * on this ci or not. We're protected from the commit * thread updating the current transaction id until * ocfs2_commit_trans() because ocfs2_start_trans() took * j_trans_barrier for us. */ - ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); + ocfs2_set_ci_lock_trans(osb->journal, ci); - mutex_lock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_lock(ci); switch (type) { case OCFS2_JOURNAL_ACCESS_CREATE: case OCFS2_JOURNAL_ACCESS_WRITE: @@ -650,9 +661,9 @@ static int __ocfs2_journal_access(handle_t *handle, status = -EINVAL; mlog(ML_ERROR, "Uknown access type!\n"); } - if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers) + if (!status && ocfs2_meta_ecc(osb) && triggers) jbd2_journal_set_triggers(bh, &triggers->ot_triggers); - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + ocfs2_metadata_cache_io_unlock(ci); if (status < 0) mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", @@ -662,66 +673,65 @@ static int __ocfs2_journal_access(handle_t *handle, return status; } -int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, - struct buffer_head *bh, int type) +int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &di_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type); } -int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &eb_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type); } -int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &gd_triggers, + return __ocfs2_journal_access(handle, ci, bh, &rb_triggers, type); } -int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &db_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type); } -int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &xb_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type); } -int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &dq_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type); } -int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &dr_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type); } -int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, &dl_triggers, - type); + return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type); +} + +int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type); } -int ocfs2_journal_access(handle_t *handle, struct inode *inode, +int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type) { - return __ocfs2_journal_access(handle, inode, bh, NULL, type); + return __ocfs2_journal_access(handle, ci, bh, NULL, type); } int ocfs2_journal_dirty(handle_t *handle, @@ -898,7 +908,7 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, ocfs2_bump_recovery_generation(fe); ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); - status = ocfs2_write_block(osb, bh, journal->j_inode); + status = ocfs2_write_block(osb, bh, INODE_CACHE(journal->j_inode)); if (status < 0) mlog_errno(status); @@ -1642,7 +1652,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, ocfs2_get_recovery_generation(fe); ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); - status = ocfs2_write_block(osb, bh, inode); + status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); if (status < 0) mlog_errno(status); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 2c3222aec62..3f74e09b0d8 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -90,56 +90,66 @@ static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j) return old_id; } -static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal, - struct inode *inode) +static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal, + struct ocfs2_caching_info *ci) { spin_lock(&trans_inc_lock); - OCFS2_I(inode)->ip_last_trans = journal->j_trans_id; + ci->ci_last_trans = journal->j_trans_id; spin_unlock(&trans_inc_lock); } /* Used to figure out whether it's safe to drop a metadata lock on an - * inode. Returns true if all the inodes changes have been + * cached object. Returns true if all the object's changes have been * checkpointed to disk. You should be holding the spinlock on the * metadata lock while calling this to be sure that nobody can take * the lock and put it on another transaction. */ -static inline int ocfs2_inode_fully_checkpointed(struct inode *inode) +static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci) { int ret; - struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal; + struct ocfs2_journal *journal = + OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal; spin_lock(&trans_inc_lock); - ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans); + ret = time_after(journal->j_trans_id, ci->ci_last_trans); spin_unlock(&trans_inc_lock); return ret; } -/* convenience function to check if an inode is still new (has never - * hit disk) Will do you a favor and set created_trans = 0 when you've - * been checkpointed. returns '1' if the inode is still new. */ -static inline int ocfs2_inode_is_new(struct inode *inode) +/* convenience function to check if an object backed by struct + * ocfs2_caching_info is still new (has never hit disk) Will do you a + * favor and set created_trans = 0 when you've + * been checkpointed. returns '1' if the ci is still new. */ +static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci) { int ret; + struct ocfs2_journal *journal = + OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal; + spin_lock(&trans_inc_lock); + ret = !(time_after(journal->j_trans_id, ci->ci_created_trans)); + if (!ret) + ci->ci_created_trans = 0; + spin_unlock(&trans_inc_lock); + return ret; +} + +/* Wrapper for inodes so we can check system files */ +static inline int ocfs2_inode_is_new(struct inode *inode) +{ /* System files are never "new" as they're written out by * mkfs. This helps us early during mount, before we have the * journal open and j_trans_id could be junk. */ if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) return 0; - spin_lock(&trans_inc_lock); - ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id, - OCFS2_I(inode)->ip_created_trans)); - if (!ret) - OCFS2_I(inode)->ip_created_trans = 0; - spin_unlock(&trans_inc_lock); - return ret; + + return ocfs2_ci_is_new(INODE_CACHE(inode)); } -static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, - struct inode *inode) +static inline void ocfs2_ci_set_new(struct ocfs2_super *osb, + struct ocfs2_caching_info *ci) { spin_lock(&trans_inc_lock); - OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id; + ci->ci_created_trans = osb->journal->j_trans_id; spin_unlock(&trans_inc_lock); } @@ -200,7 +210,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) if (ocfs2_mount_local(osb)) return; - if (!ocfs2_inode_fully_checkpointed(inode)) { + if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) { /* WARNING: This only kicks off a single * checkpoint. If someone races you and adds more * metadata to the journal, you won't know, and will @@ -210,7 +220,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) ocfs2_start_checkpoint(osb); wait_event(osb->journal->j_checkpointed, - ocfs2_inode_fully_checkpointed(inode)); + ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))); } } @@ -266,31 +276,34 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks); /* ocfs2_inode */ -int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_extent_block */ -int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci, + struct buffer_head *bh, int type); +/* ocfs2_refcount_block */ +int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_group_desc */ -int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_xattr_block */ -int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* quota blocks */ -int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* dirblock */ -int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_dx_root_block */ -int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* ocfs2_dx_leaf */ -int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, +int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* Anything that has no ecc */ -int ocfs2_journal_access(handle_t *handle, struct inode *inode, +int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); /* @@ -477,6 +490,23 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb) return credits; } +/* inode update, new refcount block and its allocation credits. */ +#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \ + + OCFS2_SUBALLOC_ALLOC) + +/* inode and the refcount block update. */ +#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* + * inode and the refcount block update. + * It doesn't include the credits for sub alloc change. + * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added. + */ +#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* 2 metadata alloc, 2 new blocks and root refcount block */ +#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3) + /* * Please note that the caller must make sure that root_el is the root * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index bac7e6abaf4..ac10f83edb9 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -297,8 +297,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) } memcpy(alloc_copy, alloc, bh->b_size); - status = ocfs2_journal_access_di(handle, local_alloc_inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode), + bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_commit; @@ -392,7 +392,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, ocfs2_clear_local_alloc(alloc); ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check); - status = ocfs2_write_block(osb, alloc_bh, inode); + status = ocfs2_write_block(osb, alloc_bh, INODE_CACHE(inode)); if (status < 0) mlog_errno(status); @@ -678,7 +678,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, * delete bits from it! */ *num_bits = bits_wanted; - status = ocfs2_journal_access_di(handle, local_alloc_inode, + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), osb->local_alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { @@ -1156,7 +1157,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, } memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); - status = ocfs2_journal_access_di(handle, local_alloc_inode, + status = ocfs2_journal_access_di(handle, + INODE_CACHE(local_alloc_inode), osb->local_alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index b606496b72e..39737613424 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -202,7 +202,7 @@ out: return ret; } -static struct vm_operations_struct ocfs2_file_vm_ops = { +static const struct vm_operations_struct ocfs2_file_vm_ops = { .fault = ocfs2_fault, .page_mkwrite = ocfs2_page_mkwrite, }; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8601f934010..f010b22b1c4 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -69,7 +69,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, struct inode *dir, struct inode *inode, - struct dentry *dentry, dev_t dev, struct buffer_head **new_fe_bh, struct buffer_head *parent_fe_bh, @@ -78,7 +77,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, - struct inode *inode, + u64 blkno, char *name, struct ocfs2_dir_lookup_result *lookup); @@ -358,8 +357,12 @@ static int ocfs2_mknod(struct inode *dir, } did_quota_inode = 1; + mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, + inode->i_mode, (unsigned long)dev, dentry->d_name.len, + dentry->d_name.name); + /* do the real work now. */ - status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev, + status = ocfs2_mknod_locked(osb, dir, inode, dev, &new_fe_bh, parent_fe_bh, handle, inode_ac); if (status < 0) { @@ -375,7 +378,8 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = ocfs2_journal_access_di(handle, dir, parent_fe_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(dir), + parent_fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -465,7 +469,6 @@ leave: static int ocfs2_mknod_locked(struct ocfs2_super *osb, struct inode *dir, struct inode *inode, - struct dentry *dentry, dev_t dev, struct buffer_head **new_fe_bh, struct buffer_head *parent_fe_bh, @@ -479,10 +482,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, u16 suballoc_bit; u16 feat; - mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, - inode->i_mode, (unsigned long)dev, dentry->d_name.len, - dentry->d_name.name); - *new_fe_bh = NULL; status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh, @@ -507,9 +506,10 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, mlog_errno(status); goto leave; } - ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), *new_fe_bh); - status = ocfs2_journal_access_di(handle, inode, *new_fe_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + *new_fe_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -565,7 +565,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, } ocfs2_populate_inode(inode, fe, 1); - ocfs2_inode_set_new(osb, inode); + ocfs2_ci_set_new(osb, INODE_CACHE(inode)); if (!ocfs2_mount_local(osb)) { status = ocfs2_create_new_inode_locks(inode); if (status < 0) @@ -682,7 +682,7 @@ static int ocfs2_link(struct dentry *old_dentry, goto out_unlock_inode; } - err = ocfs2_journal_access_di(handle, inode, fe_bh, + err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (err < 0) { mlog_errno(err); @@ -850,7 +850,8 @@ static int ocfs2_unlink(struct inode *dir, } if (inode_is_unlinkable(inode)) { - status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, + OCFS2_I(inode)->ip_blkno, orphan_name, &orphan_insert); if (status < 0) { mlog_errno(status); @@ -866,7 +867,7 @@ static int ocfs2_unlink(struct inode *dir, goto leave; } - status = ocfs2_journal_access_di(handle, inode, fe_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1241,9 +1242,8 @@ static int ocfs2_rename(struct inode *old_dir, if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, - new_inode, - orphan_name, - &orphan_insert); + OCFS2_I(new_inode)->ip_blkno, + orphan_name, &orphan_insert); if (status < 0) { mlog_errno(status); goto bail; @@ -1284,7 +1284,8 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } } - status = ocfs2_journal_access_di(handle, new_inode, newfe_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(new_inode), + newfe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1331,7 +1332,8 @@ static int ocfs2_rename(struct inode *old_dir, old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); - status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh, + status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode), + old_inode_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status >= 0) { old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; @@ -1407,9 +1409,10 @@ static int ocfs2_rename(struct inode *old_dir, (int)old_dir_nlink, old_dir->i_nlink); } else { struct ocfs2_dinode *fe; - status = ocfs2_journal_access_di(handle, old_dir, - old_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, + INODE_CACHE(old_dir), + old_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); fe = (struct ocfs2_dinode *) old_dir_bh->b_data; ocfs2_set_links_count(fe, old_dir->i_nlink); status = ocfs2_journal_dirty(handle, old_dir_bh); @@ -1527,9 +1530,11 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, mlog_errno(status); goto bail; } - ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), + bhs[virtual]); - status = ocfs2_journal_access(handle, inode, bhs[virtual], + status = ocfs2_journal_access(handle, INODE_CACHE(inode), + bhs[virtual], OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1692,7 +1697,11 @@ static int ocfs2_symlink(struct inode *dir, } did_quota_inode = 1; - status = ocfs2_mknod_locked(osb, dir, inode, dentry, + mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, + inode->i_mode, dentry->d_name.len, + dentry->d_name.name); + + status = ocfs2_mknod_locked(osb, dir, inode, 0, &new_fe_bh, parent_fe_bh, handle, inode_ac); if (status < 0) { @@ -1842,7 +1851,7 @@ bail: static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, - struct inode *inode, + u64 blkno, char *name, struct ocfs2_dir_lookup_result *lookup) { @@ -1850,7 +1859,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh = NULL; int status = 0; - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + status = ocfs2_blkno_stringify(blkno, name); if (status < 0) { mlog_errno(status); return status; @@ -1917,7 +1926,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto leave; } - status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh, + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -2002,7 +2013,9 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, goto leave; } - status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh, + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -2028,6 +2041,274 @@ leave: return status; } +int ocfs2_create_inode_in_orphan(struct inode *dir, + int mode, + struct inode **new_inode) +{ + int status, did_quota_inode = 0; + struct inode *inode = NULL; + struct inode *orphan_dir = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *di = NULL; + handle_t *handle = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct buffer_head *parent_di_bh = NULL; + struct buffer_head *new_di_bh = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + + status = ocfs2_inode_lock(dir, &parent_di_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + /* + * We give the orphan dir the root blkno to fake an orphan name, + * and allocate enough space for our insertion. + */ + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, + osb->root_blkno, + orphan_name, &orphan_insert); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* reserve an inode spot */ + status = ocfs2_reserve_new_inode(osb, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + inode = ocfs2_get_init_inode(dir, mode); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + /* We don't use standard VFS wrapper because we don't want vfs_dq_init + * to be called. */ + if (sb_any_quota_active(osb->sb) && + osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { + status = -EDQUOT; + goto leave; + } + did_quota_inode = 1; + + /* do the real work now. */ + status = ocfs2_mknod_locked(osb, dir, inode, + 0, &new_di_bh, parent_di_bh, handle, + inode_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + di = (struct ocfs2_dinode *)new_di_bh->b_data; + status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name, + &orphan_insert, orphan_dir); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* get open lock so that only nodes can't remove it from orphan dir. */ + status = ocfs2_open_lock(inode); + if (status < 0) + mlog_errno(status); + +leave: + if (status < 0 && did_quota_inode) + vfs_dq_free_inode(inode); + if (handle) + ocfs2_commit_trans(osb, handle); + + if (orphan_dir) { + /* This was locked for us in ocfs2_prepare_orphan_dir() */ + ocfs2_inode_unlock(orphan_dir, 1); + mutex_unlock(&orphan_dir->i_mutex); + iput(orphan_dir); + } + + if (status == -ENOSPC) + mlog(0, "Disk is full\n"); + + if ((status < 0) && inode) { + clear_nlink(inode); + iput(inode); + } + + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + brelse(new_di_bh); + + if (!status) + *new_inode = inode; + + ocfs2_free_dir_lookup_result(&orphan_insert); + + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + return status; +} + +int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct inode *inode, + struct dentry *dentry) +{ + int status = 0; + struct buffer_head *parent_di_bh = NULL; + handle_t *handle = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct ocfs2_dinode *dir_di, *di; + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dir_lookup_result lookup = { NULL, }; + + mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry, + dentry->d_name.len, dentry->d_name.name); + + status = ocfs2_inode_lock(dir, &parent_di_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + if (!dir_di->i_links_count) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto leave; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto leave; + + /* get a spot inside the dir. */ + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + dentry->d_name.name, + dentry->d_name.len, &lookup); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + osb->slot_num); + if (!orphan_dir_inode) { + status = -EEXIST; + mlog_errno(status); + goto leave; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mlog_errno(status); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + goto leave; + } + + status = ocfs2_read_inode_block(inode, &di_bh); + if (status < 0) { + mlog_errno(status); + goto orphan_unlock; + } + + handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto orphan_unlock; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), + di_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, + orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); + di->i_orphaned_slot = 0; + ocfs2_journal_dirty(handle, di_bh); + + status = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, parent_di_bh, + &lookup); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + goto out_commit; + } + + insert_inode_hash(inode); + dentry->d_op = &ocfs2_dentry_ops; + d_instantiate(dentry, inode); + status = 0; +out_commit: + ocfs2_commit_trans(osb, handle); +orphan_unlock: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); +leave: + + ocfs2_inode_unlock(dir, 1); + + brelse(di_bh); + brelse(parent_di_bh); + brelse(orphan_dir_bh); + + ocfs2_free_dir_lookup_result(&lookup); + + mlog_exit(status); + + return status; +} + const struct inode_operations ocfs2_dir_iops = { .create = ocfs2_create, .lookup = ocfs2_lookup, diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 688aef64c87..e5d059d4f11 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -35,5 +35,11 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct inode *orphan_dir_inode, struct inode *inode, struct buffer_head *orphan_dir_bh); +int ocfs2_create_inode_in_orphan(struct inode *dir, + int mode, + struct inode **new_inode); +int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct inode *new_inode, + struct dentry *new_dentry); #endif /* OCFS2_NAMEI_H */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 39e1d5a3950..eae40460242 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -51,20 +51,51 @@ /* For struct ocfs2_blockcheck_stats */ #include "blockcheck.h" + +/* Caching of metadata buffers */ + /* Most user visible OCFS2 inodes will have very few pieces of * metadata, but larger files (including bitmaps, etc) must be taken * into account when designing an access scheme. We allow a small * amount of inlined blocks to be stored on an array and grow the * structure into a rb tree when necessary. */ -#define OCFS2_INODE_MAX_CACHE_ARRAY 2 +#define OCFS2_CACHE_INFO_MAX_ARRAY 2 + +/* Flags for ocfs2_caching_info */ + +enum ocfs2_caching_info_flags { + /* Indicates that the metadata cache is using the inline array */ + OCFS2_CACHE_FL_INLINE = 1<<1, +}; +struct ocfs2_caching_operations; struct ocfs2_caching_info { + /* + * The parent structure provides the locks, but because the + * parent structure can differ, it provides locking operations + * to struct ocfs2_caching_info. + */ + const struct ocfs2_caching_operations *ci_ops; + + /* next two are protected by trans_inc_lock */ + /* which transaction were we created on? Zero if none. */ + unsigned long ci_created_trans; + /* last transaction we were a part of. */ + unsigned long ci_last_trans; + + /* Cache structures */ + unsigned int ci_flags; unsigned int ci_num_cached; union { - sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY]; + sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY]; struct rb_root ci_tree; } ci_cache; }; +/* + * Need this prototype here instead of in uptodate.h because journal.h + * uses it. + */ +struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci); /* this limits us to 256 nodes * if we need more, we can do a kmalloc for the map */ @@ -377,12 +408,17 @@ struct ocfs2_super /* the group we used to allocate inodes. */ u64 osb_inode_alloc_group; + + /* rb tree root for refcount lock. */ + struct rb_root osb_rf_lock_tree; + struct ocfs2_refcount_tree *osb_ref_tree_lru; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) /* Useful typedef for passing around journal access functions */ -typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode, +typedef int (*ocfs2_journal_access_func)(handle_t *handle, + struct ocfs2_caching_info *ci, struct buffer_head *bh, int type); static inline int ocfs2_should_order_data(struct inode *inode) @@ -480,6 +516,13 @@ static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n) ocfs2_set_links_count(di, links); } +static inline int ocfs2_refcount_tree(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock @@ -578,6 +621,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) #define OCFS2_IS_VALID_DX_LEAF(ptr) \ (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) +#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \ + (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE)) + static inline unsigned long ino_from_blkno(struct super_block *sb, u64 blkno) { diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 7ab6e9e5e77..e9431e4a5e7 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -68,6 +68,7 @@ #define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" #define OCFS2_DX_ROOT_SIGNATURE "DXDIR01" #define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1" +#define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ @@ -98,7 +99,8 @@ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ | OCFS2_FEATURE_INCOMPAT_XATTR \ | OCFS2_FEATURE_INCOMPAT_META_ECC \ - | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) + | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ + | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) @@ -160,6 +162,9 @@ /* Metadata checksum and error correction */ #define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 +/* Refcount tree support */ +#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -223,6 +228,7 @@ #define OCFS2_HAS_XATTR_FL (0x0002) #define OCFS2_INLINE_XATTR_FL (0x0004) #define OCFS2_INDEXED_DIR_FL (0x0008) +#define OCFS2_HAS_REFCOUNT_FL (0x0010) /* Inode attributes, keep in sync with EXT2 */ #define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ @@ -241,8 +247,11 @@ /* * Extent record flags (e_node.leaf.flags) */ -#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but - * unwritten */ +#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but + * unwritten */ +#define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference + * counted in an associated + * refcount tree */ /* * ioctl commands @@ -292,6 +301,15 @@ struct ocfs2_new_group_input { #define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input) #define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input) +/* Used to pass 2 file names to reflink. */ +struct reflink_arguments { + __u64 old_path; + __u64 new_path; + __u64 preserve; +}; +#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) + + /* * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) */ @@ -717,7 +735,8 @@ struct ocfs2_dinode { __le64 i_xattr_loc; /*80*/ struct ocfs2_block_check i_check; /* Error checking */ /*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ - __le64 i_reserved2[5]; +/*90*/ __le64 i_refcount_loc; + __le64 i_reserved2[4]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -901,6 +920,60 @@ struct ocfs2_group_desc /*40*/ __u8 bg_bitmap[0]; }; +struct ocfs2_refcount_rec { +/*00*/ __le64 r_cpos; /* Physical offset, in clusters */ + __le32 r_clusters; /* Clusters covered by this extent */ + __le32 r_refcount; /* Reference count of this extent */ +/*10*/ +}; +#define OCFS2_32BIT_POS_MASK (0xffffffffULL) + +#define OCFS2_REFCOUNT_LEAF_FL (0x00000001) +#define OCFS2_REFCOUNT_TREE_FL (0x00000002) + +struct ocfs2_refcount_list { +/*00*/ __le16 rl_count; /* Maximum number of entries possible + in rl_records */ + __le16 rl_used; /* Current number of used records */ + __le32 rl_reserved2; + __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */ +/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */ +}; + + +struct ocfs2_refcount_block { +/*00*/ __u8 rf_signature[8]; /* Signature for verification */ + __le16 rf_suballoc_slot; /* Slot suballocator this block + belongs to */ + __le16 rf_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 rf_fs_generation; /* Must match superblock */ +/*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */ + __le64 rf_parent; /* Parent block, only valid if + OCFS2_REFCOUNT_LEAF_FL is set in + rf_flags */ +/*20*/ struct ocfs2_block_check rf_check; /* Error checking */ + __le64 rf_last_eb_blk; /* Pointer to last extent block */ +/*30*/ __le32 rf_count; /* Number of inodes sharing this + refcount tree */ + __le32 rf_flags; /* See the flags above */ + __le32 rf_clusters; /* clusters covered by refcount tree. */ + __le32 rf_cpos; /* cluster offset in refcount tree.*/ +/*40*/ __le32 rf_generation; /* generation number. all be the same + * for the same refcount tree. */ + __le32 rf_reserved0; + __le64 rf_reserved1[7]; +/*80*/ union { + struct ocfs2_refcount_list rf_records; /* List of refcount + records */ + struct ocfs2_extent_list rf_list; /* Extent record list, + only valid if + OCFS2_REFCOUNT_TREE_FL + is set in rf_flags */ + }; +/* Actual on-disk size is one block */ +}; + /* * On disk extended attribute structure for OCFS2. */ @@ -1312,6 +1385,32 @@ static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb) return size / sizeof(struct ocfs2_extent_rec); } + +static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_records.rl_recs); + + return size / sizeof(struct ocfs2_refcount_rec); +} + +static inline u32 +ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec) +{ + return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; +} #else static inline int ocfs2_fast_symlink_chars(int blocksize) { diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index c212cf5a2bd..d277aabf5df 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -49,6 +49,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_QINFO, OCFS2_LOCK_TYPE_NFS_SYNC, OCFS2_LOCK_TYPE_ORPHAN_SCAN, + OCFS2_LOCK_TYPE_REFCOUNT, OCFS2_NUM_LOCK_TYPES }; @@ -89,6 +90,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_ORPHAN_SCAN: c = 'P'; break; + case OCFS2_LOCK_TYPE_REFCOUNT: + c = 'T'; + break; default: c = '\0'; } @@ -110,6 +114,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_QINFO] = "Quota", [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", + [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index 3fb96fcd4c8..e5df9d170b0 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -109,7 +109,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); int ocfs2_read_quota_block(struct inode *inode, u64 v_block, struct buffer_head **bh); -extern struct dquot_operations ocfs2_quota_operations; +extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; int ocfs2_quota_setup(void); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 44f2a5e1d04..b437dc0c4ca 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -154,7 +154,7 @@ static int ocfs2_get_quota_block(struct inode *inode, int block, err = -EIO; mlog_errno(err); } - return err;; + return err; } /* Read data from global quotafile - avoid pagecache and such because we cannot @@ -253,8 +253,9 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, flush_dcache_page(bh->b_page); set_buffer_uptodate(bh); unlock_buffer(bh); - ocfs2_set_buffer_uptodate(gqinode, bh); - err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type); + ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh); + err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh, + ja_type); if (err < 0) { brelse(bh); goto out; @@ -849,7 +850,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot) kmem_cache_free(ocfs2_dquot_cachep, dquot); } -struct dquot_operations ocfs2_quota_operations = { +const struct dquot_operations ocfs2_quota_operations = { .initialize = dquot_initialize, .drop = dquot_drop, .alloc_space = dquot_alloc_space, diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index bdb09cb6e1f..1a2c50a759f 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -108,7 +108,7 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, mlog_errno(status); return status; } - status = ocfs2_journal_access_dq(handle, inode, bh, + status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -510,7 +510,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, goto out_commit; } /* Release local quota file entry */ - status = ocfs2_journal_access_dq(handle, lqinode, + status = ocfs2_journal_access_dq(handle, + INODE_CACHE(lqinode), qbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -619,7 +620,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, mlog_errno(status); goto out_bh; } - status = ocfs2_journal_access_dq(handle, lqinode, bh, + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), + bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -993,8 +995,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( goto out_trans; } dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; - ocfs2_set_new_buffer_uptodate(lqinode, bh); - status = ocfs2_journal_access_dq(handle, lqinode, bh, + ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh); + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1027,8 +1029,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out_trans; } - ocfs2_set_new_buffer_uptodate(lqinode, dbh); - status = ocfs2_journal_access_dq(handle, lqinode, dbh, + ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh); + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1131,7 +1133,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( mlog_errno(status); goto out; } - ocfs2_set_new_buffer_uptodate(lqinode, bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh); /* Local quota info, chunk header and the new block we initialize */ handle = ocfs2_start_trans(OCFS2_SB(sb), @@ -1143,7 +1145,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( goto out; } /* Zero created block */ - status = ocfs2_journal_access_dq(handle, lqinode, bh, + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); @@ -1158,7 +1160,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( goto out_trans; } /* Update chunk header */ - status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, + status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), + chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1292,7 +1295,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot) goto out; } - status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type], + status = ocfs2_journal_access_dq(handle, + INODE_CACHE(sb_dqopt(sb)->files[type]), od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c new file mode 100644 index 00000000000..60287fc56bc --- /dev/null +++ b/fs/ocfs2/refcounttree.c @@ -0,0 +1,4313 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * refcounttree.c + * + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/sort.h> +#define MLOG_MASK_PREFIX ML_REFCOUNT +#include <cluster/masklog.h> +#include "ocfs2.h" +#include "inode.h" +#include "alloc.h" +#include "suballoc.h" +#include "journal.h" +#include "uptodate.h" +#include "super.h" +#include "buffer_head_io.h" +#include "blockcheck.h" +#include "refcounttree.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "aops.h" +#include "xattr.h" +#include "namei.h" + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include <linux/swap.h> +#include <linux/security.h> +#include <linux/fsnotify.h> +#include <linux/quotaops.h> +#include <linux/namei.h> +#include <linux/mount.h> + +struct ocfs2_cow_context { + struct inode *inode; + u32 cow_start; + u32 cow_len; + struct ocfs2_extent_tree data_et; + struct ocfs2_refcount_tree *ref_tree; + struct buffer_head *ref_root_bh; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; + void *cow_object; + struct ocfs2_post_refcount *post_refcount; + int extra_credits; + int (*get_clusters)(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags); + int (*cow_duplicate_clusters)(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); +}; + +static inline struct ocfs2_refcount_tree * +cache_info_to_refcount(struct ocfs2_caching_info *ci) +{ + return container_of(ci, struct ocfs2_refcount_tree, rf_ci); +} + +static int ocfs2_validate_refcount_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)bh->b_data; + + mlog(0, "Validating refcount block %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for refcount block %llu\n", + (unsigned long long)bh->b_blocknr); + return rc; + } + + + if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { + ocfs2_error(sb, + "Refcount block #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + rb->rf_signature); + return -EINVAL; + } + + if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(rb->rf_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Refcount block #%llu has an invalid " + "rf_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(rb->rf_fs_generation)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, + u64 rb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(ci, rb_blkno, &tmp, + ocfs2_validate_refcount_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + return rf->rf_blkno; +} + +static struct super_block * +ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + return rf->rf_sb; +} + +static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + spin_lock(&rf->rf_lock); +} + +static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + spin_unlock(&rf->rf_lock); +} + +static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + mutex_lock(&rf->rf_io_mutex); +} + +static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + mutex_unlock(&rf->rf_io_mutex); +} + +static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = { + .co_owner = ocfs2_refcount_cache_owner, + .co_get_super = ocfs2_refcount_cache_get_super, + .co_cache_lock = ocfs2_refcount_cache_lock, + .co_cache_unlock = ocfs2_refcount_cache_unlock, + .co_io_lock = ocfs2_refcount_cache_io_lock, + .co_io_unlock = ocfs2_refcount_cache_io_unlock, +}; + +static struct ocfs2_refcount_tree * +ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno) +{ + struct rb_node *n = osb->osb_rf_lock_tree.rb_node; + struct ocfs2_refcount_tree *tree = NULL; + + while (n) { + tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node); + + if (blkno < tree->rf_blkno) + n = n->rb_left; + else if (blkno > tree->rf_blkno) + n = n->rb_right; + else + return tree; + } + + return NULL; +} + +/* osb_lock is already locked. */ +static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *new) +{ + u64 rf_blkno = new->rf_blkno; + struct rb_node *parent = NULL; + struct rb_node **p = &osb->osb_rf_lock_tree.rb_node; + struct ocfs2_refcount_tree *tmp; + + while (*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_refcount_tree, + rf_node); + + if (rf_blkno < tmp->rf_blkno) + p = &(*p)->rb_left; + else if (rf_blkno > tmp->rf_blkno) + p = &(*p)->rb_right; + else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate refcount block %llu found!\n", + (unsigned long long)rf_blkno); + BUG(); + } + } + + rb_link_node(&new->rf_node, parent, p); + rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree); +} + +static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree) +{ + ocfs2_metadata_cache_exit(&tree->rf_ci); + ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres); + ocfs2_lock_res_free(&tree->rf_lockres); + kfree(tree); +} + +static inline void +ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree) +{ + rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree); + if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree) + osb->osb_ref_tree_lru = NULL; +} + +static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree) +{ + spin_lock(&osb->osb_lock); + ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); + spin_unlock(&osb->osb_lock); +} + +void ocfs2_kref_remove_refcount_tree(struct kref *kref) +{ + struct ocfs2_refcount_tree *tree = + container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); + + ocfs2_free_refcount_tree(tree); +} + +static inline void +ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree) +{ + kref_get(&tree->rf_getcnt); +} + +static inline void +ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree) +{ + kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree); +} + +static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new, + struct super_block *sb) +{ + ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops); + mutex_init(&new->rf_io_mutex); + new->rf_sb = sb; + spin_lock_init(&new->rf_lock); +} + +static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *new, + u64 rf_blkno, u32 generation) +{ + init_rwsem(&new->rf_sem); + ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, + rf_blkno, generation); +} + +static struct ocfs2_refcount_tree* +ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno) +{ + struct ocfs2_refcount_tree *new; + + new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS); + if (!new) + return NULL; + + new->rf_blkno = rf_blkno; + kref_init(&new->rf_getcnt); + ocfs2_init_refcount_tree_ci(new, osb->sb); + + return new; +} + +static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, + struct ocfs2_refcount_tree **ret_tree) +{ + int ret = 0; + struct ocfs2_refcount_tree *tree, *new = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *ref_rb; + + spin_lock(&osb->osb_lock); + if (osb->osb_ref_tree_lru && + osb->osb_ref_tree_lru->rf_blkno == rf_blkno) + tree = osb->osb_ref_tree_lru; + else + tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (tree) + goto out; + + spin_unlock(&osb->osb_lock); + + new = ocfs2_allocate_refcount_tree(osb, rf_blkno); + if (!new) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + /* + * We need the generation to create the refcount tree lock and since + * it isn't changed during the tree modification, we are safe here to + * read without protection. + * We also have to purge the cache after we create the lock since the + * refcount block may have the stale data. It can only be trusted when + * we hold the refcount lock. + */ + ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh); + if (ret) { + mlog_errno(ret); + ocfs2_metadata_cache_exit(&new->rf_ci); + kfree(new); + return ret; + } + + ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + new->rf_generation = le32_to_cpu(ref_rb->rf_generation); + ocfs2_init_refcount_tree_lock(osb, new, rf_blkno, + new->rf_generation); + ocfs2_metadata_cache_purge(&new->rf_ci); + + spin_lock(&osb->osb_lock); + tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (tree) + goto out; + + ocfs2_insert_refcount_tree(osb, new); + + tree = new; + new = NULL; + +out: + *ret_tree = tree; + + osb->osb_ref_tree_lru = tree; + + spin_unlock(&osb->osb_lock); + + if (new) + ocfs2_free_refcount_tree(new); + + brelse(ref_root_bh); + return ret; +} + +static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + di = (struct ocfs2_dinode *)di_bh->b_data; + *ref_blkno = le64_to_cpu(di->i_refcount_loc); + brelse(di_bh); +out: + return ret; +} + +static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, int rw) +{ + int ret; + + ret = ocfs2_refcount_lock(tree, rw); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rw) + down_write(&tree->rf_sem); + else + down_read(&tree->rf_sem); + +out: + return ret; +} + +/* + * Lock the refcount tree pointed by ref_blkno and return the tree. + * In most case, we lock the tree and read the refcount block. + * So read it here if the caller really needs it. + * + * If the tree has been re-created by other node, it will free the + * old one and re-create it. + */ +int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, + u64 ref_blkno, int rw, + struct ocfs2_refcount_tree **ret_tree, + struct buffer_head **ref_bh) +{ + int ret, delete_tree = 0; + struct ocfs2_refcount_tree *tree = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *rb; + +again: + ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_refcount_tree_get(tree); + + ret = __ocfs2_lock_refcount_tree(osb, tree, rw); + if (ret) { + mlog_errno(ret); + ocfs2_refcount_tree_put(tree); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, + &ref_root_bh); + if (ret) { + mlog_errno(ret); + ocfs2_unlock_refcount_tree(osb, tree, rw); + ocfs2_refcount_tree_put(tree); + goto out; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + /* + * If the refcount block has been freed and re-created, we may need + * to recreate the refcount tree also. + * + * Here we just remove the tree from the rb-tree, and the last + * kref holder will unlock and delete this refcount_tree. + * Then we goto "again" and ocfs2_get_refcount_tree will create + * the new refcount tree for us. + */ + if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) { + if (!tree->rf_removed) { + ocfs2_erase_refcount_tree_from_list(osb, tree); + tree->rf_removed = 1; + delete_tree = 1; + } + + ocfs2_unlock_refcount_tree(osb, tree, rw); + /* + * We get an extra reference when we create the refcount + * tree, so another put will destroy it. + */ + if (delete_tree) + ocfs2_refcount_tree_put(tree); + brelse(ref_root_bh); + ref_root_bh = NULL; + goto again; + } + + *ret_tree = tree; + if (ref_bh) { + *ref_bh = ref_root_bh; + ref_root_bh = NULL; + } +out: + brelse(ref_root_bh); + return ret; +} + +int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw, + struct ocfs2_refcount_tree **ret_tree, + struct buffer_head **ref_bh) +{ + int ret; + u64 ref_blkno; + + ret = ocfs2_get_refcount_block(inode, &ref_blkno); + if (ret) { + mlog_errno(ret); + return ret; + } + + return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, + rw, ret_tree, ref_bh); +} + +void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, int rw) +{ + if (rw) + up_write(&tree->rf_sem); + else + up_read(&tree->rf_sem); + + ocfs2_refcount_unlock(tree, rw); + ocfs2_refcount_tree_put(tree); +} + +void ocfs2_purge_refcount_trees(struct ocfs2_super *osb) +{ + struct rb_node *node; + struct ocfs2_refcount_tree *tree; + struct rb_root *root = &osb->osb_rf_lock_tree; + + while ((node = rb_last(root)) != NULL) { + tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node); + + mlog(0, "Purge tree %llu\n", + (unsigned long long) tree->rf_blkno); + + rb_erase(&tree->rf_node, root); + ocfs2_free_refcount_tree(tree); + } +} + +/* + * Create a refcount tree for an inode. + * We take for granted that the inode is already locked. + */ +static int ocfs2_create_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; + u16 suballoc_bit_start; + u32 num_got; + u64 first_blkno; + + BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + + mlog(0, "create tree for inode %lu\n", inode->i_ino); + + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &first_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno); + if (!new_tree) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* Initialize ocfs2_refcount_block. */ + rb = (struct ocfs2_refcount_block *)new_bh->b_data; + memset(rb, 0, inode->i_sb->s_blocksize); + strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); + rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); + rb->rf_blkno = cpu_to_le64(first_blkno); + rb->rf_count = cpu_to_le32(1); + rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); + spin_lock(&osb->osb_lock); + rb->rf_generation = osb->s_next_generation++; + spin_unlock(&osb->osb_lock); + + ocfs2_journal_dirty(handle, new_bh); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = cpu_to_le64(first_blkno); + spin_unlock(&oi->ip_lock); + + mlog(0, "created tree for inode %lu, refblock %llu\n", + inode->i_ino, (unsigned long long)first_blkno); + + ocfs2_journal_dirty(handle, di_bh); + + /* + * We have to init the tree lock here since it will use + * the generation number to create it. + */ + new_tree->rf_generation = le32_to_cpu(rb->rf_generation); + ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno, + new_tree->rf_generation); + + spin_lock(&osb->osb_lock); + tree = ocfs2_find_refcount_tree(osb, first_blkno); + + /* + * We've just created a new refcount tree in this block. If + * we found a refcount tree on the ocfs2_super, it must be + * one we just deleted. We free the old tree before + * inserting the new tree. + */ + BUG_ON(tree && tree->rf_generation == new_tree->rf_generation); + if (tree) + ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); + ocfs2_insert_refcount_tree(osb, new_tree); + spin_unlock(&osb->osb_lock); + new_tree = NULL; + if (tree) + ocfs2_refcount_tree_put(tree); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (new_tree) { + ocfs2_metadata_cache_exit(&new_tree->rf_ci); + kfree(new_tree); + } + + brelse(new_bh); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +static int ocfs2_set_refcount_tree(struct inode *inode, + struct buffer_head *di_bh, + u64 refcount_loc) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_tree *ref_tree; + + BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + le32_add_cpu(&rb->rf_count, 1); + + ocfs2_journal_dirty(handle, ref_root_bh); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = cpu_to_le64(refcount_loc); + spin_unlock(&oi->ip_lock); + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + + return ret; +} + +int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) +{ + int ret, delete_tree = 0; + handle_t *handle = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_refcount_block *rb; + struct inode *alloc_inode = NULL; + struct buffer_head *alloc_bh = NULL; + struct buffer_head *blk_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS; + u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); + u16 bit = 0; + + if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + return 0; + + BUG_ON(!ref_blkno); + ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + rb = (struct ocfs2_refcount_block *)blk_bh->b_data; + + /* + * If we are the last user, we need to free the block. + * So lock the allocator ahead. + */ + if (le32_to_cpu(rb->rf_count) == 1) { + blk = le64_to_cpu(rb->rf_blkno); + bit = le16_to_cpu(rb->rf_suballoc_bit); + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(rb->rf_suballoc_slot)); + if (!alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + credits += OCFS2_SUBALLOC_FREE; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = 0; + spin_unlock(&oi->ip_lock); + ocfs2_journal_dirty(handle, di_bh); + + le32_add_cpu(&rb->rf_count , -1); + ocfs2_journal_dirty(handle, blk_bh); + + if (!rb->rf_count) { + delete_tree = 1; + ocfs2_erase_refcount_tree_from_list(osb, ref_tree); + ret = ocfs2_free_suballoc_bits(handle, alloc_inode, + alloc_bh, bit, bg_blkno, 1); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out_unlock: + if (alloc_inode) { + ocfs2_inode_unlock(alloc_inode, 1); + brelse(alloc_bh); + } +out_mutex: + if (alloc_inode) { + mutex_unlock(&alloc_inode->i_mutex); + iput(alloc_inode); + } +out: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + if (delete_tree) + ocfs2_refcount_tree_put(ref_tree); + brelse(blk_bh); + + return ret; +} + +static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci, + struct buffer_head *ref_leaf_bh, + u64 cpos, unsigned int len, + struct ocfs2_refcount_rec *ret_rec, + int *index) +{ + int i = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_rec *rec = NULL; + + for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) { + rec = &rb->rf_records.rl_recs[i]; + + if (le64_to_cpu(rec->r_cpos) + + le32_to_cpu(rec->r_clusters) <= cpos) + continue; + else if (le64_to_cpu(rec->r_cpos) > cpos) + break; + + /* ok, cpos fail in this rec. Just return. */ + if (ret_rec) + *ret_rec = *rec; + goto out; + } + + if (ret_rec) { + /* We meet with a hole here, so fake the rec. */ + ret_rec->r_cpos = cpu_to_le64(cpos); + ret_rec->r_refcount = 0; + if (i < le16_to_cpu(rb->rf_records.rl_used) && + le64_to_cpu(rec->r_cpos) < cpos + len) + ret_rec->r_clusters = + cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos); + else + ret_rec->r_clusters = cpu_to_le32(len); + } + +out: + *index = i; +} + +/* + * Try to remove refcount tree. The mechanism is: + * 1) Check whether i_clusters == 0, if no, exit. + * 2) check whether we have i_xattr_loc in dinode. if yes, exit. + * 3) Check whether we have inline xattr stored outside, if yes, exit. + * 4) Remove the tree. + */ +int ocfs2_try_remove_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&oi->ip_xattr_sem); + down_write(&oi->ip_alloc_sem); + + if (oi->ip_clusters) + goto out; + + if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc) + goto out; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL && + ocfs2_has_inline_xattr_value_outside(inode, di)) + goto out; + + ret = ocfs2_remove_refcount_tree(inode, di_bh); + if (ret) + mlog_errno(ret); +out: + up_write(&oi->ip_alloc_sem); + up_write(&oi->ip_xattr_sem); + return 0; +} + +/* + * Given a cpos and len, try to find the refcount record which contains cpos. + * 1. If cpos can be found in one refcount record, return the record. + * 2. If cpos can't be found, return a fake record which start from cpos + * and end at a small value between cpos+len and start of the next record. + * This fake record has r_refcount = 0. + */ +static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, unsigned int len, + struct ocfs2_refcount_rec *ret_rec, + int *index, + struct buffer_head **ret_bh) +{ + int ret = 0, i, found; + u32 low_cpos; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *tmp, *rec = NULL; + struct ocfs2_extent_block *eb; + struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) { + ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len, + ret_rec, index); + *ret_bh = ref_root_bh; + get_bh(ref_root_bh); + return 0; + } + + el = &rb->rf_list; + low_cpos = cpos & OCFS2_32BIT_POS_MASK; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(sb, + "refcount tree %llu has non zero tree " + "depth in leaf btree tree block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + found = 0; + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= low_cpos) { + found = 1; + break; + } + } + + /* adjust len when we have ocfs2_extent_rec after it. */ + if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) { + tmp = &el->l_recs[i+1]; + + if (le32_to_cpu(tmp->e_cpos) < cpos + len) + len = le32_to_cpu(tmp->e_cpos) - cpos; + } + + ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len, + ret_rec, index); + *ret_bh = ref_leaf_bh; +out: + brelse(eb_bh); + return ret; +} + +enum ocfs2_ref_rec_contig { + REF_CONTIG_NONE = 0, + REF_CONTIG_LEFT, + REF_CONTIG_RIGHT, + REF_CONTIG_LEFTRIGHT, +}; + +static enum ocfs2_ref_rec_contig + ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb, + int index) +{ + if ((rb->rf_records.rl_recs[index].r_refcount == + rb->rf_records.rl_recs[index + 1].r_refcount) && + (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) + + le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) == + le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos))) + return REF_CONTIG_RIGHT; + + return REF_CONTIG_NONE; +} + +static enum ocfs2_ref_rec_contig + ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb, + int index) +{ + enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE; + + if (index < le16_to_cpu(rb->rf_records.rl_used) - 1) + ret = ocfs2_refcount_rec_adjacent(rb, index); + + if (index > 0) { + enum ocfs2_ref_rec_contig tmp; + + tmp = ocfs2_refcount_rec_adjacent(rb, index - 1); + + if (tmp == REF_CONTIG_RIGHT) { + if (ret == REF_CONTIG_RIGHT) + ret = REF_CONTIG_LEFTRIGHT; + else + ret = REF_CONTIG_LEFT; + } + } + + return ret; +} + +static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb, + int index) +{ + BUG_ON(rb->rf_records.rl_recs[index].r_refcount != + rb->rf_records.rl_recs[index+1].r_refcount); + + le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters, + le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters)); + + if (index < le16_to_cpu(rb->rf_records.rl_used) - 2) + memmove(&rb->rf_records.rl_recs[index + 1], + &rb->rf_records.rl_recs[index + 2], + sizeof(struct ocfs2_refcount_rec) * + (le16_to_cpu(rb->rf_records.rl_used) - index - 2)); + + memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1], + 0, sizeof(struct ocfs2_refcount_rec)); + le16_add_cpu(&rb->rf_records.rl_used, -1); +} + +/* + * Merge the refcount rec if we are contiguous with the adjacent recs. + */ +static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb, + int index) +{ + enum ocfs2_ref_rec_contig contig = + ocfs2_refcount_rec_contig(rb, index); + + if (contig == REF_CONTIG_NONE) + return; + + if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) { + BUG_ON(index == 0); + index--; + } + + ocfs2_rotate_refcount_rec_left(rb, index); + + if (contig == REF_CONTIG_LEFTRIGHT) + ocfs2_rotate_refcount_rec_left(rb, index); +} + +/* + * Change the refcount indexed by "index" in ref_bh. + * If refcount reaches 0, remove it. + */ +static int ocfs2_change_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_leaf_bh, + int index, int merge, int change) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rl = &rb->rf_records; + struct ocfs2_refcount_rec *rec = &rl->rl_recs[index]; + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "change index %d, old count %u, change %d\n", index, + le32_to_cpu(rec->r_refcount), change); + le32_add_cpu(&rec->r_refcount, change); + + if (!rec->r_refcount) { + if (index != le16_to_cpu(rl->rl_used) - 1) { + memmove(rec, rec + 1, + (le16_to_cpu(rl->rl_used) - index - 1) * + sizeof(struct ocfs2_refcount_rec)); + memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1], + 0, sizeof(struct ocfs2_refcount_rec)); + } + + le16_add_cpu(&rl->rl_used, -1); + } else if (merge) + ocfs2_refcount_rec_merge(rb, index); + + ret = ocfs2_journal_dirty(handle, ref_leaf_bh); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +static int ocfs2_expand_inline_ref_root(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head **ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got; + u64 blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *new_rb; + struct ocfs2_refcount_block *root_rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_bh = sb_getblk(sb, blkno); + if (new_bh == NULL) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Initialize ocfs2_refcount_block. + * It should contain the same information as the old root. + * so just memcpy it and change the corresponding field. + */ + memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); + + new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; + new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); + new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + new_rb->rf_blkno = cpu_to_le64(blkno); + new_rb->rf_cpos = cpu_to_le32(0); + new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); + new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); + ocfs2_journal_dirty(handle, new_bh); + + /* Now change the root. */ + memset(&root_rb->rf_list, 0, sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_list)); + root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb)); + root_rb->rf_clusters = cpu_to_le32(1); + root_rb->rf_list.l_next_free_rec = cpu_to_le16(1); + root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno); + root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); + root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL); + + ocfs2_journal_dirty(handle, ref_root_bh); + + mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno, + le16_to_cpu(new_rb->rf_records.rl_used)); + + *ref_leaf_bh = new_bh; + new_bh = NULL; +out: + brelse(new_bh); + return ret; +} + +static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev, + struct ocfs2_refcount_rec *next) +{ + if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <= + ocfs2_get_ref_rec_low_cpos(next)) + return 1; + + return 0; +} + +static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b) +{ + const struct ocfs2_refcount_rec *l = a, *r = b; + u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l); + u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r); + + if (l_cpos > r_cpos) + return 1; + if (l_cpos < r_cpos) + return -1; + return 0; +} + +static int cmp_refcount_rec_by_cpos(const void *a, const void *b) +{ + const struct ocfs2_refcount_rec *l = a, *r = b; + u64 l_cpos = le64_to_cpu(l->r_cpos); + u64 r_cpos = le64_to_cpu(r->r_cpos); + + if (l_cpos > r_cpos) + return 1; + if (l_cpos < r_cpos) + return -1; + return 0; +} + +static void swap_refcount_rec(void *a, void *b, int size) +{ + struct ocfs2_refcount_rec *l = a, *r = b, tmp; + + tmp = *(struct ocfs2_refcount_rec *)l; + *(struct ocfs2_refcount_rec *)l = + *(struct ocfs2_refcount_rec *)r; + *(struct ocfs2_refcount_rec *)r = tmp; +} + +/* + * The refcount cpos are ordered by their 64bit cpos, + * But we will use the low 32 bit to be the e_cpos in the b-tree. + * So we need to make sure that this pos isn't intersected with others. + * + * Note: The refcount block is already sorted by their low 32 bit cpos, + * So just try the middle pos first, and we will exit when we find + * the good position. + */ +static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl, + u32 *split_pos, int *split_index) +{ + int num_used = le16_to_cpu(rl->rl_used); + int delta, middle = num_used / 2; + + for (delta = 0; delta < middle; delta++) { + /* Let's check delta earlier than middle */ + if (ocfs2_refcount_rec_no_intersect( + &rl->rl_recs[middle - delta - 1], + &rl->rl_recs[middle - delta])) { + *split_index = middle - delta; + break; + } + + /* For even counts, don't walk off the end */ + if ((middle + delta + 1) == num_used) + continue; + + /* Now try delta past middle */ + if (ocfs2_refcount_rec_no_intersect( + &rl->rl_recs[middle + delta], + &rl->rl_recs[middle + delta + 1])) { + *split_index = middle + delta + 1; + break; + } + } + + if (delta >= middle) + return -ENOSPC; + + *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]); + return 0; +} + +static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, + struct buffer_head *new_bh, + u32 *split_cpos) +{ + int split_index = 0, num_moved, ret; + u32 cpos = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rl = &rb->rf_records; + struct ocfs2_refcount_block *new_rb = + (struct ocfs2_refcount_block *)new_bh->b_data; + struct ocfs2_refcount_list *new_rl = &new_rb->rf_records; + + mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n", + (unsigned long long)ref_leaf_bh->b_blocknr, + le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used)); + + /* + * XXX: Improvement later. + * If we know all the high 32 bit cpos is the same, no need to sort. + * + * In order to make the whole process safe, we do: + * 1. sort the entries by their low 32 bit cpos first so that we can + * find the split cpos easily. + * 2. call ocfs2_insert_extent to insert the new refcount block. + * 3. move the refcount rec to the new block. + * 4. sort the entries by their 64 bit cpos. + * 5. dirty the new_rb and rb. + */ + sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), + sizeof(struct ocfs2_refcount_rec), + cmp_refcount_rec_by_low_cpos, swap_refcount_rec); + + ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index); + if (ret) { + mlog_errno(ret); + return ret; + } + + new_rb->rf_cpos = cpu_to_le32(cpos); + + /* move refcount records starting from split_index to the new block. */ + num_moved = le16_to_cpu(rl->rl_used) - split_index; + memcpy(new_rl->rl_recs, &rl->rl_recs[split_index], + num_moved * sizeof(struct ocfs2_refcount_rec)); + + /*ok, remove the entries we just moved over to the other block. */ + memset(&rl->rl_recs[split_index], 0, + num_moved * sizeof(struct ocfs2_refcount_rec)); + + /* change old and new rl_used accordingly. */ + le16_add_cpu(&rl->rl_used, -num_moved); + new_rl->rl_used = cpu_to_le32(num_moved); + + sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), + sizeof(struct ocfs2_refcount_rec), + cmp_refcount_rec_by_cpos, swap_refcount_rec); + + sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used), + sizeof(struct ocfs2_refcount_rec), + cmp_refcount_rec_by_cpos, swap_refcount_rec); + + *split_cpos = cpos; + return 0; +} + +static int ocfs2_new_leaf_refcount_block(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got, new_cpos; + u64 blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *root_rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *new_rb; + struct ocfs2_extent_tree ref_et; + + BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)); + + ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_bh = sb_getblk(sb, blkno); + if (new_bh == NULL) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Initialize ocfs2_refcount_block. */ + new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; + memset(new_rb, 0, sb->s_blocksize); + strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); + new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + new_rb->rf_blkno = cpu_to_le64(blkno); + new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); + new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); + new_rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); + new_rb->rf_generation = root_rb->rf_generation; + + ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_journal_dirty(handle, ref_leaf_bh); + ocfs2_journal_dirty(handle, new_bh); + + ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh); + + mlog(0, "insert new leaf block %llu at %u\n", + (unsigned long long)new_bh->b_blocknr, new_cpos); + + /* Insert the new leaf block with the specific offset cpos. */ + ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr, + 1, 0, meta_ac); + if (ret) + mlog_errno(ret); + +out: + brelse(new_bh); + return ret; +} + +static int ocfs2_expand_refcount_tree(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + struct buffer_head *expand_bh = NULL; + + if (ref_root_bh == ref_leaf_bh) { + /* + * the old root bh hasn't been expanded to a b-tree, + * so expand it first. + */ + ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh, + &expand_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + expand_bh = ref_leaf_bh; + get_bh(expand_bh); + } + + + /* Now add a new refcount block into the tree.*/ + ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh, + expand_bh, meta_ac); + if (ret) + mlog_errno(ret); +out: + brelse(expand_bh); + return ret; +} + +/* + * Adjust the extent rec in b-tree representing ref_leaf_bh. + * + * Only called when we have inserted a new refcount rec at index 0 + * which means ocfs2_extent_rec.e_cpos may need some change. + */ +static int ocfs2_adjust_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_refcount_rec *rec) +{ + int ret = 0, i; + u32 new_cpos, old_cpos; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_tree et; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + struct ocfs2_extent_list *el; + + if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) + goto out; + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + old_cpos = le32_to_cpu(rb->rf_cpos); + new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; + if (old_cpos <= new_cpos) + goto out; + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + + path = ocfs2_new_path_from_et(&et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(ci, path, old_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * 2 more credits, one for the leaf refcount block, one for + * the extent block contains the extent rec. + */ + ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path), + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /* change the leaf extent block first. */ + el = path_leaf_el(path); + + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) + if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos) + break; + + BUG_ON(i == le16_to_cpu(el->l_next_free_rec)); + + el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); + + /* change the r_cpos in the leaf block. */ + rb->rf_cpos = cpu_to_le32(new_cpos); + + ocfs2_journal_dirty(handle, path_leaf_bh(path)); + ocfs2_journal_dirty(handle, ref_leaf_bh); + +out: + ocfs2_free_path(path); + return ret; +} + +static int ocfs2_insert_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_refcount_rec *rec, + int index, int merge, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rf_list = &rb->rf_records; + struct buffer_head *new_bh = NULL; + + BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); + + if (rf_list->rl_used == rf_list->rl_count) { + u64 cpos = le64_to_cpu(rec->r_cpos); + u32 len = le32_to_cpu(rec->r_clusters); + + ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, + ref_leaf_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, NULL, &index, + &new_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ref_leaf_bh = new_bh; + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + rf_list = &rb->rf_records; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (index < le16_to_cpu(rf_list->rl_used)) + memmove(&rf_list->rl_recs[index + 1], + &rf_list->rl_recs[index], + (le16_to_cpu(rf_list->rl_used) - index) * + sizeof(struct ocfs2_refcount_rec)); + + mlog(0, "insert refcount record start %llu, len %u, count %u " + "to leaf block %llu at index %d\n", + (unsigned long long)le64_to_cpu(rec->r_cpos), + le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount), + (unsigned long long)ref_leaf_bh->b_blocknr, index); + + rf_list->rl_recs[index] = *rec; + + le16_add_cpu(&rf_list->rl_used, 1); + + if (merge) + ocfs2_refcount_rec_merge(rb, index); + + ret = ocfs2_journal_dirty(handle, ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (index == 0) { + ret = ocfs2_adjust_refcount_rec(handle, ci, + ref_root_bh, + ref_leaf_bh, rec); + if (ret) + mlog_errno(ret); + } +out: + brelse(new_bh); + return ret; +} + +/* + * Split the refcount_rec indexed by "index" in ref_leaf_bh. + * This is much simple than our b-tree code. + * split_rec is the new refcount rec we want to insert. + * If split_rec->r_refcount > 0, we are changing the refcount(in case we + * increase refcount or decrease a refcount to non-zero). + * If split_rec->r_refcount == 0, we are punching a hole in current refcount + * rec( in case we decrease a refcount to zero). + */ +static int ocfs2_split_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_refcount_rec *split_rec, + int index, int merge, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, recs_need; + u32 len; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rf_list = &rb->rf_records; + struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index]; + struct ocfs2_refcount_rec *tail_rec = NULL; + struct buffer_head *new_bh = NULL; + + BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); + + mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n", + le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters), + le64_to_cpu(split_rec->r_cpos), + le32_to_cpu(split_rec->r_clusters)); + + /* + * If we just need to split the header or tail clusters, + * no more recs are needed, just split is OK. + * Otherwise we at least need one new recs. + */ + if (!split_rec->r_refcount && + (split_rec->r_cpos == orig_rec->r_cpos || + le64_to_cpu(split_rec->r_cpos) + + le32_to_cpu(split_rec->r_clusters) == + le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) + recs_need = 0; + else + recs_need = 1; + + /* + * We need one more rec if we split in the middle and the new rec have + * some refcount in it. + */ + if (split_rec->r_refcount && + (split_rec->r_cpos != orig_rec->r_cpos && + le64_to_cpu(split_rec->r_cpos) + + le32_to_cpu(split_rec->r_clusters) != + le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) + recs_need++; + + /* If the leaf block don't have enough record, expand it. */ + if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) { + struct ocfs2_refcount_rec tmp_rec; + u64 cpos = le64_to_cpu(orig_rec->r_cpos); + len = le32_to_cpu(orig_rec->r_clusters); + ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, + ref_leaf_bh, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We have to re-get it since now cpos may be moved to + * another leaf block. + */ + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, &tmp_rec, &index, + &new_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ref_leaf_bh = new_bh; + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + rf_list = &rb->rf_records; + orig_rec = &rf_list->rl_recs[index]; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We have calculated out how many new records we need and store + * in recs_need, so spare enough space first by moving the records + * after "index" to the end. + */ + if (index != le16_to_cpu(rf_list->rl_used) - 1) + memmove(&rf_list->rl_recs[index + 1 + recs_need], + &rf_list->rl_recs[index + 1], + (le16_to_cpu(rf_list->rl_used) - index - 1) * + sizeof(struct ocfs2_refcount_rec)); + + len = (le64_to_cpu(orig_rec->r_cpos) + + le32_to_cpu(orig_rec->r_clusters)) - + (le64_to_cpu(split_rec->r_cpos) + + le32_to_cpu(split_rec->r_clusters)); + + /* + * If we have "len", the we will split in the tail and move it + * to the end of the space we have just spared. + */ + if (len) { + tail_rec = &rf_list->rl_recs[index + recs_need]; + + memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); + le64_add_cpu(&tail_rec->r_cpos, + le32_to_cpu(tail_rec->r_clusters) - len); + tail_rec->r_clusters = le32_to_cpu(len); + } + + /* + * If the split pos isn't the same as the original one, we need to + * split in the head. + * + * Note: We have the chance that split_rec.r_refcount = 0, + * recs_need = 0 and len > 0, which means we just cut the head from + * the orig_rec and in that case we have done some modification in + * orig_rec above, so the check for r_cpos is faked. + */ + if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) { + len = le64_to_cpu(split_rec->r_cpos) - + le64_to_cpu(orig_rec->r_cpos); + orig_rec->r_clusters = cpu_to_le32(len); + index++; + } + + le16_add_cpu(&rf_list->rl_used, recs_need); + + if (split_rec->r_refcount) { + rf_list->rl_recs[index] = *split_rec; + mlog(0, "insert refcount record start %llu, len %u, count %u " + "to leaf block %llu at index %d\n", + (unsigned long long)le64_to_cpu(split_rec->r_cpos), + le32_to_cpu(split_rec->r_clusters), + le32_to_cpu(split_rec->r_refcount), + (unsigned long long)ref_leaf_bh->b_blocknr, index); + + if (merge) + ocfs2_refcount_rec_merge(rb, index); + } + + ret = ocfs2_journal_dirty(handle, ref_leaf_bh); + if (ret) + mlog_errno(ret); + +out: + brelse(new_bh); + return ret; +} + +static int __ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, int merge, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0, index; + struct buffer_head *ref_leaf_bh = NULL; + struct ocfs2_refcount_rec rec; + unsigned int set_len = 0; + + mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)cpos, len); + + while (len) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + set_len = le32_to_cpu(rec.r_clusters); + + /* + * Here we may meet with 3 situations: + * + * 1. If we find an already existing record, and the length + * is the same, cool, we just need to increase the r_refcount + * and it is OK. + * 2. If we find a hole, just insert it with r_refcount = 1. + * 3. If we are in the middle of one extent record, split + * it. + */ + if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos && + set_len <= len) { + mlog(0, "increase refcount rec, start %llu, len %u, " + "count %u\n", (unsigned long long)cpos, set_len, + le32_to_cpu(rec.r_refcount)); + ret = ocfs2_change_refcount_rec(handle, ci, + ref_leaf_bh, index, + merge, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + } else if (!rec.r_refcount) { + rec.r_refcount = cpu_to_le32(1); + + mlog(0, "insert refcount rec, start %llu, len %u\n", + (unsigned long long)le64_to_cpu(rec.r_cpos), + set_len); + ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh, + ref_leaf_bh, + &rec, index, + merge, meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } else { + set_len = min((u64)(cpos + len), + le64_to_cpu(rec.r_cpos) + set_len) - cpos; + rec.r_cpos = cpu_to_le64(cpos); + rec.r_clusters = cpu_to_le32(set_len); + le32_add_cpu(&rec.r_refcount, 1); + + mlog(0, "split refcount rec, start %llu, " + "len %u, count %u\n", + (unsigned long long)le64_to_cpu(rec.r_cpos), + set_len, le32_to_cpu(rec.r_refcount)); + ret = ocfs2_split_refcount_rec(handle, ci, + ref_root_bh, ref_leaf_bh, + &rec, index, merge, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += set_len; + len -= set_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +static int ocfs2_remove_refcount_extent(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_extent_tree et; + + BUG_ON(rb->rf_records.rl_used); + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos), + 1, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_remove_from_cache(ci, ref_leaf_bh); + + /* + * add the freed block to the dealloc so that it will be freed + * when we run dealloc. + */ + ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(rb->rf_suballoc_slot), + le64_to_cpu(rb->rf_blkno), + le16_to_cpu(rb->rf_suballoc_bit)); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + le32_add_cpu(&rb->rf_clusters, -1); + + /* + * check whether we need to restore the root refcount block if + * there is no leaf extent block at atll. + */ + if (!rb->rf_list.l_next_free_rec) { + BUG_ON(rb->rf_clusters); + + mlog(0, "reset refcount tree root %llu to be a record block.\n", + (unsigned long long)ref_root_bh->b_blocknr); + + rb->rf_flags = 0; + rb->rf_parent = 0; + rb->rf_cpos = 0; + memset(&rb->rf_records, 0, sb->s_blocksize - + offsetof(struct ocfs2_refcount_block, rf_records)); + rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); + } + + ocfs2_journal_dirty(handle, ref_root_bh); + +out: + return ret; +} + +int ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + return __ocfs2_increase_refcount(handle, ci, ref_root_bh, + cpos, len, 1, + meta_ac, dealloc); +} + +static int ocfs2_decrease_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head *ref_leaf_bh, + int index, u64 cpos, unsigned int len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index]; + + BUG_ON(cpos < le64_to_cpu(rec->r_cpos)); + BUG_ON(cpos + len > + le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters)); + + if (cpos == le64_to_cpu(rec->r_cpos) && + len == le32_to_cpu(rec->r_clusters)) + ret = ocfs2_change_refcount_rec(handle, ci, + ref_leaf_bh, index, 1, -1); + else { + struct ocfs2_refcount_rec split = *rec; + split.r_cpos = cpu_to_le64(cpos); + split.r_clusters = cpu_to_le32(len); + + le32_add_cpu(&split.r_refcount, -1); + + mlog(0, "split refcount rec, start %llu, " + "len %u, count %u, original start %llu, len %u\n", + (unsigned long long)le64_to_cpu(split.r_cpos), + len, le32_to_cpu(split.r_refcount), + (unsigned long long)le64_to_cpu(rec->r_cpos), + le32_to_cpu(rec->r_clusters)); + ret = ocfs2_split_refcount_rec(handle, ci, + ref_root_bh, ref_leaf_bh, + &split, index, 1, + meta_ac, dealloc); + } + + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Remove the leaf refcount block if it contains no refcount record. */ + if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) { + ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh, + ref_leaf_bh, meta_ac, + dealloc); + if (ret) + mlog_errno(ret); + } + +out: + return ret; +} + +static int __ocfs2_decrease_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete) +{ + int ret = 0, index = 0; + struct ocfs2_refcount_rec rec; + unsigned int r_count = 0, r_len; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct buffer_head *ref_leaf_bh = NULL; + + mlog(0, "Tree owner %llu, decrease refcount start %llu, " + "len %u, delete %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)cpos, len, delete); + + while (len) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, len, &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + r_count = le32_to_cpu(rec.r_refcount); + BUG_ON(r_count == 0); + if (!delete) + BUG_ON(r_count > 1); + + r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - cpos; + + ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh, + ref_leaf_bh, index, + cpos, r_len, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (le32_to_cpu(rec.r_refcount) == 1 && delete) { + ret = ocfs2_cache_cluster_dealloc(dealloc, + ocfs2_clusters_to_blocks(sb, cpos), + r_len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += r_len; + len -= r_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +/* Caller must hold refcount tree lock. */ +int ocfs2_decrease_refcount(struct inode *inode, + handle_t *handle, u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete) +{ + int ret; + u64 ref_blkno; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *tree; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_get_refcount_block(inode, &ref_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, + &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh, + cpos, len, meta_ac, dealloc, delete); + if (ret) + mlog_errno(ret); +out: + brelse(ref_root_bh); + return ret; +} + +/* + * Mark the already-existing extent at cpos as refcounted for len clusters. + * This adds the refcount extent flag. + * + * If the existing extent is larger than the request, initiate a + * split. An attempt will be made at merging with adjacent extents. + * + * The caller is responsible for passing down meta_ac if we'll need it. + */ +static int ocfs2_mark_extent_refcounted(struct inode *inode, + struct ocfs2_extent_tree *et, + handle_t *handle, u32 cpos, + u32 len, u32 phys, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + + mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n", + inode->i_ino, cpos, len, phys); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + ret = -EROFS; + goto out; + } + + ret = ocfs2_change_extent_flag(handle, et, cpos, + len, phys, meta_ac, dealloc, + OCFS2_EXT_REFCOUNTED, 0); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + +/* + * Given some contiguous physical clusters, calculate what we need + * for modifying their refcount. + */ +static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 start_cpos, + u32 clusters, + int *meta_add, + int *credits) +{ + int ret = 0, index, ref_blocks = 0, recs_add = 0; + u64 cpos = start_cpos; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_rec rec; + struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL; + u32 len; + + mlog(0, "start_cpos %llu, clusters %u\n", + (unsigned long long)start_cpos, clusters); + while (clusters) { + ret = ocfs2_get_refcount_rec(ci, ref_root_bh, + cpos, clusters, &rec, + &index, &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (ref_leaf_bh != prev_bh) { + /* + * Now we encounter a new leaf block, so calculate + * whether we need to extend the old leaf. + */ + if (prev_bh) { + rb = (struct ocfs2_refcount_block *) + prev_bh->b_data; + + if (le64_to_cpu(rb->rf_records.rl_used) + + recs_add > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + } + + recs_add = 0; + *credits += 1; + brelse(prev_bh); + prev_bh = ref_leaf_bh; + get_bh(prev_bh); + } + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + + mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu," + "rec->r_clusters %u, rec->r_refcount %u, index %d\n", + recs_add, (unsigned long long)cpos, clusters, + (unsigned long long)le64_to_cpu(rec.r_cpos), + le32_to_cpu(rec.r_clusters), + le32_to_cpu(rec.r_refcount), index); + + len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - cpos; + /* + * If the refcount rec already exist, cool. We just need + * to check whether there is a split. Otherwise we just need + * to increase the refcount. + * If we will insert one, increases recs_add. + * + * We record all the records which will be inserted to the + * same refcount block, so that we can tell exactly whether + * we need a new refcount block or not. + */ + if (rec.r_refcount) { + /* Check whether we need a split at the beginning. */ + if (cpos == start_cpos && + cpos != le64_to_cpu(rec.r_cpos)) + recs_add++; + + /* Check whether we need a split in the end. */ + if (cpos + clusters < le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) + recs_add++; + } else + recs_add++; + + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + clusters -= len; + cpos += len; + } + + if (prev_bh) { + rb = (struct ocfs2_refcount_block *)prev_bh->b_data; + + if (le64_to_cpu(rb->rf_records.rl_used) + recs_add > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + + *credits += 1; + } + + if (!ref_blocks) + goto out; + + mlog(0, "we need ref_blocks %d\n", ref_blocks); + *meta_add += ref_blocks; + *credits += ref_blocks; + + /* + * So we may need ref_blocks to insert into the tree. + * That also means we need to change the b-tree and add that number + * of records since we never merge them. + * We need one more block for expansion since the new created leaf + * block is also full and needs split. + */ + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) { + struct ocfs2_extent_tree et; + + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + *meta_add += ocfs2_extend_meta_needed(et.et_root_el); + *credits += ocfs2_calc_extend_credits(sb, + et.et_root_el, + ref_blocks); + } else { + *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + *meta_add += 1; + } + +out: + brelse(ref_leaf_bh); + brelse(prev_bh); + return ret; +} + +/* + * For refcount tree, we will decrease some contiguous clusters + * refcount count, so just go through it to see how many blocks + * we gonna touch and whether we need to create new blocks. + * + * Normally the refcount blocks store these refcount should be + * continguous also, so that we can get the number easily. + * As for meta_ac, we will at most add split 2 refcount record and + * 2 more refcount block, so just check it in a rough way. + * + * Caller must hold refcount tree lock. + */ +int ocfs2_prepare_refcount_change_for_del(struct inode *inode, + struct buffer_head *di_bh, + u64 phys_blkno, + u32 clusters, + int *credits, + struct ocfs2_alloc_context **meta_ac) +{ + int ret, ref_blocks = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *tree; + u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + ret = -EROFS; + goto out; + } + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_refcount_loc), &tree); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, + le64_to_cpu(di->i_refcount_loc), + &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, + &tree->rf_ci, + ref_root_bh, + start_cpos, clusters, + &ref_blocks, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "reserve new metadata %d, credits = %d\n", + ref_blocks, *credits); + + if (ref_blocks) { + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + ref_blocks, meta_ac); + if (ret) + mlog_errno(ret); + } + +out: + brelse(ref_root_bh); + return ret; +} + +#define MAX_CONTIG_BYTES 1048576 + +static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb) +{ + return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES); +} + +static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb) +{ + return ~(ocfs2_cow_contig_clusters(sb) - 1); +} + +/* + * Given an extent that starts at 'start' and an I/O that starts at 'cpos', + * find an offset (start + (n * contig_clusters)) that is closest to cpos + * while still being less than or equal to it. + * + * The goal is to break the extent at a multiple of contig_clusters. + */ +static inline unsigned int ocfs2_cow_align_start(struct super_block *sb, + unsigned int start, + unsigned int cpos) +{ + BUG_ON(start > cpos); + + return start + ((cpos - start) & ocfs2_cow_contig_mask(sb)); +} + +/* + * Given a cluster count of len, pad it out so that it is a multiple + * of contig_clusters. + */ +static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, + unsigned int len) +{ + unsigned int padded = + (len + (ocfs2_cow_contig_clusters(sb) - 1)) & + ocfs2_cow_contig_mask(sb); + + /* Did we wrap? */ + if (padded < len) + padded = UINT_MAX; + + return padded; +} + +/* + * Calculate out the start and number of virtual clusters we need to to CoW. + * + * cpos is vitual start cluster position we want to do CoW in a + * file and write_len is the cluster length. + * max_cpos is the place where we want to stop CoW intentionally. + * + * Normal we will start CoW from the beginning of extent record cotaining cpos. + * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we + * get good I/O from the resulting extent tree. + */ +static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, + struct ocfs2_extent_list *el, + u32 cpos, + u32 write_len, + u32 max_cpos, + u32 *cow_start, + u32 *cow_len) +{ + int ret = 0; + int tree_height = le16_to_cpu(el->l_tree_depth), i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb = NULL; + struct ocfs2_extent_rec *rec; + unsigned int want_clusters, rec_end = 0; + int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb); + int leaf_clusters; + + BUG_ON(cpos + write_len > max_cpos); + + if (tree_height > 0) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + *cow_len = 0; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + + if (ocfs2_is_empty_extent(rec)) { + mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " + "index %d\n", inode->i_ino, i); + continue; + } + + if (le32_to_cpu(rec->e_cpos) + + le16_to_cpu(rec->e_leaf_clusters) <= cpos) + continue; + + if (*cow_len == 0) { + /* + * We should find a refcounted record in the + * first pass. + */ + BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED)); + *cow_start = le32_to_cpu(rec->e_cpos); + } + + /* + * If we encounter a hole, a non-refcounted record or + * pass the max_cpos, stop the search. + */ + if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) || + (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) || + (max_cpos <= le32_to_cpu(rec->e_cpos))) + break; + + leaf_clusters = le16_to_cpu(rec->e_leaf_clusters); + rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters; + if (rec_end > max_cpos) { + rec_end = max_cpos; + leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos); + } + + /* + * How many clusters do we actually need from + * this extent? First we see how many we actually + * need to complete the write. If that's smaller + * than contig_clusters, we try for contig_clusters. + */ + if (!*cow_len) + want_clusters = write_len; + else + want_clusters = (cpos + write_len) - + (*cow_start + *cow_len); + if (want_clusters < contig_clusters) + want_clusters = contig_clusters; + + /* + * If the write does not cover the whole extent, we + * need to calculate how we're going to split the extent. + * We try to do it on contig_clusters boundaries. + * + * Any extent smaller than contig_clusters will be + * CoWed in its entirety. + */ + if (leaf_clusters <= contig_clusters) + *cow_len += leaf_clusters; + else if (*cow_len || (*cow_start == cpos)) { + /* + * This extent needs to be CoW'd from its + * beginning, so all we have to do is compute + * how many clusters to grab. We align + * want_clusters to the edge of contig_clusters + * to get better I/O. + */ + want_clusters = ocfs2_cow_align_length(inode->i_sb, + want_clusters); + + if (leaf_clusters < want_clusters) + *cow_len += leaf_clusters; + else + *cow_len += want_clusters; + } else if ((*cow_start + contig_clusters) >= + (cpos + write_len)) { + /* + * Breaking off contig_clusters at the front + * of the extent will cover our write. That's + * easy. + */ + *cow_len = contig_clusters; + } else if ((rec_end - cpos) <= contig_clusters) { + /* + * Breaking off contig_clusters at the tail of + * this extent will cover cpos. + */ + *cow_start = rec_end - contig_clusters; + *cow_len = contig_clusters; + } else if ((rec_end - cpos) <= want_clusters) { + /* + * While we can't fit the entire write in this + * extent, we know that the write goes from cpos + * to the end of the extent. Break that off. + * We try to break it at some multiple of + * contig_clusters from the front of the extent. + * Failing that (ie, cpos is within + * contig_clusters of the front), we'll CoW the + * entire extent. + */ + *cow_start = ocfs2_cow_align_start(inode->i_sb, + *cow_start, cpos); + *cow_len = rec_end - *cow_start; + } else { + /* + * Ok, the entire write lives in the middle of + * this extent. Let's try to slice the extent up + * nicely. Optimally, our CoW region starts at + * m*contig_clusters from the beginning of the + * extent and goes for n*contig_clusters, + * covering the entire write. + */ + *cow_start = ocfs2_cow_align_start(inode->i_sb, + *cow_start, cpos); + + want_clusters = (cpos + write_len) - *cow_start; + want_clusters = ocfs2_cow_align_length(inode->i_sb, + want_clusters); + if (*cow_start + want_clusters <= rec_end) + *cow_len = want_clusters; + else + *cow_len = rec_end - *cow_start; + } + + /* Have we covered our entire write yet? */ + if ((*cow_start + *cow_len) >= (cpos + write_len)) + break; + + /* + * If we reach the end of the extent block and don't get enough + * clusters, continue with the next extent block if possible. + */ + if (i + 1 == le16_to_cpu(el->l_next_free_rec) && + eb && eb->h_next_leaf_blk) { + brelse(eb_bh); + eb_bh = NULL; + + ret = ocfs2_read_extent_block(INODE_CACHE(inode), + le64_to_cpu(eb->h_next_leaf_blk), + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + i = -1; + } + } + +out: + brelse(eb_bh); + return ret; +} + +/* + * Prepare meta_ac, data_ac and calculate credits when we want to add some + * num_clusters in data_tree "et" and change the refcount for the old + * clusters(starting form p_cluster) in the refcount tree. + * + * Note: + * 1. since we may split the old tree, so we at most will need num_clusters + 2 + * more new leaf records. + * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so + * just give data_ac = NULL. + */ +static int ocfs2_lock_refcount_allocators(struct super_block *sb, + u32 p_cluster, u32 num_clusters, + struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac, + int *credits) +{ + int ret = 0, meta_add = 0; + int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); + + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (num_free_extents < num_clusters + 2) + meta_add = + ocfs2_extend_meta_needed(et->et_root_el); + + *credits += ocfs2_calc_extend_credits(sb, et->et_root_el, + num_clusters + 2); + + ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh, + p_cluster, num_clusters, + &meta_add, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n", + meta_add, num_clusters, *credits); + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (data_ac) { + ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters, + data_ac); + if (ret) + mlog_errno(ret); + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + } + + return ret; +} + +static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) +{ + BUG_ON(buffer_dirty(bh)); + + clear_buffer_mapped(bh); + + return 0; +} + +static int ocfs2_duplicate_clusters_by_page(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) +{ + int ret = 0, partial; + struct ocfs2_caching_info *ci = context->data_et.et_ci; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); + struct page *page; + pgoff_t page_index; + unsigned int from, to; + loff_t offset, end, map_end; + struct address_space *mapping = context->inode->i_mapping; + + mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, + new_cluster, new_len, cpos); + + offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); + + while (offset < end) { + page_index = offset >> PAGE_CACHE_SHIFT; + map_end = (page_index + 1) << PAGE_CACHE_SHIFT; + if (map_end > end) + map_end = end; + + /* from, to is the offset within the page. */ + from = offset & (PAGE_CACHE_SIZE - 1); + to = PAGE_CACHE_SIZE; + if (map_end & (PAGE_CACHE_SIZE - 1)) + to = map_end & (PAGE_CACHE_SIZE - 1); + + page = grab_cache_page(mapping, page_index); + + /* This page can't be dirtied before we CoW it out. */ + BUG_ON(PageDirty(page)); + + if (!PageUptodate(page)) { + ret = block_read_full_page(page, ocfs2_get_block); + if (ret) { + mlog_errno(ret); + goto unlock; + } + lock_page(page); + } + + if (page_has_buffers(page)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_clear_cow_buffer); + if (ret) { + mlog_errno(ret); + goto unlock; + } + } + + ocfs2_map_and_dirty_page(context->inode, + handle, from, to, + page, 0, &new_block); + mark_page_accessed(page); +unlock: + unlock_page(page); + page_cache_release(page); + page = NULL; + offset = map_end; + if (ret) + break; + } + + return ret; +} + +static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len) +{ + int ret = 0; + struct super_block *sb = context->inode->i_sb; + struct ocfs2_caching_info *ci = context->data_et.et_ci; + int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); + u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); + u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); + struct ocfs2_super *osb = OCFS2_SB(sb); + struct buffer_head *old_bh = NULL; + struct buffer_head *new_bh = NULL; + + mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster, + new_cluster, new_len); + + for (i = 0; i < blocks; i++, old_block++, new_block++) { + new_bh = sb_getblk(osb->sb, new_block); + if (new_bh == NULL) { + ret = -EIO; + mlog_errno(ret); + break; + } + + ocfs2_set_new_buffer_uptodate(ci, new_bh); + + ret = ocfs2_read_block(ci, old_block, &old_bh, NULL); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_journal_access(handle, ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + break; + } + + memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); + ret = ocfs2_journal_dirty(handle, new_bh); + if (ret) { + mlog_errno(ret); + break; + } + + brelse(new_bh); + brelse(old_bh); + new_bh = NULL; + old_bh = NULL; + } + + brelse(new_bh); + brelse(old_bh); + return ret; +} + +static int ocfs2_clear_ext_refcount(handle_t *handle, + struct ocfs2_extent_tree *et, + u32 cpos, u32 p_cluster, u32 len, + unsigned int ext_flags, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret, index; + struct ocfs2_extent_rec replace_rec; + struct ocfs2_path *path = NULL; + struct ocfs2_extent_list *el; + struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); + u64 ino = ocfs2_metadata_cache_owner(et->et_ci); + + mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n", + (unsigned long long)ino, cpos, len, p_cluster, ext_flags); + + memset(&replace_rec, 0, sizeof(replace_rec)); + replace_rec.e_cpos = cpu_to_le32(cpos); + replace_rec.e_leaf_clusters = cpu_to_le16(len); + replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb, + p_cluster)); + replace_rec.e_flags = ext_flags; + replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED; + + path = ocfs2_new_path_from_et(et); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(et->et_ci, path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + index = ocfs2_search_extent_list(el, cpos); + if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + ocfs2_error(sb, + "Inode %llu has an extent at cpos %u which can no " + "longer be found.\n", + (unsigned long long)ino, cpos); + ret = -EROFS; + goto out; + } + + ret = ocfs2_split_extent(handle, et, path, index, + &replace_rec, meta_ac, dealloc); + if (ret) + mlog_errno(ret); + +out: + ocfs2_free_path(path); + return ret; +} + +static int ocfs2_replace_clusters(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old, + u32 new, u32 len, + unsigned int ext_flags) +{ + int ret; + struct ocfs2_caching_info *ci = context->data_et.et_ci; + u64 ino = ocfs2_metadata_cache_owner(ci); + + mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n", + (unsigned long long)ino, cpos, old, new, len, ext_flags); + + /*If the old clusters is unwritten, no need to duplicate. */ + if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { + ret = context->cow_duplicate_clusters(handle, context, cpos, + old, new, len); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_clear_ext_refcount(handle, &context->data_et, + cpos, new, len, ext_flags, + context->meta_ac, &context->dealloc); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +static int ocfs2_cow_sync_writeback(struct super_block *sb, + struct ocfs2_cow_context *context, + u32 cpos, u32 num_clusters) +{ + int ret = 0; + loff_t offset, end, map_end; + pgoff_t page_index; + struct page *page; + + if (ocfs2_should_order_data(context->inode)) + return 0; + + offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); + + ret = filemap_fdatawrite_range(context->inode->i_mapping, + offset, end - 1); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + while (offset < end) { + page_index = offset >> PAGE_CACHE_SHIFT; + map_end = (page_index + 1) << PAGE_CACHE_SHIFT; + if (map_end > end) + map_end = end; + + page = grab_cache_page(context->inode->i_mapping, page_index); + BUG_ON(!page); + + wait_on_page_writeback(page); + if (PageError(page)) { + ret = -EIO; + mlog_errno(ret); + } else + mark_page_accessed(page); + + unlock_page(page); + page_cache_release(page); + page = NULL; + offset = map_end; + if (ret) + break; + } + + return ret; +} + +static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags) +{ + return ocfs2_get_clusters(context->inode, v_cluster, p_cluster, + num_clusters, extent_flags); +} + +static int ocfs2_make_clusters_writable(struct super_block *sb, + struct ocfs2_cow_context *context, + u32 cpos, u32 p_cluster, + u32 num_clusters, unsigned int e_flags) +{ + int ret, delete, index, credits = 0; + u32 new_bit, new_len; + unsigned int set_len; + struct ocfs2_super *osb = OCFS2_SB(sb); + handle_t *handle; + struct buffer_head *ref_leaf_bh = NULL; + struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci; + struct ocfs2_refcount_rec rec; + + mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n", + cpos, p_cluster, num_clusters, e_flags); + + ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters, + &context->data_et, + ref_ci, + context->ref_root_bh, + &context->meta_ac, + &context->data_ac, &credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + if (context->post_refcount) + credits += context->post_refcount->credits; + + credits += context->extra_credits; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + while (num_clusters) { + ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, + p_cluster, num_clusters, + &rec, &index, &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + BUG_ON(!rec.r_refcount); + set_len = min((u64)p_cluster + num_clusters, + le64_to_cpu(rec.r_cpos) + + le32_to_cpu(rec.r_clusters)) - p_cluster; + + /* + * There are many different situation here. + * 1. If refcount == 1, remove the flag and don't COW. + * 2. If refcount > 1, allocate clusters. + * Here we may not allocate r_len once at a time, so continue + * until we reach num_clusters. + */ + if (le32_to_cpu(rec.r_refcount) == 1) { + delete = 0; + ret = ocfs2_clear_ext_refcount(handle, + &context->data_et, + cpos, p_cluster, + set_len, e_flags, + context->meta_ac, + &context->dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } else { + delete = 1; + + ret = __ocfs2_claim_clusters(osb, handle, + context->data_ac, + 1, set_len, + &new_bit, &new_len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_replace_clusters(handle, context, + cpos, p_cluster, new_bit, + new_len, e_flags); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + set_len = new_len; + } + + ret = __ocfs2_decrease_refcount(handle, ref_ci, + context->ref_root_bh, + p_cluster, set_len, + context->meta_ac, + &context->dealloc, delete); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + cpos += set_len; + p_cluster += set_len; + num_clusters -= set_len; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + } + + /* handle any post_cow action. */ + if (context->post_refcount && context->post_refcount->func) { + ret = context->post_refcount->func(context->inode, handle, + context->post_refcount->para); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + /* + * Here we should write the new page out first if we are + * in write-back mode. + */ + if (context->get_clusters == ocfs2_di_get_clusters) { + ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (context->data_ac) { + ocfs2_free_alloc_context(context->data_ac); + context->data_ac = NULL; + } + if (context->meta_ac) { + ocfs2_free_alloc_context(context->meta_ac); + context->meta_ac = NULL; + } + brelse(ref_leaf_bh); + + return ret; +} + +static int ocfs2_replace_cow(struct ocfs2_cow_context *context) +{ + int ret = 0; + struct inode *inode = context->inode; + u32 cow_start = context->cow_start, cow_len = context->cow_len; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { + ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " + "tree, but the feature bit is not set in the " + "super block.", inode->i_ino); + return -EROFS; + } + + ocfs2_init_dealloc_ctxt(&context->dealloc); + + while (cow_len) { + ret = context->get_clusters(context, cow_start, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED)); + + if (cow_len < num_clusters) + num_clusters = cow_len; + + ret = ocfs2_make_clusters_writable(inode->i_sb, context, + cow_start, p_cluster, + num_clusters, ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + cow_len -= num_clusters; + cow_start += num_clusters; + } + + if (ocfs2_dealloc_has_cluster(&context->dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &context->dealloc); + } + + return ret; +} + +/* + * Starting at cpos, try to CoW write_len clusters. Don't CoW + * past max_cpos. This will stop when it runs into a hole or an + * unrefcounted extent. + */ +static int ocfs2_refcount_cow_hunk(struct inode *inode, + struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos) +{ + int ret; + u32 cow_start = 0, cow_len = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + struct ocfs2_cow_context *context = NULL; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, + cpos, write_len, max_cpos, + &cow_start, &cow_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, " + "cow_len %u\n", inode->i_ino, + cpos, write_len, cow_start, cow_len); + + BUG_ON(cow_len == 0); + + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); + if (!context) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + context->inode = inode; + context->cow_start = cow_start; + context->cow_len = cow_len; + context->ref_tree = ref_tree; + context->ref_root_bh = ref_root_bh; + context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; + context->get_clusters = ocfs2_di_get_clusters; + + ocfs2_init_dinode_extent_tree(&context->data_et, + INODE_CACHE(inode), di_bh); + + ret = ocfs2_replace_cow(context); + if (ret) + mlog_errno(ret); + + /* + * truncate the extent map here since no matter whether we meet with + * any error during the action, we shouldn't trust cached extent map + * any more. + */ + ocfs2_extent_map_trunc(inode, cow_start); + + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +out: + kfree(context); + return ret; +} + +/* + * CoW any and all clusters between cpos and cpos+write_len. + * Don't CoW past max_cpos. If this returns successfully, all + * clusters between cpos and cpos+write_len are safe to modify. + */ +int ocfs2_refcount_cow(struct inode *inode, + struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos) +{ + int ret = 0; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + + while (write_len) { + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } + + if (write_len < num_clusters) + num_clusters = write_len; + + if (ext_flags & OCFS2_EXT_REFCOUNTED) { + ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, + num_clusters, max_cpos); + if (ret) { + mlog_errno(ret); + break; + } + } + + write_len -= num_clusters; + cpos += num_clusters; + } + + return ret; +} + +static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags) +{ + struct inode *inode = context->inode; + struct ocfs2_xattr_value_root *xv = context->cow_object; + + return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster, + num_clusters, &xv->xr_list, + extent_flags); +} + +/* + * Given a xattr value root, calculate the most meta/credits we need for + * refcount tree change if we truncate it to 0. + */ +int ocfs2_refcounted_xattr_delete_need(struct inode *inode, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_xattr_value_root *xv, + int *meta_add, int *credits) +{ + int ret = 0, index, ref_blocks = 0; + u32 p_cluster, num_clusters; + u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters); + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_rec rec; + struct buffer_head *ref_leaf_bh = NULL; + + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &xv->xr_list, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos += num_clusters; + + while (num_clusters) { + ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh, + p_cluster, num_clusters, + &rec, &index, + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!rec.r_refcount); + + rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + + /* + * We really don't know whether the other clusters is in + * this refcount block or not, so just take the worst + * case that all the clusters are in this block and each + * one will split a refcount rec, so totally we need + * clusters * 2 new refcount rec. + */ + if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 > + le16_to_cpu(rb->rf_records.rl_count)) + ref_blocks++; + + *credits += 1; + brelse(ref_leaf_bh); + ref_leaf_bh = NULL; + + if (num_clusters <= le32_to_cpu(rec.r_clusters)) + break; + else + num_clusters -= le32_to_cpu(rec.r_clusters); + p_cluster += num_clusters; + } + } + + *meta_add += ref_blocks; + if (!ref_blocks) + goto out; + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + else { + struct ocfs2_extent_tree et; + + ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh); + *credits += ocfs2_calc_extend_credits(inode->i_sb, + et.et_root_el, + ref_blocks); + } + +out: + brelse(ref_leaf_bh); + return ret; +} + +/* + * Do CoW for xattr. + */ +int ocfs2_refcount_cow_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_refcount_tree *ref_tree, + struct buffer_head *ref_root_bh, + u32 cpos, u32 write_len, + struct ocfs2_post_refcount *post) +{ + int ret; + struct ocfs2_xattr_value_root *xv = vb->vb_xv; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_cow_context *context = NULL; + u32 cow_start, cow_len; + + BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, + cpos, write_len, UINT_MAX, + &cow_start, &cow_len); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(cow_len == 0); + + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); + if (!context) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + context->inode = inode; + context->cow_start = cow_start; + context->cow_len = cow_len; + context->ref_tree = ref_tree; + context->ref_root_bh = ref_root_bh;; + context->cow_object = xv; + + context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; + /* We need the extra credits for duplicate_clusters by jbd. */ + context->extra_credits = + ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len; + context->get_clusters = ocfs2_xattr_value_get_clusters; + context->post_refcount = post; + + ocfs2_init_xattr_value_extent_tree(&context->data_et, + INODE_CACHE(inode), vb); + + ret = ocfs2_replace_cow(context); + if (ret) + mlog_errno(ret); + +out: + kfree(context); + return ret; +} + +/* + * Insert a new extent into refcount tree and mark a extent rec + * as refcounted in the dinode tree. + */ +int ocfs2_add_refcount_flag(struct inode *inode, + struct ocfs2_extent_tree *data_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *post) +{ + int ret; + handle_t *handle; + int credits = 1, ref_blocks = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, + ref_ci, ref_root_bh, + p_cluster, num_clusters, + &ref_blocks, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "reserve new metadata %d, credits = %d\n", + ref_blocks, credits); + + if (ref_blocks) { + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + ref_blocks, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (post) + credits += post->credits; + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_mark_extent_refcounted(inode, data_et, handle, + cpos, num_clusters, p_cluster, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, + p_cluster, num_clusters, 0, + meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (post && post->func) { + ret = post->func(inode, handle, post->para); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_change_ctime(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + handle_t *handle; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_attach_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret, data_changed = 0; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_refcount_tree *ref_tree; + unsigned int ext_flags; + loff_t size; + u32 cpos, num_clusters, clusters, p_cluster; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree di_et; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { + ret = ocfs2_create_refcount_tree(inode, di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + BUG_ON(!di->i_refcount_loc); + ret = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(di->i_refcount_loc), 1, + &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); + + size = i_size_read(inode); + clusters = ocfs2_clusters_for_bytes(inode->i_sb, size); + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + + if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { + ret = ocfs2_add_refcount_flag(inode, &di_et, + &ref_tree->rf_ci, + ref_root_bh, cpos, + p_cluster, num_clusters, + &dealloc, NULL); + if (ret) { + mlog_errno(ret); + goto unlock; + } + + data_changed = 1; + } + cpos += num_clusters; + } + + if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh, + &ref_tree->rf_ci, + ref_root_bh, + &dealloc); + if (ret) { + mlog_errno(ret); + goto unlock; + } + } + + if (data_changed) { + ret = ocfs2_change_ctime(inode, di_bh); + if (ret) + mlog_errno(ret); + } + +unlock: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + + if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } +out: + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode, 0); + + return ret; +} + +static int ocfs2_add_refcounted_extent(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + unsigned int ext_flags, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + handle_t *handle; + int credits = 0; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_lock_refcount_allocators(inode->i_sb, + p_cluster, num_clusters, + et, ref_ci, + ref_root_bh, &meta_ac, + NULL, &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_insert_extent(handle, et, cpos, + cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, + p_cluster)), + num_clusters, ext_flags, meta_ac); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, + p_cluster, num_clusters, + meta_ac, dealloc); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_duplicate_extent_list(struct inode *s_inode, + struct inode *t_inode, + struct buffer_head *t_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + u32 p_cluster, num_clusters, clusters, cpos; + loff_t size; + unsigned int ext_flags; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh); + + size = i_size_read(s_inode); + clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size); + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, + &num_clusters, &ext_flags); + + if (p_cluster) { + ret = ocfs2_add_refcounted_extent(t_inode, &et, + ref_ci, ref_root_bh, + cpos, p_cluster, + num_clusters, + ext_flags, + dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + cpos += num_clusters; + } + +out: + return ret; +} + +/* + * change the new file's attributes to the src. + * + * reflink creates a snapshot of a file, that means the attributes + * must be identical except for three exceptions - nlink, ino, and ctime. + */ +static int ocfs2_complete_reflink(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh, + bool preserve) +{ + int ret; + handle_t *handle; + struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data; + loff_t size = i_size_read(s_inode); + + handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&OCFS2_I(t_inode)->ip_lock); + OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters; + OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr; + OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; + spin_unlock(&OCFS2_I(t_inode)->ip_lock); + i_size_write(t_inode, size); + + di->i_xattr_inline_size = s_di->i_xattr_inline_size; + di->i_clusters = s_di->i_clusters; + di->i_size = s_di->i_size; + di->i_dyn_features = s_di->i_dyn_features; + di->i_attr = s_di->i_attr; + + if (preserve) { + di->i_uid = s_di->i_uid; + di->i_gid = s_di->i_gid; + di->i_mode = s_di->i_mode; + + /* + * update time. + * we want mtime to appear identical to the source and + * update ctime. + */ + t_inode->i_ctime = CURRENT_TIME; + + di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); + + t_inode->i_mtime = s_inode->i_mtime; + di->i_mtime = s_di->i_mtime; + di->i_mtime_nsec = s_di->i_mtime_nsec; + } + + ocfs2_journal_dirty(handle, t_bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle); + return ret; +} + +static int ocfs2_create_reflink_node(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh, + bool preserve) +{ + int ret; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); + struct ocfs2_refcount_block *rb; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; + struct ocfs2_refcount_tree *ref_tree; + + ocfs2_init_dealloc_ctxt(&dealloc); + + ret = ocfs2_set_refcount_tree(t_inode, t_bh, + le64_to_cpu(di->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, + &ref_tree->rf_ci, ref_root_bh, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + + ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh, preserve); + if (ret) + mlog_errno(ret); + +out_unlock_refcount: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +out: + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } + + return ret; +} + +static int __ocfs2_reflink(struct dentry *old_dentry, + struct buffer_head *old_bh, + struct inode *new_inode, + bool preserve) +{ + int ret; + struct inode *inode = old_dentry->d_inode; + struct buffer_head *new_bh = NULL; + + ret = filemap_fdatawrite(inode->i_mapping); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_attach_refcount_tree(inode, old_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + mutex_lock(&new_inode->i_mutex); + ret = ocfs2_inode_lock(new_inode, &new_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_create_reflink_node(inode, old_bh, + new_inode, new_bh, preserve); + if (ret) { + mlog_errno(ret); + goto inode_unlock; + } + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) { + ret = ocfs2_reflink_xattrs(inode, old_bh, + new_inode, new_bh, + preserve); + if (ret) + mlog_errno(ret); + } +inode_unlock: + ocfs2_inode_unlock(new_inode, 1); + brelse(new_bh); +out_unlock: + mutex_unlock(&new_inode->i_mutex); +out: + if (!ret) { + ret = filemap_fdatawait(inode->i_mapping); + if (ret) + mlog_errno(ret); + } + return ret; +} + +static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) +{ + int error; + struct inode *inode = old_dentry->d_inode; + struct buffer_head *old_bh = NULL; + struct inode *new_orphan_inode = NULL; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + &new_orphan_inode); + if (error) { + mlog_errno(error); + goto out; + } + + error = ocfs2_inode_lock(inode, &old_bh, 1); + if (error) { + mlog_errno(error); + goto out; + } + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + down_write(&OCFS2_I(inode)->ip_alloc_sem); + error = __ocfs2_reflink(old_dentry, old_bh, + new_orphan_inode, preserve); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + up_write(&OCFS2_I(inode)->ip_xattr_sem); + + ocfs2_inode_unlock(inode, 1); + brelse(old_bh); + + if (error) { + mlog_errno(error); + goto out; + } + + /* If the security isn't preserved, we need to re-initialize them. */ + if (!preserve) { + error = ocfs2_init_security_and_acl(dir, new_orphan_inode); + if (error) + mlog_errno(error); + } +out: + if (!error) { + error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + new_dentry); + if (error) + mlog_errno(error); + } + + if (new_orphan_inode) { + /* + * We need to open_unlock the inode no matter whether we + * succeed or not, so that other nodes can delete it later. + */ + ocfs2_open_unlock(new_orphan_inode); + if (error) + iput(new_orphan_inode); + } + + return error; +} + +/* + * Below here are the bits used by OCFS2_IOC_REFLINK() to fake + * sys_reflink(). This will go away when vfs_reflink() exists in + * fs/namei.c. + */ + +/* copied from may_create in VFS. */ +static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) +{ + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* copied from user_path_parent. */ +static int ocfs2_user_path_parent(const char __user *path, + struct nameidata *nd, char **name) +{ + char *s = getname(path); + int error; + + if (IS_ERR(s)) + return PTR_ERR(s); + + error = path_lookup(s, LOOKUP_PARENT, nd); + if (error) + putname(s); + else + *name = s; + + return error; +} + +/** + * ocfs2_vfs_reflink - Create a reference-counted link + * + * @old_dentry: source dentry + inode + * @dir: directory to create the target + * @new_dentry: target dentry + * @preserve: if true, preserve all file attributes + */ +int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) +{ + struct inode *inode = old_dentry->d_inode; + int error; + + if (!inode) + return -ENOENT; + + error = ocfs2_may_create(dir, new_dentry); + if (error) + return error; + + if (dir->i_sb != inode->i_sb) + return -EXDEV; + + /* + * A reflink to an append-only or immutable file cannot be created. + */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + /* Only regular files can be reflinked. */ + if (!S_ISREG(inode->i_mode)) + return -EPERM; + + /* + * If the caller wants to preserve ownership, they require the + * rights to do so. + */ + if (preserve) { + if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN)) + return -EPERM; + if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN)) + return -EPERM; + } + + /* + * If the caller is modifying any aspect of the attributes, they + * are not creating a snapshot. They need read permission on the + * file. + */ + if (!preserve) { + error = inode_permission(inode, MAY_READ); + if (error) + return error; + } + + mutex_lock(&inode->i_mutex); + vfs_dq_init(dir); + error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); + mutex_unlock(&inode->i_mutex); + if (!error) + fsnotify_create(dir, new_dentry); + return error; +} +/* + * Most codes are copied from sys_linkat. + */ +int ocfs2_reflink_ioctl(struct inode *inode, + const char __user *oldname, + const char __user *newname, + bool preserve) +{ + struct dentry *new_dentry; + struct nameidata nd; + struct path old_path; + int error; + char *to = NULL; + + if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + error = user_path_at(AT_FDCWD, oldname, 0, &old_path); + if (error) { + mlog_errno(error); + return error; + } + + error = ocfs2_user_path_parent(newname, &nd, &to); + if (error) { + mlog_errno(error); + goto out; + } + + error = -EXDEV; + if (old_path.mnt != nd.path.mnt) + goto out_release; + new_dentry = lookup_create(&nd, 0); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) { + mlog_errno(error); + goto out_unlock; + } + + error = mnt_want_write(nd.path.mnt); + if (error) { + mlog_errno(error); + goto out_dput; + } + + error = ocfs2_vfs_reflink(old_path.dentry, + nd.path.dentry->d_inode, + new_dentry, preserve); + mnt_drop_write(nd.path.mnt); +out_dput: + dput(new_dentry); +out_unlock: + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); +out_release: + path_put(&nd.path); + putname(to); +out: + path_put(&old_path); + + return error; +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h new file mode 100644 index 00000000000..c1d19b1d3ec --- /dev/null +++ b/fs/ocfs2/refcounttree.h @@ -0,0 +1,106 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * refcounttree.h + * + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef OCFS2_REFCOUNTTREE_H +#define OCFS2_REFCOUNTTREE_H + +struct ocfs2_refcount_tree { + struct rb_node rf_node; + u64 rf_blkno; + u32 rf_generation; + struct rw_semaphore rf_sem; + struct ocfs2_lock_res rf_lockres; + struct kref rf_getcnt; + int rf_removed; + + /* the following 4 fields are used by caching_info. */ + struct ocfs2_caching_info rf_ci; + spinlock_t rf_lock; + struct mutex rf_io_mutex; + struct super_block *rf_sb; +}; + +void ocfs2_purge_refcount_trees(struct ocfs2_super *osb); +int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw, + struct ocfs2_refcount_tree **tree, + struct buffer_head **ref_bh); +void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, + int rw); + +int ocfs2_decrease_refcount(struct inode *inode, + handle_t *handle, u32 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc, + int delete); +int ocfs2_prepare_refcount_change_for_del(struct inode *inode, + struct buffer_head *di_bh, + u64 phys_blkno, + u32 clusters, + int *credits, + struct ocfs2_alloc_context **meta_ac); +int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, + u32 cpos, u32 write_len, u32 max_cpos); + +typedef int (ocfs2_post_refcount_func)(struct inode *inode, + handle_t *handle, + void *para); +/* + * Some refcount caller need to do more work after we modify the data b-tree + * during refcount operation(including CoW and add refcount flag), and make the + * transaction complete. So it must give us this structure so that we can do it + * within our transaction. + * + */ +struct ocfs2_post_refcount { + int credits; /* credits it need for journal. */ + ocfs2_post_refcount_func *func; /* real function. */ + void *para; +}; + +int ocfs2_refcounted_xattr_delete_need(struct inode *inode, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_xattr_value_root *xv, + int *meta_add, int *credits); +int ocfs2_refcount_cow_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_refcount_tree *ref_tree, + struct buffer_head *ref_root_bh, + u32 cpos, u32 write_len, + struct ocfs2_post_refcount *post); +int ocfs2_add_refcount_flag(struct inode *inode, + struct ocfs2_extent_tree *data_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + u32 cpos, u32 p_cluster, u32 num_clusters, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *post); +int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh); +int ocfs2_try_remove_refcount_tree(struct inode *inode, + struct buffer_head *di_bh); +int ocfs2_increase_refcount(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, u32 len, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_reflink_ioctl(struct inode *inode, + const char __user *oldname, + const char __user *newname, + bool preserve); +#endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 424adaa5f90..3c3d673a4d2 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", new_clusters, first_new_cluster); - ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode), + group_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; @@ -141,7 +141,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, } /* update the inode accordingly. */ - ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); @@ -514,7 +514,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out_unlock; } - ocfs2_set_new_buffer_uptodate(inode, group_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh); ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh); if (ret) { @@ -536,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) cl = &fe->id2.i_chain; cr = &cl->cl_recs[input->chain]; - ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode), + group_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_commit; @@ -552,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out_commit; } - ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), + main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_commit; diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 40661e7824e..bfbd7e9e949 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If * this is not true, the read of -1 (UINT64_MAX) will fail. */ - ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, - OCFS2_BH_IGNORE_CACHE, NULL); + ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks, + si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL); if (ret == 0) { spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); @@ -213,7 +213,7 @@ static int ocfs2_update_disk_slot(struct ocfs2_super *osb, ocfs2_update_disk_slot_old(si, slot_num, &bh); spin_unlock(&osb->osb_lock); - status = ocfs2_write_block(osb, bh, si->si_inode); + status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode)); if (status < 0) mlog_errno(status); @@ -404,8 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, (unsigned long long)blkno); bh = NULL; /* Acquire a fresh bh */ - status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, - OCFS2_BH_IGNORE_CACHE, NULL); + status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno, + 1, &bh, OCFS2_BH_IGNORE_CACHE, NULL); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 73a16d4666d..c30b644d957 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -310,7 +310,7 @@ int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, int rc; struct buffer_head *tmp = *bh; - rc = ocfs2_read_block(inode, gd_blkno, &tmp, + rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, ocfs2_validate_group_descriptor); if (rc) goto out; @@ -352,7 +352,7 @@ static int ocfs2_block_group_fill(handle_t *handle, } status = ocfs2_journal_access_gd(handle, - alloc_inode, + INODE_CACHE(alloc_inode), bg_bh, OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { @@ -476,7 +476,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, mlog_errno(status); goto bail; } - ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); status = ocfs2_block_group_fill(handle, alloc_inode, @@ -491,7 +491,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, bg = (struct ocfs2_group_desc *) bg_bh->b_data; - status = ocfs2_journal_access_di(handle, alloc_inode, + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1033,7 +1033,7 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, journal_type = OCFS2_JOURNAL_ACCESS_UNDO; status = ocfs2_journal_access_gd(handle, - alloc_inode, + INODE_CACHE(alloc_inode), group_bh, journal_type); if (status < 0) { @@ -1106,7 +1106,8 @@ static int ocfs2_relink_block_group(handle_t *handle, bg_ptr = le64_to_cpu(bg->bg_next_group); prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); - status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh, + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + prev_bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -1121,8 +1122,8 @@ static int ocfs2_relink_block_group(handle_t *handle, goto out_rollback; } - status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_rollback; @@ -1136,8 +1137,8 @@ static int ocfs2_relink_block_group(handle_t *handle, goto out_rollback; } - status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_rollback; @@ -1288,7 +1289,7 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode, struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; - ret = ocfs2_journal_access_di(handle, inode, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); @@ -1461,7 +1462,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, /* Ok, claim our bits now: set the info on dinode, chainlist * and then the group */ status = ocfs2_journal_access_di(handle, - alloc_inode, + INODE_CACHE(alloc_inode), ac->ac_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { @@ -1907,8 +1908,8 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle, if (ocfs2_is_cluster_bitmap(alloc_inode)) journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh, - journal_type); + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + group_bh, journal_type); if (status < 0) { mlog_errno(status); goto bail; @@ -1993,8 +1994,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle, goto bail; } - status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -2151,7 +2152,7 @@ int ocfs2_lock_allocators(struct inode *inode, BUG_ON(clusters_to_add != 0 && data_ac == NULL); - num_free_extents = ocfs2_num_free_extents(osb, inode, et); + num_free_extents = ocfs2_num_free_extents(osb, et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a3f8871d21f..c0e48aeebb1 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -28,7 +28,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> -#include <linux/utsname.h> #include <linux/init.h> #include <linux/random.h> #include <linux/statfs.h> @@ -69,6 +68,7 @@ #include "ver.h" #include "xattr.h" #include "quota.h" +#include "refcounttree.h" #include "buffer_head_io.h" @@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, } #endif /* CONFIG_DEBUG_FS */ -static struct file_operations ocfs2_osb_debug_fops = { +static const struct file_operations ocfs2_osb_debug_fops = { .open = ocfs2_osb_debug_open, .release = ocfs2_debug_release, .read = ocfs2_debug_read, @@ -965,7 +965,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type, int remount) return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED); } -static struct quotactl_ops ocfs2_quotactl_ops = { +static const struct quotactl_ops ocfs2_quotactl_ops = { .quota_on = ocfs2_quota_on, .quota_off = ocfs2_quota_off, .quota_sync = vfs_quota_sync, @@ -1668,8 +1668,6 @@ static void ocfs2_inode_init_once(void *data) spin_lock_init(&oi->ip_lock); ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); - oi->ip_created_trans = 0; - oi->ip_last_trans = 0; oi->ip_dir_start_lookup = 0; init_rwsem(&oi->ip_alloc_sem); @@ -1683,7 +1681,8 @@ static void ocfs2_inode_init_once(void *data) ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); - ocfs2_metadata_cache_init(&oi->vfs_inode); + ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), + &ocfs2_inode_caching_ops); inode_init_once(&oi->vfs_inode); } @@ -1859,6 +1858,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_sync_blockdev(sb); + ocfs2_purge_refcount_trees(osb); + /* No cluster connection means we've failed during mount, so skip * all the steps which depended on that to complete. */ if (osb->cconn) { @@ -2065,6 +2066,8 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } + osb->osb_rf_lock_tree = RB_ROOT; + osb->s_feature_compat = le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); osb->s_feature_ro_compat = @@ -2490,7 +2493,8 @@ void __ocfs2_abort(struct super_block* sb, /* Force a panic(). This stinks, but it's better than letting * things continue without having a proper hard readonly * here. */ - OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; + if (!ocfs2_mount_local(OCFS2_SB(sb))) + OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; ocfs2_handle_error(sb); } diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 579dd1b1110..e3421030a69 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -38,7 +38,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/pagemap.h> -#include <linux/utsname.h> #include <linux/namei.h> #define MLOG_MASK_PREFIX ML_NAMEI diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 187b99ff036..b6284f235d2 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -75,15 +75,77 @@ struct ocfs2_meta_cache_item { static struct kmem_cache *ocfs2_uptodate_cachep = NULL; -void ocfs2_metadata_cache_init(struct inode *inode) +u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) { - struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + BUG_ON(!ci || !ci->ci_ops); - oi->ip_flags |= OCFS2_INODE_CACHE_INLINE; + return ci->ci_ops->co_owner(ci); +} + +struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + return ci->ci_ops->co_get_super(ci); +} + +static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_cache_lock(ci); +} + +static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_cache_unlock(ci); +} + +void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_io_lock(ci); +} + +void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + BUG_ON(!ci || !ci->ci_ops); + + ci->ci_ops->co_io_unlock(ci); +} + + +static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci, + int clear) +{ + ci->ci_flags |= OCFS2_CACHE_FL_INLINE; ci->ci_num_cached = 0; + + if (clear) { + ci->ci_created_trans = 0; + ci->ci_last_trans = 0; + } +} + +void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci, + const struct ocfs2_caching_operations *ops) +{ + BUG_ON(!ops); + + ci->ci_ops = ops; + ocfs2_metadata_cache_reset(ci, 1); } +void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci) +{ + ocfs2_metadata_cache_purge(ci); + ocfs2_metadata_cache_reset(ci, 1); +} + + /* No lock taken here as 'root' is not expected to be visible to other * processes. */ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) @@ -112,19 +174,20 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) * This function is a few more lines longer than necessary due to some * accounting done here, but I think it's worth tracking down those * bugs sooner -- Mark */ -void ocfs2_metadata_cache_purge(struct inode *inode) +void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci) { - struct ocfs2_inode_info *oi = OCFS2_I(inode); unsigned int tree, to_purge, purged; - struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; struct rb_root root = RB_ROOT; - spin_lock(&oi->ip_lock); - tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE); + BUG_ON(!ci || !ci->ci_ops); + + ocfs2_metadata_cache_lock(ci); + tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE); to_purge = ci->ci_num_cached; - mlog(0, "Purge %u %s items from Inode %llu\n", to_purge, - tree ? "array" : "tree", (unsigned long long)oi->ip_blkno); + mlog(0, "Purge %u %s items from Owner %llu\n", to_purge, + tree ? "array" : "tree", + (unsigned long long)ocfs2_metadata_cache_owner(ci)); /* If we're a tree, save off the root so that we can safely * initialize the cache. We do the work to free tree members @@ -132,16 +195,17 @@ void ocfs2_metadata_cache_purge(struct inode *inode) if (tree) root = ci->ci_cache.ci_tree; - ocfs2_metadata_cache_init(inode); - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_reset(ci, 0); + ocfs2_metadata_cache_unlock(ci); purged = ocfs2_purge_copied_metadata_tree(&root); /* If possible, track the number wiped so that we can more * easily detect counting errors. Unfortunately, this is only * meaningful for trees. */ if (tree && purged != to_purge) - mlog(ML_ERROR, "Inode %llu, count = %u, purged = %u\n", - (unsigned long long)oi->ip_blkno, to_purge, purged); + mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + to_purge, purged); } /* Returns the index in the cache array, -1 if not found. @@ -182,27 +246,25 @@ ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, return NULL; } -static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, +static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci, struct buffer_head *bh) { int index = -1; struct ocfs2_meta_cache_item *item = NULL; - spin_lock(&oi->ip_lock); + ocfs2_metadata_cache_lock(ci); - mlog(0, "Inode %llu, query block %llu (inline = %u)\n", - (unsigned long long)oi->ip_blkno, + mlog(0, "Owner %llu, query block %llu (inline = %u)\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long) bh->b_blocknr, - !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE)); + !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE)); - if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) - index = ocfs2_search_cache_array(&oi->ip_metadata_cache, - bh->b_blocknr); + if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) + index = ocfs2_search_cache_array(ci, bh->b_blocknr); else - item = ocfs2_search_cache_tree(&oi->ip_metadata_cache, - bh->b_blocknr); + item = ocfs2_search_cache_tree(ci, bh->b_blocknr); - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_unlock(ci); mlog(0, "index = %d, item = %p\n", index, item); @@ -214,7 +276,7 @@ static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, * * This can be called under lock_buffer() */ -int ocfs2_buffer_uptodate(struct inode *inode, +int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh) { /* Doesn't matter if the bh is in our cache or not -- if it's @@ -230,24 +292,24 @@ int ocfs2_buffer_uptodate(struct inode *inode, /* Ok, locally the buffer is marked as up to date, now search * our cache to see if we can trust that. */ - return ocfs2_buffer_cached(OCFS2_I(inode), bh); + return ocfs2_buffer_cached(ci, bh); } -/* +/* * Determine whether a buffer is currently out on a read-ahead request. - * ip_io_sem should be held to serialize submitters with the logic here. + * ci_io_sem should be held to serialize submitters with the logic here. */ -int ocfs2_buffer_read_ahead(struct inode *inode, +int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci, struct buffer_head *bh) { - return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh); + return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh); } /* Requires ip_lock */ static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, sector_t block) { - BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY); + BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY); mlog(0, "block %llu takes position %u\n", (unsigned long long) block, ci->ci_num_cached); @@ -292,66 +354,64 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, ci->ci_num_cached++; } -static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi, - struct ocfs2_caching_info *ci) +/* co_cache_lock() must be held */ +static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci) { - assert_spin_locked(&oi->ip_lock); - - return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) && - (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY); + return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) && + (ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY); } -/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the +/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the * pointers in tree after we use them - this allows caller to detect - * when to free in case of error. */ -static void ocfs2_expand_cache(struct ocfs2_inode_info *oi, + * when to free in case of error. + * + * The co_cache_lock() must be held. */ +static void ocfs2_expand_cache(struct ocfs2_caching_info *ci, struct ocfs2_meta_cache_item **tree) { int i; - struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; - mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY, - "Inode %llu, num cached = %u, should be %u\n", - (unsigned long long)oi->ip_blkno, ci->ci_num_cached, - OCFS2_INODE_MAX_CACHE_ARRAY); - mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), - "Inode %llu not marked as inline anymore!\n", - (unsigned long long)oi->ip_blkno); - assert_spin_locked(&oi->ip_lock); + mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY, + "Owner %llu, num cached = %u, should be %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY); + mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE), + "Owner %llu not marked as inline anymore!\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci)); /* Be careful to initialize the tree members *first* because * once the ci_tree is used, the array is junk... */ - for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) tree[i]->c_block = ci->ci_cache.ci_array[i]; - oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE; + ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE; ci->ci_cache.ci_tree = RB_ROOT; /* this will be set again by __ocfs2_insert_cache_tree */ ci->ci_num_cached = 0; - for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) { __ocfs2_insert_cache_tree(ci, tree[i]); tree[i] = NULL; } mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n", - (unsigned long long)oi->ip_blkno, oi->ip_flags, ci->ci_num_cached); + (unsigned long long)ocfs2_metadata_cache_owner(ci), + ci->ci_flags, ci->ci_num_cached); } /* Slow path function - memory allocation is necessary. See the * comment above ocfs2_set_buffer_uptodate for more information. */ -static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, +static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, sector_t block, int expand_tree) { int i; - struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; struct ocfs2_meta_cache_item *new = NULL; - struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] = + struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] = { NULL, }; - mlog(0, "Inode %llu, block %llu, expand = %d\n", - (unsigned long long)oi->ip_blkno, + mlog(0, "Owner %llu, block %llu, expand = %d\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)block, expand_tree); new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); @@ -364,7 +424,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, if (expand_tree) { /* Do *not* allocate an array here - the removal code * has no way of tracking that. */ - for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) { tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); if (!tree[i]) { @@ -376,21 +436,21 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, } } - spin_lock(&oi->ip_lock); - if (ocfs2_insert_can_use_array(oi, ci)) { + ocfs2_metadata_cache_lock(ci); + if (ocfs2_insert_can_use_array(ci)) { mlog(0, "Someone cleared the tree underneath us\n"); /* Ok, items were removed from the cache in between * locks. Detect this and revert back to the fast path */ ocfs2_append_cache_array(ci, block); - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_unlock(ci); goto out_free; } if (expand_tree) - ocfs2_expand_cache(oi, tree); + ocfs2_expand_cache(ci, tree); __ocfs2_insert_cache_tree(ci, new); - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_unlock(ci); new = NULL; out_free: @@ -400,14 +460,14 @@ out_free: /* If these were used, then ocfs2_expand_cache re-set them to * NULL for us. */ if (tree[0]) { - for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) + for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) if (tree[i]) kmem_cache_free(ocfs2_uptodate_cachep, tree[i]); } } -/* Item insertion is guarded by ip_io_mutex, so the insertion path takes +/* Item insertion is guarded by co_io_lock(), so the insertion path takes * advantage of this by not rechecking for a duplicate insert during * the slow case. Additionally, if the cache needs to be bumped up to * a tree, the code will not recheck after acquiring the lock -- @@ -425,59 +485,55 @@ out_free: * Readahead buffers can be passed in here before the I/O request is * completed. */ -void ocfs2_set_buffer_uptodate(struct inode *inode, +void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh) { int expand; - struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; /* The block may very well exist in our cache already, so avoid * doing any more work in that case. */ - if (ocfs2_buffer_cached(oi, bh)) + if (ocfs2_buffer_cached(ci, bh)) return; - mlog(0, "Inode %llu, inserting block %llu\n", - (unsigned long long)oi->ip_blkno, + mlog(0, "Owner %llu, inserting block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long)bh->b_blocknr); /* No need to recheck under spinlock - insertion is guarded by - * ip_io_mutex */ - spin_lock(&oi->ip_lock); - if (ocfs2_insert_can_use_array(oi, ci)) { + * co_io_lock() */ + ocfs2_metadata_cache_lock(ci); + if (ocfs2_insert_can_use_array(ci)) { /* Fast case - it's an array and there's a free * spot. */ ocfs2_append_cache_array(ci, bh->b_blocknr); - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_unlock(ci); return; } expand = 0; - if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { + if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { /* We need to bump things up to a tree. */ expand = 1; } - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_unlock(ci); - __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand); + __ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand); } /* Called against a newly allocated buffer. Most likely nobody should * be able to read this sort of metadata while it's still being - * allocated, but this is careful to take ip_io_mutex anyway. */ -void ocfs2_set_new_buffer_uptodate(struct inode *inode, + * allocated, but this is careful to take co_io_lock() anyway. */ +void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh) { - struct ocfs2_inode_info *oi = OCFS2_I(inode); - /* This should definitely *not* exist in our cache */ - BUG_ON(ocfs2_buffer_cached(oi, bh)); + BUG_ON(ocfs2_buffer_cached(ci, bh)); set_buffer_uptodate(bh); - mutex_lock(&oi->ip_io_mutex); - ocfs2_set_buffer_uptodate(inode, bh); - mutex_unlock(&oi->ip_io_mutex); + ocfs2_metadata_cache_io_lock(ci); + ocfs2_set_buffer_uptodate(ci, bh); + ocfs2_metadata_cache_io_unlock(ci); } /* Requires ip_lock. */ @@ -487,7 +543,7 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, sector_t *array = ci->ci_cache.ci_array; int bytes; - BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY); + BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY); BUG_ON(index >= ci->ci_num_cached); BUG_ON(!ci->ci_num_cached); @@ -515,21 +571,19 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, ci->ci_num_cached--; } -static void ocfs2_remove_block_from_cache(struct inode *inode, +static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci, sector_t block) { int index; struct ocfs2_meta_cache_item *item = NULL; - struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; - spin_lock(&oi->ip_lock); - mlog(0, "Inode %llu, remove %llu, items = %u, array = %u\n", - (unsigned long long)oi->ip_blkno, + ocfs2_metadata_cache_lock(ci); + mlog(0, "Owner %llu, remove %llu, items = %u, array = %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), (unsigned long long) block, ci->ci_num_cached, - oi->ip_flags & OCFS2_INODE_CACHE_INLINE); + ci->ci_flags & OCFS2_CACHE_FL_INLINE); - if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { + if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { index = ocfs2_search_cache_array(ci, block); if (index != -1) ocfs2_remove_metadata_array(ci, index); @@ -538,7 +592,7 @@ static void ocfs2_remove_block_from_cache(struct inode *inode, if (item) ocfs2_remove_metadata_tree(ci, item); } - spin_unlock(&oi->ip_lock); + ocfs2_metadata_cache_unlock(ci); if (item) kmem_cache_free(ocfs2_uptodate_cachep, item); @@ -549,23 +603,24 @@ static void ocfs2_remove_block_from_cache(struct inode *inode, * bother reverting things to an inlined array in the case of a remove * which moves us back under the limit. */ -void ocfs2_remove_from_cache(struct inode *inode, +void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci, struct buffer_head *bh) { sector_t block = bh->b_blocknr; - ocfs2_remove_block_from_cache(inode, block); + ocfs2_remove_block_from_cache(ci, block); } /* Called when we remove xattr clusters from an inode. */ -void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, +void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci, sector_t block, u32 c_len) { - unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len; for (i = 0; i < b_len; i++, block++) - ocfs2_remove_block_from_cache(inode, block); + ocfs2_remove_block_from_cache(ci, block); } int __init init_ocfs2_uptodate_cache(void) @@ -577,7 +632,7 @@ int __init init_ocfs2_uptodate_cache(void) return -ENOMEM; mlog(0, "%u inlined cache items per inode.\n", - OCFS2_INODE_MAX_CACHE_ARRAY); + OCFS2_CACHE_INFO_MAX_ARRAY); return 0; } diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index 531b4b3a0c4..0d826fe2da0 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h @@ -26,24 +26,59 @@ #ifndef OCFS2_UPTODATE_H #define OCFS2_UPTODATE_H +/* + * The caching code relies on locking provided by the user of + * struct ocfs2_caching_info. These operations connect that up. + */ +struct ocfs2_caching_operations { + /* + * A u64 representing the owning structure. Usually this + * is the block number (i_blkno or whatnot). This is used so + * that caching log messages can identify the owning structure. + */ + u64 (*co_owner)(struct ocfs2_caching_info *ci); + + /* The superblock is needed during I/O. */ + struct super_block *(*co_get_super)(struct ocfs2_caching_info *ci); + /* + * Lock and unlock the caching data. These will not sleep, and + * should probably be spinlocks. + */ + void (*co_cache_lock)(struct ocfs2_caching_info *ci); + void (*co_cache_unlock)(struct ocfs2_caching_info *ci); + + /* + * Lock and unlock for disk I/O. These will sleep, and should + * be mutexes. + */ + void (*co_io_lock)(struct ocfs2_caching_info *ci); + void (*co_io_unlock)(struct ocfs2_caching_info *ci); +}; + int __init init_ocfs2_uptodate_cache(void); void exit_ocfs2_uptodate_cache(void); -void ocfs2_metadata_cache_init(struct inode *inode); -void ocfs2_metadata_cache_purge(struct inode *inode); +void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci, + const struct ocfs2_caching_operations *ops); +void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci); +void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci); + +u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci); +void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci); +void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci); -int ocfs2_buffer_uptodate(struct inode *inode, +int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh); -void ocfs2_set_buffer_uptodate(struct inode *inode, +void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh); -void ocfs2_set_new_buffer_uptodate(struct inode *inode, +void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci, struct buffer_head *bh); -void ocfs2_remove_from_cache(struct inode *inode, +void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci, struct buffer_head *bh); -void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, +void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci, sector_t block, u32 c_len); -int ocfs2_buffer_read_ahead(struct inode *inode, +int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci, struct buffer_head *bh); #endif /* OCFS2_UPTODATE_H */ diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d1a27cda984..fe3419068df 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -55,7 +55,8 @@ #include "buffer_head_io.h" #include "super.h" #include "xattr.h" - +#include "refcounttree.h" +#include "acl.h" struct ocfs2_xattr_def_value_root { struct ocfs2_xattr_value_root xv; @@ -140,7 +141,7 @@ struct ocfs2_xattr_search { int not_found; }; -static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, +static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, struct ocfs2_xattr_header *xh, int index, int *block_off, @@ -157,7 +158,7 @@ static int ocfs2_xattr_index_block_find(struct inode *inode, struct ocfs2_xattr_search *xs); static int ocfs2_xattr_tree_list_index_block(struct inode *inode, - struct ocfs2_xattr_tree_root *xt, + struct buffer_head *blk_bh, char *buffer, size_t buffer_size); @@ -170,12 +171,42 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode, struct ocfs2_xattr_search *xs, struct ocfs2_xattr_set_ctxt *ctxt); -static int ocfs2_delete_xattr_index_block(struct inode *inode, - struct buffer_head *xb_bh); +typedef int (xattr_tree_rec_func)(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para); +static int ocfs2_iterate_xattr_index_block(struct inode *inode, + struct buffer_head *root_bh, + xattr_tree_rec_func *rec_func, + void *para); +static int ocfs2_delete_xattr_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para); +static int ocfs2_rm_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len, + void *para); + static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, u64 src_blk, u64 last_blk, u64 to_blk, unsigned int start_bucket, u32 *first_hash); +static int ocfs2_prepare_refcount_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_refcount_tree **ref_tree, + int *meta_need, + int *credits); +static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, + struct ocfs2_xattr_bucket *bucket, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **bh); +static int ocfs2_xattr_security_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags); static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) { @@ -254,9 +285,9 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, break; } - if (!ocfs2_buffer_uptodate(bucket->bu_inode, + if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode), bucket->bu_bhs[i])) - ocfs2_set_new_buffer_uptodate(bucket->bu_inode, + ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode), bucket->bu_bhs[i]); } @@ -271,7 +302,7 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket, { int rc; - rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno, + rc = ocfs2_read_blocks(INODE_CACHE(bucket->bu_inode), xb_blkno, bucket->bu_blocks, bucket->bu_bhs, 0, NULL); if (!rc) { @@ -297,7 +328,8 @@ static int ocfs2_xattr_bucket_journal_access(handle_t *handle, int i, rc = 0; for (i = 0; i < bucket->bu_blocks; i++) { - rc = ocfs2_journal_access(handle, bucket->bu_inode, + rc = ocfs2_journal_access(handle, + INODE_CACHE(bucket->bu_inode), bucket->bu_bhs[i], type); if (rc) { mlog_errno(rc); @@ -399,7 +431,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno, int rc; struct buffer_head *tmp = *bh; - rc = ocfs2_read_block(inode, xb_blkno, &tmp, + rc = ocfs2_read_block(INODE_CACHE(inode), xb_blkno, &tmp, ocfs2_validate_xattr_block); /* If ocfs2_read_block() got us a new bh, pass it up. */ @@ -596,15 +628,14 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, int status = 0; handle_t *handle = ctxt->handle; enum ocfs2_alloc_restarted why; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); struct ocfs2_extent_tree et; mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); - ocfs2_init_xattr_value_extent_tree(&et, inode, vb); + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); - status = vb->vb_access(handle, inode, vb->vb_bh, + status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -612,13 +643,11 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, } prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); - status = ocfs2_add_clusters_in_btree(osb, - inode, + status = ocfs2_add_clusters_in_btree(handle, + &et, &logical_start, clusters_to_add, 0, - &et, - handle, ctxt->data_ac, ctxt->meta_ac, &why); @@ -649,6 +678,7 @@ leave: static int __ocfs2_remove_xattr_range(struct inode *inode, struct ocfs2_xattr_value_buf *vb, u32 cpos, u32 phys_cpos, u32 len, + unsigned int ext_flags, struct ocfs2_xattr_set_ctxt *ctxt) { int ret; @@ -656,16 +686,16 @@ static int __ocfs2_remove_xattr_range(struct inode *inode, handle_t *handle = ctxt->handle; struct ocfs2_extent_tree et; - ocfs2_init_xattr_value_extent_tree(&et, inode, vb); + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); - ret = vb->vb_access(handle, inode, vb->vb_bh, + ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac, + ret = ocfs2_remove_extent(handle, &et, cpos, len, ctxt->meta_ac, &ctxt->dealloc); if (ret) { mlog_errno(ret); @@ -680,7 +710,14 @@ static int __ocfs2_remove_xattr_range(struct inode *inode, goto out; } - ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len); + if (ext_flags & OCFS2_EXT_REFCOUNTED) + ret = ocfs2_decrease_refcount(inode, handle, + ocfs2_blocks_to_clusters(inode->i_sb, + phys_blkno), + len, ctxt->meta_ac, &ctxt->dealloc, 1); + else + ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, + phys_blkno, len); if (ret) mlog_errno(ret); @@ -695,6 +732,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, struct ocfs2_xattr_set_ctxt *ctxt) { int ret = 0; + unsigned int ext_flags; u32 trunc_len, cpos, phys_cpos, alloc_size; u64 block; @@ -706,7 +744,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, while (trunc_len) { ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, &alloc_size, - &vb->vb_xv->xr_list); + &vb->vb_xv->xr_list, &ext_flags); if (ret) { mlog_errno(ret); goto out; @@ -717,15 +755,15 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, ret = __ocfs2_remove_xattr_range(inode, vb, cpos, phys_cpos, alloc_size, - ctxt); + ext_flags, ctxt); if (ret) { mlog_errno(ret); goto out; } block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); - ocfs2_remove_xattr_clusters_from_cache(inode, block, - alloc_size); + ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), + block, alloc_size); cpos += alloc_size; trunc_len -= alloc_size; } @@ -810,6 +848,23 @@ static int ocfs2_xattr_list_entries(struct inode *inode, return result; } +int ocfs2_has_inline_xattr_value_outside(struct inode *inode, + struct ocfs2_dinode *di) +{ + struct ocfs2_xattr_header *xh; + int i; + + xh = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) + if (!ocfs2_xattr_is_local(&xh->xh_entries[i])) + return 1; + + return 0; +} + static int ocfs2_xattr_ibody_list(struct inode *inode, struct ocfs2_dinode *di, char *buffer, @@ -855,11 +910,9 @@ static int ocfs2_xattr_block_list(struct inode *inode, struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size); - } else { - struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; - ret = ocfs2_xattr_tree_list_index_block(inode, xt, + } else + ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh, buffer, buffer_size); - } brelse(blk_bh); @@ -961,7 +1014,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode, cpos = 0; while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, - &num_clusters, el); + &num_clusters, el, NULL); if (ret) { mlog_errno(ret); goto out; @@ -970,7 +1023,8 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode, blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); /* Copy ocfs2_xattr_value */ for (i = 0; i < num_clusters * bpc; i++, blkno++) { - ret = ocfs2_read_block(inode, blkno, &bh, NULL); + ret = ocfs2_read_block(INODE_CACHE(inode), blkno, + &bh, NULL); if (ret) { mlog_errno(ret); goto out; @@ -1085,7 +1139,7 @@ static int ocfs2_xattr_block_get(struct inode *inode, i = xs->here - xs->header->xh_entries; if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, bucket_xh(xs->bucket), i, &block_off, @@ -1183,7 +1237,7 @@ static int ocfs2_xattr_get(struct inode *inode, static int __ocfs2_xattr_set_value_outside(struct inode *inode, handle_t *handle, - struct ocfs2_xattr_value_root *xv, + struct ocfs2_xattr_value_buf *vb, const void *value, int value_len) { @@ -1194,28 +1248,34 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode, u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); u64 blkno; struct buffer_head *bh = NULL; + unsigned int ext_flags; + struct ocfs2_xattr_value_root *xv = vb->vb_xv; BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, - &num_clusters, &xv->xr_list); + &num_clusters, &xv->xr_list, + &ext_flags); if (ret) { mlog_errno(ret); goto out; } + BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED); + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); for (i = 0; i < num_clusters * bpc; i++, blkno++) { - ret = ocfs2_read_block(inode, blkno, &bh, NULL); + ret = ocfs2_read_block(INODE_CACHE(inode), blkno, + &bh, NULL); if (ret) { mlog_errno(ret); goto out; } ret = ocfs2_journal_access(handle, - inode, + INODE_CACHE(inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { @@ -1266,7 +1326,7 @@ static int ocfs2_xattr_cleanup(struct inode *inode, void *val = xs->base + offs; size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; - ret = vb->vb_access(handle, inode, vb->vb_bh, + ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1294,7 +1354,7 @@ static int ocfs2_xattr_update_entry(struct inode *inode, { int ret; - ret = vb->vb_access(handle, inode, vb->vb_bh, + ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1355,7 +1415,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode, mlog_errno(ret); return ret; } - ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv, + ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb, xi->value, xi->value_len); if (ret < 0) mlog_errno(ret); @@ -1594,7 +1654,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, ret = __ocfs2_xattr_set_value_outside(inode, handle, - vb.vb_xv, + &vb, xi->value, xi->value_len); if (ret < 0) @@ -1615,7 +1675,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, } } - ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1623,7 +1683,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, } if (!(flag & OCFS2_INLINE_XATTR_FL)) { - ret = vb.vb_access(handle, inode, vb.vb_bh, + ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1700,51 +1760,112 @@ out: return ret; } +/* + * In xattr remove, if it is stored outside and refcounted, we may have + * the chance to split the refcount tree. So need the allocators. + */ +static int ocfs2_lock_xattr_remove_allocators(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_alloc_context **meta_ac, + int *ref_credits) +{ + int ret, meta_add = 0; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + + *ref_credits = 0; + ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster, + &num_clusters, + &xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci, + ref_root_bh, xv, + &meta_add, ref_credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), + meta_add, meta_ac); + if (ret) + mlog_errno(ret); + +out: + return ret; +} + static int ocfs2_remove_value_outside(struct inode*inode, struct ocfs2_xattr_value_buf *vb, - struct ocfs2_xattr_header *header) + struct ocfs2_xattr_header *header, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) { - int ret = 0, i; + int ret = 0, i, ref_credits; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + void *val; ocfs2_init_dealloc_ctxt(&ctxt.dealloc); - ctxt.handle = ocfs2_start_trans(osb, - ocfs2_remove_extent_credits(osb->sb)); - if (IS_ERR(ctxt.handle)) { - ret = PTR_ERR(ctxt.handle); - mlog_errno(ret); - goto out; - } - for (i = 0; i < le16_to_cpu(header->xh_count); i++) { struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; - if (!ocfs2_xattr_is_local(entry)) { - void *val; + if (ocfs2_xattr_is_local(entry)) + continue; - val = (void *)header + - le16_to_cpu(entry->xe_name_offset); - vb->vb_xv = (struct ocfs2_xattr_value_root *) - (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); - ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); - if (ret < 0) { - mlog_errno(ret); - break; - } + val = (void *)header + + le16_to_cpu(entry->xe_name_offset); + vb->vb_xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); + + ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv, + ref_ci, ref_root_bh, + &ctxt.meta_ac, + &ref_credits); + + ctxt.handle = ocfs2_start_trans(osb, ref_credits + + ocfs2_remove_extent_credits(osb->sb)); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); + if (ret < 0) { + mlog_errno(ret); + break; + } + + ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) { + ocfs2_free_alloc_context(ctxt.meta_ac); + ctxt.meta_ac = NULL; } } - ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &ctxt.dealloc); -out: return ret; } static int ocfs2_xattr_ibody_remove(struct inode *inode, - struct buffer_head *di_bh) + struct buffer_head *di_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) { struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; @@ -1759,13 +1880,21 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode, ((void *)di + inode->i_sb->s_blocksize - le16_to_cpu(di->i_xattr_inline_size)); - ret = ocfs2_remove_value_outside(inode, &vb, header); + ret = ocfs2_remove_value_outside(inode, &vb, header, + ref_ci, ref_root_bh); return ret; } +struct ocfs2_rm_xattr_bucket_para { + struct ocfs2_caching_info *ref_ci; + struct buffer_head *ref_root_bh; +}; + static int ocfs2_xattr_block_remove(struct inode *inode, - struct buffer_head *blk_bh) + struct buffer_head *blk_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) { struct ocfs2_xattr_block *xb; int ret = 0; @@ -1773,19 +1902,29 @@ static int ocfs2_xattr_block_remove(struct inode *inode, .vb_bh = blk_bh, .vb_access = ocfs2_journal_access_xb, }; + struct ocfs2_rm_xattr_bucket_para args = { + .ref_ci = ref_ci, + .ref_root_bh = ref_root_bh, + }; xb = (struct ocfs2_xattr_block *)blk_bh->b_data; if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); - ret = ocfs2_remove_value_outside(inode, &vb, header); + ret = ocfs2_remove_value_outside(inode, &vb, header, + ref_ci, ref_root_bh); } else - ret = ocfs2_delete_xattr_index_block(inode, blk_bh); + ret = ocfs2_iterate_xattr_index_block(inode, + blk_bh, + ocfs2_rm_xattr_cluster, + &args); return ret; } static int ocfs2_xattr_free_block(struct inode *inode, - u64 block) + u64 block, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh) { struct inode *xb_alloc_inode; struct buffer_head *xb_alloc_bh = NULL; @@ -1803,7 +1942,7 @@ static int ocfs2_xattr_free_block(struct inode *inode, goto out; } - ret = ocfs2_xattr_block_remove(inode, blk_bh); + ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1863,6 +2002,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) { struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_refcount_tree *ref_tree = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_caching_info *ref_ci = NULL; handle_t *handle; int ret; @@ -1872,8 +2014,21 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) return 0; + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + ref_ci = &ref_tree->rf_ci; + + } + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { - ret = ocfs2_xattr_ibody_remove(inode, di_bh); + ret = ocfs2_xattr_ibody_remove(inode, di_bh, + ref_ci, ref_root_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1882,7 +2037,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) if (di->i_xattr_loc) { ret = ocfs2_xattr_free_block(inode, - le64_to_cpu(di->i_xattr_loc)); + le64_to_cpu(di->i_xattr_loc), + ref_ci, ref_root_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1896,7 +2052,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) mlog_errno(ret); goto out; } - ret = ocfs2_journal_access_di(handle, inode, di_bh, + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -1916,6 +2072,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: + if (ref_tree) + ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1); + brelse(ref_root_bh); return ret; } @@ -2083,6 +2242,84 @@ cleanup: return ret; } +static int ocfs2_create_xattr_block(handle_t *handle, + struct inode *inode, + struct buffer_head *inode_bh, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **ret_bh, + int indexed) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got; + u64 first_blkno; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *new_bh = NULL; + struct ocfs2_xattr_block *xblk; + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &first_blkno); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh); + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), + new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + + /* Initialize ocfs2_xattr_block */ + xblk = (struct ocfs2_xattr_block *)new_bh->b_data; + memset(xblk, 0, inode->i_sb->s_blocksize); + strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); + xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); + xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); + xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); + xblk->xb_blkno = cpu_to_le64(first_blkno); + + if (indexed) { + struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root; + xr->xt_clusters = cpu_to_le32(1); + xr->xt_last_eb_blk = 0; + xr->xt_list.l_tree_depth = 0; + xr->xt_list.l_count = cpu_to_le16( + ocfs2_xattr_recs_per_xb(inode->i_sb)); + xr->xt_list.l_next_free_rec = cpu_to_le16(1); + xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED); + } + + ret = ocfs2_journal_dirty(handle, new_bh); + if (ret < 0) { + mlog_errno(ret); + goto end; + } + di->i_xattr_loc = cpu_to_le64(first_blkno); + ocfs2_journal_dirty(handle, inode_bh); + + *ret_bh = new_bh; + new_bh = NULL; + +end: + brelse(new_bh); + return ret; +} + /* * ocfs2_xattr_block_set() * @@ -2095,63 +2332,24 @@ static int ocfs2_xattr_block_set(struct inode *inode, struct ocfs2_xattr_set_ctxt *ctxt) { struct buffer_head *new_bh = NULL; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; handle_t *handle = ctxt->handle; struct ocfs2_xattr_block *xblk = NULL; - u16 suballoc_bit_start; - u32 num_got; - u64 first_blkno; int ret; if (!xs->xattr_bh) { - ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, - OCFS2_JOURNAL_ACCESS_CREATE); - if (ret < 0) { - mlog_errno(ret); - goto end; - } - - ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1, - &suballoc_bit_start, &num_got, - &first_blkno); - if (ret < 0) { - mlog_errno(ret); - goto end; - } - - new_bh = sb_getblk(inode->i_sb, first_blkno); - ocfs2_set_new_buffer_uptodate(inode, new_bh); - - ret = ocfs2_journal_access_xb(handle, inode, new_bh, - OCFS2_JOURNAL_ACCESS_CREATE); - if (ret < 0) { + ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh, + ctxt->meta_ac, &new_bh, 0); + if (ret) { mlog_errno(ret); goto end; } - /* Initialize ocfs2_xattr_block */ xs->xattr_bh = new_bh; - xblk = (struct ocfs2_xattr_block *)new_bh->b_data; - memset(xblk, 0, inode->i_sb->s_blocksize); - strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); - xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); - xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); - xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); - xblk->xb_blkno = cpu_to_le64(first_blkno); - + xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; xs->header = &xblk->xb_attrs.xb_header; xs->base = (void *)xs->header; xs->end = (void *)xblk + inode->i_sb->s_blocksize; xs->here = xs->header->xh_entries; - - ret = ocfs2_journal_dirty(handle, new_bh); - if (ret < 0) { - mlog_errno(ret); - goto end; - } - di->i_xattr_loc = cpu_to_le64(first_blkno); - ocfs2_journal_dirty(handle, xs->inode_bh); } else xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; @@ -2273,7 +2471,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode, old_in_xb = 1; if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, bucket_xh(xbs->bucket), i, &block_off, &name_offset); @@ -2428,6 +2626,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode, struct ocfs2_xattr_search *xis, struct ocfs2_xattr_search *xbs, struct ocfs2_xattr_set_ctxt *ctxt, + int extra_meta, int *credits) { int clusters_add, meta_add, ret; @@ -2444,6 +2643,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode, return ret; } + meta_add += extra_meta; mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " "credits = %d\n", xi->name, meta_add, clusters_add, *credits); @@ -2598,7 +2798,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, if (!ret) { /* Update inode ctime. */ - ret = ocfs2_journal_access_di(ctxt->handle, inode, + ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode), xis->inode_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { @@ -2711,10 +2911,11 @@ int ocfs2_xattr_set(struct inode *inode, { struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; - int ret, credits; + int ret, credits, ref_meta = 0, ref_credits = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + struct ocfs2_refcount_tree *ref_tree = NULL; struct ocfs2_xattr_info xi = { .name_index = name_index, @@ -2779,6 +2980,17 @@ int ocfs2_xattr_set(struct inode *inode, goto cleanup; } + /* Check whether the value is refcounted and do some prepartion. */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL && + (!xis.not_found || !xbs.not_found)) { + ret = ocfs2_prepare_refcount_xattr(inode, di, &xi, + &xis, &xbs, &ref_tree, + &ref_meta, &ref_credits); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + } mutex_lock(&tl_inode->i_mutex); @@ -2793,7 +3005,7 @@ int ocfs2_xattr_set(struct inode *inode, mutex_unlock(&tl_inode->i_mutex); ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, - &xbs, &ctxt, &credits); + &xbs, &ctxt, ref_meta, &credits); if (ret) { mlog_errno(ret); goto cleanup; @@ -2801,7 +3013,7 @@ int ocfs2_xattr_set(struct inode *inode, /* we need to update inode's ctime field, so add credit for it. */ credits += OCFS2_INODE_UPDATE_CREDITS; - ctxt.handle = ocfs2_start_trans(osb, credits); + ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits); if (IS_ERR(ctxt.handle)) { ret = PTR_ERR(ctxt.handle); mlog_errno(ret); @@ -2819,8 +3031,16 @@ int ocfs2_xattr_set(struct inode *inode, if (ocfs2_dealloc_has_cluster(&ctxt.dealloc)) ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &ctxt.dealloc); + cleanup: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); up_write(&OCFS2_I(inode)->ip_xattr_sem); + if (!value && !ret) { + ret = ocfs2_try_remove_refcount_tree(inode, di_bh); + if (ret) + mlog_errno(ret); + } ocfs2_inode_unlock(inode, 1); cleanup_nolock: brelse(di_bh); @@ -2849,7 +3069,8 @@ static int ocfs2_xattr_get_rec(struct inode *inode, u64 e_blkno = 0; if (el->l_tree_depth) { - ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh); + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, name_hash, + &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -2931,7 +3152,7 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode, if (cmp) continue; - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh, i, &block_off, @@ -3175,7 +3396,7 @@ struct ocfs2_xattr_tree_list { size_t result; }; -static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, +static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb, struct ocfs2_xattr_header *xh, int index, int *block_off, @@ -3188,8 +3409,8 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset); - *block_off = name_offset >> inode->i_sb->s_blocksize_bits; - *new_offset = name_offset % inode->i_sb->s_blocksize; + *block_off = name_offset >> sb->s_blocksize_bits; + *new_offset = name_offset % sb->s_blocksize; return 0; } @@ -3209,7 +3430,7 @@ static int ocfs2_list_xattr_bucket(struct inode *inode, prefix = ocfs2_xattr_prefix(type); if (prefix) { - ret = ocfs2_xattr_bucket_get_name_value(inode, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, bucket_xh(bucket), i, &block_off, @@ -3232,22 +3453,19 @@ static int ocfs2_list_xattr_bucket(struct inode *inode, return ret; } -static int ocfs2_xattr_tree_list_index_block(struct inode *inode, - struct ocfs2_xattr_tree_root *xt, - char *buffer, - size_t buffer_size) +static int ocfs2_iterate_xattr_index_block(struct inode *inode, + struct buffer_head *blk_bh, + xattr_tree_rec_func *rec_func, + void *para) { - struct ocfs2_extent_list *el = &xt->xt_list; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; int ret = 0; u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0; u64 p_blkno = 0; - struct ocfs2_xattr_tree_list xl = { - .buffer = buffer, - .buffer_size = buffer_size, - .result = 0, - }; - if (le16_to_cpu(el->l_next_free_rec) == 0) + if (!el->l_next_free_rec || !rec_func) return 0; while (name_hash > 0) { @@ -3255,16 +3473,15 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode, &e_cpos, &num_clusters, el); if (ret) { mlog_errno(ret); - goto out; + break; } - ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, - ocfs2_list_xattr_bucket, - &xl); + ret = rec_func(inode, blk_bh, p_blkno, e_cpos, + num_clusters, para); if (ret) { if (ret != -ERANGE) mlog_errno(ret); - goto out; + break; } if (e_cpos == 0) @@ -3273,6 +3490,37 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode, name_hash = e_cpos - 1; } + return ret; + +} + +static int ocfs2_list_xattr_tree_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para) +{ + return ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_list_xattr_bucket, para); +} + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct buffer_head *blk_bh, + char *buffer, + size_t buffer_size) +{ + int ret; + struct ocfs2_xattr_tree_list xl = { + .buffer = buffer, + .buffer_size = buffer_size, + .result = 0, + }; + + ret = ocfs2_iterate_xattr_index_block(inode, blk_bh, + ocfs2_list_xattr_tree_rec, &xl); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = xl.result; out: return ret; @@ -3426,7 +3674,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, */ down_write(&oi->ip_alloc_sem); - ret = ocfs2_journal_access_xb(handle, inode, xb_bh, + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), xb_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -4263,9 +4511,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, prev_cpos, (unsigned long long)bucket_blkno(first)); - ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh); - ret = ocfs2_journal_access_xb(handle, inode, root_bh, + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); @@ -4319,7 +4567,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", num_bits, (unsigned long long)block, v_start); - ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, + ret = ocfs2_insert_extent(handle, &et, v_start, block, num_bits, 0, ctxt->meta_ac); if (ret < 0) { mlog_errno(ret); @@ -4798,10 +5046,13 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, struct ocfs2_xattr_entry *xe = xs->here; struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); void *base; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); - ret = ocfs2_xattr_bucket_get_name_value(inode, xh, + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh, xe - xh->xh_entries, &block_off, &offset); @@ -4814,8 +5065,10 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, xv = (struct ocfs2_xattr_value_root *)(base + offset + OCFS2_XATTR_SIZE(xe->xe_name_len)); + vb.vb_xv = xv; + vb.vb_bh = xs->bucket->bu_bhs[block_off]; ret = __ocfs2_xattr_set_value_outside(inode, handle, - xv, val, value_len); + &vb, val, value_len); if (ret) mlog_errno(ret); out: @@ -4826,7 +5079,8 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, struct buffer_head *root_bh, u64 blkno, u32 cpos, - u32 len) + u32 len, + void *para) { int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -4838,14 +5092,22 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_extent_tree et; - ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); + ret = ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_delete_xattr_in_bucket, para); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh); ocfs2_init_dealloc_ctxt(&dealloc); mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n", cpos, len, (unsigned long long)blkno); - ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len); + ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno, + len); ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); if (ret) { @@ -4870,14 +5132,14 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, goto out; } - ret = ocfs2_journal_access_xb(handle, inode, root_bh, + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; } - ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, + ret = ocfs2_remove_extent(handle, &et, cpos, len, meta_ac, &dealloc); if (ret) { mlog_errno(ret); @@ -5220,7 +5482,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, struct ocfs2_xattr_bucket *bucket, void *para) { - int ret = 0; + int ret = 0, ref_credits; struct ocfs2_xattr_header *xh = bucket_xh(bucket); u16 i; struct ocfs2_xattr_entry *xe; @@ -5228,7 +5490,9 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,}; int credits = ocfs2_remove_extent_credits(osb->sb) + ocfs2_blocks_per_xattr_bucket(inode->i_sb); - + struct ocfs2_xattr_value_root *xv; + struct ocfs2_rm_xattr_bucket_para *args = + (struct ocfs2_rm_xattr_bucket_para *)para; ocfs2_init_dealloc_ctxt(&ctxt.dealloc); @@ -5237,7 +5501,16 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, if (ocfs2_xattr_is_local(xe)) continue; - ctxt.handle = ocfs2_start_trans(osb, credits); + ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, + i, &xv, NULL); + + ret = ocfs2_lock_xattr_remove_allocators(inode, xv, + args->ref_ci, + args->ref_root_bh, + &ctxt.meta_ac, + &ref_credits); + + ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits); if (IS_ERR(ctxt.handle)) { ret = PTR_ERR(ctxt.handle); mlog_errno(ret); @@ -5248,57 +5521,1439 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, i, 0, &ctxt); ocfs2_commit_trans(osb, ctxt.handle); + if (ctxt.meta_ac) { + ocfs2_free_alloc_context(ctxt.meta_ac); + ctxt.meta_ac = NULL; + } if (ret) { mlog_errno(ret); break; } } + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &ctxt.dealloc); return ret; } -static int ocfs2_delete_xattr_index_block(struct inode *inode, - struct buffer_head *xb_bh) +/* + * Whenever we modify a xattr value root in the bucket(e.g, CoW + * or change the extent record flag), we need to recalculate + * the metaecc for the whole bucket. So it is done here. + * + * Note: + * We have to give the extra credits for the caller. + */ +static int ocfs2_xattr_bucket_post_refcount(struct inode *inode, + handle_t *handle, + void *para) +{ + int ret; + struct ocfs2_xattr_bucket *bucket = + (struct ocfs2_xattr_bucket *)para; + + ret = ocfs2_xattr_bucket_journal_access(handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_xattr_bucket_journal_dirty(handle, bucket); + + return 0; +} + +/* + * Special action we need if the xattr value is refcounted. + * + * 1. If the xattr is refcounted, lock the tree. + * 2. CoW the xattr if we are setting the new value and the value + * will be stored outside. + * 3. In other case, decrease_refcount will work for us, so just + * lock the refcount tree, calculate the meta and credits is OK. + * + * We have to do CoW before ocfs2_init_xattr_set_ctxt since + * currently CoW is a completed transaction, while this function + * will also lock the allocators and let us deadlock. So we will + * CoW the whole xattr value. + */ +static int ocfs2_prepare_refcount_xattr(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_refcount_tree **ref_tree, + int *meta_add, + int *credits) { - struct ocfs2_xattr_block *xb = - (struct ocfs2_xattr_block *)xb_bh->b_data; - struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; int ret = 0; - u32 name_hash = UINT_MAX, e_cpos, num_clusters; - u64 p_blkno; + struct ocfs2_xattr_block *xb; + struct ocfs2_xattr_entry *xe; + char *base; + u32 p_cluster, num_clusters; + unsigned int ext_flags; + int name_offset, name_len; + struct ocfs2_xattr_value_buf vb; + struct ocfs2_xattr_bucket *bucket = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_post_refcount refcount; + struct ocfs2_post_refcount *p = NULL; + struct buffer_head *ref_root_bh = NULL; - if (le16_to_cpu(el->l_next_free_rec) == 0) - return 0; + if (!xis->not_found) { + xe = xis->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + base = xis->base; + vb.vb_bh = xis->inode_bh; + vb.vb_access = ocfs2_journal_access_di; + } else { + int i, block_off = 0; + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + xe = xbs->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + i = xbs->here - xbs->header->xh_entries; - while (name_hash > 0) { - ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, - &e_cpos, &num_clusters, el); + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(xbs->bucket), + i, &block_off, + &name_offset); + if (ret) { + mlog_errno(ret); + goto out; + } + base = bucket_block(xbs->bucket, block_off); + vb.vb_bh = xbs->bucket->bu_bhs[block_off]; + vb.vb_access = ocfs2_journal_access; + + if (ocfs2_meta_ecc(osb)) { + /*create parameters for ocfs2_post_refcount. */ + bucket = xbs->bucket; + refcount.credits = bucket->bu_blocks; + refcount.para = bucket; + refcount.func = + ocfs2_xattr_bucket_post_refcount; + p = &refcount; + } + } else { + base = xbs->base; + vb.vb_bh = xbs->xattr_bh; + vb.vb_access = ocfs2_journal_access_xb; + } + } + + if (ocfs2_xattr_is_local(xe)) + goto out; + + vb.vb_xv = (struct ocfs2_xattr_value_root *) + (base + name_offset + name_len); + + ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster, + &num_clusters, &vb.vb_xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We just need to check the 1st extent record, since we always + * CoW the whole xattr. So there shouldn't be a xattr with + * some REFCOUNT extent recs after the 1st one. + */ + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) + goto out; + + ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), + 1, ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * If we are deleting the xattr or the new size will be stored inside, + * cool, leave it there, the xattr truncate process will remove them + * for us(it still needs the refcount tree lock and the meta, credits). + * And the worse case is that every cluster truncate will split the + * refcount tree, and make the original extent become 3. So we will need + * 2 * cluster more extent recs at most. + */ + if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) { + + ret = ocfs2_refcounted_xattr_delete_need(inode, + &(*ref_tree)->rf_ci, + ref_root_bh, vb.vb_xv, + meta_add, credits); + if (ret) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_refcount_cow_xattr(inode, di, &vb, + *ref_tree, ref_root_bh, 0, + le32_to_cpu(vb.vb_xv->xr_clusters), p); + if (ret) + mlog_errno(ret); + +out: + brelse(ref_root_bh); + return ret; +} + +/* + * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root. + * The physical clusters will be added to refcount tree. + */ +static int ocfs2_xattr_value_attach_refcount(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + struct ocfs2_extent_tree *value_et, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_post_refcount *refcount) +{ + int ret = 0; + u32 clusters = le32_to_cpu(xv->xr_clusters); + u32 cpos, p_cluster, num_clusters; + struct ocfs2_extent_list *el = &xv->xr_list; + unsigned int ext_flags; + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, el, &ext_flags); + + cpos += num_clusters; + if ((ext_flags & OCFS2_EXT_REFCOUNTED)) + continue; + + BUG_ON(!p_cluster); + + ret = ocfs2_add_refcount_flag(inode, value_et, + ref_ci, ref_root_bh, + cpos - num_clusters, + p_cluster, num_clusters, + dealloc, refcount); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +/* + * Given a normal ocfs2_xattr_header, refcount all the entries which + * have value stored outside. + * Used for xattrs stored in inode and ocfs2_xattr_block. + */ +static int ocfs2_xattr_attach_refcount_normal(struct inode *inode, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_header *header, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_extent_tree et; + int i, ret = 0; + + for (i = 0; i < le16_to_cpu(header->xh_count); i++) { + xe = &header->xh_entries[i]; + + if (ocfs2_xattr_is_local(xe)) + continue; + + xv = (struct ocfs2_xattr_value_root *)((void *)header + + le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + vb->vb_xv = xv; + ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); + + ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et, + ref_ci, ref_root_bh, + dealloc, NULL); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +static int ocfs2_xattr_inline_attach_refcount(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *) + (fe_bh->b_data + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + struct ocfs2_xattr_value_buf vb = { + .vb_bh = fe_bh, + .vb_access = ocfs2_journal_access_di, + }; + + return ocfs2_xattr_attach_refcount_normal(inode, &vb, header, + ref_ci, ref_root_bh, dealloc); +} + +struct ocfs2_xattr_tree_value_refcount_para { + struct ocfs2_caching_info *ref_ci; + struct buffer_head *ref_root_bh; + struct ocfs2_cached_dealloc_ctxt *dealloc; +}; + +static int ocfs2_get_xattr_tree_value_root(struct super_block *sb, + struct ocfs2_xattr_bucket *bucket, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **bh) +{ + int ret, block_off, name_offset; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); + struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset]; + void *base; + + ret = ocfs2_xattr_bucket_get_name_value(sb, + bucket_xh(bucket), + offset, + &block_off, + &name_offset); + if (ret) { + mlog_errno(ret); + goto out; + } + + base = bucket_block(bucket, block_off); + + *xv = (struct ocfs2_xattr_value_root *)(base + name_offset + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + if (bh) + *bh = bucket->bu_bhs[block_off]; +out: + return ret; +} + +/* + * For a given xattr bucket, refcount all the entries which + * have value stored outside. + */ +static int ocfs2_xattr_bucket_value_refcount(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int i, ret = 0; + struct ocfs2_extent_tree et; + struct ocfs2_xattr_tree_value_refcount_para *ref = + (struct ocfs2_xattr_tree_value_refcount_para *)para; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; + struct ocfs2_post_refcount refcount = { + .credits = bucket->bu_blocks, + .para = bucket, + .func = ocfs2_xattr_bucket_post_refcount, + }; + struct ocfs2_post_refcount *p = NULL; + + /* We only need post_refcount if we support metaecc. */ + if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb))) + p = &refcount; + + mlog(0, "refcount bucket %llu, count = %u\n", + (unsigned long long)bucket_blkno(bucket), + le16_to_cpu(xh->xh_count)); + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i, + &vb.vb_xv, &vb.vb_bh); + if (ret) { + mlog_errno(ret); + break; + } + + ocfs2_init_xattr_value_extent_tree(&et, + INODE_CACHE(inode), &vb); + + ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv, + &et, ref->ref_ci, + ref->ref_root_bh, + ref->dealloc, p); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; + +} + +static int ocfs2_refcount_xattr_tree_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, u32 cpos, u32 len, void *para) +{ + return ocfs2_iterate_xattr_buckets(inode, blkno, len, + ocfs2_xattr_bucket_value_refcount, + para); +} + +static int ocfs2_xattr_block_attach_refcount(struct inode *inode, + struct buffer_head *blk_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; + + ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header, + ref_ci, ref_root_bh, + dealloc); + } else { + struct ocfs2_xattr_tree_value_refcount_para para = { + .ref_ci = ref_ci, + .ref_root_bh = ref_root_bh, + .dealloc = dealloc, + }; + + ret = ocfs2_iterate_xattr_index_block(inode, blk_bh, + ocfs2_refcount_xattr_tree_rec, + ¶); + } + + return ret; +} + +int ocfs2_xattr_attach_refcount_tree(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; + struct buffer_head *blk_bh = NULL; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh, + ref_ci, ref_root_bh, + dealloc); if (ret) { mlog_errno(ret); goto out; } + } + + if (!di->i_xattr_loc) + goto out; + + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci, + ref_root_bh, dealloc); + if (ret) + mlog_errno(ret); + + brelse(blk_bh); +out: + + return ret; +} + +typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe); +/* + * Store the information we need in xattr reflink. + * old_bh and new_bh are inode bh for the old and new inode. + */ +struct ocfs2_xattr_reflink { + struct inode *old_inode; + struct inode *new_inode; + struct buffer_head *old_bh; + struct buffer_head *new_bh; + struct ocfs2_caching_info *ref_ci; + struct buffer_head *ref_root_bh; + struct ocfs2_cached_dealloc_ctxt *dealloc; + should_xattr_reflinked *xattr_reflinked; +}; + +/* + * Given a xattr header and xe offset, + * return the proper xv and the corresponding bh. + * xattr in inode, block and xattr tree have different implementaions. + */ +typedef int (get_xattr_value_root)(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para); + +/* + * Calculate all the xattr value root metadata stored in this xattr header and + * credits we need if we create them from the scratch. + * We use get_xattr_value_root so that all types of xattr container can use it. + */ +static int ocfs2_value_metas_in_xattr_header(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int *metas, int *credits, + int *num_recs, + get_xattr_value_root *func, + void *para) +{ + int i, ret = 0; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe; + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = func(sb, bh, xh, i, &xv, NULL, para); + if (ret) { + mlog_errno(ret); + break; + } + + *metas += le16_to_cpu(xv->xr_list.l_tree_depth) * + le16_to_cpu(xv->xr_list.l_next_free_rec); + + *credits += ocfs2_calc_extend_credits(sb, + &def_xv.xv.xr_list, + le32_to_cpu(xv->xr_clusters)); + + /* + * If the value is a tree with depth > 1, We don't go deep + * to the extent block, so just calculate a maximum record num. + */ + if (!xv->xr_list.l_tree_depth) + *num_recs += xv->xr_list.l_next_free_rec; + else + *num_recs += ocfs2_clusters_for_bytes(sb, + XATTR_SIZE_MAX); + } + + return ret; +} + +/* Used by xattr inode and block to return the right xv and buffer_head. */ +static int ocfs2_get_xattr_value_root(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset]; + + *xv = (struct ocfs2_xattr_value_root *)((void *)xh + + le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len)); + + if (ret_bh) + *ret_bh = bh; + + return 0; +} + +/* + * Lock the meta_ac and caculate how much credits we need for reflink xattrs. + * It is only used for inline xattr and xattr block. + */ +static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb, + struct ocfs2_xattr_header *xh, + struct buffer_head *ref_root_bh, + int *credits, + struct ocfs2_alloc_context **meta_ac) +{ + int ret, meta_add = 0, num_recs = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + *credits = 0; + + ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh, + &meta_add, credits, &num_recs, + ocfs2_get_xattr_value_root, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * We need to add/modify num_recs in refcount tree, so just calculate + * an approximate number we need for refcount tree change. + * Sometimes we need to split the tree, and after split, half recs + * will be moved to the new block, and a new block can only provide + * half number of recs. So we multiple new blocks by 2. + */ + num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2; + meta_add += num_recs; + *credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += le16_to_cpu(rb->rf_list.l_tree_depth) * + le16_to_cpu(rb->rf_list.l_next_free_rec) + 1; + else + *credits += 1; + + ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac); + if (ret) + mlog_errno(ret); + +out: + return ret; +} - ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, - ocfs2_delete_xattr_in_bucket, - NULL); +/* + * Given a xattr header, reflink all the xattrs in this container. + * It can be used for inode, block and bucket. + * + * NOTE: + * Before we call this function, the caller has memcpy the xattr in + * old_xh to the new_xh. + * + * If args.xattr_reflinked is set, call it to decide whether the xe should + * be reflinked or not. If not, remove it from the new xattr header. + */ +static int ocfs2_reflink_xattr_header(handle_t *handle, + struct ocfs2_xattr_reflink *args, + struct buffer_head *old_bh, + struct ocfs2_xattr_header *xh, + struct buffer_head *new_bh, + struct ocfs2_xattr_header *new_xh, + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_alloc_context *meta_ac, + get_xattr_value_root *func, + void *para) +{ + int ret = 0, i, j; + struct super_block *sb = args->old_inode->i_sb; + struct buffer_head *value_bh; + struct ocfs2_xattr_entry *xe, *last; + struct ocfs2_xattr_value_root *xv, *new_xv; + struct ocfs2_extent_tree data_et; + u32 clusters, cpos, p_cluster, num_clusters; + unsigned int ext_flags = 0; + + mlog(0, "reflink xattr in container %llu, count = %u\n", + (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count)); + + last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)]; + for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) { + xe = &xh->xh_entries[i]; + + if (args->xattr_reflinked && !args->xattr_reflinked(xe)) { + xe = &new_xh->xh_entries[j]; + + le16_add_cpu(&new_xh->xh_count, -1); + if (new_xh->xh_count) { + memmove(xe, xe + 1, + (void *)last - (void *)xe); + memset(last, 0, + sizeof(struct ocfs2_xattr_entry)); + } + + /* + * We don't want j to increase in the next round since + * it is already moved ahead. + */ + j--; + continue; + } + + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = func(sb, old_bh, xh, i, &xv, NULL, para); + if (ret) { + mlog_errno(ret); + break; + } + + ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * For the xattr which has l_tree_depth = 0, all the extent + * recs have already be copied to the new xh with the + * propriate OCFS2_EXT_REFCOUNTED flag we just need to + * increase the refount count int the refcount tree. + * + * For the xattr which has l_tree_depth > 0, we need + * to initialize it to the empty default value root, + * and then insert the extents one by one. + */ + if (xv->xr_list.l_tree_depth) { + memcpy(new_xv, &def_xv, sizeof(def_xv)); + vb->vb_xv = new_xv; + vb->vb_bh = value_bh; + ocfs2_init_xattr_value_extent_tree(&data_et, + INODE_CACHE(args->new_inode), vb); + } + + clusters = le32_to_cpu(xv->xr_clusters); + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(args->old_inode, + cpos, + &p_cluster, + &num_clusters, + &xv->xr_list, + &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!p_cluster); + + if (xv->xr_list.l_tree_depth) { + ret = ocfs2_insert_extent(handle, + &data_et, cpos, + ocfs2_clusters_to_blocks( + args->old_inode->i_sb, + p_cluster), + num_clusters, ext_flags, + meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_increase_refcount(handle, args->ref_ci, + args->ref_root_bh, + p_cluster, num_clusters, + meta_ac, args->dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + cpos += num_clusters; + } + } + +out: + return ret; +} + +static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args) +{ + int ret = 0, credits = 0; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data; + int inline_size = le16_to_cpu(di->i_xattr_inline_size); + int header_off = osb->sb->s_blocksize - inline_size; + struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *) + (args->old_bh->b_data + header_off); + struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *) + (args->new_bh->b_data + header_off); + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_inode_info *new_oi; + struct ocfs2_dinode *new_di; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = args->new_bh, + .vb_access = ocfs2_journal_access_di, + }; + + ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh, + &credits, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode), + args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memcpy(args->new_bh->b_data + header_off, + args->old_bh->b_data + header_off, inline_size); + + new_di = (struct ocfs2_dinode *)args->new_bh->b_data; + new_di->i_xattr_inline_size = cpu_to_le16(inline_size); + + ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh, + args->new_bh, new_xh, &vb, meta_ac, + ocfs2_get_xattr_value_root, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_oi = OCFS2_I(args->new_inode); + spin_lock(&new_oi->ip_lock); + new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; + new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); + spin_unlock(&new_oi->ip_lock); + + ocfs2_journal_dirty(handle, args->new_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_create_empty_xattr_block(struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head **ret_bh, + int indexed) +{ + int ret; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + mlog(0, "create new xattr block for inode %llu, index = %d\n", + (unsigned long long)fe_bh->b_blocknr, indexed); + ret = ocfs2_create_xattr_block(handle, inode, fe_bh, + meta_ac, ret_bh, indexed); + if (ret) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); +out: + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh, + struct buffer_head *new_blk_bh) +{ + int ret = 0, credits = 0; + handle_t *handle; + struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode); + struct ocfs2_dinode *new_di; + struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb); + int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_block *new_xb = + (struct ocfs2_xattr_block *)new_blk_bh->b_data; + struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = new_blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; + + ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh, + &credits, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* One more credits in case we need to add xattr flags in new inode. */ + handle = ocfs2_start_trans(osb, credits + 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) { + ret = ocfs2_journal_access_di(handle, + INODE_CACHE(args->new_inode), + args->new_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode), + new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off, + osb->sb->s_blocksize - header_off); + + ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh, + new_blk_bh, new_xh, &vb, meta_ac, + ocfs2_get_xattr_value_root, NULL); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_journal_dirty(handle, new_blk_bh); + + if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) { + new_di = (struct ocfs2_dinode *)args->new_bh->b_data; + spin_lock(&new_oi->ip_lock); + new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL; + new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); + spin_unlock(&new_oi->ip_lock); + + ocfs2_journal_dirty(handle, args->new_bh); + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +struct ocfs2_reflink_xattr_tree_args { + struct ocfs2_xattr_reflink *reflink; + struct buffer_head *old_blk_bh; + struct buffer_head *new_blk_bh; + struct ocfs2_xattr_bucket *old_bucket; + struct ocfs2_xattr_bucket *new_bucket; +}; + +/* + * NOTE: + * We have to handle the case that both old bucket and new bucket + * will call this function to get the right ret_bh. + * So The caller must give us the right bh. + */ +static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_reflink_xattr_tree_args *args = + (struct ocfs2_reflink_xattr_tree_args *)para; + struct ocfs2_xattr_bucket *bucket; + + if (bh == args->old_bucket->bu_bhs[0]) + bucket = args->old_bucket; + else + bucket = args->new_bucket; + + return ocfs2_get_xattr_tree_value_root(sb, bucket, offset, + xv, ret_bh); +} + +struct ocfs2_value_tree_metas { + int num_metas; + int credits; + int num_recs; +}; + +static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb, + struct buffer_head *bh, + struct ocfs2_xattr_header *xh, + int offset, + struct ocfs2_xattr_value_root **xv, + struct buffer_head **ret_bh, + void *para) +{ + struct ocfs2_xattr_bucket *bucket = + (struct ocfs2_xattr_bucket *)para; + + return ocfs2_get_xattr_tree_value_root(sb, bucket, offset, + xv, ret_bh); +} + +static int ocfs2_calc_value_tree_metas(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + struct ocfs2_value_tree_metas *metas = + (struct ocfs2_value_tree_metas *)para; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data; + + /* Add the credits for this bucket first. */ + metas->credits += bucket->bu_blocks; + return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0], + xh, &metas->num_metas, + &metas->credits, &metas->num_recs, + ocfs2_value_tree_metas_in_bucket, + bucket); +} + +/* + * Given a xattr extent rec starting from blkno and having len clusters, + * iterate all the buckets calculate how much metadata we need for reflinking + * all the ocfs2_xattr_value_root and lock the allocators accordingly. + */ +static int ocfs2_lock_reflink_xattr_rec_allocators( + struct ocfs2_reflink_xattr_tree_args *args, + struct ocfs2_extent_tree *xt_et, + u64 blkno, u32 len, int *credits, + struct ocfs2_alloc_context **meta_ac, + struct ocfs2_alloc_context **data_ac) +{ + int ret, num_free_extents; + struct ocfs2_value_tree_metas metas; + struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb); + struct ocfs2_refcount_block *rb; + + memset(&metas, 0, sizeof(metas)); + + ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len, + ocfs2_calc_value_tree_metas, &metas); + if (ret) { + mlog_errno(ret); + goto out; + } + + *credits = metas.credits; + + /* + * Calculate we need for refcount tree change. + * + * We need to add/modify num_recs in refcount tree, so just calculate + * an approximate number we need for refcount tree change. + * Sometimes we need to split the tree, and after split, half recs + * will be moved to the new block, and a new block can only provide + * half number of recs. So we multiple new blocks by 2. + * In the end, we have to add credits for modifying the already + * existed refcount block. + */ + rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data; + metas.num_recs = + (metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) / + ocfs2_refcount_recs_per_rb(osb->sb) * 2; + metas.num_metas += metas.num_recs; + *credits += metas.num_recs + + metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; + if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) + *credits += le16_to_cpu(rb->rf_list.l_tree_depth) * + le16_to_cpu(rb->rf_list.l_next_free_rec) + 1; + else + *credits += 1; + + /* count in the xattr tree change. */ + num_free_extents = ocfs2_num_free_extents(osb, xt_et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + if (num_free_extents < len) + metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el); + + *credits += ocfs2_calc_extend_credits(osb->sb, + xt_et->et_root_el, len); + + if (metas.num_metas) { + ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas, + meta_ac); if (ret) { mlog_errno(ret); goto out; } + } - ret = ocfs2_rm_xattr_cluster(inode, xb_bh, - p_blkno, e_cpos, num_clusters); + if (len) { + ret = ocfs2_reserve_clusters(osb, len, data_ac); + if (ret) + mlog_errno(ret); + } +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + meta_ac = NULL; + } + } + + return ret; +} + +static int ocfs2_reflink_xattr_buckets(handle_t *handle, + u64 blkno, u64 new_blkno, u32 clusters, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_reflink_xattr_tree_args *args) +{ + int i, j, ret = 0; + struct super_block *sb = args->reflink->old_inode->i_sb; + u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb)); + u32 num_buckets = clusters * bpc; + int bpb = args->old_bucket->bu_blocks; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; + + for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) { + ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno); if (ret) { mlog_errno(ret); break; } - if (e_cpos == 0) + ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno); + if (ret) { + mlog_errno(ret); break; + } - name_hash = e_cpos - 1; + /* + * The real bucket num in this series of blocks is stored + * in the 1st bucket. + */ + if (i == 0) + num_buckets = le16_to_cpu( + bucket_xh(args->old_bucket)->xh_num_buckets); + + ret = ocfs2_xattr_bucket_journal_access(handle, + args->new_bucket, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + break; + } + + for (j = 0; j < bpb; j++) + memcpy(bucket_block(args->new_bucket, j), + bucket_block(args->old_bucket, j), + sb->s_blocksize); + + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + + ret = ocfs2_reflink_xattr_header(handle, args->reflink, + args->old_bucket->bu_bhs[0], + bucket_xh(args->old_bucket), + args->new_bucket->bu_bhs[0], + bucket_xh(args->new_bucket), + &vb, meta_ac, + ocfs2_get_reflink_xattr_value_root, + args); + if (ret) { + mlog_errno(ret); + break; + } + + /* + * Re-access and dirty the bucket to calculate metaecc. + * Because we may extend the transaction in reflink_xattr_header + * which will let the already accessed block gone. + */ + ret = ocfs2_xattr_bucket_journal_access(handle, + args->new_bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + break; + } + + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + ocfs2_xattr_bucket_relse(args->old_bucket); + ocfs2_xattr_bucket_relse(args->new_bucket); + } + + ocfs2_xattr_bucket_relse(args->old_bucket); + ocfs2_xattr_bucket_relse(args->new_bucket); + return ret; +} +/* + * Create the same xattr extent record in the new inode's xattr tree. + */ +static int ocfs2_reflink_xattr_rec(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len, + void *para) +{ + int ret, credits = 0; + u32 p_cluster, num_clusters; + u64 new_blkno; + handle_t *handle; + struct ocfs2_reflink_xattr_tree_args *args = + (struct ocfs2_reflink_xattr_tree_args *)para; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_tree_extent_tree(&et, + INODE_CACHE(args->reflink->new_inode), + args->new_blk_bh); + + ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno, + len, &credits, + &meta_ac, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_claim_clusters(osb, handle, data_ac, + len, &p_cluster, &num_clusters); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster); + + mlog(0, "reflink xattr buckets %llu to %llu, len %u\n", + (unsigned long long)blkno, (unsigned long long)new_blkno, len); + ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len, + meta_ac, data_ac, args); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", + (unsigned long long)new_blkno, len, cpos); + ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno, + len, 0, meta_ac); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + return ret; +} + +/* + * Create reflinked xattr buckets. + * We will add bucket one by one, and refcount all the xattrs in the bucket + * if they are stored outside. + */ +static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh, + struct buffer_head *new_blk_bh) +{ + int ret; + struct ocfs2_reflink_xattr_tree_args para; + + memset(¶, 0, sizeof(para)); + para.reflink = args; + para.old_blk_bh = blk_bh; + para.new_blk_bh = new_blk_bh; + + para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode); + if (!para.old_bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode); + if (!para.new_bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh, + ocfs2_reflink_xattr_rec, + ¶); + if (ret) + mlog_errno(ret); + +out: + ocfs2_xattr_bucket_free(para.old_bucket); + ocfs2_xattr_bucket_free(para.new_bucket); + return ret; +} + +static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args, + struct buffer_head *blk_bh) +{ + int ret, indexed = 0; + struct buffer_head *new_blk_bh = NULL; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)blk_bh->b_data; + + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) + indexed = 1; + + ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh, + &new_blk_bh, indexed); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) + ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh); + else + ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh); + if (ret) + mlog_errno(ret); + +out: + brelse(new_blk_bh); + return ret; +} + +static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe) +{ + int type = ocfs2_xattr_get_type(xe); + + return type != OCFS2_XATTR_INDEX_SECURITY && + type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS && + type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; +} + +int ocfs2_reflink_xattrs(struct inode *old_inode, + struct buffer_head *old_bh, + struct inode *new_inode, + struct buffer_head *new_bh, + bool preserve_security) +{ + int ret; + struct ocfs2_xattr_reflink args; + struct ocfs2_inode_info *oi = OCFS2_I(old_inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_refcount_tree *ref_tree; + struct buffer_head *ref_root_bh = NULL; + + ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb), + le64_to_cpu(di->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_init_dealloc_ctxt(&dealloc); + + args.old_inode = old_inode; + args.new_inode = new_inode; + args.old_bh = old_bh; + args.new_bh = new_bh; + args.ref_ci = &ref_tree->rf_ci; + args.ref_root_bh = ref_root_bh; + args.dealloc = &dealloc; + if (preserve_security) + args.xattr_reflinked = NULL; + else + args.xattr_reflinked = ocfs2_reflink_xattr_no_security; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_reflink_xattr_inline(&args); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + } + + if (!di->i_xattr_loc) + goto out_unlock; + + ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_reflink_xattr_in_block(&args, blk_bh); + if (ret) + mlog_errno(ret); + + brelse(blk_bh); + +out_unlock: + ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb), + ref_tree, 1); + brelse(ref_root_bh); + + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1); + ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc); } out: @@ -5306,6 +6961,51 @@ out: } /* + * Initialize security and acl for a already created inode. + * Used for reflink a non-preserve-security file. + * + * It uses common api like ocfs2_xattr_set, so the caller + * must not hold any lock expect i_mutex. + */ +int ocfs2_init_security_and_acl(struct inode *dir, + struct inode *inode) +{ + int ret = 0; + struct buffer_head *dir_bh = NULL; + struct ocfs2_security_xattr_info si = { + .enable = 1, + }; + + ret = ocfs2_init_security_get(inode, dir, &si); + if (!ret) { + ret = ocfs2_xattr_security_set(inode, si.name, + si.value, si.value_len, + XATTR_CREATE); + if (ret) { + mlog_errno(ret); + goto leave; + } + } else if (ret != -EOPNOTSUPP) { + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_inode_lock(dir, &dir_bh, 0); + if (ret) { + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); + if (ret) + mlog_errno(ret); + + ocfs2_inode_unlock(dir, 0); + brelse(dir_bh); +leave: + return ret; +} +/* * 'security' attributes support */ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 1ca7e9a1b7b..08e36389f56 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -55,6 +55,8 @@ int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *, int, const char *, const void *, size_t, int, struct ocfs2_alloc_context *, struct ocfs2_alloc_context *); +int ocfs2_has_inline_xattr_value_outside(struct inode *inode, + struct ocfs2_dinode *di); int ocfs2_xattr_remove(struct inode *, struct buffer_head *); int ocfs2_init_security_get(struct inode *, struct inode *, struct ocfs2_security_xattr_info *); @@ -83,5 +85,16 @@ struct ocfs2_xattr_value_buf { struct ocfs2_xattr_value_root *vb_xv; }; - +int ocfs2_xattr_attach_refcount_tree(struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_caching_info *ref_ci, + struct buffer_head *ref_root_bh, + struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_reflink_xattrs(struct inode *old_inode, + struct buffer_head *old_bh, + struct inode *new_inode, + struct buffer_head *new_bh, + bool preserve_security); +int ocfs2_init_security_and_acl(struct inode *dir, + struct inode *inode); #endif /* OCFS2_XATTR_H */ |