diff options
Diffstat (limited to 'fs/ocfs2/refcounttree.c')
-rw-r--r-- | fs/ocfs2/refcounttree.c | 4313 |
1 files changed, 4313 insertions, 0 deletions
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c new file mode 100644 index 00000000000..60287fc56bc --- /dev/null +++ b/fs/ocfs2/refcounttree.c @@ -0,0 +1,4313 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * refcounttree.c + * + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/sort.h> +#define MLOG_MASK_PREFIX ML_REFCOUNT +#include <cluster/masklog.h> +#include "ocfs2.h" +#include "inode.h" +#include "alloc.h" +#include "suballoc.h" +#include "journal.h" +#include "uptodate.h" +#include "super.h" +#include "buffer_head_io.h" +#include "blockcheck.h" +#include "refcounttree.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "aops.h" +#include "xattr.h" +#include "namei.h" + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include <linux/swap.h> +#include <linux/security.h> +#include <linux/fsnotify.h> +#include <linux/quotaops.h> +#include <linux/namei.h> +#include <linux/mount.h> + +struct ocfs2_cow_context { + struct inode *inode; + u32 cow_start; + u32 cow_len; + struct ocfs2_extent_tree data_et; + struct ocfs2_refcount_tree *ref_tree; + struct buffer_head *ref_root_bh; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; + void *cow_object; + struct ocfs2_post_refcount *post_refcount; + int extra_credits; + int (*get_clusters)(struct ocfs2_cow_context *context, + u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, + unsigned int *extent_flags); + int (*cow_duplicate_clusters)(handle_t *handle, + struct ocfs2_cow_context *context, + u32 cpos, u32 old_cluster, + u32 new_cluster, u32 new_len); +}; + +static inline struct ocfs2_refcount_tree * +cache_info_to_refcount(struct ocfs2_caching_info *ci) +{ + return container_of(ci, struct ocfs2_refcount_tree, rf_ci); +} + +static int ocfs2_validate_refcount_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)bh->b_data; + + mlog(0, "Validating refcount block %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for refcount block %llu\n", + (unsigned long long)bh->b_blocknr); + return rc; + } + + + if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { + ocfs2_error(sb, + "Refcount block #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + rb->rf_signature); + return -EINVAL; + } + + if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Refcount block #%llu has an invalid rf_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(rb->rf_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Refcount block #%llu has an invalid " + "rf_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(rb->rf_fs_generation)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, + u64 rb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(ci, rb_blkno, &tmp, + ocfs2_validate_refcount_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + return rf->rf_blkno; +} + +static struct super_block * +ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + return rf->rf_sb; +} + +static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + spin_lock(&rf->rf_lock); +} + +static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + spin_unlock(&rf->rf_lock); +} + +static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + mutex_lock(&rf->rf_io_mutex); +} + +static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci) +{ + struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); + + mutex_unlock(&rf->rf_io_mutex); +} + +static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = { + .co_owner = ocfs2_refcount_cache_owner, + .co_get_super = ocfs2_refcount_cache_get_super, + .co_cache_lock = ocfs2_refcount_cache_lock, + .co_cache_unlock = ocfs2_refcount_cache_unlock, + .co_io_lock = ocfs2_refcount_cache_io_lock, + .co_io_unlock = ocfs2_refcount_cache_io_unlock, +}; + +static struct ocfs2_refcount_tree * +ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno) +{ + struct rb_node *n = osb->osb_rf_lock_tree.rb_node; + struct ocfs2_refcount_tree *tree = NULL; + + while (n) { + tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node); + + if (blkno < tree->rf_blkno) + n = n->rb_left; + else if (blkno > tree->rf_blkno) + n = n->rb_right; + else + return tree; + } + + return NULL; +} + +/* osb_lock is already locked. */ +static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *new) +{ + u64 rf_blkno = new->rf_blkno; + struct rb_node *parent = NULL; + struct rb_node **p = &osb->osb_rf_lock_tree.rb_node; + struct ocfs2_refcount_tree *tmp; + + while (*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_refcount_tree, + rf_node); + + if (rf_blkno < tmp->rf_blkno) + p = &(*p)->rb_left; + else if (rf_blkno > tmp->rf_blkno) + p = &(*p)->rb_right; + else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate refcount block %llu found!\n", + (unsigned long long)rf_blkno); + BUG(); + } + } + + rb_link_node(&new->rf_node, parent, p); + rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree); +} + +static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree) +{ + ocfs2_metadata_cache_exit(&tree->rf_ci); + ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres); + ocfs2_lock_res_free(&tree->rf_lockres); + kfree(tree); +} + +static inline void +ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree) +{ + rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree); + if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree) + osb->osb_ref_tree_lru = NULL; +} + +static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree) +{ + spin_lock(&osb->osb_lock); + ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); + spin_unlock(&osb->osb_lock); +} + +void ocfs2_kref_remove_refcount_tree(struct kref *kref) +{ + struct ocfs2_refcount_tree *tree = + container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); + + ocfs2_free_refcount_tree(tree); +} + +static inline void +ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree) +{ + kref_get(&tree->rf_getcnt); +} + +static inline void +ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree) +{ + kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree); +} + +static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new, + struct super_block *sb) +{ + ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops); + mutex_init(&new->rf_io_mutex); + new->rf_sb = sb; + spin_lock_init(&new->rf_lock); +} + +static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *new, + u64 rf_blkno, u32 generation) +{ + init_rwsem(&new->rf_sem); + ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, + rf_blkno, generation); +} + +static struct ocfs2_refcount_tree* +ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno) +{ + struct ocfs2_refcount_tree *new; + + new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS); + if (!new) + return NULL; + + new->rf_blkno = rf_blkno; + kref_init(&new->rf_getcnt); + ocfs2_init_refcount_tree_ci(new, osb->sb); + + return new; +} + +static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, + struct ocfs2_refcount_tree **ret_tree) +{ + int ret = 0; + struct ocfs2_refcount_tree *tree, *new = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *ref_rb; + + spin_lock(&osb->osb_lock); + if (osb->osb_ref_tree_lru && + osb->osb_ref_tree_lru->rf_blkno == rf_blkno) + tree = osb->osb_ref_tree_lru; + else + tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (tree) + goto out; + + spin_unlock(&osb->osb_lock); + + new = ocfs2_allocate_refcount_tree(osb, rf_blkno); + if (!new) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + /* + * We need the generation to create the refcount tree lock and since + * it isn't changed during the tree modification, we are safe here to + * read without protection. + * We also have to purge the cache after we create the lock since the + * refcount block may have the stale data. It can only be trusted when + * we hold the refcount lock. + */ + ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh); + if (ret) { + mlog_errno(ret); + ocfs2_metadata_cache_exit(&new->rf_ci); + kfree(new); + return ret; + } + + ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + new->rf_generation = le32_to_cpu(ref_rb->rf_generation); + ocfs2_init_refcount_tree_lock(osb, new, rf_blkno, + new->rf_generation); + ocfs2_metadata_cache_purge(&new->rf_ci); + + spin_lock(&osb->osb_lock); + tree = ocfs2_find_refcount_tree(osb, rf_blkno); + if (tree) + goto out; + + ocfs2_insert_refcount_tree(osb, new); + + tree = new; + new = NULL; + +out: + *ret_tree = tree; + + osb->osb_ref_tree_lru = tree; + + spin_unlock(&osb->osb_lock); + + if (new) + ocfs2_free_refcount_tree(new); + + brelse(ref_root_bh); + return ret; +} + +static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + + di = (struct ocfs2_dinode *)di_bh->b_data; + *ref_blkno = le64_to_cpu(di->i_refcount_loc); + brelse(di_bh); +out: + return ret; +} + +static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, int rw) +{ + int ret; + + ret = ocfs2_refcount_lock(tree, rw); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rw) + down_write(&tree->rf_sem); + else + down_read(&tree->rf_sem); + +out: + return ret; +} + +/* + * Lock the refcount tree pointed by ref_blkno and return the tree. + * In most case, we lock the tree and read the refcount block. + * So read it here if the caller really needs it. + * + * If the tree has been re-created by other node, it will free the + * old one and re-create it. + */ +int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, + u64 ref_blkno, int rw, + struct ocfs2_refcount_tree **ret_tree, + struct buffer_head **ref_bh) +{ + int ret, delete_tree = 0; + struct ocfs2_refcount_tree *tree = NULL; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *rb; + +again: + ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree); + if (ret) { + mlog_errno(ret); + return ret; + } + + ocfs2_refcount_tree_get(tree); + + ret = __ocfs2_lock_refcount_tree(osb, tree, rw); + if (ret) { + mlog_errno(ret); + ocfs2_refcount_tree_put(tree); + goto out; + } + + ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, + &ref_root_bh); + if (ret) { + mlog_errno(ret); + ocfs2_unlock_refcount_tree(osb, tree, rw); + ocfs2_refcount_tree_put(tree); + goto out; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + /* + * If the refcount block has been freed and re-created, we may need + * to recreate the refcount tree also. + * + * Here we just remove the tree from the rb-tree, and the last + * kref holder will unlock and delete this refcount_tree. + * Then we goto "again" and ocfs2_get_refcount_tree will create + * the new refcount tree for us. + */ + if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) { + if (!tree->rf_removed) { + ocfs2_erase_refcount_tree_from_list(osb, tree); + tree->rf_removed = 1; + delete_tree = 1; + } + + ocfs2_unlock_refcount_tree(osb, tree, rw); + /* + * We get an extra reference when we create the refcount + * tree, so another put will destroy it. + */ + if (delete_tree) + ocfs2_refcount_tree_put(tree); + brelse(ref_root_bh); + ref_root_bh = NULL; + goto again; + } + + *ret_tree = tree; + if (ref_bh) { + *ref_bh = ref_root_bh; + ref_root_bh = NULL; + } +out: + brelse(ref_root_bh); + return ret; +} + +int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw, + struct ocfs2_refcount_tree **ret_tree, + struct buffer_head **ref_bh) +{ + int ret; + u64 ref_blkno; + + ret = ocfs2_get_refcount_block(inode, &ref_blkno); + if (ret) { + mlog_errno(ret); + return ret; + } + + return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, + rw, ret_tree, ref_bh); +} + +void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, + struct ocfs2_refcount_tree *tree, int rw) +{ + if (rw) + up_write(&tree->rf_sem); + else + up_read(&tree->rf_sem); + + ocfs2_refcount_unlock(tree, rw); + ocfs2_refcount_tree_put(tree); +} + +void ocfs2_purge_refcount_trees(struct ocfs2_super *osb) +{ + struct rb_node *node; + struct ocfs2_refcount_tree *tree; + struct rb_root *root = &osb->osb_rf_lock_tree; + + while ((node = rb_last(root)) != NULL) { + tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node); + + mlog(0, "Purge tree %llu\n", + (unsigned long long) tree->rf_blkno); + + rb_erase(&tree->rf_node, root); + ocfs2_free_refcount_tree(tree); + } +} + +/* + * Create a refcount tree for an inode. + * We take for granted that the inode is already locked. + */ +static int ocfs2_create_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; + u16 suballoc_bit_start; + u32 num_got; + u64 first_blkno; + + BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + + mlog(0, "create tree for inode %lu\n", inode->i_ino); + + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &first_blkno); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno); + if (!new_tree) { + ret = -ENOMEM; + mlog_errno(ret); + goto out_commit; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh); + + ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* Initialize ocfs2_refcount_block. */ + rb = (struct ocfs2_refcount_block *)new_bh->b_data; + memset(rb, 0, inode->i_sb->s_blocksize); + strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); + rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); + rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); + rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); + rb->rf_blkno = cpu_to_le64(first_blkno); + rb->rf_count = cpu_to_le32(1); + rb->rf_records.rl_count = + cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); + spin_lock(&osb->osb_lock); + rb->rf_generation = osb->s_next_generation++; + spin_unlock(&osb->osb_lock); + + ocfs2_journal_dirty(handle, new_bh); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = cpu_to_le64(first_blkno); + spin_unlock(&oi->ip_lock); + + mlog(0, "created tree for inode %lu, refblock %llu\n", + inode->i_ino, (unsigned long long)first_blkno); + + ocfs2_journal_dirty(handle, di_bh); + + /* + * We have to init the tree lock here since it will use + * the generation number to create it. + */ + new_tree->rf_generation = le32_to_cpu(rb->rf_generation); + ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno, + new_tree->rf_generation); + + spin_lock(&osb->osb_lock); + tree = ocfs2_find_refcount_tree(osb, first_blkno); + + /* + * We've just created a new refcount tree in this block. If + * we found a refcount tree on the ocfs2_super, it must be + * one we just deleted. We free the old tree before + * inserting the new tree. + */ + BUG_ON(tree && tree->rf_generation == new_tree->rf_generation); + if (tree) + ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); + ocfs2_insert_refcount_tree(osb, new_tree); + spin_unlock(&osb->osb_lock); + new_tree = NULL; + if (tree) + ocfs2_refcount_tree_put(tree); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + if (new_tree) { + ocfs2_metadata_cache_exit(&new_tree->rf_ci); + kfree(new_tree); + } + + brelse(new_bh); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +static int ocfs2_set_refcount_tree(struct inode *inode, + struct buffer_head *di_bh, + u64 refcount_loc) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_block *rb; + struct ocfs2_refcount_tree *ref_tree; + + BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; + le32_add_cpu(&rb->rf_count, 1); + + ocfs2_journal_dirty(handle, ref_root_bh); + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = cpu_to_le64(refcount_loc); + spin_unlock(&oi->ip_lock); + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + + return ret; +} + +int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) +{ + int ret, delete_tree = 0; + handle_t *handle = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_refcount_block *rb; + struct inode *alloc_inode = NULL; + struct buffer_head *alloc_bh = NULL; + struct buffer_head *blk_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS; + u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); + u16 bit = 0; + + if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + return 0; + + BUG_ON(!ref_blkno); + ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + rb = (struct ocfs2_refcount_block *)blk_bh->b_data; + + /* + * If we are the last user, we need to free the block. + * So lock the allocator ahead. + */ + if (le32_to_cpu(rb->rf_count) == 1) { + blk = le64_to_cpu(rb->rf_blkno); + bit = le16_to_cpu(rb->rf_suballoc_bit); + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(rb->rf_suballoc_slot)); + if (!alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); + if (ret) { + mlog_errno(ret); + goto out_mutex; + } + + credits += OCFS2_SUBALLOC_FREE; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + di->i_refcount_loc = 0; + spin_unlock(&oi->ip_lock); + ocfs2_journal_dirty(handle, di_bh); + + le32_add_cpu(&rb->rf_count , -1); + ocfs2_journal_dirty(handle, blk_bh); + + if (!rb->rf_count) { + delete_tree = 1; + ocfs2_erase_refcount_tree_from_list(osb, ref_tree); + ret = ocfs2_free_suballoc_bits(handle, alloc_inode, + alloc_bh, bit, bg_blkno, 1); + if (ret) + mlog_errno(ret); + } + +out_commit: + ocfs2_commit_trans(osb, handle); +out_unlock: + if (alloc_inode) { + ocfs2_inode_unlock(alloc_inode, 1); + brelse(alloc_bh); + } +out_mutex: + if (alloc_inode) { + mutex_unlock(&alloc_inode->i_mutex); + iput(alloc_inode); + } +out: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + if (delete_tree) + ocfs2_refcount_tree_put(ref_tree); + brelse(blk_bh); + + return ret; +} + +static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci, + struct buffer_head *ref_leaf_bh, + u64 cpos, unsigned int len, + struct ocfs2_refcount_rec *ret_rec, + int *index) +{ + int i = 0; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_rec *rec = NULL; + + for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) { + rec = &rb->rf_records.rl_recs[i]; + + if (le64_to_cpu(rec->r_cpos) + + le32_to_cpu(rec->r_clusters) <= cpos) + continue; + else if (le64_to_cpu(rec->r_cpos) > cpos) + break; + + /* ok, cpos fail in this rec. Just return. */ + if (ret_rec) + *ret_rec = *rec; + goto out; + } + + if (ret_rec) { + /* We meet with a hole here, so fake the rec. */ + ret_rec->r_cpos = cpu_to_le64(cpos); + ret_rec->r_refcount = 0; + if (i < le16_to_cpu(rb->rf_records.rl_used) && + le64_to_cpu(rec->r_cpos) < cpos + len) + ret_rec->r_clusters = + cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos); + else + ret_rec->r_clusters = cpu_to_le32(len); + } + +out: + *index = i; +} + +/* + * Try to remove refcount tree. The mechanism is: + * 1) Check whether i_clusters == 0, if no, exit. + * 2) check whether we have i_xattr_loc in dinode. if yes, exit. + * 3) Check whether we have inline xattr stored outside, if yes, exit. + * 4) Remove the tree. + */ +int ocfs2_try_remove_refcount_tree(struct inode *inode, + struct buffer_head *di_bh) +{ + int ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&oi->ip_xattr_sem); + down_write(&oi->ip_alloc_sem); + + if (oi->ip_clusters) + goto out; + + if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc) + goto out; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL && + ocfs2_has_inline_xattr_value_outside(inode, di)) + goto out; + + ret = ocfs2_remove_refcount_tree(inode, di_bh); + if (ret) + mlog_errno(ret); +out: + up_write(&oi->ip_alloc_sem); + up_write(&oi->ip_xattr_sem); + return 0; +} + +/* + * Given a cpos and len, try to find the refcount record which contains cpos. + * 1. If cpos can be found in one refcount record, return the record. + * 2. If cpos can't be found, return a fake record which start from cpos + * and end at a small value between cpos+len and start of the next record. + * This fake record has r_refcount = 0. + */ +static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + u64 cpos, unsigned int len, + struct ocfs2_refcount_rec *ret_rec, + int *index, + struct buffer_head **ret_bh) +{ + int ret = 0, i, found; + u32 low_cpos; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *tmp, *rec = NULL; + struct ocfs2_extent_block *eb; + struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_root_bh->b_data; + + if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) { + ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len, + ret_rec, index); + *ret_bh = ref_root_bh; + get_bh(ref_root_bh); + return 0; + } + + el = &rb->rf_list; + low_cpos = cpos & OCFS2_32BIT_POS_MASK; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(sb, + "refcount tree %llu has non zero tree " + "depth in leaf btree tree block %llu\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + found = 0; + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= low_cpos) { + found = 1; + break; + } + } + + /* adjust len when we have ocfs2_extent_rec after it. */ + if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) { + tmp = &el->l_recs[i+1]; + + if (le32_to_cpu(tmp->e_cpos) < cpos + len) + len = le32_to_cpu(tmp->e_cpos) - cpos; + } + + ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), + &ref_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len, + ret_rec, index); + *ret_bh = ref_leaf_bh; +out: + brelse(eb_bh); + return ret; +} + +enum ocfs2_ref_rec_contig { + REF_CONTIG_NONE = 0, + REF_CONTIG_LEFT, + REF_CONTIG_RIGHT, + REF_CONTIG_LEFTRIGHT, +}; + +static enum ocfs2_ref_rec_contig + ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb, + int index) +{ + if ((rb->rf_records.rl_recs[index].r_refcount == + rb->rf_records.rl_recs[index + 1].r_refcount) && + (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) + + le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) == + le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos))) + return REF_CONTIG_RIGHT; + + return REF_CONTIG_NONE; +} + +static enum ocfs2_ref_rec_contig + ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb, + int index) +{ + enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE; + + if (index < le16_to_cpu(rb->rf_records.rl_used) - 1) + ret = ocfs2_refcount_rec_adjacent(rb, index); + + if (index > 0) { + enum ocfs2_ref_rec_contig tmp; + + tmp = ocfs2_refcount_rec_adjacent(rb, index - 1); + + if (tmp == REF_CONTIG_RIGHT) { + if (ret == REF_CONTIG_RIGHT) + ret = REF_CONTIG_LEFTRIGHT; + else + ret = REF_CONTIG_LEFT; + } + } + + return ret; +} + +static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb, + int index) +{ + BUG_ON(rb->rf_records.rl_recs[index].r_refcount != + rb->rf_records.rl_recs[index+1].r_refcount); + + le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters, + le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters)); + + if (index < le16_to_cpu(rb->rf_records.rl_used) - 2) + memmove(&rb->rf_records.rl_recs[index + 1], + &rb->rf_records.rl_recs[index + 2], + sizeof(struct ocfs2_refcount_rec) * + (le16_to_cpu(rb->rf_records.rl_used) - index - 2)); + + memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1], + 0, sizeof(struct ocfs2_refcount_rec)); + le16_add_cpu(&rb->rf_records.rl_used, -1); +} + +/* + * Merge the refcount rec if we are contiguous with the adjacent recs. + */ +static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb, + int index) +{ + enum ocfs2_ref_rec_contig contig = + ocfs2_refcount_rec_contig(rb, index); + + if (contig == REF_CONTIG_NONE) + return; + + if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) { + BUG_ON(index == 0); + index--; + } + + ocfs2_rotate_refcount_rec_left(rb, index); + + if (contig == REF_CONTIG_LEFTRIGHT) + ocfs2_rotate_refcount_rec_left(rb, index); +} + +/* + * Change the refcount indexed by "index" in ref_bh. + * If refcount reaches 0, remove it. + */ +static int ocfs2_change_refcount_rec(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_leaf_bh, + int index, int merge, int change) +{ + int ret; + struct ocfs2_refcount_block *rb = + (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; + struct ocfs2_refcount_list *rl = &rb->rf_records; + struct ocfs2_refcount_rec *rec = &rl->rl_recs[index]; + + ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "change index %d, old count %u, change %d\n", index, + le32_to_cpu(rec->r_refcount), change); + le32_add_cpu(&rec->r_refcount, change); + + if (!rec->r_refcount) { + if (index != le16_to_cpu(rl->rl_used) - 1) { + memmove(rec, rec + 1, + (le16_to_cpu(rl->rl_used) - index - 1) * + sizeof(struct ocfs2_refcount_rec)); + memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1], + 0, sizeof(struct ocfs2_refcount_rec)); + } + + le16_add_cpu(&rl->rl_used, -1); + } else if (merge) + ocfs2_refcount_rec_merge(rb, index); + + ret = ocfs2_journal_dirty(handle, ref_leaf_bh); + if (ret) + mlog_errno(ret); +out: + return ret; +} + +static int ocfs2_expand_inline_ref_root(handle_t *handle, + struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct buffer_head **ref_leaf_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int ret; + u16 suballoc_bit_start; + u32 num_got; + u64 blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct buffer_head *new_bh = NULL; + struct ocfs2_refcount_block *new_rb; |