diff options
Diffstat (limited to 'fs/ocfs2/suballoc.c')
| -rw-r--r-- | fs/ocfs2/suballoc.c | 2197 |
1 files changed, 1656 insertions, 541 deletions
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 72c198a004d..0cb889a17ae 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -29,12 +29,12 @@ #include <linux/slab.h> #include <linux/highmem.h> -#define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dlmglue.h" #include "inode.h" #include "journal.h" @@ -43,9 +43,42 @@ #include "super.h" #include "sysfile.h" #include "uptodate.h" +#include "ocfs2_trace.h" #include "buffer_head_io.h" +#define NOT_ALLOC_NEW_GROUP 0 +#define ALLOC_NEW_GROUP 0x1 +#define ALLOC_GROUPS_FROM_GLOBAL 0x2 + +#define OCFS2_MAX_TO_STEAL 1024 + +struct ocfs2_suballoc_result { + u64 sr_bg_blkno; /* The bg we allocated from. Set + to 0 when a block group is + contiguous. */ + u64 sr_bg_stable_blkno; /* + * Doesn't change, always + * set to target block + * group descriptor + * block. + */ + u64 sr_blkno; /* The first allocated block */ + unsigned int sr_bit_offset; /* The bit in the bg */ + unsigned int sr_bits; /* How many bits we claimed */ +}; + +static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) +{ + if (res->sr_blkno == 0) + return 0; + + if (res->sr_bg_blkno) + return res->sr_bg_blkno; + + return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); +} + static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); @@ -53,43 +86,33 @@ static int ocfs2_block_group_fill(handle_t *handle, struct inode *alloc_inode, struct buffer_head *bg_bh, u64 group_blkno, + unsigned int group_clusters, u16 my_chain, struct ocfs2_chain_list *cl); static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, - struct buffer_head *bh); + struct buffer_head *bh, + u64 max_block, + u64 *last_alloc_group, + int flags); static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, - u16 *bit_off, u16 *bits_found); + u64 max_block, + struct ocfs2_suballoc_result *res); static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, - u16 *bit_off, u16 *bits_found); -static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, - struct ocfs2_alloc_context *ac, + u64 max_block, + struct ocfs2_suballoc_result *res); +static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 *bg_blkno); + struct ocfs2_suballoc_result *res); static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr); -static inline int ocfs2_block_group_set_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits); -static inline int ocfs2_block_group_clear_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits); - static int ocfs2_relink_block_group(handle_t *handle, struct inode *alloc_inode, struct buffer_head *fe_bh, @@ -105,8 +128,12 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode, u64 data_blkno, u64 *bg_blkno, u16 *bg_bit_off); +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + int flags, + struct ocfs2_alloc_context **ac); -void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) { struct inode *inode = ac->ac_inode; @@ -117,9 +144,20 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) mutex_unlock(&inode->i_mutex); iput(inode); + ac->ac_inode = NULL; } - if (ac->ac_bh) - brelse(ac->ac_bh); + brelse(ac->ac_bh); + ac->ac_bh = NULL; + ac->ac_resv = NULL; + if (ac->ac_find_loc_priv) { + kfree(ac->ac_find_loc_priv); + ac->ac_find_loc_priv = NULL; + } +} + +void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) +{ + ocfs2_free_ac_resource(ac); kfree(ac); } @@ -128,77 +166,223 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); } -/* somewhat more expensive than our other checks, so use sparingly. */ -int ocfs2_check_group_descriptor(struct super_block *sb, - struct ocfs2_dinode *di, - struct ocfs2_group_desc *gd) +#define do_error(fmt, ...) \ + do{ \ + if (resize) \ + mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ + else \ + ocfs2_error(sb, fmt, ##__VA_ARGS__); \ + } while (0) + +static int ocfs2_validate_gd_self(struct super_block *sb, + struct buffer_head *bh, + int resize) { - unsigned int max_bits; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd); - return -EIO; + do_error("Group descriptor #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + gd->bg_signature); + return -EINVAL; + } + + if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { + do_error("Group descriptor #%llu has an invalid bg_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(gd->bg_blkno)); + return -EINVAL; } + if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { + do_error("Group descriptor #%llu has an invalid " + "fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(gd->bg_generation)); + return -EINVAL; + } + + if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { + do_error("Group descriptor #%llu has bit count %u but " + "claims that %u are free", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits), + le16_to_cpu(gd->bg_free_bits_count)); + return -EINVAL; + } + + if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { + do_error("Group descriptor #%llu has bit count %u but " + "max bitmap bits of %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits), + 8 * le16_to_cpu(gd->bg_size)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_validate_gd_parent(struct super_block *sb, + struct ocfs2_dinode *di, + struct buffer_head *bh, + int resize) +{ + unsigned int max_bits; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + if (di->i_blkno != gd->bg_parent_dinode) { - ocfs2_error(sb, "Group descriptor # %llu has bad parent " - "pointer (%llu, expected %llu)", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), - (unsigned long long)le64_to_cpu(di->i_blkno)); - return -EIO; + do_error("Group descriptor #%llu has bad parent " + "pointer (%llu, expected %llu)", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), + (unsigned long long)le64_to_cpu(di->i_blkno)); + return -EINVAL; } max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); if (le16_to_cpu(gd->bg_bits) > max_bits) { - ocfs2_error(sb, "Group descriptor # %llu has bit count of %u", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits)); - return -EIO; + do_error("Group descriptor #%llu has bit count of %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits)); + return -EINVAL; } - if (le16_to_cpu(gd->bg_chain) >= - le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { - ocfs2_error(sb, "Group descriptor # %llu has bad chain %u", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_chain)); - return -EIO; + /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ + if ((le16_to_cpu(gd->bg_chain) > + le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || + ((le16_to_cpu(gd->bg_chain) == + le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { + do_error("Group descriptor #%llu has bad chain %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_chain)); + return -EINVAL; } - if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { - ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " - "claims that %u are free", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits), - le16_to_cpu(gd->bg_free_bits_count)); - return -EIO; - } + return 0; +} - if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { - ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " - "max bitmap bits of %u", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits), - 8 * le16_to_cpu(gd->bg_size)); - return -EIO; +#undef do_error + +/* + * This version only prints errors. It does not fail the filesystem, and + * exists only for resize. + */ +int ocfs2_check_group_descriptor(struct super_block *sb, + struct ocfs2_dinode *di, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); + if (rc) { + mlog(ML_ERROR, + "Checksum failed for group descriptor %llu\n", + (unsigned long long)bh->b_blocknr); + } else + rc = ocfs2_validate_gd_self(sb, bh, 1); + if (!rc) + rc = ocfs2_validate_gd_parent(sb, di, bh, 1); + + return rc; +} + +static int ocfs2_validate_group_descriptor(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + trace_ocfs2_validate_group_descriptor( + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); + if (rc) + return rc; + + /* + * Errors after here are fatal. + */ + + return ocfs2_validate_gd_self(sb, bh, 0); +} + +int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, + u64 gd_blkno, struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, + ocfs2_validate_group_descriptor); + if (rc) + goto out; + + rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); + if (rc) { + brelse(tmp); + goto out; } - return 0; + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!*bh) + *bh = tmp; + +out: + return rc; +} + +static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb, + struct ocfs2_group_desc *bg, + struct ocfs2_chain_list *cl, + u64 p_blkno, unsigned int clusters) +{ + struct ocfs2_extent_list *el = &bg->bg_list; + struct ocfs2_extent_rec *rec; + + BUG_ON(!ocfs2_supports_discontig_bg(osb)); + if (!el->l_next_free_rec) + el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb)); + rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)]; + rec->e_blkno = cpu_to_le64(p_blkno); + rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) / + le16_to_cpu(cl->cl_bpc)); + rec->e_leaf_clusters = cpu_to_le16(clusters); + le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc)); + le16_add_cpu(&bg->bg_free_bits_count, + clusters * le16_to_cpu(cl->cl_bpc)); + le16_add_cpu(&el->l_next_free_rec, 1); } static int ocfs2_block_group_fill(handle_t *handle, struct inode *alloc_inode, struct buffer_head *bg_bh, u64 group_blkno, + unsigned int group_clusters, u16 my_chain, struct ocfs2_chain_list *cl) { int status = 0; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct super_block * sb = alloc_inode->i_sb; - mlog_entry_void(); - if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " "b_blocknr (%llu)", @@ -208,10 +392,10 @@ static int ocfs2_block_group_fill(handle_t *handle, goto bail; } - status = ocfs2_journal_access(handle, - alloc_inode, - bg_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + bg_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; @@ -220,19 +404,23 @@ static int ocfs2_block_group_fill(handle_t *handle, memset(bg, 0, sb->s_blocksize); strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); - bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); - bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); + bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, + osb->s_feature_incompat)); bg->bg_chain = cpu_to_le16(my_chain); bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); bg->bg_blkno = cpu_to_le64(group_blkno); + if (group_clusters == le16_to_cpu(cl->cl_cpg)) + bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); + else + ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno, + group_clusters); + /* set the 1st bit in the bitmap to account for the descriptor block */ ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); - status = ocfs2_journal_dirty(handle, bg_bh); - if (status < 0) - mlog_errno(status); + ocfs2_journal_dirty(handle, bg_bh); /* There is no need to zero out or otherwise initialize the * other blocks in a group - All valid FS metadata in a block @@ -240,7 +428,8 @@ static int ocfs2_block_group_fill(handle_t *handle, * allocation time. */ bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -258,32 +447,263 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) return best; } +static struct buffer_head * +ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl) +{ + int status; + u32 bit_off, num_bits; + u64 bg_blkno; + struct buffer_head *bg_bh; + unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); + + status = ocfs2_claim_clusters(handle, ac, + le16_to_cpu(cl->cl_cpg), &bit_off, + &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + /* setup the group */ + bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); + trace_ocfs2_block_group_alloc_contig( + (unsigned long long)bg_blkno, alloc_rec); + + bg_bh = sb_getblk(osb->sb, bg_blkno); + if (!bg_bh) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); + + status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, + bg_blkno, num_bits, alloc_rec, cl); + if (status < 0) { + brelse(bg_bh); + mlog_errno(status); + } + +bail: + return status ? ERR_PTR(status) : bg_bh; +} + +static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb, + handle_t *handle, + struct ocfs2_alloc_context *ac, + unsigned int min_bits, + u32 *bit_off, u32 *num_bits) +{ + int status = 0; + + while (min_bits) { + status = ocfs2_claim_clusters(handle, ac, min_bits, + bit_off, num_bits); + if (status != -ENOSPC) + break; + + min_bits >>= 1; + } + + return status; +} + +static int ocfs2_block_group_grow_discontig(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl, + unsigned int min_bits) +{ + int status; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + struct ocfs2_group_desc *bg = + (struct ocfs2_group_desc *)bg_bh->b_data; + unsigned int needed = le16_to_cpu(cl->cl_cpg) - + le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); + u32 p_cpos, clusters; + u64 p_blkno; + struct ocfs2_extent_list *el = &bg->bg_list; + + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + bg_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) < + le16_to_cpu(el->l_count))) { + if (min_bits > needed) + min_bits = needed; + status = ocfs2_block_group_claim_bits(osb, handle, ac, + min_bits, &p_cpos, + &clusters); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos); + ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno, + clusters); + + min_bits = clusters; + needed = le16_to_cpu(cl->cl_cpg) - + le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc); + } + + if (needed > 0) { + /* + * We have used up all the extent rec but can't fill up + * the cpg. So bail out. + */ + status = -ENOSPC; + goto bail; + } + + ocfs2_journal_dirty(handle, bg_bh); + +bail: + return status; +} + +static void ocfs2_bg_alloc_cleanup(handle_t *handle, + struct ocfs2_alloc_context *cluster_ac, + struct inode *alloc_inode, + struct buffer_head *bg_bh) +{ + int i, ret; + struct ocfs2_group_desc *bg; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + if (!bg_bh) + return; + + bg = (struct ocfs2_group_desc *)bg_bh->b_data; + el = &bg->bg_list; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, + cluster_ac->ac_bh, + le64_to_cpu(rec->e_blkno), + le16_to_cpu(rec->e_leaf_clusters)); + if (ret) + mlog_errno(ret); + /* Try all the clusters to free */ + } + + ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh); + brelse(bg_bh); +} + +static struct buffer_head * +ocfs2_block_group_alloc_discontig(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_alloc_context *ac, + struct ocfs2_chain_list *cl) +{ + int status; + u32 bit_off, num_bits; + u64 bg_blkno; + unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1; + struct buffer_head *bg_bh = NULL; + unsigned int alloc_rec = ocfs2_find_smallest_chain(cl); + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + + if (!ocfs2_supports_discontig_bg(osb)) { + status = -ENOSPC; + goto bail; + } + + status = ocfs2_extend_trans(handle, + ocfs2_calc_bg_discontig_credits(osb->sb)); + if (status) { + mlog_errno(status); + goto bail; + } + + /* + * We're going to be grabbing from multiple cluster groups. + * We don't have enough credits to relink them all, and the + * cluster groups will be staying in cache for the duration of + * this operation. + */ + ac->ac_disable_chain_relink = 1; + + /* Claim the first region */ + status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits, + &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + min_bits = num_bits; + + /* setup the group */ + bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); + trace_ocfs2_block_group_alloc_discontig( + (unsigned long long)bg_blkno, alloc_rec); + + bg_bh = sb_getblk(osb->sb, bg_blkno); + if (!bg_bh) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh); + + status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh, + bg_blkno, num_bits, alloc_rec, cl); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_block_group_grow_discontig(handle, alloc_inode, + bg_bh, ac, cl, min_bits); + if (status) + mlog_errno(status); + +bail: + if (status) + ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh); + return status ? ERR_PTR(status) : bg_bh; +} + /* * We expect the block group allocator to already be locked. */ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, - struct buffer_head *bh) + struct buffer_head *bh, + u64 max_block, + u64 *last_alloc_group, + int flags) { int status, credits; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; struct ocfs2_chain_list *cl; struct ocfs2_alloc_context *ac = NULL; handle_t *handle = NULL; - u32 bit_off, num_bits; u16 alloc_rec; - u64 bg_blkno; struct buffer_head *bg_bh = NULL; struct ocfs2_group_desc *bg; BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); - mlog_entry_void(); - cl = &fe->id2.i_chain; - status = ocfs2_reserve_clusters(osb, - le16_to_cpu(cl->cl_cpg), - &ac); + status = ocfs2_reserve_clusters_with_limit(osb, + le16_to_cpu(cl->cl_cpg), + max_block, flags, &ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -300,57 +720,40 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, goto bail; } - status = ocfs2_claim_clusters(osb, - handle, - ac, - le16_to_cpu(cl->cl_cpg), - &bit_off, - &num_bits); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); - goto bail; - } - - alloc_rec = ocfs2_find_smallest_chain(cl); - - /* setup the group */ - bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "new descriptor, record %u, at block %llu\n", - alloc_rec, (unsigned long long)bg_blkno); - - bg_bh = sb_getblk(osb->sb, bg_blkno); - if (!bg_bh) { - status = -EIO; - mlog_errno(status); - goto bail; + if (last_alloc_group && *last_alloc_group != 0) { + trace_ocfs2_block_group_alloc( + (unsigned long long)*last_alloc_group); + ac->ac_last_group = *last_alloc_group; } - ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh); - status = ocfs2_block_group_fill(handle, - alloc_inode, - bg_bh, - bg_blkno, - alloc_rec, - cl); - if (status < 0) { - mlog_errno(status); + bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, + ac, cl); + if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC)) + bg_bh = ocfs2_block_group_alloc_discontig(handle, + alloc_inode, + ac, cl); + if (IS_ERR(bg_bh)) { + status = PTR_ERR(bg_bh); + bg_bh = NULL; + if (status != -ENOSPC) + mlog_errno(status); goto bail; } - bg = (struct ocfs2_group_desc *) bg_bh->b_data; - status = ocfs2_journal_access(handle, alloc_inode, - bh, OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } + alloc_rec = le16_to_cpu(bg->bg_chain); le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, le16_to_cpu(bg->bg_free_bits_count)); - le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); - cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); + le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, + le16_to_cpu(bg->bg_bits)); + cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) le16_add_cpu(&cl->cl_next_free_rec, 1); @@ -359,11 +762,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); - status = ocfs2_journal_dirty(handle, bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, bh); spin_lock(&OCFS2_I(alloc_inode)->ip_lock); OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); @@ -372,8 +771,14 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); + ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); status = 0; + + /* save the new last alloc group so that the caller can cache it. */ + if (last_alloc_group) + *last_alloc_group = ac->ac_last_group; + bail: if (handle) ocfs2_commit_trans(osb, handle); @@ -381,17 +786,19 @@ bail: if (ac) ocfs2_free_alloc_context(ac); - if (bg_bh) - brelse(bg_bh); + brelse(bg_bh); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac, int type, - u32 slot) + u32 slot, + u64 *last_alloc_group, + int flags) { int status; u32 bits_wanted = ac->ac_bits_wanted; @@ -400,8 +807,6 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_dinode *fe; u32 free_bits; - mlog_entry_void(); - alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); if (!alloc_inode) { mlog_errno(-EINVAL); @@ -420,13 +825,14 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, } ac->ac_inode = alloc_inode; + ac->ac_alloc_slot = slot; fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); - status = -EIO; - goto bail; - } + + /* The bh was validated by the inode read inside + * ocfs2_inode_lock(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", (unsigned long long)le64_to_cpu(fe->i_blkno)); @@ -440,13 +846,22 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, if (bits_wanted > free_bits) { /* cluster bitmap never grows */ if (ocfs2_is_cluster_bitmap(alloc_inode)) { - mlog(0, "Disk Full: wanted=%u, free_bits=%u\n", - bits_wanted, free_bits); + trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted, + free_bits); + status = -ENOSPC; + goto bail; + } + + if (!(flags & ALLOC_NEW_GROUP)) { + trace_ocfs2_reserve_suballoc_bits_no_new_group( + slot, bits_wanted, free_bits); status = -ENOSPC; goto bail; } - status = ocfs2_block_group_alloc(osb, alloc_inode, bh); + status = ocfs2_block_group_alloc(osb, alloc_inode, bh, + ac->ac_max_block, + last_alloc_group, flags); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -463,19 +878,120 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, get_bh(bh); ac->ac_bh = bh; bail: - if (bh) - brelse(bh); + brelse(bh); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } -int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, - struct ocfs2_dinode *fe, - struct ocfs2_alloc_context **ac) +static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) +{ + spin_lock(&osb->osb_lock); + osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + atomic_set(&osb->s_num_inodes_stolen, 0); +} + +static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb) +{ + spin_lock(&osb->osb_lock); + osb->s_meta_steal_slot = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + atomic_set(&osb->s_num_meta_stolen, 0); +} + +void ocfs2_init_steal_slots(struct ocfs2_super *osb) +{ + ocfs2_init_inode_steal_slot(osb); + ocfs2_init_meta_steal_slot(osb); +} + +static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type) +{ + spin_lock(&osb->osb_lock); + if (type == INODE_ALLOC_SYSTEM_INODE) + osb->s_inode_steal_slot = slot; + else if (type == EXTENT_ALLOC_SYSTEM_INODE) + osb->s_meta_steal_slot = slot; + spin_unlock(&osb->osb_lock); +} + +static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type) +{ + int slot = OCFS2_INVALID_SLOT; + + spin_lock(&osb->osb_lock); + if (type == INODE_ALLOC_SYSTEM_INODE) + slot = osb->s_inode_steal_slot; + else if (type == EXTENT_ALLOC_SYSTEM_INODE) + slot = osb->s_meta_steal_slot; + spin_unlock(&osb->osb_lock); + + return slot; +} + +static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) +{ + return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE); +} + +static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb) +{ + return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE); +} + +static int ocfs2_steal_resource(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac, + int type) +{ + int i, status = -ENOSPC; + int slot = __ocfs2_get_steal_slot(osb, type); + + /* Start to steal resource from the first slot after ours. */ + if (slot == OCFS2_INVALID_SLOT) + slot = osb->slot_num + 1; + + for (i = 0; i < osb->max_slots; i++, slot++) { + if (slot == osb->max_slots) + slot = 0; + + if (slot == osb->slot_num) + continue; + + status = ocfs2_reserve_suballoc_bits(osb, ac, + type, + (u32)slot, NULL, + NOT_ALLOC_NEW_GROUP); + if (status >= 0) { + __ocfs2_set_steal_slot(osb, slot, type); + break; + } + + ocfs2_free_ac_resource(ac); + } + + return status; +} + +static int ocfs2_steal_inode(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE); +} + +static int ocfs2_steal_meta(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE); +} + +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac) { int status; - u32 slot; + int slot = ocfs2_get_meta_steal_slot(osb); *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { @@ -484,13 +1000,36 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, goto bail; } - (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); + (*ac)->ac_bits_wanted = blocks; (*ac)->ac_which = OCFS2_AC_USE_META; - slot = osb->slot_num; (*ac)->ac_group_search = ocfs2_block_group_search; + if (slot != OCFS2_INVALID_SLOT && + atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL) + goto extent_steal; + + atomic_set(&osb->s_num_meta_stolen, 0); status = ocfs2_reserve_suballoc_bits(osb, (*ac), - EXTENT_ALLOC_SYSTEM_INODE, slot); + EXTENT_ALLOC_SYSTEM_INODE, + (u32)osb->slot_num, NULL, + ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP); + + + if (status >= 0) { + status = 0; + if (slot != OCFS2_INVALID_SLOT) + ocfs2_init_meta_steal_slot(osb); + goto bail; + } else if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + + ocfs2_free_ac_resource(*ac); + +extent_steal: + status = ocfs2_steal_meta(osb, *ac); + atomic_inc(&osb->s_num_meta_stolen); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -504,14 +1043,26 @@ bail: *ac = NULL; } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_extent_list *root_el, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_new_metadata_blocks(osb, + ocfs2_extend_meta_needed(root_el), + ac); +} + int ocfs2_reserve_new_inode(struct ocfs2_super *osb, struct ocfs2_alloc_context **ac) { int status; + int slot = ocfs2_get_inode_steal_slot(osb); + u64 alloc_group; *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { @@ -525,9 +1076,61 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, (*ac)->ac_group_search = ocfs2_block_group_search; + /* + * stat(2) can't handle i_ino > 32bits, so we tell the + * lower levels not to allocate us a block group past that + * limit. The 'inode64' mount option avoids this behavior. + */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) + (*ac)->ac_max_block = (u32)~0U; + + /* + * slot is set when we successfully steal inode from other nodes. + * It is reset in 3 places: + * 1. when we flush the truncate log + * 2. when we complete local alloc recovery. + * 3. when we successfully allocate from our own slot. + * After it is set, we will go on stealing inodes until we find the + * need to check our slots to see whether there is some space for us. + */ + if (slot != OCFS2_INVALID_SLOT && + atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL) + goto inode_steal; + + atomic_set(&osb->s_num_inodes_stolen, 0); + alloc_group = osb->osb_inode_alloc_group; status = ocfs2_reserve_suballoc_bits(osb, *ac, INODE_ALLOC_SYSTEM_INODE, - osb->slot_num); + (u32)osb->slot_num, + &alloc_group, + ALLOC_NEW_GROUP | + ALLOC_GROUPS_FROM_GLOBAL); + if (status >= 0) { + status = 0; + + spin_lock(&osb->osb_lock); + osb->osb_inode_alloc_group = alloc_group; + spin_unlock(&osb->osb_lock); + trace_ocfs2_reserve_new_inode_new_group( + (unsigned long long)alloc_group); + + /* + * Some inodes must be freed by us, so try to allocate + * from our own next time. + */ + if (slot != OCFS2_INVALID_SLOT) + ocfs2_init_inode_steal_slot(osb); + goto bail; + } else if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + + ocfs2_free_ac_resource(*ac); + +inode_steal: + status = ocfs2_steal_inode(osb, *ac); + atomic_inc(&osb->s_num_inodes_stolen); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -541,7 +1144,8 @@ bail: *ac = NULL; } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -557,7 +1161,8 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, status = ocfs2_reserve_suballoc_bits(osb, ac, GLOBAL_BITMAP_SYSTEM_INODE, - OCFS2_INVALID_SLOT); + OCFS2_INVALID_SLOT, NULL, + ALLOC_NEW_GROUP); if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -570,14 +1175,13 @@ bail: /* Callers don't need to care which bitmap (local alloc or main) to * use so we figure it out for them, but unfortunately this clutters * things a bit. */ -int ocfs2_reserve_clusters(struct ocfs2_super *osb, - u32 bits_wanted, - struct ocfs2_alloc_context **ac) +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + int flags, + struct ocfs2_alloc_context **ac) { int status; - mlog_entry_void(); - *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { status = -ENOMEM; @@ -586,24 +1190,17 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, } (*ac)->ac_bits_wanted = bits_wanted; + (*ac)->ac_max_block = max_block; status = -ENOSPC; - if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { + if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) && + ocfs2_alloc_should_use_local(osb, bits_wanted)) { status = ocfs2_reserve_local_alloc_bits(osb, bits_wanted, *ac); if ((status < 0) && (status != -ENOSPC)) { mlog_errno(status); goto bail; - } else if (status == -ENOSPC) { - /* reserve_local_bits will return enospc with - * the local alloc inode still locked, so we - * can change this safely here. */ - mlog(0, "Disabling local alloc\n"); - /* We set to OCFS2_LA_DISABLED so that umount - * can clean up what's left of the local - * allocation */ - osb->local_alloc_state = OCFS2_LA_DISABLED; } } @@ -623,10 +1220,19 @@ bail: *ac = NULL; } - mlog_exit(status); + if (status) + mlog_errno(status); return status; } +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, + ALLOC_NEW_GROUP, ac); +} + /* * More or less lifted from ext3. I'll leave their description below: * @@ -653,32 +1259,39 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr) { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + int ret; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) return 0; - if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data) + + if (!buffer_jbd(bg_bh)) return 1; + jbd_lock_bh_state(bg_bh); bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; - return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); + if (bg) + ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); + else + ret = 1; + jbd_unlock_bh_state(bg_bh); + + return ret; } static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, struct buffer_head *bg_bh, unsigned int bits_wanted, unsigned int total_bits, - u16 *bit_off, - u16 *bits_found) + struct ocfs2_suballoc_result *res) { void *bitmap; u16 best_offset, best_size; int offset, start, found, status = 0; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg); - return -EIO; - } + /* Callers got this descriptor from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); found = start = best_offset = best_size = 0; bitmap = bg->bg_bitmap; @@ -713,14 +1326,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, } } - /* XXX: I think the first clause is equivalent to the second - * - jlbec */ - if (found == bits_wanted) { - *bit_off = start - found; - *bits_found = found; - } else if (best_size) { - *bit_off = best_offset; - *bits_found = best_size; + if (best_size) { + res->sr_bit_offset = best_offset; + res->sr_bits = best_size; } else { status = -ENOSPC; /* No error log here -- see the comment above @@ -730,7 +1338,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, return status; } -static inline int ocfs2_block_group_set_bits(handle_t *handle, +int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, @@ -741,44 +1349,40 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, void *bitmap = bg->bg_bitmap; int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; - mlog_entry_void(); - - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; - goto bail; - } + /* All callers get the descriptor via + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); - mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, - num_bits); + trace_ocfs2_block_group_set_bits(bit_off, num_bits); if (ocfs2_is_cluster_bitmap(alloc_inode)) journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - status = ocfs2_journal_access(handle, - alloc_inode, - group_bh, - journal_type); + status = ocfs2_journal_access_gd(handle, + INODE_CACHE(alloc_inode), + group_bh, + journal_type); if (status < 0) { mlog_errno(status); goto bail; } le16_add_cpu(&bg->bg_free_bits_count, -num_bits); - + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" + " count %u but claims %u are freed. num_bits %d", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), num_bits); + return -EROFS; + } while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); - status = ocfs2_journal_dirty(handle, - group_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, group_bh); bail: - mlog_exit(status); return status; } @@ -811,91 +1415,59 @@ static int ocfs2_relink_block_group(handle_t *handle, int status; /* there is a really tiny chance the journal calls could fail, * but we wouldn't want inconsistent blocks in *any* case. */ - u64 fe_ptr, bg_ptr, prev_bg_ptr; + u64 bg_ptr, prev_bg_ptr; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); - status = -EIO; - goto out; - } - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; - goto out; - } - if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg); - status = -EIO; - goto out; - } + /* The caller got these descriptors from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); - mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n", - (unsigned long long)le64_to_cpu(fe->i_blkno), chain, - (unsigned long long)le64_to_cpu(bg->bg_blkno), - (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); + trace_ocfs2_relink_block_group( + (unsigned long long)le64_to_cpu(fe->i_blkno), chain, + (unsigned long long)le64_to_cpu(bg->bg_blkno), + (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); - fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); bg_ptr = le64_to_cpu(bg->bg_next_group); prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); - status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + prev_bg_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) + goto out; prev_bg->bg_next_group = bg->bg_next_group; + ocfs2_journal_dirty(handle, prev_bg_bh); - status = ocfs2_journal_dirty(handle, prev_bg_bh); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } - - status = ocfs2_journal_access(handle, alloc_inode, bg_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) + goto out_rollback_prev_bg; bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; + ocfs2_journal_dirty(handle, bg_bh); - status = ocfs2_journal_dirty(handle, bg_bh); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } - - status = ocfs2_journal_access(handle, alloc_inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) + goto out_rollback_bg; fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; + ocfs2_journal_dirty(handle, fe_bh); - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto out_rollback; - } - - status = 0; -out_rollback: - if (status < 0) { - fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); - bg->bg_next_group = cpu_to_le64(bg_ptr); - prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); - } out: - mlog_exit(status); + if (status < 0) + mlog_errno(status); return status; + +out_rollback_bg: + bg->bg_next_group = cpu_to_le64(bg_ptr); +out_rollback_prev_bg: + prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); + goto out; } static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, @@ -909,12 +1481,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, - u16 *bit_off, u16 *bits_found) + u64 max_block, + struct ocfs2_suballoc_result *res) { int search = -ENOSPC; int ret; + u64 blkoff; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; - u16 tmp_off, tmp_found; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); unsigned int max_bits, gd_cluster_off; BUG_ON(!ocfs2_is_cluster_bitmap(inode)); @@ -923,7 +1497,7 @@ static int ocfs2_cluster_group_search(struct inode *inode, max_bits = le16_to_cpu(gd->bg_bits); /* Tail groups in cluster bitmaps which aren't cpg - * aligned are prone to partial extention by a failed + * aligned are prone to partial extension by a failed * fs resize. If the file system resize never got to * update the dinode cluster count, then we don't want * to trust any clusters past it, regardless of what @@ -933,27 +1507,42 @@ static int ocfs2_cluster_group_search(struct inode *inode, if ((gd_cluster_off + max_bits) > OCFS2_I(inode)->ip_clusters) { max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; - mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits), - OCFS2_I(inode)->ip_clusters, max_bits); + trace_ocfs2_cluster_group_search_wrong_max_bits( + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + OCFS2_I(inode)->ip_clusters, max_bits); } ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, - max_bits, - &tmp_off, &tmp_found); + max_bits, res); if (ret) return ret; + if (max_block) { + blkoff = ocfs2_clusters_to_blocks(inode->i_sb, + gd_cluster_off + + res->sr_bit_offset + + res->sr_bits); + trace_ocfs2_cluster_group_search_max_block( + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + return -ENOSPC; + } + /* ocfs2_block_group_find_clear_bits() might * return success, but we still want to return * -ENOSPC unless it found the minimum number * of bits. */ - if (min_bits <= tmp_found) { - *bit_off = tmp_off; - *bits_found = tmp_found; + if (min_bits <= res->sr_bits) search = 0; /* success */ + else if (res->sr_bits) { + /* + * Don't show bits which we'll be returning + * for allocation to the local alloc bitmap. + */ + ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits); } } @@ -963,24 +1552,36 @@ static int ocfs2_cluster_group_search(struct inode *inode, static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, - u16 *bit_off, u16 *bits_found) + u64 max_block, + struct ocfs2_suballoc_result *res) { int ret = -ENOSPC; + u64 blkoff; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; BUG_ON(min_bits != 1); BUG_ON(ocfs2_is_cluster_bitmap(inode)); - if (bg->bg_free_bits_count) + if (bg->bg_free_bits_count) { ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, le16_to_cpu(bg->bg_bits), - bit_off, bits_found); + res); + if (!ret && max_block) { + blkoff = le64_to_cpu(bg->bg_blkno) + + res->sr_bit_offset + res->sr_bits; + trace_ocfs2_block_group_search_max_block( + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + ret = -ENOSPC; + } + } return ret; } -static int ocfs2_alloc_dinode_update_counts(struct inode *inode, +int ocfs2_alloc_dinode_update_counts(struct inode *inode, handle_t *handle, struct buffer_head *di_bh, u32 num_bits, @@ -991,8 +1592,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode, struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; @@ -1001,56 +1602,119 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode, tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); - - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret < 0) - mlog_errno(ret); + ocfs2_journal_dirty(handle, di_bh); out: return ret; } +void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl; + + cl = (struct ocfs2_chain_list *)&di->id2.i_chain; + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits); + le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits); +} + +static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, + struct ocfs2_extent_rec *rec, + struct ocfs2_chain_list *cl) +{ + unsigned int bpc = le16_to_cpu(cl->cl_bpc); + unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; + unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc; + + if (res->sr_bit_offset < bitoff) + return 0; + if (res->sr_bit_offset >= (bitoff + bitcount)) + return 0; + res->sr_blkno = le64_to_cpu(rec->e_blkno) + + (res->sr_bit_offset - bitoff); + if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount)) + res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset; + return 1; +} + +static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac, + struct ocfs2_group_desc *bg, + struct ocfs2_suballoc_result *res) +{ + int i; + u64 bg_blkno = res->sr_bg_blkno; /* Save off */ + struct ocfs2_extent_rec *rec; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; + struct ocfs2_chain_list *cl = &di->id2.i_chain; + + if (ocfs2_is_cluster_bitmap(ac->ac_inode)) { + res->sr_blkno = 0; + return; + } + + res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset; + res->sr_bg_blkno = 0; /* Clear it for contig block groups */ + if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) || + !bg->bg_list.l_next_free_rec) + return; + + for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) { + rec = &bg->bg_list.l_recs[i]; + if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) { + res->sr_bg_blkno = bg_blkno; /* Restore */ + break; + } + } +} + static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 gd_blkno, + struct ocfs2_suballoc_result *res, u16 *bits_left) { int ret; - u16 found; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *gd; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; struct inode *alloc_inode = ac->ac_inode; - ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno, - &group_bh, OCFS2_BH_CACHED, alloc_inode); + ret = ocfs2_read_group_descriptor(alloc_inode, di, + res->sr_bg_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); return ret; } gd = (struct ocfs2_group_desc *) group_bh->b_data; - if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd); - ret = -EIO; - goto out; - } - ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, - bit_off, &found); + ac->ac_max_block, res); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); goto out; } - *num_bits = found; + if (!ret) + ocfs2_bg_discontig_fix_result(ac, gd, res); + + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; + + if (ac->ac_find_loc_only) + goto out_loc_only; ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, - *num_bits, + res->sr_bits, le16_to_cpu(gd->bg_chain)); if (ret < 0) { mlog_errno(ret); @@ -1058,10 +1722,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, } ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, - *bit_off, *num_bits); - if (ret < 0) + res->sr_bit_offset, res->sr_bits); + if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, + res->sr_bits, + le16_to_cpu(gd->bg_chain)); mlog_errno(ret); + } +out_loc_only: *bits_left = le16_to_cpu(gd->bg_free_bits_count); out: @@ -1074,14 +1743,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 *bg_blkno, + struct ocfs2_suballoc_result *res, u16 *bits_left) { int status; - u16 chain, tmp_bits; - u32 tmp_used; + u16 chain; u64 next_group; struct inode *alloc_inode = ac->ac_inode; struct buffer_head *group_bh = NULL; @@ -1091,53 +1757,42 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, struct ocfs2_group_desc *bg; chain = ac->ac_chain; - mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n", - bits_wanted, chain, - (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); + trace_ocfs2_search_chain_begin( + (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, + bits_wanted, chain); - status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), - le64_to_cpu(cl->cl_recs[chain].c_blkno), - &group_bh, OCFS2_BH_CACHED, alloc_inode); + status = ocfs2_read_group_descriptor(alloc_inode, fe, + le64_to_cpu(cl->cl_recs[chain].c_blkno), + &group_bh); if (status < 0) { mlog_errno(status); goto bail; } bg = (struct ocfs2_group_desc *) group_bh->b_data; - status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg); - if (status) { - mlog_errno(status); - goto bail; - } status = -ENOSPC; /* for now, the chain search is a bit simplistic. We just use * the 1st group with any empty bits. */ while ((status = ac->ac_group_search(alloc_inode, group_bh, - bits_wanted, min_bits, bit_off, - &tmp_bits)) == -ENOSPC) { + bits_wanted, min_bits, + ac->ac_max_block, + res)) == -ENOSPC) { if (!bg->bg_next_group) break; - if (prev_group_bh) { - brelse(prev_group_bh); - prev_group_bh = NULL; - } + brelse(prev_group_bh); + prev_group_bh = NULL; + next_group = le64_to_cpu(bg->bg_next_group); prev_group_bh = group_bh; group_bh = NULL; - status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), - next_group, &group_bh, - OCFS2_BH_CACHED, alloc_inode); + status = ocfs2_read_group_descriptor(alloc_inode, fe, + next_group, &group_bh); if (status < 0) { mlog_errno(status); goto bail; } bg = (struct ocfs2_group_desc *) group_bh->b_data; - status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg); - if (status) { - mlog_errno(status); - goto bail; - } } if (status < 0) { if (status != -ENOSPC) @@ -1145,12 +1800,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, goto bail; } - mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", - tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); + trace_ocfs2_search_chain_succ( + (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits); - *num_bits = tmp_bits; + res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); - BUG_ON(*num_bits == 0); + BUG_ON(res->sr_bits == 0); + if (!status) + ocfs2_bg_discontig_fix_result(ac, bg, res); + + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; /* * Keep track of previous block descriptor read. When @@ -1165,9 +1828,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, * Do this *after* figuring out how many bits we're taking out * of our target group. */ - if (ac->ac_allow_chain_relink && + if (!ac->ac_disable_chain_relink && (prev_group_bh) && - (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { + (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) { status = ocfs2_relink_block_group(handle, alloc_inode, ac->ac_bh, group_bh, prev_group_bh, chain); @@ -1177,24 +1840,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, } } - /* Ok, claim our bits now: set the info on dinode, chainlist - * and then the group */ - status = ocfs2_journal_access(handle, - alloc_inode, - ac->ac_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } + if (ac->ac_find_loc_only) + goto out_loc_only; - tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); - fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); - - status = ocfs2_journal_dirty(handle, - ac->ac_bh); - if (status < 0) { + status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (status) { mlog_errno(status); goto bail; } @@ -1203,60 +1855,58 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, alloc_inode, bg, group_bh, - *bit_off, - *num_bits); + res->sr_bit_offset, + res->sr_bits); if (status < 0) { + ocfs2_rollback_alloc_dinode_counts(alloc_inode, + ac->ac_bh, res->sr_bits, chain); mlog_errno(status); goto bail; } - mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits, - (unsigned long long)le64_to_cpu(fe->i_blkno)); + trace_ocfs2_search_chain_end( + (unsigned long long)le64_to_cpu(fe->i_blkno), + res->sr_bits); - *bg_blkno = le64_to_cpu(bg->bg_blkno); +out_loc_only: *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: - if (group_bh) - brelse(group_bh); - if (prev_group_bh) - brelse(prev_group_bh); + brelse(group_bh); + brelse(prev_group_bh); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } /* will give out up to bits_wanted contiguous bits. */ -static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, - struct ocfs2_alloc_context *ac, +static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, handle_t *handle, u32 bits_wanted, u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 *bg_blkno) + struct ocfs2_suballoc_result *res) { int status; u16 victim, i; u16 bits_left = 0; - u64 hint_blkno = ac->ac_last_group; + u64 hint = ac->ac_last_group; struct ocfs2_chain_list *cl; struct ocfs2_dinode *fe; - mlog_entry_void(); - BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); BUG_ON(!ac->ac_bh); fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe); - status = -EIO; - goto bail; - } + + /* The bh was validated by the inode read during + * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + if (le32_to_cpu(fe->id1.bitmap1.i_used) >= le32_to_cpu(fe->id1.bitmap1.i_total)) { - ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " + ocfs2_error(ac->ac_inode->i_sb, + "Chain allocator dinode %llu has %u used " "bits but only %u total.", (unsigned long long)le64_to_cpu(fe->i_blkno), le32_to_cpu(fe->id1.bitmap1.i_used), @@ -1265,22 +1915,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, goto bail; } - if (hint_blkno) { + res->sr_bg_blkno = hint; + if (res->sr_bg_blkno) { /* Attempt to short-circuit the usual search mechanism * by jumping straight to the most recently used - * allocation group. This helps us mantain some + * allocation group. This helps us maintain some * contiguousness across allocations. */ status = ocfs2_search_one_group(ac, handle, bits_wanted, - min_bits, bit_off, num_bits, - hint_blkno, &bits_left); - if (!status) { - /* Be careful to update *bg_blkno here as the - * caller is expecting it to be filled in, and - * ocfs2_search_one_group() won't do that for - * us. */ - *bg_blkno = hint_blkno; + min_bits, res, &bits_left); + if (!status) goto set_hint; - } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1291,25 +1935,25 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, victim = ocfs2_find_victim_chain(cl); ac->ac_chain = victim; - ac->ac_allow_chain_relink = 1; - status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off, - num_bits, bg_blkno, &bits_left); - if (!status) + status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, + res, &bits_left); + if (!status) { + hint = ocfs2_group_from_res(res); goto set_hint; + } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; } - mlog(0, "Search of victim chain %u came up with nothing, " - "trying all chains now.\n", victim); + trace_ocfs2_claim_suballoc_bits(victim); /* If we didn't pick a good victim, then just default to * searching each chain in order. Don't allow chain relinking * because we only calculate enough journal credits for one * relink per alloc. */ - ac->ac_allow_chain_relink = 0; + ac->ac_disable_chain_relink = 1; for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { if (i == victim) continue; @@ -1318,10 +1962,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, ac->ac_chain = i; status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, - bit_off, num_bits, bg_blkno, - &bits_left); - if (!status) + res, &bits_left); + if (!status) { + hint = ocfs2_group_from_res(res); break; + } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1336,89 +1981,260 @@ set_hint: if (bits_left < min_bits) ac->ac_last_group = 0; else - ac->ac_last_group = *bg_blkno; + ac->ac_last_group = hint; } bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } -int ocfs2_claim_metadata(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, u32 bits_wanted, + u64 *suballoc_loc, u16 *suballoc_bit_start, unsigned int *num_bits, u64 *blkno_start) { int status; - u64 bg_blkno; + struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; BUG_ON(!ac); BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); BUG_ON(ac->ac_which != OCFS2_AC_USE_META); - status = ocfs2_claim_suballoc_bits(osb, - ac, + status = ocfs2_claim_suballoc_bits(ac, handle, bits_wanted, 1, - suballoc_bit_start, - num_bits, - &bg_blkno); + &res); if (status < 0) { mlog_errno(status); goto bail; } - atomic_inc(&osb->alloc_stats.bg_allocs); + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); - *blkno_start = bg_blkno + (u64) *suballoc_bit_start; - ac->ac_bits_given += (*num_bits); + *suballoc_loc = res.sr_bg_blkno; + *suballoc_bit_start = res.sr_bit_offset; + *blkno_start = res.sr_blkno; + ac->ac_bits_given += res.sr_bits; + *num_bits = res.sr_bits; status = 0; bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } -int ocfs2_claim_new_inode(struct ocfs2_super *osb, - handle_t *handle, +static void ocfs2_init_inode_ac_group(struct inode *dir, + struct buffer_head *parent_di_bh, + struct ocfs2_alloc_context *ac) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data; + /* + * Try to allocate inodes from some specific group. + * + * If the parent dir has recorded the last group used in allocation, + * cool, use it. Otherwise if we try to allocate new inode from the + * same slot the parent dir belongs to, use the same chunk. + * + * We are very careful here to avoid the mistake of setting + * ac_last_group to a group descriptor from a different (unlocked) slot. + */ + if (OCFS2_I(dir)->ip_last_used_group && + OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot) + ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group; + else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) { + if (di->i_suballoc_loc) + ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc); + else + ac->ac_last_group = ocfs2_which_suballoc_group( + le64_to_cpu(di->i_blkno), + le16_to_cpu(di->i_suballoc_bit)); + } +} + +static inline void ocfs2_save_inode_ac_group(struct inode *dir, + struct ocfs2_alloc_context *ac) +{ + OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group; + OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; +} + +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_suballoc_result *res; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given != 0); + BUG_ON(ac->ac_bits_wanted != 1); + BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + + res = kzalloc(sizeof(*res), GFP_NOFS); + if (res == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + + /* + * The handle started here is for chain relink. Alternatively, + * we could just disable relink for these calls. + */ + handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + /* + * This will instruct ocfs2_claim_suballoc_bits and + * ocfs2_search_one_group to search but save actual allocation + * for later. + */ + ac->ac_find_loc_only = 1; + + ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ac->ac_find_loc_priv = res; + *fe_blkno = res->sr_blkno; + ocfs2_update_inode_fsync_trans(handle, dir, 0); +out: + if (handle) + ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); + + if (ret) + kfree(res); + + return ret; +} + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno) +{ + int ret; + u16 chain; + struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; + struct buffer_head *bg_bh = NULL; + struct ocfs2_group_desc *bg; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; + + /* + * Since di_blkno is being passed back in, we check for any + * inconsistencies which may have happened between + * calls. These are code bugs as di_blkno is not expected to + * change once returned from ocfs2_find_new_inode_loc() + */ + BUG_ON(res->sr_blkno != di_blkno); + + ret = ocfs2_read_group_descriptor(ac->ac_inode, di, + res->sr_bg_stable_blkno, &bg_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + bg = (struct ocfs2_group_desc *) bg_bh->b_data; + chain = le16_to_cpu(bg->bg_chain); + + ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_block_group_set_bits(handle, + ac->ac_inode, + bg, + bg_bh, + res->sr_bit_offset, + res->sr_bits); + if (ret < 0) { + ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, + ac->ac_bh, res->sr_bits, chain); + mlog_errno(ret); + goto out; + } + + trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno, + res->sr_bits); + + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); + + BUG_ON(res->sr_bits != 1); + + *suballoc_loc = res->sr_bg_blkno; + *suballoc_bit = res->sr_bit_offset; + ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); + +out: + brelse(bg_bh); + + return ret; +} + +int ocfs2_claim_new_inode(handle_t *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, u16 *suballoc_bit, u64 *fe_blkno) { int status; - unsigned int num_bits; - u64 bg_blkno; - - mlog_entry_void(); + struct ocfs2_suballoc_result res; BUG_ON(!ac); BUG_ON(ac->ac_bits_given != 0); BUG_ON(ac->ac_bits_wanted != 1); BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); - status = ocfs2_claim_suballoc_bits(osb, - ac, + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + + status = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, - suballoc_bit, - &num_bits, - &bg_blkno); + &res); if (status < 0) { mlog_errno(status); goto bail; } - atomic_inc(&osb->alloc_stats.bg_allocs); + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); - BUG_ON(num_bits != 1); + BUG_ON(res.sr_bits != 1); - *fe_blkno = bg_blkno + (u64) (*suballoc_bit); + *suballoc_loc = res.sr_bg_blkno; + *suballoc_bit = res.sr_bit_offset; + *fe_blkno = res.sr_blkno; ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); status = 0; bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } @@ -1483,8 +2299,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode, * contig. allocation, set to '1' to indicate we can deal with extents * of any size. */ -int __ocfs2_claim_clusters(struct ocfs2_super *osb, - handle_t *handle, +int __ocfs2_claim_clusters(handle_t *handle, struct ocfs2_alloc_context *ac, u32 min_clusters, u32 max_clusters, @@ -1493,10 +2308,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb, { int status; unsigned int bits_wanted = max_clusters; - u64 bg_blkno = 0; - u16 bg_bit_off; - - mlog_entry_void(); + struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; + struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); @@ -1504,6 +2317,8 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb, && ac->ac_which != OCFS2_AC_USE_MAIN); if (ac->ac_which == OCFS2_AC_USE_LOCAL) { + WARN_ON(min_clusters > 1); + status = ocfs2_claim_local_alloc_bits(osb, handle, ac, @@ -1526,20 +2341,19 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb, if (bits_wanted > (osb->bitmap_cpg - 1)) bits_wanted = osb->bitmap_cpg - 1; - status = ocfs2_claim_suballoc_bits(osb, - ac, + status = ocfs2_claim_suballoc_bits(ac, handle, bits_wanted, min_clusters, - &bg_bit_off, - num_clusters, - &bg_blkno); + &res); if (!status) { + BUG_ON(res.sr_blkno); /* cluster alloc can't set */ *cluster_start = ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, - bg_blkno, - bg_bit_off); + res.sr_bg_blkno, + res.sr_bit_offset); atomic_inc(&osb->alloc_stats.bitmap_data); + *num_clusters = res.sr_bits; } } if (status < 0) { @@ -1551,12 +2365,12 @@ int __ocfs2_claim_clusters(struct ocfs2_super *osb, ac->ac_bits_given += *num_clusters; bail: - mlog_exit(status); + if (status) + mlog_errno(status); return status; } -int ocfs2_claim_clusters(struct ocfs2_super *osb, - handle_t *handle, +int ocfs2_claim_clusters(handle_t *handle, struct ocfs2_alloc_context *ac, u32 min_clusters, u32 *cluster_start, @@ -1564,58 +2378,69 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb, { unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; - return __ocfs2_claim_clusters(osb, handle, ac, min_clusters, + return __ocfs2_claim_clusters(handle, ac, min_clusters, bits_wanted, cluster_start, num_clusters); } -static inline int ocfs2_block_group_clear_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits) +static int ocfs2_block_group_clear_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits, + void (*undo_fn)(unsigned int bit, + unsigned long *bmap)) { int status; unsigned int tmp; - int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; struct ocfs2_group_desc *undo_bg = NULL; - mlog_entry_void(); - - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; - goto bail; - } - - mlog(0, "off = %u, num = %u\n", bit_off, num_bits); + /* The caller got this descriptor from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); - if (ocfs2_is_cluster_bitmap(alloc_inode)) - journal_type = OCFS2_JOURNAL_ACCESS_UNDO; + trace_ocfs2_block_group_clear_bits(bit_off, num_bits); - status = ocfs2_journal_access(handle, alloc_inode, group_bh, - journal_type); + BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); + status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), + group_bh, + undo_fn ? + OCFS2_JOURNAL_ACCESS_UNDO : + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } - if (ocfs2_is_cluster_bitmap(alloc_inode)) - undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data; + if (undo_fn) { + jbd_lock_bh_state(group_bh); + undo_bg = (struct ocfs2_group_desc *) + bh2jh(group_bh)->b_committed_data; + BUG_ON(!undo_bg); + } tmp = num_bits; while(tmp--) { ocfs2_clear_bit((bit_off + tmp), (unsigned long *) bg->bg_bitmap); - if (ocfs2_is_cluster_bitmap(alloc_inode)) - ocfs2_set_bit(bit_off + tmp, - (unsigned long *) undo_bg->bg_bitmap); + if (undo_fn) + undo_fn(bit_off + tmp, + (unsigned long *) undo_bg->bg_bitmap); } le16_add_cpu(&bg->bg_free_bits_count, num_bits); + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" + " count %u but claims %u are freed. num_bits %d", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), + le16_to_cpu(bg->bg_free_bits_count), num_bits); + return -EROFS; + } - status = ocfs2_journal_dirty(handle, group_bh); - if (status < 0) - mlog_errno(status); + if (undo_fn) + jbd_unlock_bh_state(group_bh); + + ocfs2_journal_dirty(handle, group_bh); bail: return status; } @@ -1623,59 +2448,55 @@ bail: /* * expects the suballoc inode to already be locked. */ -int ocfs2_free_suballoc_bits(handle_t *handle, - struct inode *alloc_inode, - struct buffer_head *alloc_bh, - unsigned int start_bit, - u64 bg_blkno, - unsigned int count) +static int _ocfs2_free_suballoc_bits(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count, + void (*undo_fn)(unsigned int bit, + unsigned long *bitmap)) { int status = 0; u32 tmp_used; - struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; struct ocfs2_chain_list *cl = &fe->id2.i_chain; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *group; - mlog_entry_void(); - - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); - status = -EIO; - goto bail; - } + /* The alloc_bh comes from ocfs2_free_dinode() or + * ocfs2_free_clusters(). The callers have all locked the + * allocator and gotten alloc_bh from the lock call. This + * validates the dinode buffer. Any corruption that has happened + * is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); - mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n", - (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, - (unsigned long long)bg_blkno, start_bit); + trace_ocfs2_free_suballoc_bits( + (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, + (unsigned long long)bg_blkno, + start_bit, count); - status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, - alloc_inode); + status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, + &group_bh); if (status < 0) { mlog_errno(status); goto bail; } - group = (struct ocfs2_group_desc *) group_bh->b_data; - status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group); - if (status) { - mlog_errno(status); - goto bail; - } + BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); status = ocfs2_block_group_clear_bits(handle, alloc_inode, group, group_bh, - start_bit, count); + start_bit, count, undo_fn); if (status < 0) { mlog_errno(status); goto bail; } - status = ocfs2_journal_access(handle, alloc_inode, alloc_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -1685,21 +2506,27 @@ int ocfs2_free_suballoc_bits(handle_t *handle, count); tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); - - status = ocfs2_journal_dirty(handle, alloc_bh); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_journal_dirty(handle, alloc_bh); bail: - if (group_bh) - brelse(group_bh); + brelse(group_bh); - mlog_exit(status); + if (status) + mlog_errno(status); return status; } +int ocfs2_free_suballoc_bits(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count) +{ + return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, + start_bit, bg_blkno, count, NULL); +} + int ocfs2_free_dinode(handle_t *handle, struct inode *inode_alloc_inode, struct buffer_head *inode_alloc_bh, @@ -1709,15 +2536,19 @@ int ocfs2_free_dinode(handle_t *handle, u16 bit = le16_to_cpu(di->i_suballoc_bit); u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); + if (di->i_suballoc_loc) + bg_blkno = le64_to_cpu(di->i_suballoc_loc); return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, inode_alloc_bh, bit, bg_blkno, 1); } -int ocfs2_free_clusters(handle_t *handle, - struct inode *bitmap_inode, - struct buffer_head *bitmap_bh, - u64 start_blk, - unsigned int num_clusters) +static int _ocfs2_free_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters, + void (*undo_fn)(unsigned int bit, + unsigned long *bitmap)) { int status; u16 bg_start_bit; @@ -1726,11 +2557,8 @@ int ocfs2_free_clusters(handle_t *handle, /* You can't ever have a contiguous set of clusters * bigger than a block group bitmap so we never have to worry - * about looping on them. */ - - mlog_entry_void(); - - /* This is expensive. We can safely remove once this stuff has + * about looping on them. + * This is expensive. We can safely remove once this stuff has * gotten tested really well. */ BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); @@ -1739,21 +2567,53 @@ int ocfs2_free_clusters(handle_t *handle, ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, &bg_start_bit); - mlog(0, "want to free %u clusters starting at block %llu\n", - num_clusters, (unsigned long long)start_blk); - mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", - (unsigned long long)bg_blkno, bg_start_bit); + trace_ocfs2_free_clusters((unsigned long long)bg_blkno, + (unsigned long long)start_blk, + bg_start_bit, num_clusters); - status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, - bg_start_bit, bg_blkno, - num_clusters); - if (status < 0) + status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, + bg_start_bit, bg_blkno, + num_clusters, undo_fn); + if (status < 0) { mlog_errno(status); + goto out; + } + + ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), + num_clusters); - mlog_exit(status); +out: + if (status) + mlog_errno(status); return status; } +int ocfs2_free_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters) +{ + return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, + start_blk, num_clusters, + _ocfs2_set_bit); +} + +/* + * Give never-used clusters back to the global bitmap. We don't need + * to protect these bits in the undo buffer. + */ +int ocfs2_release_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters) +{ + return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, + start_blk, num_clusters, + _ocfs2_clear_bit); +} + static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) { printk("Block Group:\n"); @@ -1800,3 +2660,258 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); } } + +/* + * For a given allocation, determine which allocators will need to be + * accessed, and lock them, reserving the appropriate number of bits. + * + * Sparse file systems call this from ocfs2_write_begin_nolock() + * and ocfs2_allocate_unwritten_extents(). + * + * File systems which don't support holes call this from + * ocfs2_extend_allocation(). + */ +int ocfs2_lock_allocators(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac) +{ + int ret = 0, num_free_extents; + unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *meta_ac = NULL; + if (data_ac) + *data_ac = NULL; + + BUG_ON(clusters_to_add != 0 && data_ac == NULL); + + num_free_extents = ocfs2_num_free_extents(osb, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + /* + * Sparse allocation file systems need to be more conservative + * with reserving room for expansion - the actual allocation + * happens while we've got a journal handle open so re-taking + * a cluster lock (because we ran out of room for another + * extent) will violate ordering rules. + * + * Most of the time we'll only be seeing this 1 cluster at a time + * anyway. + * + * Always lock for any unwritten extents - we might want to + * add blocks during a split. + */ + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { + ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + + if (clusters_to_add == 0) + goto out; + + ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + + /* + * We cannot have an error and a non null *data_ac. + */ + } + + return ret; +} + +/* + * Read the inode specified by blkno to get suballoc_slot and + * suballoc_bit. + */ +static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, + u16 *suballoc_slot, u64 *group_blkno, + u16 *suballoc_bit) +{ + int status; + struct buffer_head *inode_bh = NULL; + struct ocfs2_dinode *inode_fe; + + trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno); + + /* dirty read disk */ + status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); + if (status < 0) { + mlog(ML_ERROR, "read block %llu failed %d\n", + (unsigned long long)blkno, status); + goto bail; + } + + inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; + if (!OCFS2_IS_VALID_DINODE(inode_fe)) { + mlog(ML_ERROR, "invalid inode %llu requested\n", + (unsigned long long)blkno); + status = -EINVAL; + goto bail; + } + + if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && + (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) { + mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n", + (unsigned long long)blkno, + (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); + status = -EINVAL; + goto bail; + } + + if (suballoc_slot) + *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); + if (suballoc_bit) + *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); + if (group_blkno) + *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); + +bail: + brelse(inode_bh); + + if (status) + mlog_errno(status); + return status; +} + +/* + * test whether bit is SET in allocator bitmap or not. on success, 0 + * is returned and *res is 1 for SET; 0 otherwise. when fails, errno + * is returned and *res is meaningless. Call this after you have + * cluster locked against suballoc, or you may get a result based on + * non-up2date contents + */ +static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, + struct inode *suballoc, + struct buffer_head *alloc_bh, + u64 group_blkno, u64 blkno, + u16 bit, int *res) +{ + struct ocfs2_dinode *alloc_di; + struct ocfs2_group_desc *group; + struct buffer_head *group_bh = NULL; + u64 bg_blkno; + int status; + + trace_ocfs2_test_suballoc_bit((unsigned long long)blkno, + (unsigned int)bit); + + alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; + if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { + mlog(ML_ERROR, "suballoc bit %u out of range of %u\n", + (unsigned int)bit, + ocfs2_bits_per_group(&alloc_di->id2.i_chain)); + status = -EINVAL; + goto bail; + } + + bg_blkno = group_blkno ? group_blkno : + ocfs2_which_suballoc_group(blkno, bit); + status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, + &group_bh); + if (status < 0) { + mlog(ML_ERROR, "read group %llu failed %d\n", + (unsigned long long)bg_blkno, status); + goto bail; + } + + group = (struct ocfs2_group_desc *) group_bh->b_data; + *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); + +bail: + brelse(group_bh); + + if (status) + mlog_errno(status); + return status; +} + +/* + * Test if the bit representing this inode (blkno) is set in the + * suballocator. + * + * On success, 0 is returned and *res is 1 for SET; 0 otherwise. + * + * In the event of failure, a negative value is returned and *res is + * meaningless. + * + * Callers must make sure to hold nfs_sync_lock to prevent + * ocfs2_delete_inode() on another node from accessing the same + * suballocator concurrently. + */ +int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) +{ + int status; + u64 group_blkno = 0; + u16 suballoc_bit = 0, suballoc_slot = 0; + struct inode *inode_alloc_inode; + struct buffer_head *alloc_bh = NULL; + + trace_ocfs2_test_inode_bit((unsigned long long)blkno); + + status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, + &group_blkno, &suballoc_bit); + if (status < 0) { + mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); + goto bail; + } + + inode_alloc_inode = + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, + suballoc_slot); + if (!inode_alloc_inode) { + /* the error code could be inaccurate, but we are not able to + * get the correct one. */ + status = -EINVAL; + mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", + (u32)suballoc_slot); + goto bail; + } + + mutex_lock(&inode_alloc_inode->i_mutex); + status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); + if (status < 0) { + mutex_unlock(&inode_alloc_inode->i_mutex); + iput(inode_alloc_inode); + mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", + (u32)suballoc_slot, status); + goto bail; + } + + status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, + group_blkno, blkno, suballoc_bit, res); + if (status < 0) + mlog(ML_ERROR, "test suballoc bit failed %d\n", status); + + ocfs2_inode_unlock(inode_alloc_inode, 0); + mutex_unlock(&inode_alloc_inode->i_mutex); + + iput(inode_alloc_inode); + brelse(alloc_bh); +bail: + if (status) + mlog_errno(status); + return status; +} |
