diff options
Diffstat (limited to 'fs/ext4/ialloc.c')
| -rw-r--r-- | fs/ext4/ialloc.c | 284 |
1 files changed, 205 insertions, 79 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index d48e8b14928..5b87fc36aab 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -70,18 +70,26 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_group_clusters_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); - ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, - EXT4_INODES_PER_GROUP(sb) / 8); + grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } @@ -117,6 +125,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct ext4_group_desc *desc; struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -166,7 +176,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) trace_ext4_load_inode_bitmap(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ, bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { put_bh(bh); @@ -185,6 +195,14 @@ verify: put_bh(bh); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, bitmap_blk); + grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return NULL; } ext4_unlock_group(sb, block_group); @@ -221,6 +239,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; + struct ext4_group_info *grp; if (!sb) { printk(KERN_ERR "EXT4-fs: %s:%d: inode on " @@ -266,7 +285,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (!bitmap_bh) + /* Don't bother if the inode bitmap is corrupt. */ + grp = ext4_get_group_info(sb, block_group); + if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh) goto error_return; BUFFER_TRACE(bitmap_bh, "get_write_access"); @@ -315,9 +336,16 @@ out: err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; - ext4_mark_super_dirty(sb); - } else + } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + } error_return: brelse(bitmap_bh); @@ -325,8 +353,8 @@ error_return: } struct orlov_stats { + __u64 free_clusters; __u32 free_inodes; - __u32 free_clusters; __u32 used_dirs; }; @@ -343,7 +371,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_clusters = atomic_read(&flex_group[g].free_clusters); + stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } @@ -427,7 +455,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4fs_dirhash(qstr->name, qstr->len, &hinfo); grp = hinfo.hash; } else - get_random_bytes(&grp, sizeof(grp)); + grp = prandom_u32(); parent_group = (unsigned)grp % ngroups; for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; @@ -626,6 +654,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent, } /* + * In no journal mode, if an inode has recently been deleted, we want + * to avoid reusing it until we're reasonably sure the inode table + * block has been written back to disk. (Yes, these values are + * somewhat arbitrary...) + */ +#define RECENTCY_MIN 5 +#define RECENTCY_DIRTY 30 + +static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) +{ + struct ext4_group_desc *gdp; + struct ext4_inode *raw_inode; + struct buffer_head *bh; + unsigned long dtime, now; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0, recentcy = RECENTCY_MIN; + + gdp = ext4_get_group_desc(sb, group, NULL); + if (unlikely(!gdp)) + return 0; + + bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + + (ino / inodes_per_block)); + if (unlikely(!bh) || !buffer_uptodate(bh)) + /* + * If the block is not in the buffer cache, then it + * must have been written out. + */ + goto out; + + offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); + raw_inode = (struct ext4_inode *) (bh->b_data + offset); + dtime = le32_to_cpu(raw_inode->i_dtime); + now = get_seconds(); + if (buffer_dirty(bh)) + recentcy += RECENTCY_DIRTY; + + if (dtime && (dtime < now) && (now < dtime + recentcy)) + ret = 1; +out: + brelse(bh); + return ret; +} + +/* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of @@ -635,8 +708,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, - const struct qstr *qstr, __u32 goal, uid_t *owner) +struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, + umode_t mode, const struct qstr *qstr, + __u32 goal, uid_t *owner, int handle_type, + unsigned int line_no, int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -651,6 +726,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, struct inode *ret; ext4_group_t i; ext4_group_t flex_group; + struct ext4_group_info *grp; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -665,6 +741,23 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode, ei = EXT4_I(inode); sbi = EXT4_SB(sb); + /* + * Initalize owners and quota early so that we don't have to account + * for quota initialization worst case in standard inode creating + * transaction + */ + if (owner) { + inode->i_mode = mode; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); + } else if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + if (!goal) goal = sbi->s_inode_goal; @@ -696,40 +789,98 @@ got_group: gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) - goto fail; + goto out; + + /* + * Check free inodes count before loading bitmap. + */ + if (ext4_free_inodes_count(sb, gdp) == 0) { + if (++group == ngroups) + group = 0; + continue; + } + + grp = ext4_get_group_info(sb, group); + /* Skip groups with already-known suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (++group == ngroups) + group = 0; + continue; + } brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); - if (!inode_bitmap_bh) - goto fail; + /* Skip groups with suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) { + if (++group == ngroups) + group = 0; + continue; + } repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) inode_bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); - if (ino >= EXT4_INODES_PER_GROUP(sb)) { - if (++group == ngroups) - group = 0; - continue; - } + if (ino >= EXT4_INODES_PER_GROUP(sb)) + goto next_group; if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); continue; } + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, ino)) { + ino++; + goto next_inode; + } + if (!handle) { + BUG_ON(nblocks <= 0); + handle = __ext4_journal_start_sb(dir->i_sb, line_no, + handle_type, nblocks, + 0); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext4_std_error(sb, err); + goto out; + } + } + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); ext4_unlock_group(sb, group); ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ +next_inode: if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; +next_group: + if (++group == ngroups) + group = 0; } err = -ENOSPC; goto out; got: + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { @@ -740,12 +891,12 @@ got: err = ext4_journal_get_write_access(handle, block_bitmap_bh); if (err) { brelse(block_bitmap_bh); - goto fail; + ext4_std_error(sb, err); + goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); - brelse(block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); @@ -754,27 +905,18 @@ got: ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); ext4_block_bitmap_csum_set(sb, group, gdp, - block_bitmap_bh, - EXT4_BLOCKS_PER_GROUP(sb) / - 8); + block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); + brelse(block_bitmap_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } } - BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, inode_bitmap_bh); - if (err) - goto fail; - - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); - if (err) - goto fail; - /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; @@ -817,35 +959,21 @@ got: } ext4_unlock_group(sb, group); - BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); - if (err) - goto fail; - BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); - ext4_mark_super_dirty(sb); if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - if (owner) { - inode->i_mode = mode; - i_uid_write(inode, owner[0]); - i_gid_write(inode, owner[1]); - } else if (test_opt(sb, GRPID)) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = dir->i_gid; - } else - inode_init_owner(inode, dir, mode); inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ @@ -874,7 +1002,9 @@ got: * twice. */ err = -EIO; - goto fail; + ext4_error(sb, "failed to insert inode %lu: doubly allocated?", + inode->i_ino); + goto out; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; @@ -884,7 +1014,6 @@ got: if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { __u32 csum; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, @@ -898,8 +1027,11 @@ got: ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; + ei->i_inline_off = 0; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + ret = inode; - dquot_initialize(inode); err = dquot_alloc_inode(inode); if (err) goto fail_drop; @@ -933,24 +1065,17 @@ got: ext4_debug("allocating inode %lu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); - goto really_out; -fail: - ext4_std_error(sb, err); -out: - iput(inode); - ret = ERR_PTR(err); -really_out: brelse(inode_bitmap_bh); return ret; fail_free_drop: dquot_free_inode(inode); - fail_drop: - dquot_drop(inode); - inode->i_flags |= S_NOQUOTA; clear_nlink(inode); unlock_new_inode(inode); +out: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; iput(inode); brelse(inode_bitmap_bh); return ERR_PTR(err); @@ -1009,17 +1134,17 @@ iget_failed: inode = NULL; bad_orphan: ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); - printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", + printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext4_test_bit(bit, bitmap_bh->b_data)); - printk(KERN_NOTICE "inode=%p\n", inode); + printk(KERN_WARNING "inode=%p\n", inode); if (inode) { - printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + printk(KERN_WARNING "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); - printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", + printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); - printk(KERN_NOTICE "max_ino=%lu\n", max_ino); - printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); + printk(KERN_WARNING "max_ino=%lu\n", max_ino); + printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; @@ -1054,7 +1179,8 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) if (!bitmap_bh) continue; - x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); + x = ext4_count_free(bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; @@ -1128,7 +1254,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) goto out; - handle = ext4_journal_start_sb(sb, 1); + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; |
