diff options
Diffstat (limited to 'fs/ext4/ialloc.c')
| -rw-r--r-- | fs/ext4/ialloc.c | 743 |
1 files changed, 370 insertions, 373 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index eb9097aec6f..5b87fc36aab 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -70,28 +70,49 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { + struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); - J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ - if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_error(sb, "Checksum bad for group %u", block_group); - ext4_free_blks_set(sb, gdp, 0); - ext4_free_inodes_set(sb, gdp, 0); - ext4_itable_unused_set(sb, gdp, 0); - memset(bh->b_data, 0xff, sb->s_blocksize); + grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); return 0; } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); + ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, block_group, gdp); return EXT4_INODES_PER_GROUP(sb); } +void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + set_bitmap_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + /* * Read the inode allocation bitmap for a given block_group, reading * into the specified slot in the superblock's bitmap cache. @@ -104,6 +125,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct ext4_group_desc *desc; struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) @@ -118,12 +141,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) return NULL; } if (bitmap_uptodate(bh)) - return bh; + goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); - return bh; + goto verify; } ext4_lock_group(sb, block_group); @@ -131,6 +154,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_init_inode_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); + set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; @@ -144,22 +168,45 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) */ set_bitmap_uptodate(bh); unlock_buffer(bh); - return bh; + goto verify; } /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. + * submit the buffer_head for reading */ - set_bitmap_uptodate(bh); - if (bh_submit_read(bh) < 0) { + trace_ext4_load_inode_bitmap(sb, block_group); + bh->b_end_io = ext4_end_bitmap_read; + get_bh(bh); + submit_bh(READ | REQ_META | REQ_PRIO, bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error(sb, "Cannot read inode bitmap - " - "block_group = %u, inode_bitmap = %llu", - block_group, bitmap_blk); + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); return NULL; } + +verify: + ext4_lock_group(sb, block_group); + if (!buffer_verified(bh) && + !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, + EXT4_INODES_PER_GROUP(sb) / 8)) { + ext4_unlock_group(sb, block_group); + put_bh(bh); + ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " + "inode_bitmap = %llu", block_group, bitmap_blk); + grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + return NULL; + } + ext4_unlock_group(sb, block_group); + set_buffer_verified(bh); return bh; } @@ -192,20 +239,22 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; + struct ext4_group_info *grp; - if (atomic_read(&inode->i_count) > 1) { - printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", - atomic_read(&inode->i_count)); + if (!sb) { + printk(KERN_ERR "EXT4-fs: %s:%d: inode on " + "nonexistent device\n", __func__, __LINE__); return; } - if (inode->i_nlink) { - printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", - inode->i_nlink); + if (atomic_read(&inode->i_count) > 1) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", + __func__, __LINE__, inode->i_ino, + atomic_read(&inode->i_count)); return; } - if (!sb) { - printk(KERN_ERR "ext4_free_inode: inode on " - "nonexistent device\n"); + if (inode->i_nlink) { + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", + __func__, __LINE__, inode->i_ino, inode->i_nlink); return; } sbi = EXT4_SB(sb); @@ -236,7 +285,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (!bitmap_bh) + /* Don't bother if the inode bitmap is corrupt. */ + grp = ext4_get_group_info(sb, block_group); + if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh) goto error_return; BUFFER_TRACE(bitmap_bh, "get_write_access"); @@ -251,7 +302,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) fatal = ext4_journal_get_write_access(handle, bh2); } ext4_lock_group(sb, block_group); - cleared = ext4_clear_bit(bit, bitmap_bh->b_data); + cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); if (fatal || !cleared) { ext4_unlock_group(sb, block_group); goto out; @@ -264,7 +315,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ext4_used_dirs_set(sb, gdp, count); percpu_counter_dec(&sbi->s_dirs_counter); } - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); percpu_counter_inc(&sbi->s_freeinodes_counter); @@ -283,130 +336,25 @@ out: err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; - ext4_mark_super_dirty(sb); - } else + } else { ext4_error(sb, "bit already cleared for inode %lu", ino); + if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + } error_return: brelse(bitmap_bh); ext4_std_error(sb, fatal); } -/* - * There are two policies for allocating an inode. If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory\'s block - * group to find a free inode. - */ -static int find_group_dir(struct super_block *sb, struct inode *parent, - ext4_group_t *best_group) -{ - ext4_group_t ngroups = ext4_get_groups_count(sb); - unsigned int freei, avefreei; - struct ext4_group_desc *desc, *best_desc = NULL; - ext4_group_t group; - int ret = -1; - - freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter); - avefreei = freei / ngroups; - - for (group = 0; group < ngroups; group++) { - desc = ext4_get_group_desc(sb, group, NULL); - if (!desc || !ext4_free_inodes_count(sb, desc)) - continue; - if (ext4_free_inodes_count(sb, desc) < avefreei) - continue; - if (!best_desc || - (ext4_free_blks_count(sb, desc) > - ext4_free_blks_count(sb, best_desc))) { - *best_group = group; - best_desc = desc; - ret = 0; - } - } - return ret; -} - -#define free_block_ratio 10 - -static int find_group_flex(struct super_block *sb, struct inode *parent, - ext4_group_t *best_group) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_desc *desc; - struct flex_groups *flex_group = sbi->s_flex_groups; - ext4_group_t parent_group = EXT4_I(parent)->i_block_group; - ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); - ext4_group_t ngroups = ext4_get_groups_count(sb); - int flex_size = ext4_flex_bg_size(sbi); - ext4_group_t best_flex = parent_fbg_group; - int blocks_per_flex = sbi->s_blocks_per_group * flex_size; - int flexbg_free_blocks; - int flex_freeb_ratio; - ext4_group_t n_fbg_groups; - ext4_group_t i; - - n_fbg_groups = (ngroups + flex_size - 1) >> - sbi->s_log_groups_per_flex; - -find_close_to_parent: - flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks); - flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; - if (atomic_read(&flex_group[best_flex].free_inodes) && - flex_freeb_ratio > free_block_ratio) - goto found_flexbg; - - if (best_flex && best_flex == parent_fbg_group) { - best_flex--; - goto find_close_to_parent; - } - - for (i = 0; i < n_fbg_groups; i++) { - if (i == parent_fbg_group || i == parent_fbg_group - 1) - continue; - - flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks); - flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; - - if (flex_freeb_ratio > free_block_ratio && - (atomic_read(&flex_group[i].free_inodes))) { - best_flex = i; - goto found_flexbg; - } - - if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) || - ((atomic_read(&flex_group[i].free_blocks) > - atomic_read(&flex_group[best_flex].free_blocks)) && - atomic_read(&flex_group[i].free_inodes))) - best_flex = i; - } - - if (!atomic_read(&flex_group[best_flex].free_inodes) || - !atomic_read(&flex_group[best_flex].free_blocks)) - return -1; - -found_flexbg: - for (i = best_flex * flex_size; i < ngroups && - i < (best_flex + 1) * flex_size; i++) { - desc = ext4_get_group_desc(sb, i, NULL); - if (ext4_free_inodes_count(sb, desc)) { - *best_group = i; - goto out; - } - } - - return -1; -out: - return 0; -} - struct orlov_stats { + __u64 free_clusters; __u32 free_inodes; - __u32 free_blocks; __u32 used_dirs; }; @@ -423,7 +371,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_blocks = atomic_read(&flex_group[g].free_blocks); + stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } @@ -431,11 +379,11 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, desc = ext4_get_group_desc(sb, g, NULL); if (desc) { stats->free_inodes = ext4_free_inodes_count(sb, desc); - stats->free_blocks = ext4_free_blks_count(sb, desc); + stats->free_clusters = ext4_free_group_clusters(sb, desc); stats->used_dirs = ext4_used_dirs_count(sb, desc); } else { stats->free_inodes = 0; - stats->free_blocks = 0; + stats->free_clusters = 0; stats->used_dirs = 0; } } @@ -462,18 +410,18 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, */ static int find_group_orlov(struct super_block *sb, struct inode *parent, - ext4_group_t *group, int mode, + ext4_group_t *group, umode_t mode, const struct qstr *qstr) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); - unsigned int freei, avefreei; - ext4_fsblk_t freeb, avefreeb; + unsigned int freei, avefreei, grp_free; + ext4_fsblk_t freeb, avefreec; unsigned int ndirs; int max_dirs, min_inodes; - ext4_grpblk_t min_blocks; + ext4_grpblk_t min_clusters; ext4_group_t i, grp, g, ngroups; struct ext4_group_desc *desc; struct orlov_stats stats; @@ -489,9 +437,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; - freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); - avefreeb = freeb; - do_div(avefreeb, ngroups); + freeb = EXT4_C2B(sbi, + percpu_counter_read_positive(&sbi->s_freeclusters_counter)); + avefreec = freeb; + do_div(avefreec, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); if (S_ISDIR(mode) && @@ -506,7 +455,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4fs_dirhash(qstr->name, qstr->len, &hinfo); grp = hinfo.hash; } else - get_random_bytes(&grp, sizeof(grp)); + grp = prandom_u32(); parent_group = (unsigned)grp % ngroups; for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; @@ -517,7 +466,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, continue; if (stats.free_inodes < avefreei) continue; - if (stats.free_blocks < avefreeb) + if (stats.free_clusters < avefreec) continue; grp = g; ret = 0; @@ -555,7 +504,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, min_inodes = avefreei - inodes_per_group*flex_size / 4; if (min_inodes < 1) min_inodes = 1; - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; + min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; /* * Start looking in the flex group where we last allocated an @@ -574,7 +523,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, continue; if (stats.free_inodes < min_inodes) continue; - if (stats.free_blocks < min_blocks) + if (stats.free_clusters < min_clusters) continue; goto found_flex_bg; } @@ -587,10 +536,12 @@ fallback_retry: for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); - if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_inodes_count(sb, desc) >= avefreei) { - *group = grp; - return 0; + if (desc) { + grp_free = ext4_free_inodes_count(sb, desc); + if (grp_free && grp_free >= avefreei) { + *group = grp; + return 0; + } } } @@ -607,7 +558,7 @@ fallback_retry: } static int find_group_other(struct super_block *sb, struct inode *parent, - ext4_group_t *group, int mode) + ext4_group_t *group, umode_t mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); @@ -649,7 +600,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group = parent_group + flex_size; if (*group > ngroups) *group = 0; - return find_group_orlov(sb, parent, group, mode, 0); + return find_group_orlov(sb, parent, group, mode, NULL); } /* @@ -658,7 +609,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group = parent_group; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_blks_count(sb, desc)) + ext4_free_group_clusters(sb, desc)) return 0; /* @@ -682,7 +633,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group -= ngroups; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && - ext4_free_blks_count(sb, desc)) + ext4_free_group_clusters(sb, desc)) return 0; } @@ -703,91 +654,48 @@ static int find_group_other(struct super_block *sb, struct inode *parent, } /* - * claim the inode from the inode bitmap. If the group - * is uninit we need to take the groups's ext4_group_lock - * and clear the uninit flag. The inode bitmap update - * and group desc uninit flag clear should be done - * after holding ext4_group_lock so that ext4_read_inode_bitmap - * doesn't race with the ext4_claim_inode + * In no journal mode, if an inode has recently been deleted, we want + * to avoid reusing it until we're reasonably sure the inode table + * block has been written back to disk. (Yes, these values are + * somewhat arbitrary...) */ -static int ext4_claim_inode(struct super_block *sb, - struct buffer_head *inode_bitmap_bh, - unsigned long ino, ext4_group_t group, int mode) -{ - int free = 0, retval = 0, count; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_group_info *grp = ext4_get_group_info(sb, group); - struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); +#define RECENTCY_MIN 5 +#define RECENTCY_DIRTY 30 - /* - * We have to be sure that new inode allocation does not race with - * inode table initialization, because otherwise we may end up - * allocating and writing new inode right before sb_issue_zeroout - * takes place and overwriting our new inode with zeroes. So we - * take alloc_sem to prevent it. - */ - down_read(&grp->alloc_sem); - ext4_lock_group(sb, group); - if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { - /* not a free inode */ - retval = 1; - goto err_ret; - } - ino++; - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || - ino > EXT4_INODES_PER_GROUP(sb)) { - ext4_unlock_group(sb, group); - up_read(&grp->alloc_sem); - ext4_error(sb, "reserved inode or inode > inodes count - " - "block_group = %u, inode=%lu", group, - ino + group * EXT4_INODES_PER_GROUP(sb)); - return 1; - } - /* If we didn't allocate from within the initialized part of the inode - * table then we need to initialize up to this inode. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); - /* When marking the block group with - * ~EXT4_BG_INODE_UNINIT we don't want to depend - * on the value of bg_itable_unused even though - * mke2fs could have initialized the same for us. - * Instead we calculated the value below - */ - - free = 0; - } else { - free = EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp); - } +static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) +{ + struct ext4_group_desc *gdp; + struct ext4_inode *raw_inode; + struct buffer_head *bh; + unsigned long dtime, now; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0, recentcy = RECENTCY_MIN; + + gdp = ext4_get_group_desc(sb, group, NULL); + if (unlikely(!gdp)) + return 0; + bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + + (ino / inodes_per_block)); + if (unlikely(!bh) || !buffer_uptodate(bh)) /* - * Check the relative inode number against the last used - * relative inode number in this group. if it is greater - * we need to update the bg_itable_unused count - * + * If the block is not in the buffer cache, then it + * must have been written out. */ - if (ino > free) - ext4_itable_unused_set(sb, gdp, - (EXT4_INODES_PER_GROUP(sb) - ino)); - } - count = ext4_free_inodes_count(sb, gdp) - 1; - ext4_free_inodes_set(sb, gdp, count); - if (S_ISDIR(mode)) { - count = ext4_used_dirs_count(sb, gdp) + 1; - ext4_used_dirs_set(sb, gdp, count); - if (sbi->s_log_groups_per_flex) { - ext4_group_t f = ext4_flex_group(sbi, group); + goto out; - atomic_inc(&sbi->s_flex_groups[f].used_dirs); - } - } - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); -err_ret: - ext4_unlock_group(sb, group); - up_read(&grp->alloc_sem); - return retval; + offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); + raw_inode = (struct ext4_inode *) (bh->b_data + offset); + dtime = le32_to_cpu(raw_inode->i_dtime); + now = get_seconds(); + if (buffer_dirty(bh)) + recentcy += RECENTCY_DIRTY; + + if (dtime && (dtime < now) && (now < dtime + recentcy)) + ret = 1; +out: + brelse(bh); + return ret; } /* @@ -800,8 +708,10 @@ err_ret: * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, - const struct qstr *qstr, __u32 goal) +struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, + umode_t mode, const struct qstr *qstr, + __u32 goal, uid_t *owner, int handle_type, + unsigned int line_no, int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; @@ -815,9 +725,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, int ret2, err = 0; struct inode *ret; ext4_group_t i; - int free = 0; - static int once = 1; ext4_group_t flex_group; + struct ext4_group_info *grp; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) @@ -832,6 +741,23 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, ei = EXT4_I(inode); sbi = EXT4_SB(sb); + /* + * Initalize owners and quota early so that we don't have to account + * for quota initialization worst case in standard inode creating + * transaction + */ + if (owner) { + inode->i_mode = mode; + i_uid_write(inode, owner[0]); + i_gid_write(inode, owner[1]); + } else if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = dir->i_gid; + } else + inode_init_owner(inode, dir, mode); + dquot_initialize(inode); + if (!goal) goal = sbi->s_inode_goal; @@ -842,26 +768,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, goto got_group; } - if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { - ret2 = find_group_flex(sb, dir, &group); - if (ret2 == -1) { - ret2 = find_group_other(sb, dir, &group, mode); - if (ret2 == 0 && once) { - once = 0; - printk(KERN_NOTICE "ext4: find_group_flex " - "failed, fallback succeeded dir %lu\n", - dir->i_ino); - } - } - goto got_group; - } - - if (S_ISDIR(mode)) { - if (test_opt(sb, OLDALLOC)) - ret2 = find_group_dir(sb, dir, &group); - else - ret2 = find_group_orlov(sb, dir, &group, mode, qstr); - } else + if (S_ISDIR(mode)) + ret2 = find_group_orlov(sb, dir, &group, mode, qstr); + else ret2 = find_group_other(sb, dir, &group, mode); got_group: @@ -870,65 +779,87 @@ got_group: if (ret2 == -1) goto out; + /* + * Normally we will only go through one pass of this loop, + * unless we get unlucky and it turns out the group we selected + * had its last inode grabbed by someone else. + */ for (i = 0; i < ngroups; i++, ino = 0) { err = -EIO; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) - goto fail; + goto out; + + /* + * Check free inodes count before loading bitmap. + */ + if (ext4_free_inodes_count(sb, gdp) == 0) { + if (++group == ngroups) + group = 0; + continue; + } + + grp = ext4_get_group_info(sb, group); + /* Skip groups with already-known suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (++group == ngroups) + group = 0; + continue; + } brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); - if (!inode_bitmap_bh) - goto fail; + /* Skip groups with suspicious inode tables */ + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) { + if (++group == ngroups) + group = 0; + continue; + } repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) inode_bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); - - if (ino < EXT4_INODES_PER_GROUP(sb)) { - - BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, - inode_bitmap_bh); - if (err) - goto fail; - - BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, - group_desc_bh); - if (err) - goto fail; - if (!ext4_claim_inode(sb, inode_bitmap_bh, - ino, group, mode)) { - /* we won it */ - BUFFER_TRACE(inode_bitmap_bh, - "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, - NULL, - inode_bitmap_bh); - if (err) - goto fail; - /* zero bit is inode number 1*/ - ino++; - goto got; + if (ino >= EXT4_INODES_PER_GROUP(sb)) + goto next_group; + if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { + ext4_error(sb, "reserved inode found cleared - " + "inode=%lu", ino + 1); + continue; + } + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, ino)) { + ino++; + goto next_inode; + } + if (!handle) { + BUG_ON(nblocks <= 0); + handle = __ext4_journal_start_sb(dir->i_sb, line_no, + handle_type, nblocks, + 0); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext4_std_error(sb, err); + goto out; } - /* we lost it */ - ext4_handle_release_buffer(handle, inode_bitmap_bh); - ext4_handle_release_buffer(handle, group_desc_bh); - - if (++ino < EXT4_INODES_PER_GROUP(sb)) - goto repeat_in_this_group; } - - /* - * This case is possible in concurrent environment. It is very - * rare. We cannot repeat the find_group_xxx() call because - * that will simply return the same blockgroup, because the - * group descriptor metadata has not yet been updated. - * So we just go onto the next blockgroup. - */ + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + ext4_lock_group(sb, group); + ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + ext4_unlock_group(sb, group); + ino++; /* the inode bitmap is zero-based */ + if (!ret2) + goto got; /* we grabbed the inode! */ +next_inode: + if (ino < EXT4_INODES_PER_GROUP(sb)) + goto repeat_in_this_group; +next_group: if (++group == ngroups) group = 0; } @@ -936,8 +867,22 @@ repeat_in_this_group: goto out; got: + BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, group_desc_bh); + if (err) { + ext4_std_error(sb, err); + goto out; + } + /* We may have to initialize the block bitmap if it isn't already */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && + if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { struct buffer_head *block_bitmap_bh; @@ -946,54 +891,90 @@ got: err = ext4_journal_get_write_access(handle, block_bitmap_bh); if (err) { brelse(block_bitmap_bh); - goto fail; + ext4_std_error(sb, err); + goto out; } - free = 0; - ext4_lock_group(sb, group); + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); + /* recheck and clear flag under lock if we still need to */ + ext4_lock_group(sb, group); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - free = ext4_free_blocks_after_init(sb, group, gdp); gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_blks_set(sb, gdp, free); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, - gdp); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + ext4_block_bitmap_csum_set(sb, group, gdp, + block_bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); + brelse(block_bitmap_bh); - /* Don't need to dirty bitmap block if we didn't change it */ - if (free) { - BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); - err = ext4_handle_dirty_metadata(handle, - NULL, block_bitmap_bh); + if (err) { + ext4_std_error(sb, err); + goto out; } + } - brelse(block_bitmap_bh); - if (err) - goto fail; + /* Update the relevant bg descriptor fields */ + if (ext4_has_group_desc_csum(sb)) { + int free; + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + + down_read(&grp->alloc_sem); /* protect vs itable lazyinit */ + ext4_lock_group(sb, group); /* while we modify the bg desc */ + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + free = 0; + } + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + up_read(&grp->alloc_sem); + } else { + ext4_lock_group(sb, group); } + + ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); + if (S_ISDIR(mode)) { + ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + + atomic_inc(&sbi->s_flex_groups[f].used_dirs); + } + } + if (ext4_has_group_desc_csum(sb)) { + ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, + EXT4_INODES_PER_GROUP(sb) / 8); + ext4_group_desc_csum_set(sb, group, gdp); + } + ext4_unlock_group(sb, group); + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); - if (err) - goto fail; + if (err) { + ext4_std_error(sb, err); + goto out; + } percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); - ext4_mark_super_dirty(sb); if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - if (test_opt(sb, GRPID)) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = dir->i_gid; - } else - inode_init_owner(inode, dir, mode); - inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; @@ -1004,11 +985,7 @@ got: ei->i_dir_start_lookup = 0; ei->i_disksize = 0; - /* - * Don't inherit extent flag from directory, amongst others. We set - * extent flag on newly created directory and file only if -o extent - * mount option is specified - */ + /* Don't inherit extent flag from directory, amongst others. */ ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); ei->i_file_acl = 0; @@ -1020,20 +997,41 @@ got: if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { - err = -EINVAL; - goto fail_drop; + /* + * Likely a bitmap corruption causing inode to be allocated + * twice. + */ + err = -EIO; + ext4_error(sb, "failed to insert inode %lu: doubly allocated?", + inode->i_ino); + goto out; } spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; spin_unlock(&sbi->s_next_gen_lock); + /* Precompute checksum seed for inode metadata */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + __u32 csum; + __le32 inum = cpu_to_le32(inode->i_ino); + __le32 gen = cpu_to_le32(inode->i_generation); + csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + sizeof(inum)); + ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, + sizeof(gen)); + } + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; + ei->i_inline_off = 0; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + ret = inode; - dquot_initialize(inode); err = dquot_alloc_inode(inode); if (err) goto fail_drop; @@ -1042,7 +1040,7 @@ got: if (err) goto fail_free_drop; - err = ext4_init_security(handle, inode, dir); + err = ext4_init_security(handle, inode, dir, qstr); if (err) goto fail_free_drop; @@ -1054,6 +1052,11 @@ got: } } + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + ei->i_datasync_tid = handle->h_transaction->t_tid; + } + err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); @@ -1062,24 +1065,17 @@ got: ext4_debug("allocating inode %lu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); - goto really_out; -fail: - ext4_std_error(sb, err); -out: - iput(inode); - ret = ERR_PTR(err); -really_out: brelse(inode_bitmap_bh); return ret; fail_free_drop: dquot_free_inode(inode); - fail_drop: + clear_nlink(inode); + unlock_new_inode(inode); +out: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; - inode->i_nlink = 0; - unlock_new_inode(inode); iput(inode); brelse(inode_bitmap_bh); return ERR_PTR(err); @@ -1138,17 +1134,17 @@ iget_failed: inode = NULL; bad_orphan: ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); - printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", + printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext4_test_bit(bit, bitmap_bh->b_data)); - printk(KERN_NOTICE "inode=%p\n", inode); + printk(KERN_WARNING "inode=%p\n", inode); if (inode) { - printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + printk(KERN_WARNING "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); - printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", + printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); - printk(KERN_NOTICE "max_ino=%lu\n", max_ino); - printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); + printk(KERN_WARNING "max_ino=%lu\n", max_ino); + printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; @@ -1183,7 +1179,8 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) if (!bitmap_bh) continue; - x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); + x = ext4_count_free(bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; @@ -1227,9 +1224,9 @@ unsigned long ext4_count_dirs(struct super_block * sb) * where it is called from on active part of filesystem is ext4lazyinit * thread, so we do not need any special locks, however we have to prevent * inode allocation from the current group, so we take alloc_sem lock, to - * block ext4_claim_inode until we are finished. + * block ext4_new_inode() until we are finished. */ -extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, +int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); @@ -1257,7 +1254,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) goto out; - handle = ext4_journal_start_sb(sb, 1); + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -1275,13 +1272,13 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, sbi->s_inodes_per_block); if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { - ext4_error(sb, "Something is wrong with group %u\n" - "Used itable blocks: %d" - "itable unused count: %u\n", + ext4_error(sb, "Something is wrong with group %u: " + "used itable blocks: %d; " + "itable unused count: %u", group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; - goto out; + goto err_out; } blk = ext4_inode_table(sb, gdp) + used_blks; @@ -1312,7 +1309,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, skip_zeroout: ext4_lock_group(sb, group); gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); + ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, |
