diff options
Diffstat (limited to 'fs/ext4/ialloc.c')
| -rw-r--r-- | fs/ext4/ialloc.c | 745 | 
1 files changed, 371 insertions, 374 deletions
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1ce240a23eb..5b87fc36aab 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -70,28 +70,49 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,  				       ext4_group_t block_group,  				       struct ext4_group_desc *gdp)  { +	struct ext4_group_info *grp;  	struct ext4_sb_info *sbi = EXT4_SB(sb); -  	J_ASSERT_BH(bh, buffer_locked(bh));  	/* If checksum is bad mark all blocks and inodes use to prevent  	 * allocation, essentially implementing a per-group read-only flag. */ -	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { +	if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {  		ext4_error(sb, "Checksum bad for group %u", block_group); -		ext4_free_blks_set(sb, gdp, 0); -		ext4_free_inodes_set(sb, gdp, 0); -		ext4_itable_unused_set(sb, gdp, 0); -		memset(bh->b_data, 0xff, sb->s_blocksize); +		grp = ext4_get_group_info(sb, block_group); +		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +			percpu_counter_sub(&sbi->s_freeclusters_counter, +					   grp->bb_free); +		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); +		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +			int count; +			count = ext4_free_inodes_count(sb, gdp); +			percpu_counter_sub(&sbi->s_freeinodes_counter, +					   count); +		} +		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);  		return 0;  	}  	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);  	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,  			bh->b_data); +	ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh, +				   EXT4_INODES_PER_GROUP(sb) / 8); +	ext4_group_desc_csum_set(sb, block_group, gdp);  	return EXT4_INODES_PER_GROUP(sb);  } +void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) +{ +	if (uptodate) { +		set_buffer_uptodate(bh); +		set_bitmap_uptodate(bh); +	} +	unlock_buffer(bh); +	put_bh(bh); +} +  /*   * Read the inode allocation bitmap for a given block_group, reading   * into the specified slot in the superblock's bitmap cache. @@ -104,6 +125,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)  	struct ext4_group_desc *desc;  	struct buffer_head *bh = NULL;  	ext4_fsblk_t bitmap_blk; +	struct ext4_group_info *grp; +	struct ext4_sb_info *sbi = EXT4_SB(sb);  	desc = ext4_get_group_desc(sb, block_group, NULL);  	if (!desc) @@ -118,12 +141,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)  		return NULL;  	}  	if (bitmap_uptodate(bh)) -		return bh; +		goto verify;  	lock_buffer(bh);  	if (bitmap_uptodate(bh)) {  		unlock_buffer(bh); -		return bh; +		goto verify;  	}  	ext4_lock_group(sb, block_group); @@ -131,6 +154,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)  		ext4_init_inode_bitmap(sb, bh, block_group, desc);  		set_bitmap_uptodate(bh);  		set_buffer_uptodate(bh); +		set_buffer_verified(bh);  		ext4_unlock_group(sb, block_group);  		unlock_buffer(bh);  		return bh; @@ -144,22 +168,45 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)  		 */  		set_bitmap_uptodate(bh);  		unlock_buffer(bh); -		return bh; +		goto verify;  	}  	/* -	 * submit the buffer_head for read. We can -	 * safely mark the bitmap as uptodate now. -	 * We do it here so the bitmap uptodate bit -	 * get set with buffer lock held. +	 * submit the buffer_head for reading  	 */ -	set_bitmap_uptodate(bh); -	if (bh_submit_read(bh) < 0) { +	trace_ext4_load_inode_bitmap(sb, block_group); +	bh->b_end_io = ext4_end_bitmap_read; +	get_bh(bh); +	submit_bh(READ | REQ_META | REQ_PRIO, bh); +	wait_on_buffer(bh); +	if (!buffer_uptodate(bh)) {  		put_bh(bh);  		ext4_error(sb, "Cannot read inode bitmap - " -			    "block_group = %u, inode_bitmap = %llu", -			    block_group, bitmap_blk); +			   "block_group = %u, inode_bitmap = %llu", +			   block_group, bitmap_blk);  		return NULL;  	} + +verify: +	ext4_lock_group(sb, block_group); +	if (!buffer_verified(bh) && +	    !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, +					   EXT4_INODES_PER_GROUP(sb) / 8)) { +		ext4_unlock_group(sb, block_group); +		put_bh(bh); +		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " +			   "inode_bitmap = %llu", block_group, bitmap_blk); +		grp = ext4_get_group_info(sb, block_group); +		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +			int count; +			count = ext4_free_inodes_count(sb, desc); +			percpu_counter_sub(&sbi->s_freeinodes_counter, +					   count); +		} +		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); +		return NULL; +	} +	ext4_unlock_group(sb, block_group); +	set_buffer_verified(bh);  	return bh;  } @@ -192,20 +239,22 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)  	struct ext4_super_block *es;  	struct ext4_sb_info *sbi;  	int fatal = 0, err, count, cleared; +	struct ext4_group_info *grp; -	if (atomic_read(&inode->i_count) > 1) { -		printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", -		       atomic_read(&inode->i_count)); +	if (!sb) { +		printk(KERN_ERR "EXT4-fs: %s:%d: inode on " +		       "nonexistent device\n", __func__, __LINE__);  		return;  	} -	if (inode->i_nlink) { -		printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", -		       inode->i_nlink); +	if (atomic_read(&inode->i_count) > 1) { +		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", +			 __func__, __LINE__, inode->i_ino, +			 atomic_read(&inode->i_count));  		return;  	} -	if (!sb) { -		printk(KERN_ERR "ext4_free_inode: inode on " -		       "nonexistent device\n"); +	if (inode->i_nlink) { +		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", +			 __func__, __LINE__, inode->i_ino, inode->i_nlink);  		return;  	}  	sbi = EXT4_SB(sb); @@ -236,7 +285,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)  	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);  	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);  	bitmap_bh = ext4_read_inode_bitmap(sb, block_group); -	if (!bitmap_bh) +	/* Don't bother if the inode bitmap is corrupt. */ +	grp = ext4_get_group_info(sb, block_group); +	if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)  		goto error_return;  	BUFFER_TRACE(bitmap_bh, "get_write_access"); @@ -251,7 +302,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)  		fatal = ext4_journal_get_write_access(handle, bh2);  	}  	ext4_lock_group(sb, block_group); -	cleared = ext4_clear_bit(bit, bitmap_bh->b_data); +	cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);  	if (fatal || !cleared) {  		ext4_unlock_group(sb, block_group);  		goto out; @@ -264,7 +315,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)  		ext4_used_dirs_set(sb, gdp, count);  		percpu_counter_dec(&sbi->s_dirs_counter);  	} -	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); +	ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, +				   EXT4_INODES_PER_GROUP(sb) / 8); +	ext4_group_desc_csum_set(sb, block_group, gdp);  	ext4_unlock_group(sb, block_group);  	percpu_counter_inc(&sbi->s_freeinodes_counter); @@ -283,130 +336,25 @@ out:  		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);  		if (!fatal)  			fatal = err; -		ext4_mark_super_dirty(sb); -	} else +	} else {  		ext4_error(sb, "bit already cleared for inode %lu", ino); +		if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +			int count; +			count = ext4_free_inodes_count(sb, gdp); +			percpu_counter_sub(&sbi->s_freeinodes_counter, +					   count); +		} +		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); +	}  error_return:  	brelse(bitmap_bh);  	ext4_std_error(sb, fatal);  } -/* - * There are two policies for allocating an inode.  If the new inode is - * a directory, then a forward search is made for a block group with both - * free space and a low directory-to-inode ratio; if that fails, then of - * the groups with above-average free space, that group with the fewest - * directories already is chosen. - * - * For other inodes, search forward from the parent directory\'s block - * group to find a free inode. - */ -static int find_group_dir(struct super_block *sb, struct inode *parent, -				ext4_group_t *best_group) -{ -	ext4_group_t ngroups = ext4_get_groups_count(sb); -	unsigned int freei, avefreei; -	struct ext4_group_desc *desc, *best_desc = NULL; -	ext4_group_t group; -	int ret = -1; - -	freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter); -	avefreei = freei / ngroups; - -	for (group = 0; group < ngroups; group++) { -		desc = ext4_get_group_desc(sb, group, NULL); -		if (!desc || !ext4_free_inodes_count(sb, desc)) -			continue; -		if (ext4_free_inodes_count(sb, desc) < avefreei) -			continue; -		if (!best_desc || -		    (ext4_free_blks_count(sb, desc) > -		     ext4_free_blks_count(sb, best_desc))) { -			*best_group = group; -			best_desc = desc; -			ret = 0; -		} -	} -	return ret; -} - -#define free_block_ratio 10 - -static int find_group_flex(struct super_block *sb, struct inode *parent, -			   ext4_group_t *best_group) -{ -	struct ext4_sb_info *sbi = EXT4_SB(sb); -	struct ext4_group_desc *desc; -	struct flex_groups *flex_group = sbi->s_flex_groups; -	ext4_group_t parent_group = EXT4_I(parent)->i_block_group; -	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); -	ext4_group_t ngroups = ext4_get_groups_count(sb); -	int flex_size = ext4_flex_bg_size(sbi); -	ext4_group_t best_flex = parent_fbg_group; -	int blocks_per_flex = sbi->s_blocks_per_group * flex_size; -	int flexbg_free_blocks; -	int flex_freeb_ratio; -	ext4_group_t n_fbg_groups; -	ext4_group_t i; - -	n_fbg_groups = (ngroups + flex_size - 1) >> -		sbi->s_log_groups_per_flex; - -find_close_to_parent: -	flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks); -	flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; -	if (atomic_read(&flex_group[best_flex].free_inodes) && -	    flex_freeb_ratio > free_block_ratio) -		goto found_flexbg; - -	if (best_flex && best_flex == parent_fbg_group) { -		best_flex--; -		goto find_close_to_parent; -	} - -	for (i = 0; i < n_fbg_groups; i++) { -		if (i == parent_fbg_group || i == parent_fbg_group - 1) -			continue; - -		flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks); -		flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; - -		if (flex_freeb_ratio > free_block_ratio && -		    (atomic_read(&flex_group[i].free_inodes))) { -			best_flex = i; -			goto found_flexbg; -		} - -		if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) || -		    ((atomic_read(&flex_group[i].free_blocks) > -		      atomic_read(&flex_group[best_flex].free_blocks)) && -		     atomic_read(&flex_group[i].free_inodes))) -			best_flex = i; -	} - -	if (!atomic_read(&flex_group[best_flex].free_inodes) || -	    !atomic_read(&flex_group[best_flex].free_blocks)) -		return -1; - -found_flexbg: -	for (i = best_flex * flex_size; i < ngroups && -		     i < (best_flex + 1) * flex_size; i++) { -		desc = ext4_get_group_desc(sb, i, NULL); -		if (ext4_free_inodes_count(sb, desc)) { -			*best_group = i; -			goto out; -		} -	} - -	return -1; -out: -	return 0; -} -  struct orlov_stats { +	__u64 free_clusters;  	__u32 free_inodes; -	__u32 free_blocks;  	__u32 used_dirs;  }; @@ -423,7 +371,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,  	if (flex_size > 1) {  		stats->free_inodes = atomic_read(&flex_group[g].free_inodes); -		stats->free_blocks = atomic_read(&flex_group[g].free_blocks); +		stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);  		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);  		return;  	} @@ -431,11 +379,11 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,  	desc = ext4_get_group_desc(sb, g, NULL);  	if (desc) {  		stats->free_inodes = ext4_free_inodes_count(sb, desc); -		stats->free_blocks = ext4_free_blks_count(sb, desc); +		stats->free_clusters = ext4_free_group_clusters(sb, desc);  		stats->used_dirs = ext4_used_dirs_count(sb, desc);  	} else {  		stats->free_inodes = 0; -		stats->free_blocks = 0; +		stats->free_clusters = 0;  		stats->used_dirs = 0;  	}  } @@ -462,18 +410,18 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,   */  static int find_group_orlov(struct super_block *sb, struct inode *parent, -			    ext4_group_t *group, int mode, +			    ext4_group_t *group, umode_t mode,  			    const struct qstr *qstr)  {  	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;  	struct ext4_sb_info *sbi = EXT4_SB(sb);  	ext4_group_t real_ngroups = ext4_get_groups_count(sb);  	int inodes_per_group = EXT4_INODES_PER_GROUP(sb); -	unsigned int freei, avefreei; -	ext4_fsblk_t freeb, avefreeb; +	unsigned int freei, avefreei, grp_free; +	ext4_fsblk_t freeb, avefreec;  	unsigned int ndirs;  	int max_dirs, min_inodes; -	ext4_grpblk_t min_blocks; +	ext4_grpblk_t min_clusters;  	ext4_group_t i, grp, g, ngroups;  	struct ext4_group_desc *desc;  	struct orlov_stats stats; @@ -489,9 +437,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,  	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);  	avefreei = freei / ngroups; -	freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); -	avefreeb = freeb; -	do_div(avefreeb, ngroups); +	freeb = EXT4_C2B(sbi, +		percpu_counter_read_positive(&sbi->s_freeclusters_counter)); +	avefreec = freeb; +	do_div(avefreec, ngroups);  	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);  	if (S_ISDIR(mode) && @@ -506,7 +455,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,  			ext4fs_dirhash(qstr->name, qstr->len, &hinfo);  			grp = hinfo.hash;  		} else -			get_random_bytes(&grp, sizeof(grp)); +			grp = prandom_u32();  		parent_group = (unsigned)grp % ngroups;  		for (i = 0; i < ngroups; i++) {  			g = (parent_group + i) % ngroups; @@ -517,7 +466,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,  				continue;  			if (stats.free_inodes < avefreei)  				continue; -			if (stats.free_blocks < avefreeb) +			if (stats.free_clusters < avefreec)  				continue;  			grp = g;  			ret = 0; @@ -555,7 +504,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,  	min_inodes = avefreei - inodes_per_group*flex_size / 4;  	if (min_inodes < 1)  		min_inodes = 1; -	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4; +	min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;  	/*  	 * Start looking in the flex group where we last allocated an @@ -574,7 +523,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,  			continue;  		if (stats.free_inodes < min_inodes)  			continue; -		if (stats.free_blocks < min_blocks) +		if (stats.free_clusters < min_clusters)  			continue;  		goto found_flex_bg;  	} @@ -587,10 +536,12 @@ fallback_retry:  	for (i = 0; i < ngroups; i++) {  		grp = (parent_group + i) % ngroups;  		desc = ext4_get_group_desc(sb, grp, NULL); -		if (desc && ext4_free_inodes_count(sb, desc) && -		    ext4_free_inodes_count(sb, desc) >= avefreei) { -			*group = grp; -			return 0; +		if (desc) { +			grp_free = ext4_free_inodes_count(sb, desc); +			if (grp_free && grp_free >= avefreei) { +				*group = grp; +				return 0; +			}  		}  	} @@ -607,7 +558,7 @@ fallback_retry:  }  static int find_group_other(struct super_block *sb, struct inode *parent, -			    ext4_group_t *group, int mode) +			    ext4_group_t *group, umode_t mode)  {  	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;  	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); @@ -649,7 +600,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,  		*group = parent_group + flex_size;  		if (*group > ngroups)  			*group = 0; -		return find_group_orlov(sb, parent, group, mode, 0); +		return find_group_orlov(sb, parent, group, mode, NULL);  	}  	/* @@ -658,7 +609,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,  	*group = parent_group;  	desc = ext4_get_group_desc(sb, *group, NULL);  	if (desc && ext4_free_inodes_count(sb, desc) && -			ext4_free_blks_count(sb, desc)) +	    ext4_free_group_clusters(sb, desc))  		return 0;  	/* @@ -682,7 +633,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,  			*group -= ngroups;  		desc = ext4_get_group_desc(sb, *group, NULL);  		if (desc && ext4_free_inodes_count(sb, desc) && -				ext4_free_blks_count(sb, desc)) +		    ext4_free_group_clusters(sb, desc))  			return 0;  	} @@ -703,91 +654,48 @@ static int find_group_other(struct super_block *sb, struct inode *parent,  }  /* - * claim the inode from the inode bitmap. If the group - * is uninit we need to take the groups's ext4_group_lock - * and clear the uninit flag. The inode bitmap update - * and group desc uninit flag clear should be done - * after holding ext4_group_lock so that ext4_read_inode_bitmap - * doesn't race with the ext4_claim_inode + * In no journal mode, if an inode has recently been deleted, we want + * to avoid reusing it until we're reasonably sure the inode table + * block has been written back to disk.  (Yes, these values are + * somewhat arbitrary...)   */ -static int ext4_claim_inode(struct super_block *sb, -			struct buffer_head *inode_bitmap_bh, -			unsigned long ino, ext4_group_t group, int mode) -{ -	int free = 0, retval = 0, count; -	struct ext4_sb_info *sbi = EXT4_SB(sb); -	struct ext4_group_info *grp = ext4_get_group_info(sb, group); -	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); +#define RECENTCY_MIN	5 +#define RECENTCY_DIRTY	30 -	/* -	 * We have to be sure that new inode allocation does not race with -	 * inode table initialization, because otherwise we may end up -	 * allocating and writing new inode right before sb_issue_zeroout -	 * takes place and overwriting our new inode with zeroes. So we -	 * take alloc_sem to prevent it. -	 */ -	down_read(&grp->alloc_sem); -	ext4_lock_group(sb, group); -	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { -		/* not a free inode */ -		retval = 1; -		goto err_ret; -	} -	ino++; -	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || -			ino > EXT4_INODES_PER_GROUP(sb)) { -		ext4_unlock_group(sb, group); -		up_read(&grp->alloc_sem); -		ext4_error(sb, "reserved inode or inode > inodes count - " -			   "block_group = %u, inode=%lu", group, -			   ino + group * EXT4_INODES_PER_GROUP(sb)); -		return 1; -	} -	/* If we didn't allocate from within the initialized part of the inode -	 * table then we need to initialize up to this inode. */ -	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - -		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { -			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); -			/* When marking the block group with -			 * ~EXT4_BG_INODE_UNINIT we don't want to depend -			 * on the value of bg_itable_unused even though -			 * mke2fs could have initialized the same for us. -			 * Instead we calculated the value below -			 */ - -			free = 0; -		} else { -			free = EXT4_INODES_PER_GROUP(sb) - -				ext4_itable_unused_count(sb, gdp); -		} +static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) +{ +	struct ext4_group_desc	*gdp; +	struct ext4_inode	*raw_inode; +	struct buffer_head	*bh; +	unsigned long		dtime, now; +	int	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; +	int	offset, ret = 0, recentcy = RECENTCY_MIN; + +	gdp = ext4_get_group_desc(sb, group, NULL); +	if (unlikely(!gdp)) +		return 0; +	bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + +		       (ino / inodes_per_block)); +	if (unlikely(!bh) || !buffer_uptodate(bh))  		/* -		 * Check the relative inode number against the last used -		 * relative inode number in this group. if it is greater -		 * we need to  update the bg_itable_unused count -		 * +		 * If the block is not in the buffer cache, then it +		 * must have been written out.  		 */ -		if (ino > free) -			ext4_itable_unused_set(sb, gdp, -					(EXT4_INODES_PER_GROUP(sb) - ino)); -	} -	count = ext4_free_inodes_count(sb, gdp) - 1; -	ext4_free_inodes_set(sb, gdp, count); -	if (S_ISDIR(mode)) { -		count = ext4_used_dirs_count(sb, gdp) + 1; -		ext4_used_dirs_set(sb, gdp, count); -		if (sbi->s_log_groups_per_flex) { -			ext4_group_t f = ext4_flex_group(sbi, group); +		goto out; -			atomic_inc(&sbi->s_flex_groups[f].used_dirs); -		} -	} -	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); -err_ret: -	ext4_unlock_group(sb, group); -	up_read(&grp->alloc_sem); -	return retval; +	offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); +	raw_inode = (struct ext4_inode *) (bh->b_data + offset); +	dtime = le32_to_cpu(raw_inode->i_dtime); +	now = get_seconds(); +	if (buffer_dirty(bh)) +		recentcy += RECENTCY_DIRTY; + +	if (dtime && (dtime < now) && (now < dtime + recentcy)) +		ret = 1; +out: +	brelse(bh); +	return ret;  }  /* @@ -800,8 +708,10 @@ err_ret:   * For other inodes, search forward from the parent directory's block   * group to find a free inode.   */ -struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, -			     const struct qstr *qstr, __u32 goal) +struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, +			       umode_t mode, const struct qstr *qstr, +			       __u32 goal, uid_t *owner, int handle_type, +			       unsigned int line_no, int nblocks)  {  	struct super_block *sb;  	struct buffer_head *inode_bitmap_bh = NULL; @@ -815,9 +725,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,  	int ret2, err = 0;  	struct inode *ret;  	ext4_group_t i; -	int free = 0; -	static int once = 1;  	ext4_group_t flex_group; +	struct ext4_group_info *grp;  	/* Cannot create files in a deleted directory */  	if (!dir || !dir->i_nlink) @@ -832,6 +741,23 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,  	ei = EXT4_I(inode);  	sbi = EXT4_SB(sb); +	/* +	 * Initalize owners and quota early so that we don't have to account +	 * for quota initialization worst case in standard inode creating +	 * transaction +	 */ +	if (owner) { +		inode->i_mode = mode; +		i_uid_write(inode, owner[0]); +		i_gid_write(inode, owner[1]); +	} else if (test_opt(sb, GRPID)) { +		inode->i_mode = mode; +		inode->i_uid = current_fsuid(); +		inode->i_gid = dir->i_gid; +	} else +		inode_init_owner(inode, dir, mode); +	dquot_initialize(inode); +  	if (!goal)  		goal = sbi->s_inode_goal; @@ -842,26 +768,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,  		goto got_group;  	} -	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { -		ret2 = find_group_flex(sb, dir, &group); -		if (ret2 == -1) { -			ret2 = find_group_other(sb, dir, &group, mode); -			if (ret2 == 0 && once) { -				once = 0; -				printk(KERN_NOTICE "ext4: find_group_flex " -				       "failed, fallback succeeded dir %lu\n", -				       dir->i_ino); -			} -		} -		goto got_group; -	} - -	if (S_ISDIR(mode)) { -		if (test_opt(sb, OLDALLOC)) -			ret2 = find_group_dir(sb, dir, &group); -		else -			ret2 = find_group_orlov(sb, dir, &group, mode, qstr); -	} else +	if (S_ISDIR(mode)) +		ret2 = find_group_orlov(sb, dir, &group, mode, qstr); +	else  		ret2 = find_group_other(sb, dir, &group, mode);  got_group: @@ -870,65 +779,87 @@ got_group:  	if (ret2 == -1)  		goto out; +	/* +	 * Normally we will only go through one pass of this loop, +	 * unless we get unlucky and it turns out the group we selected +	 * had its last inode grabbed by someone else. +	 */  	for (i = 0; i < ngroups; i++, ino = 0) {  		err = -EIO;  		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);  		if (!gdp) -			goto fail; +			goto out; + +		/* +		 * Check free inodes count before loading bitmap. +		 */ +		if (ext4_free_inodes_count(sb, gdp) == 0) { +			if (++group == ngroups) +				group = 0; +			continue; +		} + +		grp = ext4_get_group_info(sb, group); +		/* Skip groups with already-known suspicious inode tables */ +		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +			if (++group == ngroups) +				group = 0; +			continue; +		}  		brelse(inode_bitmap_bh);  		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); -		if (!inode_bitmap_bh) -			goto fail; +		/* Skip groups with suspicious inode tables */ +		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) { +			if (++group == ngroups) +				group = 0; +			continue; +		}  repeat_in_this_group:  		ino = ext4_find_next_zero_bit((unsigned long *)  					      inode_bitmap_bh->b_data,  					      EXT4_INODES_PER_GROUP(sb), ino); - -		if (ino < EXT4_INODES_PER_GROUP(sb)) { - -			BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); -			err = ext4_journal_get_write_access(handle, -							    inode_bitmap_bh); -			if (err) -				goto fail; - -			BUFFER_TRACE(group_desc_bh, "get_write_access"); -			err = ext4_journal_get_write_access(handle, -								group_desc_bh); -			if (err) -				goto fail; -			if (!ext4_claim_inode(sb, inode_bitmap_bh, -						ino, group, mode)) { -				/* we won it */ -				BUFFER_TRACE(inode_bitmap_bh, -					"call ext4_handle_dirty_metadata"); -				err = ext4_handle_dirty_metadata(handle, -								 NULL, -							inode_bitmap_bh); -				if (err) -					goto fail; -				/* zero bit is inode number 1*/ -				ino++; -				goto got; +		if (ino >= EXT4_INODES_PER_GROUP(sb)) +			goto next_group; +		if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { +			ext4_error(sb, "reserved inode found cleared - " +				   "inode=%lu", ino + 1); +			continue; +		} +		if ((EXT4_SB(sb)->s_journal == NULL) && +		    recently_deleted(sb, group, ino)) { +			ino++; +			goto next_inode; +		} +		if (!handle) { +			BUG_ON(nblocks <= 0); +			handle = __ext4_journal_start_sb(dir->i_sb, line_no, +							 handle_type, nblocks, +							 0); +			if (IS_ERR(handle)) { +				err = PTR_ERR(handle); +				ext4_std_error(sb, err); +				goto out;  			} -			/* we lost it */ -			ext4_handle_release_buffer(handle, inode_bitmap_bh); -			ext4_handle_release_buffer(handle, group_desc_bh); - -			if (++ino < EXT4_INODES_PER_GROUP(sb)) -				goto repeat_in_this_group;  		} - -		/* -		 * This case is possible in concurrent environment.  It is very -		 * rare.  We cannot repeat the find_group_xxx() call because -		 * that will simply return the same blockgroup, because the -		 * group descriptor metadata has not yet been updated. -		 * So we just go onto the next blockgroup. -		 */ +		BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); +		err = ext4_journal_get_write_access(handle, inode_bitmap_bh); +		if (err) { +			ext4_std_error(sb, err); +			goto out; +		} +		ext4_lock_group(sb, group); +		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); +		ext4_unlock_group(sb, group); +		ino++;		/* the inode bitmap is zero-based */ +		if (!ret2) +			goto got; /* we grabbed the inode! */ +next_inode: +		if (ino < EXT4_INODES_PER_GROUP(sb)) +			goto repeat_in_this_group; +next_group:  		if (++group == ngroups)  			group = 0;  	} @@ -936,8 +867,22 @@ repeat_in_this_group:  	goto out;  got: +	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); +	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); +	if (err) { +		ext4_std_error(sb, err); +		goto out; +	} + +	BUFFER_TRACE(group_desc_bh, "get_write_access"); +	err = ext4_journal_get_write_access(handle, group_desc_bh); +	if (err) { +		ext4_std_error(sb, err); +		goto out; +	} +  	/* We may have to initialize the block bitmap if it isn't already */ -	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && +	if (ext4_has_group_desc_csum(sb) &&  	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {  		struct buffer_head *block_bitmap_bh; @@ -946,54 +891,90 @@ got:  		err = ext4_journal_get_write_access(handle, block_bitmap_bh);  		if (err) {  			brelse(block_bitmap_bh); -			goto fail; +			ext4_std_error(sb, err); +			goto out;  		} -		free = 0; -		ext4_lock_group(sb, group); +		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); +		err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); +  		/* recheck and clear flag under lock if we still need to */ +		ext4_lock_group(sb, group);  		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { -			free = ext4_free_blocks_after_init(sb, group, gdp);  			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); -			ext4_free_blks_set(sb, gdp, free); -			gdp->bg_checksum = ext4_group_desc_csum(sbi, group, -								gdp); +			ext4_free_group_clusters_set(sb, gdp, +				ext4_free_clusters_after_init(sb, group, gdp)); +			ext4_block_bitmap_csum_set(sb, group, gdp, +						   block_bitmap_bh); +			ext4_group_desc_csum_set(sb, group, gdp);  		}  		ext4_unlock_group(sb, group); +		brelse(block_bitmap_bh); -		/* Don't need to dirty bitmap block if we didn't change it */ -		if (free) { -			BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); -			err = ext4_handle_dirty_metadata(handle, -							NULL, block_bitmap_bh); +		if (err) { +			ext4_std_error(sb, err); +			goto out;  		} +	} -		brelse(block_bitmap_bh); -		if (err) -			goto fail; +	/* Update the relevant bg descriptor fields */ +	if (ext4_has_group_desc_csum(sb)) { +		int free; +		struct ext4_group_info *grp = ext4_get_group_info(sb, group); + +		down_read(&grp->alloc_sem); /* protect vs itable lazyinit */ +		ext4_lock_group(sb, group); /* while we modify the bg desc */ +		free = EXT4_INODES_PER_GROUP(sb) - +			ext4_itable_unused_count(sb, gdp); +		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { +			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); +			free = 0; +		} +		/* +		 * Check the relative inode number against the last used +		 * relative inode number in this group. if it is greater +		 * we need to update the bg_itable_unused count +		 */ +		if (ino > free) +			ext4_itable_unused_set(sb, gdp, +					(EXT4_INODES_PER_GROUP(sb) - ino)); +		up_read(&grp->alloc_sem); +	} else { +		ext4_lock_group(sb, group);  	} + +	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); +	if (S_ISDIR(mode)) { +		ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); +		if (sbi->s_log_groups_per_flex) { +			ext4_group_t f = ext4_flex_group(sbi, group); + +			atomic_inc(&sbi->s_flex_groups[f].used_dirs); +		} +	} +	if (ext4_has_group_desc_csum(sb)) { +		ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, +					   EXT4_INODES_PER_GROUP(sb) / 8); +		ext4_group_desc_csum_set(sb, group, gdp); +	} +	ext4_unlock_group(sb, group); +  	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");  	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); -	if (err) -		goto fail; +	if (err) { +		ext4_std_error(sb, err); +		goto out; +	}  	percpu_counter_dec(&sbi->s_freeinodes_counter);  	if (S_ISDIR(mode))  		percpu_counter_inc(&sbi->s_dirs_counter); -	ext4_mark_super_dirty(sb);  	if (sbi->s_log_groups_per_flex) {  		flex_group = ext4_flex_group(sbi, group);  		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);  	} -	if (test_opt(sb, GRPID)) { -		inode->i_mode = mode; -		inode->i_uid = current_fsuid(); -		inode->i_gid = dir->i_gid; -	} else -		inode_init_owner(inode, dir, mode); -  	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);  	/* This is the optimal IO size (for stat), not the fs block size */  	inode->i_blocks = 0; @@ -1004,11 +985,7 @@ got:  	ei->i_dir_start_lookup = 0;  	ei->i_disksize = 0; -	/* -	 * Don't inherit extent flag from directory, amongst others. We set -	 * extent flag on newly created directory and file only if -o extent -	 * mount option is specified -	 */ +	/* Don't inherit extent flag from directory, amongst others. */  	ei->i_flags =  		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);  	ei->i_file_acl = 0; @@ -1020,20 +997,41 @@ got:  	if (IS_DIRSYNC(inode))  		ext4_handle_sync(handle);  	if (insert_inode_locked(inode) < 0) { -		err = -EINVAL; -		goto fail_drop; +		/* +		 * Likely a bitmap corruption causing inode to be allocated +		 * twice. +		 */ +		err = -EIO; +		ext4_error(sb, "failed to insert inode %lu: doubly allocated?", +			   inode->i_ino); +		goto out;  	}  	spin_lock(&sbi->s_next_gen_lock);  	inode->i_generation = sbi->s_next_generation++;  	spin_unlock(&sbi->s_next_gen_lock); -	ei->i_state_flags = 0; +	/* Precompute checksum seed for inode metadata */ +	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, +			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { +		__u32 csum; +		__le32 inum = cpu_to_le32(inode->i_ino); +		__le32 gen = cpu_to_le32(inode->i_generation); +		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, +				   sizeof(inum)); +		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, +					      sizeof(gen)); +	} + +	ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */  	ext4_set_inode_state(inode, EXT4_STATE_NEW);  	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; +	ei->i_inline_off = 0; +	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) +		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +  	ret = inode; -	dquot_initialize(inode);  	err = dquot_alloc_inode(inode);  	if (err)  		goto fail_drop; @@ -1042,7 +1040,7 @@ got:  	if (err)  		goto fail_free_drop; -	err = ext4_init_security(handle, inode, dir); +	err = ext4_init_security(handle, inode, dir, qstr);  	if (err)  		goto fail_free_drop; @@ -1054,6 +1052,11 @@ got:  		}  	} +	if (ext4_handle_valid(handle)) { +		ei->i_sync_tid = handle->h_transaction->t_tid; +		ei->i_datasync_tid = handle->h_transaction->t_tid; +	} +  	err = ext4_mark_inode_dirty(handle, inode);  	if (err) {  		ext4_std_error(sb, err); @@ -1062,24 +1065,17 @@ got:  	ext4_debug("allocating inode %lu\n", inode->i_ino);  	trace_ext4_allocate_inode(inode, dir, mode); -	goto really_out; -fail: -	ext4_std_error(sb, err); -out: -	iput(inode); -	ret = ERR_PTR(err); -really_out:  	brelse(inode_bitmap_bh);  	return ret;  fail_free_drop:  	dquot_free_inode(inode); -  fail_drop: +	clear_nlink(inode); +	unlock_new_inode(inode); +out:  	dquot_drop(inode);  	inode->i_flags |= S_NOQUOTA; -	inode->i_nlink = 0; -	unlock_new_inode(inode);  	iput(inode);  	brelse(inode_bitmap_bh);  	return ERR_PTR(err); @@ -1138,17 +1134,17 @@ iget_failed:  	inode = NULL;  bad_orphan:  	ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino); -	printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", +	printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",  	       bit, (unsigned long long)bitmap_bh->b_blocknr,  	       ext4_test_bit(bit, bitmap_bh->b_data)); -	printk(KERN_NOTICE "inode=%p\n", inode); +	printk(KERN_WARNING "inode=%p\n", inode);  	if (inode) { -		printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", +		printk(KERN_WARNING "is_bad_inode(inode)=%d\n",  		       is_bad_inode(inode)); -		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", +		printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",  		       NEXT_ORPHAN(inode)); -		printk(KERN_NOTICE "max_ino=%lu\n", max_ino); -		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); +		printk(KERN_WARNING "max_ino=%lu\n", max_ino); +		printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);  		/* Avoid freeing blocks if we got a bad deleted inode */  		if (inode->i_nlink == 0)  			inode->i_blocks = 0; @@ -1183,7 +1179,8 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)  		if (!bitmap_bh)  			continue; -		x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); +		x = ext4_count_free(bitmap_bh->b_data, +				    EXT4_INODES_PER_GROUP(sb) / 8);  		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",  			(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);  		bitmap_count += x; @@ -1227,9 +1224,9 @@ unsigned long ext4_count_dirs(struct super_block * sb)   * where it is called from on active part of filesystem is ext4lazyinit   * thread, so we do not need any special locks, however we have to prevent   * inode allocation from the current group, so we take alloc_sem lock, to - * block ext4_claim_inode until we are finished. + * block ext4_new_inode() until we are finished.   */ -extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, +int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,  				 int barrier)  {  	struct ext4_group_info *grp = ext4_get_group_info(sb, group); @@ -1257,7 +1254,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,  	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))  		goto out; -	handle = ext4_journal_start_sb(sb, 1); +	handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);  	if (IS_ERR(handle)) {  		ret = PTR_ERR(handle);  		goto out; @@ -1275,13 +1272,13 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,  			    sbi->s_inodes_per_block);  	if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) { -		ext4_error(sb, "Something is wrong with group %u\n" -			   "Used itable blocks: %d" -			   "itable unused count: %u\n", +		ext4_error(sb, "Something is wrong with group %u: " +			   "used itable blocks: %d; " +			   "itable unused count: %u",  			   group, used_blks,  			   ext4_itable_unused_count(sb, gdp));  		ret = 1; -		goto out; +		goto err_out;  	}  	blk = ext4_inode_table(sb, gdp) + used_blks; @@ -1312,7 +1309,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,  skip_zeroout:  	ext4_lock_group(sb, group);  	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); -	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); +	ext4_group_desc_csum_set(sb, group, gdp);  	ext4_unlock_group(sb, group);  	BUFFER_TRACE(group_desc_bh,  | 
