diff options
Diffstat (limited to 'fs/xfs/xfs_ialloc.c')
| -rw-r--r-- | fs/xfs/xfs_ialloc.c | 813 | 
1 files changed, 654 insertions, 159 deletions
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index ccf2fb14396..5960e5593fe 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -17,29 +17,30 @@   */  #include "xfs.h"  #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h"  #include "xfs_bit.h" -#include "xfs_log.h"  #include "xfs_inum.h" -#include "xfs_trans.h"  #include "xfs_sb.h"  #include "xfs_ag.h"  #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h"  #include "xfs_inode.h"  #include "xfs_btree.h"  #include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h"  #include "xfs_alloc.h"  #include "xfs_rtalloc.h"  #include "xfs_error.h"  #include "xfs_bmap.h"  #include "xfs_cksum.h" +#include "xfs_trans.h"  #include "xfs_buf_item.h"  #include "xfs_icreate_item.h"  #include "xfs_icache.h" +#include "xfs_dinode.h" +#include "xfs_trace.h"  /* @@ -51,7 +52,7 @@ xfs_ialloc_cluster_alignment(  {  	if (xfs_sb_version_hasalign(&args->mp->m_sb) &&  	    args->mp->m_sb.sb_inoalignmt >= -	     XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp))) +	     XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))  		return args->mp->m_sb.sb_inoalignmt;  	return 1;  } @@ -111,6 +112,66 @@ xfs_inobt_get_rec(  }  /* + * Insert a single inobt record. Cursor must already point to desired location. + */ +STATIC int +xfs_inobt_insert_rec( +	struct xfs_btree_cur	*cur, +	__int32_t		freecount, +	xfs_inofree_t		free, +	int			*stat) +{ +	cur->bc_rec.i.ir_freecount = freecount; +	cur->bc_rec.i.ir_free = free; +	return xfs_btree_insert(cur, stat); +} + +/* + * Insert records describing a newly allocated inode chunk into the inobt. + */ +STATIC int +xfs_inobt_insert( +	struct xfs_mount	*mp, +	struct xfs_trans	*tp, +	struct xfs_buf		*agbp, +	xfs_agino_t		newino, +	xfs_agino_t		newlen, +	xfs_btnum_t		btnum) +{ +	struct xfs_btree_cur	*cur; +	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp); +	xfs_agnumber_t		agno = be32_to_cpu(agi->agi_seqno); +	xfs_agino_t		thisino; +	int			i; +	int			error; + +	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + +	for (thisino = newino; +	     thisino < newino + newlen; +	     thisino += XFS_INODES_PER_CHUNK) { +		error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i); +		if (error) { +			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); +			return error; +		} +		ASSERT(i == 0); + +		error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, +					     XFS_INOBT_ALL_FREE, &i); +		if (error) { +			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); +			return error; +		} +		ASSERT(i == 1); +	} + +	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + +	return 0; +} + +/*   * Verify that the number of free inodes in the AGI is correct.   */  #ifdef DEBUG @@ -169,27 +230,20 @@ xfs_ialloc_inode_init(  {  	struct xfs_buf		*fbuf;  	struct xfs_dinode	*free; -	int			blks_per_cluster, nbufs, ninodes; +	int			nbufs, blks_per_cluster, inodes_per_cluster;  	int			version;  	int			i, j;  	xfs_daddr_t		d;  	xfs_ino_t		ino = 0;  	/* -	 * Loop over the new block(s), filling in the inodes. -	 * For small block sizes, manipulate the inodes in buffers -	 * which are multiples of the blocks size. +	 * Loop over the new block(s), filling in the inodes.  For small block +	 * sizes, manipulate the inodes in buffers  which are multiples of the +	 * blocks size.  	 */ -	if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { -		blks_per_cluster = 1; -		nbufs = length; -		ninodes = mp->m_sb.sb_inopblock; -	} else { -		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / -				   mp->m_sb.sb_blocksize; -		nbufs = length / blks_per_cluster; -		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; -	} +	blks_per_cluster = xfs_icluster_size_fsb(mp); +	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; +	nbufs = length / blks_per_cluster;  	/*  	 * Figure out what version number to use in the inodes we create.  If @@ -224,12 +278,10 @@ xfs_ialloc_inode_init(  		 * they track in the AIL as if they were physically logged.  		 */  		if (tp) -			xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp), +			xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,  					mp->m_sb.sb_inodesize, length, gen); -	} else if (xfs_sb_version_hasnlink(&mp->m_sb)) +	} else  		version = 2; -	else -		version = 1;  	for (j = 0; j < nbufs; j++) {  		/* @@ -245,7 +297,7 @@ xfs_ialloc_inode_init(  		/* Initialize the inode buffers and log them appropriately. */  		fbuf->b_ops = &xfs_inode_buf_ops;  		xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); -		for (i = 0; i < ninodes; i++) { +		for (i = 0; i < inodes_per_cluster; i++) {  			int	ioffset = i << mp->m_sb.sb_inodelog;  			uint	isize = xfs_dinode_size(version); @@ -309,13 +361,10 @@ xfs_ialloc_ag_alloc(  {  	xfs_agi_t	*agi;		/* allocation group header */  	xfs_alloc_arg_t	args;		/* allocation argument structure */ -	xfs_btree_cur_t	*cur;		/* inode btree cursor */  	xfs_agnumber_t	agno;  	int		error; -	int		i;  	xfs_agino_t	newino;		/* new first inode's number */  	xfs_agino_t	newlen;		/* new number of inodes */ -	xfs_agino_t	thisino;	/* current inode number, for loop */  	int		isaligned = 0;	/* inode allocation at stripe unit */  					/* boundary */  	struct xfs_perag *pag; @@ -328,11 +377,11 @@ xfs_ialloc_ag_alloc(  	 * Locking will ensure that we don't have two callers in here  	 * at one time.  	 */ -	newlen = XFS_IALLOC_INODES(args.mp); +	newlen = args.mp->m_ialloc_inos;  	if (args.mp->m_maxicount &&  	    args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)  		return XFS_ERROR(ENOSPC); -	args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp); +	args.minlen = args.maxlen = args.mp->m_ialloc_blks;  	/*  	 * First try to allocate inodes contiguous with the last-allocated  	 * chunk of inodes.  If the filesystem is striped, this will fill @@ -342,7 +391,7 @@ xfs_ialloc_ag_alloc(  	newino = be32_to_cpu(agi->agi_newino);  	agno = be32_to_cpu(agi->agi_seqno);  	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + -			XFS_IALLOC_BLOCKS(args.mp); +		     args.mp->m_ialloc_blks;  	if (likely(newino != NULLAGINO &&  		  (args.agbno < be32_to_cpu(agi->agi_length)))) {  		args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); @@ -369,6 +418,18 @@ xfs_ialloc_ag_alloc(  		args.minleft = args.mp->m_in_maxlevels - 1;  		if ((error = xfs_alloc_vextent(&args)))  			return error; + +		/* +		 * This request might have dirtied the transaction if the AG can +		 * satisfy the request, but the exact block was not available. +		 * If the allocation did fail, subsequent requests will relax +		 * the exact agbno requirement and increase the alignment +		 * instead. It is critical that the total size of the request +		 * (len + alignment + slop) does not increase from this point +		 * on, so reset minalignslop to ensure it is not included in +		 * subsequent requests. +		 */ +		args.minalignslop = 0;  	} else  		args.fsbno = NULLFSBLOCK; @@ -453,29 +514,19 @@ xfs_ialloc_ag_alloc(  	agi->agi_newino = cpu_to_be32(newino);  	/* -	 * Insert records describing the new inode chunk into the btree. +	 * Insert records describing the new inode chunk into the btrees.  	 */ -	cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno); -	for (thisino = newino; -	     thisino < newino + newlen; -	     thisino += XFS_INODES_PER_CHUNK) { -		cur->bc_rec.i.ir_startino = thisino; -		cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK; -		cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE; -		error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i); -		if (error) { -			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); -			return error; -		} -		ASSERT(i == 0); -		error = xfs_btree_insert(cur, &i); -		if (error) { -			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); +	error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, +				 XFS_BTNUM_INO); +	if (error) +		return error; + +	if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { +		error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, +					 XFS_BTNUM_FINO); +		if (error)  			return error; -		} -		ASSERT(i == 1);  	} -	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);  	/*  	 * Log allocation group header fields  	 */ @@ -584,7 +635,7 @@ xfs_ialloc_ag_select(  		 * Is there enough free space for the file plus a block of  		 * inodes? (if we need to allocate some)?  		 */ -		ineed = XFS_IALLOC_BLOCKS(mp); +		ineed = mp->m_ialloc_blks;  		longest = pag->pagf_longest;  		if (!longest)  			longest = pag->pagf_flcount > 0; @@ -669,13 +720,10 @@ xfs_ialloc_get_rec(  }  /* - * Allocate an inode. - * - * The caller selected an AG for us, and made sure that free inodes are - * available. + * Allocate an inode using the inobt-only algorithm.   */  STATIC int -xfs_dialloc_ag( +xfs_dialloc_ag_inobt(  	struct xfs_trans	*tp,  	struct xfs_buf		*agbp,  	xfs_ino_t		parent, @@ -701,7 +749,7 @@ xfs_dialloc_ag(  	ASSERT(pag->pagi_freecount > 0);   restart_pagno: -	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); +	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);  	/*  	 * If pagino is 0 (this is the root inode allocation) use newino.  	 * This must work because we've just allocated some. @@ -934,6 +982,294 @@ error0:  }  /* + * Use the free inode btree to allocate an inode based on distance from the + * parent. Note that the provided cursor may be deleted and replaced. + */ +STATIC int +xfs_dialloc_ag_finobt_near( +	xfs_agino_t			pagino, +	struct xfs_btree_cur		**ocur, +	struct xfs_inobt_rec_incore	*rec) +{ +	struct xfs_btree_cur		*lcur = *ocur;	/* left search cursor */ +	struct xfs_btree_cur		*rcur;	/* right search cursor */ +	struct xfs_inobt_rec_incore	rrec; +	int				error; +	int				i, j; + +	error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i); +	if (error) +		return error; + +	if (i == 1) { +		error = xfs_inobt_get_rec(lcur, rec, &i); +		if (error) +			return error; +		XFS_WANT_CORRUPTED_RETURN(i == 1); + +		/* +		 * See if we've landed in the parent inode record. The finobt +		 * only tracks chunks with at least one free inode, so record +		 * existence is enough. +		 */ +		if (pagino >= rec->ir_startino && +		    pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK)) +			return 0; +	} + +	error = xfs_btree_dup_cursor(lcur, &rcur); +	if (error) +		return error; + +	error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j); +	if (error) +		goto error_rcur; +	if (j == 1) { +		error = xfs_inobt_get_rec(rcur, &rrec, &j); +		if (error) +			goto error_rcur; +		XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur); +	} + +	XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur); +	if (i == 1 && j == 1) { +		/* +		 * Both the left and right records are valid. Choose the closer +		 * inode chunk to the target. +		 */ +		if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) > +		    (rrec.ir_startino - pagino)) { +			*rec = rrec; +			xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); +			*ocur = rcur; +		} else { +			xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); +		} +	} else if (j == 1) { +		/* only the right record is valid */ +		*rec = rrec; +		xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); +		*ocur = rcur; +	} else if (i == 1) { +		/* only the left record is valid */ +		xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); +	} + +	return 0; + +error_rcur: +	xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR); +	return error; +} + +/* + * Use the free inode btree to find a free inode based on a newino hint. If + * the hint is NULL, find the first free inode in the AG. + */ +STATIC int +xfs_dialloc_ag_finobt_newino( +	struct xfs_agi			*agi, +	struct xfs_btree_cur		*cur, +	struct xfs_inobt_rec_incore	*rec) +{ +	int error; +	int i; + +	if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { +		error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ, +					 &i); +		if (error) +			return error; +		if (i == 1) { +			error = xfs_inobt_get_rec(cur, rec, &i); +			if (error) +				return error; +			XFS_WANT_CORRUPTED_RETURN(i == 1); + +			return 0; +		} +	} + +	/* +	 * Find the first inode available in the AG. +	 */ +	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); +	if (error) +		return error; +	XFS_WANT_CORRUPTED_RETURN(i == 1); + +	error = xfs_inobt_get_rec(cur, rec, &i); +	if (error) +		return error; +	XFS_WANT_CORRUPTED_RETURN(i == 1); + +	return 0; +} + +/* + * Update the inobt based on a modification made to the finobt. Also ensure that + * the records from both trees are equivalent post-modification. + */ +STATIC int +xfs_dialloc_ag_update_inobt( +	struct xfs_btree_cur		*cur,	/* inobt cursor */ +	struct xfs_inobt_rec_incore	*frec,	/* finobt record */ +	int				offset) /* inode offset */ +{ +	struct xfs_inobt_rec_incore	rec; +	int				error; +	int				i; + +	error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); +	if (error) +		return error; +	XFS_WANT_CORRUPTED_RETURN(i == 1); + +	error = xfs_inobt_get_rec(cur, &rec, &i); +	if (error) +		return error; +	XFS_WANT_CORRUPTED_RETURN(i == 1); +	ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % +				   XFS_INODES_PER_CHUNK) == 0); + +	rec.ir_free &= ~XFS_INOBT_MASK(offset); +	rec.ir_freecount--; + +	XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && +				  (rec.ir_freecount == frec->ir_freecount)); + +	error = xfs_inobt_update(cur, &rec); +	if (error) +		return error; + +	return 0; +} + +/* + * Allocate an inode using the free inode btree, if available. Otherwise, fall + * back to the inobt search algorithm. + * + * The caller selected an AG for us, and made sure that free inodes are + * available. + */ +STATIC int +xfs_dialloc_ag( +	struct xfs_trans	*tp, +	struct xfs_buf		*agbp, +	xfs_ino_t		parent, +	xfs_ino_t		*inop) +{ +	struct xfs_mount		*mp = tp->t_mountp; +	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp); +	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno); +	xfs_agnumber_t			pagno = XFS_INO_TO_AGNO(mp, parent); +	xfs_agino_t			pagino = XFS_INO_TO_AGINO(mp, parent); +	struct xfs_perag		*pag; +	struct xfs_btree_cur		*cur;	/* finobt cursor */ +	struct xfs_btree_cur		*icur;	/* inobt cursor */ +	struct xfs_inobt_rec_incore	rec; +	xfs_ino_t			ino; +	int				error; +	int				offset; +	int				i; + +	if (!xfs_sb_version_hasfinobt(&mp->m_sb)) +		return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); + +	pag = xfs_perag_get(mp, agno); + +	/* +	 * If pagino is 0 (this is the root inode allocation) use newino. +	 * This must work because we've just allocated some. +	 */ +	if (!pagino) +		pagino = be32_to_cpu(agi->agi_newino); + +	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + +	error = xfs_check_agi_freecount(cur, agi); +	if (error) +		goto error_cur; + +	/* +	 * The search algorithm depends on whether we're in the same AG as the +	 * parent. If so, find the closest available inode to the parent. If +	 * not, consider the agi hint or find the first free inode in the AG. +	 */ +	if (agno == pagno) +		error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); +	else +		error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); +	if (error) +		goto error_cur; + +	offset = xfs_lowbit64(rec.ir_free); +	ASSERT(offset >= 0); +	ASSERT(offset < XFS_INODES_PER_CHUNK); +	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % +				   XFS_INODES_PER_CHUNK) == 0); +	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + +	/* +	 * Modify or remove the finobt record. +	 */ +	rec.ir_free &= ~XFS_INOBT_MASK(offset); +	rec.ir_freecount--; +	if (rec.ir_freecount) +		error = xfs_inobt_update(cur, &rec); +	else +		error = xfs_btree_delete(cur, &i); +	if (error) +		goto error_cur; + +	/* +	 * The finobt has now been updated appropriately. We haven't updated the +	 * agi and superblock yet, so we can create an inobt cursor and validate +	 * the original freecount. If all is well, make the equivalent update to +	 * the inobt using the finobt record and offset information. +	 */ +	icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + +	error = xfs_check_agi_freecount(icur, agi); +	if (error) +		goto error_icur; + +	error = xfs_dialloc_ag_update_inobt(icur, &rec, offset); +	if (error) +		goto error_icur; + +	/* +	 * Both trees have now been updated. We must update the perag and +	 * superblock before we can check the freecount for each btree. +	 */ +	be32_add_cpu(&agi->agi_freecount, -1); +	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); +	pag->pagi_freecount--; + +	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + +	error = xfs_check_agi_freecount(icur, agi); +	if (error) +		goto error_icur; +	error = xfs_check_agi_freecount(cur, agi); +	if (error) +		goto error_icur; + +	xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); +	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); +	xfs_perag_put(pag); +	*inop = ino; +	return 0; + +error_icur: +	xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); +error_cur: +	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); +	xfs_perag_put(pag); +	return error; +} + +/*   * Allocate an inode on disk.   *   * Mode is used to tell whether the new inode will need space, and whether it @@ -998,7 +1334,7 @@ xfs_dialloc(  	 * inode.  	 */  	if (mp->m_maxicount && -	    mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { +	    mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {  		noroom = 1;  		okalloc = 0;  	} @@ -1092,78 +1428,34 @@ out_error:  	return XFS_ERROR(error);  } -/* - * Free disk inode.  Carefully avoids touching the incore inode, all - * manipulations incore are the caller's responsibility. - * The on-disk inode is not changed by this operation, only the - * btree (free inode mask) is changed. - */ -int -xfs_difree( -	xfs_trans_t	*tp,		/* transaction pointer */ -	xfs_ino_t	inode,		/* inode to be freed */ -	xfs_bmap_free_t	*flist,		/* extents to free */ -	int		*delete,	/* set if inode cluster was deleted */ -	xfs_ino_t	*first_ino)	/* first inode in deleted cluster */ +STATIC int +xfs_difree_inobt( +	struct xfs_mount		*mp, +	struct xfs_trans		*tp, +	struct xfs_buf			*agbp, +	xfs_agino_t			agino, +	struct xfs_bmap_free		*flist, +	int				*deleted, +	xfs_ino_t			*first_ino, +	struct xfs_inobt_rec_incore	*orec)  { -	/* REFERENCED */ -	xfs_agblock_t	agbno;	/* block number containing inode */ -	xfs_buf_t	*agbp;	/* buffer containing allocation group header */ -	xfs_agino_t	agino;	/* inode number relative to allocation group */ -	xfs_agnumber_t	agno;	/* allocation group number */ -	xfs_agi_t	*agi;	/* allocation group header */ -	xfs_btree_cur_t	*cur;	/* inode btree cursor */ -	int		error;	/* error return value */ -	int		i;	/* result code */ -	int		ilen;	/* inodes in an inode cluster */ -	xfs_mount_t	*mp;	/* mount structure for filesystem */ -	int		off;	/* offset of inode in inode chunk */ -	xfs_inobt_rec_incore_t rec;	/* btree record */ -	struct xfs_perag *pag; +	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp); +	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno); +	struct xfs_perag		*pag; +	struct xfs_btree_cur		*cur; +	struct xfs_inobt_rec_incore	rec; +	int				ilen; +	int				error; +	int				i; +	int				off; -	mp = tp->t_mountp; - -	/* -	 * Break up inode number into its components. -	 */ -	agno = XFS_INO_TO_AGNO(mp, inode); -	if (agno >= mp->m_sb.sb_agcount)  { -		xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", -			__func__, agno, mp->m_sb.sb_agcount); -		ASSERT(0); -		return XFS_ERROR(EINVAL); -	} -	agino = XFS_INO_TO_AGINO(mp, inode); -	if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  { -		xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", -			__func__, (unsigned long long)inode, -			(unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); -		ASSERT(0); -		return XFS_ERROR(EINVAL); -	} -	agbno = XFS_AGINO_TO_AGBNO(mp, agino); -	if (agbno >= mp->m_sb.sb_agblocks)  { -		xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", -			__func__, agbno, mp->m_sb.sb_agblocks); -		ASSERT(0); -		return XFS_ERROR(EINVAL); -	} -	/* -	 * Get the allocation group header. -	 */ -	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); -	if (error) { -		xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", -			__func__, error); -		return error; -	} -	agi = XFS_BUF_TO_AGI(agbp);  	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); -	ASSERT(agbno < be32_to_cpu(agi->agi_length)); +	ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length)); +  	/*  	 * Initialize the cursor.  	 */ -	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); +	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);  	error = xfs_check_agi_freecount(cur, agi);  	if (error) @@ -1201,9 +1493,9 @@ xfs_difree(  	 * When an inode cluster is free, it becomes eligible for removal  	 */  	if (!(mp->m_flags & XFS_MOUNT_IKEEP) && -	    (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { +	    (rec.ir_freecount == mp->m_ialloc_inos)) { -		*delete = 1; +		*deleted = 1;  		*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);  		/* @@ -1211,7 +1503,7 @@ xfs_difree(  		 * AGI and Superblock inode counts, and mark the disk space  		 * to be freed when the transaction is committed.  		 */ -		ilen = XFS_IALLOC_INODES(mp); +		ilen = mp->m_ialloc_inos;  		be32_add_cpu(&agi->agi_count, -ilen);  		be32_add_cpu(&agi->agi_freecount, -(ilen - 1));  		xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); @@ -1227,11 +1519,11 @@ xfs_difree(  			goto error0;  		} -		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, -				agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)), -				XFS_IALLOC_BLOCKS(mp), flist, mp); +		xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, +				  XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), +				  mp->m_ialloc_blks, flist, mp);  	} else { -		*delete = 0; +		*deleted = 0;  		error = xfs_inobt_update(cur, &rec);  		if (error) { @@ -1255,6 +1547,7 @@ xfs_difree(  	if (error)  		goto error0; +	*orec = rec;  	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);  	return 0; @@ -1263,6 +1556,182 @@ error0:  	return error;  } +/* + * Free an inode in the free inode btree. + */ +STATIC int +xfs_difree_finobt( +	struct xfs_mount		*mp, +	struct xfs_trans		*tp, +	struct xfs_buf			*agbp, +	xfs_agino_t			agino, +	struct xfs_inobt_rec_incore	*ibtrec) /* inobt record */ +{ +	struct xfs_agi			*agi = XFS_BUF_TO_AGI(agbp); +	xfs_agnumber_t			agno = be32_to_cpu(agi->agi_seqno); +	struct xfs_btree_cur		*cur; +	struct xfs_inobt_rec_incore	rec; +	int				offset = agino - ibtrec->ir_startino; +	int				error; +	int				i; + +	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + +	error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); +	if (error) +		goto error; +	if (i == 0) { +		/* +		 * If the record does not exist in the finobt, we must have just +		 * freed an inode in a previously fully allocated chunk. If not, +		 * something is out of sync. +		 */ +		XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error); + +		error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, +					     ibtrec->ir_free, &i); +		if (error) +			goto error; +		ASSERT(i == 1); + +		goto out; +	} + +	/* +	 * Read and update the existing record. We could just copy the ibtrec +	 * across here, but that would defeat the purpose of having redundant +	 * metadata. By making the modifications independently, we can catch +	 * corruptions that we wouldn't see if we just copied from one record +	 * to another. +	 */ +	error = xfs_inobt_get_rec(cur, &rec, &i); +	if (error) +		goto error; +	XFS_WANT_CORRUPTED_GOTO(i == 1, error); + +	rec.ir_free |= XFS_INOBT_MASK(offset); +	rec.ir_freecount++; + +	XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) && +				(rec.ir_freecount == ibtrec->ir_freecount), +				error); + +	/* +	 * The content of inobt records should always match between the inobt +	 * and finobt. The lifecycle of records in the finobt is different from +	 * the inobt in that the finobt only tracks records with at least one +	 * free inode. Hence, if all of the inodes are free and we aren't +	 * keeping inode chunks permanently on disk, remove the record. +	 * Otherwise, update the record with the new information. +	 */ +	if (rec.ir_freecount == mp->m_ialloc_inos && +	    !(mp->m_flags & XFS_MOUNT_IKEEP)) { +		error = xfs_btree_delete(cur, &i); +		if (error) +			goto error; +		ASSERT(i == 1); +	} else { +		error = xfs_inobt_update(cur, &rec); +		if (error) +			goto error; +	} + +out: +	error = xfs_check_agi_freecount(cur, agi); +	if (error) +		goto error; + +	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); +	return 0; + +error: +	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); +	return error; +} + +/* + * Free disk inode.  Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int +xfs_difree( +	struct xfs_trans	*tp,		/* transaction pointer */ +	xfs_ino_t		inode,		/* inode to be freed */ +	struct xfs_bmap_free	*flist,		/* extents to free */ +	int			*deleted,/* set if inode cluster was deleted */ +	xfs_ino_t		*first_ino)/* first inode in deleted cluster */ +{ +	/* REFERENCED */ +	xfs_agblock_t		agbno;	/* block number containing inode */ +	struct xfs_buf		*agbp;	/* buffer for allocation group header */ +	xfs_agino_t		agino;	/* allocation group inode number */ +	xfs_agnumber_t		agno;	/* allocation group number */ +	int			error;	/* error return value */ +	struct xfs_mount	*mp;	/* mount structure for filesystem */ +	struct xfs_inobt_rec_incore rec;/* btree record */ + +	mp = tp->t_mountp; + +	/* +	 * Break up inode number into its components. +	 */ +	agno = XFS_INO_TO_AGNO(mp, inode); +	if (agno >= mp->m_sb.sb_agcount)  { +		xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", +			__func__, agno, mp->m_sb.sb_agcount); +		ASSERT(0); +		return XFS_ERROR(EINVAL); +	} +	agino = XFS_INO_TO_AGINO(mp, inode); +	if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  { +		xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", +			__func__, (unsigned long long)inode, +			(unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); +		ASSERT(0); +		return XFS_ERROR(EINVAL); +	} +	agbno = XFS_AGINO_TO_AGBNO(mp, agino); +	if (agbno >= mp->m_sb.sb_agblocks)  { +		xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", +			__func__, agbno, mp->m_sb.sb_agblocks); +		ASSERT(0); +		return XFS_ERROR(EINVAL); +	} +	/* +	 * Get the allocation group header. +	 */ +	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); +	if (error) { +		xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", +			__func__, error); +		return error; +	} + +	/* +	 * Fix up the inode allocation btree. +	 */ +	error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, +				 &rec); +	if (error) +		goto error0; + +	/* +	 * Fix up the free inode btree. +	 */ +	if (xfs_sb_version_hasfinobt(&mp->m_sb)) { +		error = xfs_difree_finobt(mp, tp, agbp, agino, &rec); +		if (error) +			goto error0; +	} + +	return 0; + +error0: +	return error; +} +  STATIC int  xfs_imap_lookup(  	struct xfs_mount	*mp, @@ -1294,7 +1763,7 @@ xfs_imap_lookup(  	 * we have a record, we need to ensure it contains the inode number  	 * we are looking up.  	 */ -	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); +	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);  	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);  	if (!error) {  		if (i) @@ -1310,7 +1779,7 @@ xfs_imap_lookup(  	/* check that the returned record contains the required inode */  	if (rec.ir_startino > agino || -	    rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) +	    rec.ir_startino + mp->m_ialloc_inos <= agino)  		return EINVAL;  	/* for untrusted inodes check it is allocated first */ @@ -1383,7 +1852,7 @@ xfs_imap(  		return XFS_ERROR(EINVAL);  	} -	blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; +	blks_per_cluster = xfs_icluster_size_fsb(mp);  	/*  	 * For bulkstat and handle lookups, we have an untrusted inode number @@ -1404,7 +1873,7 @@ xfs_imap(  	 * If the inode cluster size is the same as the blocksize or  	 * smaller we get to the buffer by simple arithmetics.  	 */ -	if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) { +	if (blks_per_cluster == 1) {  		offset = XFS_INO_TO_OFFSET(mp, ino);  		ASSERT(offset < mp->m_sb.sb_inopblock); @@ -1482,7 +1951,16 @@ xfs_ialloc_compute_maxlevels(  }  /* - * Log specified fields for the ag hdr (inode section) + * Log specified fields for the ag hdr (inode section). The growth of the agi + * structure over time requires that we interpret the buffer as two logical + * regions delineated by the end of the unlinked list. This is due to the size + * of the hash table and its location in the middle of the agi. + * + * For example, a request to log a field before agi_unlinked and a field after + * agi_unlinked could cause us to log the entire hash table and use an excessive + * amount of log space. To avoid this behavior, log the region up through + * agi_unlinked in one call and the region after agi_unlinked through the end of + * the structure in another.   */  void  xfs_ialloc_log_agi( @@ -1505,6 +1983,8 @@ xfs_ialloc_log_agi(  		offsetof(xfs_agi_t, agi_newino),  		offsetof(xfs_agi_t, agi_dirino),  		offsetof(xfs_agi_t, agi_unlinked), +		offsetof(xfs_agi_t, agi_free_root), +		offsetof(xfs_agi_t, agi_free_level),  		sizeof(xfs_agi_t)  	};  #ifdef DEBUG @@ -1513,15 +1993,30 @@ xfs_ialloc_log_agi(  	agi = XFS_BUF_TO_AGI(bp);  	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));  #endif + +	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); +  	/* -	 * Compute byte offsets for the first and last fields. +	 * Compute byte offsets for the first and last fields in the first +	 * region and log the agi buffer. This only logs up through +	 * agi_unlinked.  	 */ -	xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last); +	if (fields & XFS_AGI_ALL_BITS_R1) { +		xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1, +				  &first, &last); +		xfs_trans_log_buf(tp, bp, first, last); +	} +  	/* -	 * Log the allocation group inode header buffer. +	 * Mask off the bits in the first region and calculate the first and +	 * last field offsets for any bits in the second region.  	 */ -	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); -	xfs_trans_log_buf(tp, bp, first, last); +	fields &= ~XFS_AGI_ALL_BITS_R1; +	if (fields) { +		xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2, +				  &first, &last); +		xfs_trans_log_buf(tp, bp, first, last); +	}  }  #ifdef DEBUG @@ -1574,18 +2069,17 @@ xfs_agi_read_verify(  	struct xfs_buf	*bp)  {  	struct xfs_mount *mp = bp->b_target->bt_mount; -	int		agi_ok = 1; - -	if (xfs_sb_version_hascrc(&mp->m_sb)) -		agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), -					  offsetof(struct xfs_agi, agi_crc)); -	agi_ok = agi_ok && xfs_agi_verify(bp); -	if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, -			XFS_RANDOM_IALLOC_READ_AGI))) { -		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); +	if (xfs_sb_version_hascrc(&mp->m_sb) && +	    !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) +		xfs_buf_ioerror(bp, EFSBADCRC); +	else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, +				XFS_ERRTAG_IALLOC_READ_AGI, +				XFS_RANDOM_IALLOC_READ_AGI))  		xfs_buf_ioerror(bp, EFSCORRUPTED); -	} + +	if (bp->b_error) +		xfs_verifier_error(bp);  }  static void @@ -1596,8 +2090,8 @@ xfs_agi_write_verify(  	struct xfs_buf_log_item	*bip = bp->b_fspriv;  	if (!xfs_agi_verify(bp)) { -		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);  		xfs_buf_ioerror(bp, EFSCORRUPTED); +		xfs_verifier_error(bp);  		return;  	} @@ -1606,8 +2100,7 @@ xfs_agi_write_verify(  	if (bip)  		XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); -	xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), -			 offsetof(struct xfs_agi, agi_crc)); +	xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);  }  const struct xfs_buf_ops xfs_agi_buf_ops = { @@ -1627,15 +2120,15 @@ xfs_read_agi(  {  	int			error; -	ASSERT(agno != NULLAGNUMBER); +	trace_xfs_read_agi(mp, agno); +	ASSERT(agno != NULLAGNUMBER);  	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,  			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),  			XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);  	if (error)  		return error; -	ASSERT(!xfs_buf_geterror(*bpp));  	xfs_buf_set_ref(*bpp, XFS_AGI_REF);  	return 0;  } @@ -1651,6 +2144,8 @@ xfs_ialloc_read_agi(  	struct xfs_perag	*pag;	/* per allocation group data */  	int			error; +	trace_xfs_ialloc_read_agi(mp, agno); +  	error = xfs_read_agi(mp, tp, agno, bpp);  	if (error)  		return error;  | 
