aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_ialloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_ialloc.c')
-rw-r--r--fs/xfs/xfs_ialloc.c2545
1 files changed, 1650 insertions, 895 deletions
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index db9d5fa600a..5960e5593fe 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -17,96 +17,337 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_bmap.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_icreate_item.h"
+#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_trace.h"
+
/*
- * Log specified fields for the inode given by bp and off.
+ * Allocation group level functions.
*/
-STATIC void
-xfs_ialloc_log_di(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *bp, /* inode buffer */
- int off, /* index of inode in buffer */
- int fields) /* bitmask of fields to log */
+static inline int
+xfs_ialloc_cluster_alignment(
+ xfs_alloc_arg_t *args)
{
- int first; /* first byte number */
- int ioffset; /* off in bytes */
- int last; /* last byte number */
- xfs_mount_t *mp; /* mount point structure */
- static const short offsets[] = { /* field offsets */
- /* keep in sync with bits */
- offsetof(xfs_dinode_core_t, di_magic),
- offsetof(xfs_dinode_core_t, di_mode),
- offsetof(xfs_dinode_core_t, di_version),
- offsetof(xfs_dinode_core_t, di_format),
- offsetof(xfs_dinode_core_t, di_onlink),
- offsetof(xfs_dinode_core_t, di_uid),
- offsetof(xfs_dinode_core_t, di_gid),
- offsetof(xfs_dinode_core_t, di_nlink),
- offsetof(xfs_dinode_core_t, di_projid),
- offsetof(xfs_dinode_core_t, di_pad),
- offsetof(xfs_dinode_core_t, di_atime),
- offsetof(xfs_dinode_core_t, di_mtime),
- offsetof(xfs_dinode_core_t, di_ctime),
- offsetof(xfs_dinode_core_t, di_size),
- offsetof(xfs_dinode_core_t, di_nblocks),
- offsetof(xfs_dinode_core_t, di_extsize),
- offsetof(xfs_dinode_core_t, di_nextents),
- offsetof(xfs_dinode_core_t, di_anextents),
- offsetof(xfs_dinode_core_t, di_forkoff),
- offsetof(xfs_dinode_core_t, di_aformat),
- offsetof(xfs_dinode_core_t, di_dmevmask),
- offsetof(xfs_dinode_core_t, di_dmstate),
- offsetof(xfs_dinode_core_t, di_flags),
- offsetof(xfs_dinode_core_t, di_gen),
- offsetof(xfs_dinode_t, di_next_unlinked),
- offsetof(xfs_dinode_t, di_u),
- offsetof(xfs_dinode_t, di_a),
- sizeof(xfs_dinode_t)
- };
+ if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
+ args->mp->m_sb.sb_inoalignmt >=
+ XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
+ return args->mp->m_sb.sb_inoalignmt;
+ return 1;
+}
+/*
+ * Lookup a record by ino in the btree given by cur.
+ */
+int /* error */
+xfs_inobt_lookup(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agino_t ino, /* starting inode of chunk */
+ xfs_lookup_t dir, /* <=, >=, == */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.i.ir_startino = ino;
+ cur->bc_rec.i.ir_freecount = 0;
+ cur->bc_rec.i.ir_free = 0;
+ return xfs_btree_lookup(cur, dir, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int /* error */
+xfs_inobt_update(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_inobt_rec_incore_t *irec) /* btree record */
+{
+ union xfs_btree_rec rec;
+
+ rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
+ rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
+ rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
+ return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int /* error */
+xfs_inobt_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_inobt_rec_incore_t *irec, /* btree record */
+ int *stat) /* output: success/failure */
+{
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (!error && *stat == 1) {
+ irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
+ irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
+ irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
+ }
+ return error;
+}
+
+/*
+ * Insert a single inobt record. Cursor must already point to desired location.
+ */
+STATIC int
+xfs_inobt_insert_rec(
+ struct xfs_btree_cur *cur,
+ __int32_t freecount,
+ xfs_inofree_t free,
+ int *stat)
+{
+ cur->bc_rec.i.ir_freecount = freecount;
+ cur->bc_rec.i.ir_free = free;
+ return xfs_btree_insert(cur, stat);
+}
+
+/*
+ * Insert records describing a newly allocated inode chunk into the inobt.
+ */
+STATIC int
+xfs_inobt_insert(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t newino,
+ xfs_agino_t newlen,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agino_t thisino;
+ int i;
+ int error;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+ for (thisino = newino;
+ thisino < newino + newlen;
+ thisino += XFS_INODES_PER_CHUNK) {
+ error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 0);
+
+ error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+ XFS_INOBT_ALL_FREE, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 1);
+ }
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ return 0;
+}
+
+/*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+ struct xfs_btree_cur *cur,
+ struct xfs_agi *agi)
+{
+ if (cur->bc_nlevels == 1) {
+ xfs_inobt_rec_incore_t rec;
+ int freecount = 0;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+
+ do {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+
+ if (i) {
+ freecount += rec.ir_freecount;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ return error;
+ }
+ } while (i == 1);
+
+ if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+ ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+ }
+ return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi) 0
+#endif
+
+/*
+ * Initialise a new set of inodes. When called without a transaction context
+ * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
+ * than logging them (which in a transaction context puts them into the AIL
+ * for writeback rather than the xfsbufd queue).
+ */
+int
+xfs_ialloc_inode_init(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct list_head *buffer_list,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ xfs_agblock_t length,
+ unsigned int gen)
+{
+ struct xfs_buf *fbuf;
+ struct xfs_dinode *free;
+ int nbufs, blks_per_cluster, inodes_per_cluster;
+ int version;
+ int i, j;
+ xfs_daddr_t d;
+ xfs_ino_t ino = 0;
- ASSERT(offsetof(xfs_dinode_t, di_core) == 0);
- ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0);
- mp = tp->t_mountp;
/*
- * Get the inode-relative first and last bytes for these fields
+ * Loop over the new block(s), filling in the inodes. For small block
+ * sizes, manipulate the inodes in buffers which are multiples of the
+ * blocks size.
*/
- xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last);
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+ nbufs = length / blks_per_cluster;
+
/*
- * Convert to buffer offsets and log it.
+ * Figure out what version number to use in the inodes we create. If
+ * the superblock version has caught up to the one that supports the new
+ * inode format, then use the new inode version. Otherwise use the old
+ * version so that old kernels will continue to be able to use the file
+ * system.
+ *
+ * For v3 inodes, we also need to write the inode number into the inode,
+ * so calculate the first inode number of the chunk here as
+ * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
+ * across multiple filesystem blocks (such as a cluster) and so cannot
+ * be used in the cluster buffer loop below.
+ *
+ * Further, because we are writing the inode directly into the buffer
+ * and calculating a CRC on the entire inode, we have ot log the entire
+ * inode so that the entire range the CRC covers is present in the log.
+ * That means for v3 inode we log the entire buffer rather than just the
+ * inode cores.
*/
- ioffset = off << mp->m_sb.sb_inodelog;
- first += ioffset;
- last += ioffset;
- xfs_trans_log_buf(tp, bp, first, last);
-}
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ version = 3;
+ ino = XFS_AGINO_TO_INO(mp, agno,
+ XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
-/*
- * Allocation group level functions.
- */
+ /*
+ * log the initialisation that is about to take place as an
+ * logical operation. This means the transaction does not
+ * need to log the physical changes to the inode buffers as log
+ * recovery will know what initialisation is actually needed.
+ * Hence we only need to log the buffers as "ordered" buffers so
+ * they track in the AIL as if they were physically logged.
+ */
+ if (tp)
+ xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
+ mp->m_sb.sb_inodesize, length, gen);
+ } else
+ version = 2;
+
+ for (j = 0; j < nbufs; j++) {
+ /*
+ * Get the block.
+ */
+ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+ fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize * blks_per_cluster,
+ XBF_UNMAPPED);
+ if (!fbuf)
+ return ENOMEM;
+
+ /* Initialize the inode buffers and log them appropriately. */
+ fbuf->b_ops = &xfs_inode_buf_ops;
+ xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
+ for (i = 0; i < inodes_per_cluster; i++) {
+ int ioffset = i << mp->m_sb.sb_inodelog;
+ uint isize = xfs_dinode_size(version);
+
+ free = xfs_make_iptr(mp, fbuf, i);
+ free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ free->di_version = version;
+ free->di_gen = cpu_to_be32(gen);
+ free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+
+ if (version == 3) {
+ free->di_ino = cpu_to_be64(ino);
+ ino++;
+ uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+ xfs_dinode_calc_crc(mp, free);
+ } else if (tp) {
+ /* just log the inode core */
+ xfs_trans_log_buf(tp, fbuf, ioffset,
+ ioffset + isize - 1);
+ }
+ }
+
+ if (tp) {
+ /*
+ * Mark the buffer as an inode allocation buffer so it
+ * sticks in AIL at the point of this allocation
+ * transaction. This ensures the they are on disk before
+ * the tail of the log can be moved past this
+ * transaction (i.e. by preventing relogging from moving
+ * it forward in the log).
+ */
+ xfs_trans_inode_alloc_buf(tp, fbuf);
+ if (version == 3) {
+ /*
+ * Mark the buffer as ordered so that they are
+ * not physically logged in the transaction but
+ * still tracked in the AIL as part of the
+ * transaction and pin the log appropriately.
+ */
+ xfs_trans_ordered_buf(tp, fbuf);
+ xfs_trans_log_buf(tp, fbuf, 0,
+ BBTOB(fbuf->b_length) - 1);
+ }
+ } else {
+ fbuf->b_flags |= XBF_DONE;
+ xfs_buf_delwri_queue(fbuf, buffer_list);
+ xfs_buf_relse(fbuf);
+ }
+ }
+ return 0;
+}
/*
* Allocate new inodes in the allocation group specified by agbp.
@@ -120,24 +361,15 @@ xfs_ialloc_ag_alloc(
{
xfs_agi_t *agi; /* allocation group header */
xfs_alloc_arg_t args; /* allocation argument structure */
- int blks_per_cluster; /* fs blocks per inode cluster */
- xfs_btree_cur_t *cur; /* inode btree cursor */
- xfs_daddr_t d; /* disk addr of buffer */
xfs_agnumber_t agno;
int error;
- xfs_buf_t *fbuf; /* new free inodes' buffer */
- xfs_dinode_t *free; /* new free inode structure */
- int i; /* inode counter */
- int j; /* block counter */
- int nbufs; /* num bufs of new inodes */
xfs_agino_t newino; /* new first inode's number */
xfs_agino_t newlen; /* new number of inodes */
- int ninodes; /* num inodes per buf */
- xfs_agino_t thisino; /* current inode number, for loop */
- int version; /* inode version number to use */
int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
+ struct xfs_perag *pag;
+ memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = tp->t_mountp;
@@ -145,35 +377,59 @@ xfs_ialloc_ag_alloc(
* Locking will ensure that we don't have two callers in here
* at one time.
*/
- newlen = XFS_IALLOC_INODES(args.mp);
+ newlen = args.mp->m_ialloc_inos;
if (args.mp->m_maxicount &&
args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
return XFS_ERROR(ENOSPC);
- args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
+ args.minlen = args.maxlen = args.mp->m_ialloc_blks;
/*
* First try to allocate inodes contiguous with the last-allocated
* chunk of inodes. If the filesystem is striped, this will fill
* an entire stripe unit with inodes.
- */
+ */
agi = XFS_BUF_TO_AGI(agbp);
newino = be32_to_cpu(agi->agi_newino);
+ agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
- XFS_IALLOC_BLOCKS(args.mp);
+ args.mp->m_ialloc_blks;
if (likely(newino != NULLAGINO &&
(args.agbno < be32_to_cpu(agi->agi_length)))) {
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
args.type = XFS_ALLOCTYPE_THIS_BNO;
- args.mod = args.total = args.wasdel = args.isfl =
- args.userdata = args.minalignslop = 0;
args.prod = 1;
- args.alignment = 1;
+
/*
- * Allow space for the inode btree to split.
+ * We need to take into account alignment here to ensure that
+ * we don't modify the free list if we fail to have an exact
+ * block. If we don't have an exact match, and every oher
+ * attempt allocation attempt fails, we'll end up cancelling
+ * a dirty transaction and shutting down.
+ *
+ * For an exact allocation, alignment must be 1,
+ * however we need to take cluster alignment into account when
+ * fixing up the freelist. Use the minalignslop field to
+ * indicate that extra blocks might be required for alignment,
+ * but not to use them in the actual exact allocation.
*/
- args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+ args.alignment = 1;
+ args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
+
+ /* Allow space for the inode btree to split. */
+ args.minleft = args.mp->m_in_maxlevels - 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
+
+ /*
+ * This request might have dirtied the transaction if the AG can
+ * satisfy the request, but the exact block was not available.
+ * If the allocation did fail, subsequent requests will relax
+ * the exact agbno requirement and increase the alignment
+ * instead. It is critical that the total size of the request
+ * (len + alignment + slop) does not increase from this point
+ * on, so reset minalignslop to ensure it is not included in
+ * subsequent requests.
+ */
+ args.minalignslop = 0;
} else
args.fsbno = NULLFSBLOCK;
@@ -191,32 +447,24 @@ xfs_ialloc_ag_alloc(
ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
args.alignment = args.mp->m_dalign;
isaligned = 1;
- } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
- args.mp->m_sb.sb_inoalignmt >=
- XFS_B_TO_FSBT(args.mp,
- XFS_INODE_CLUSTER_SIZE(args.mp)))
- args.alignment = args.mp->m_sb.sb_inoalignmt;
- else
- args.alignment = 1;
+ } else
+ args.alignment = xfs_ialloc_cluster_alignment(&args);
/*
* Need to figure out where to allocate the inode blocks.
* Ideally they should be spaced out through the a.g.
* For now, just allocate blocks up front.
*/
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
/*
* Allocate a fixed-size extent of inodes.
*/
args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.mod = args.total = args.wasdel = args.isfl =
- args.userdata = args.minalignslop = 0;
args.prod = 1;
/*
* Allow space for the inode btree to split.
*/
- args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+ args.minleft = args.mp->m_in_maxlevels - 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -228,14 +476,8 @@ xfs_ialloc_ag_alloc(
if (isaligned && args.fsbno == NULLFSBLOCK) {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
- args.fsbno = XFS_AGB_TO_FSB(args.mp,
- be32_to_cpu(agi->agi_seqno), args.agbno);
- if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
- args.mp->m_sb.sb_inoalignmt >=
- XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
- args.alignment = args.mp->m_sb.sb_inoalignmt;
- else
- args.alignment = 1;
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
+ args.alignment = xfs_ialloc_cluster_alignment(&args);
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -245,90 +487,46 @@ xfs_ialloc_ag_alloc(
return 0;
}
ASSERT(args.len == args.minlen);
+
/*
- * Convert the results.
- */
- newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
- /*
- * Loop over the new block(s), filling in the inodes.
- * For small block sizes, manipulate the inodes in buffers
- * which are multiples of the blocks size.
+ * Stamp and write the inode buffers.
+ *
+ * Seed the new inode cluster with a random generation number. This
+ * prevents short-term reuse of generation numbers if a chunk is
+ * freed and then immediately reallocated. We use random numbers
+ * rather than a linear progression to prevent the next generation
+ * number from being easily guessable.
*/
- if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
- blks_per_cluster = 1;
- nbufs = (int)args.len;
- ninodes = args.mp->m_sb.sb_inopblock;
- } else {
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
- args.mp->m_sb.sb_blocksize;
- nbufs = (int)args.len / blks_per_cluster;
- ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
- }
+ error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno,
+ args.len, prandom_u32());
+
+ if (error)
+ return error;
/*
- * Figure out what version number to use in the inodes we create.
- * If the superblock version has caught up to the one that supports
- * the new inode format, then use the new inode version. Otherwise
- * use the old version so that old kernels will continue to be
- * able to use the file system.
+ * Convert the results.
*/
- if (XFS_SB_VERSION_HASNLINK(&args.mp->m_sb))
- version = XFS_DINODE_VERSION_2;
- else
- version = XFS_DINODE_VERSION_1;
-
- for (j = 0; j < nbufs; j++) {
- /*
- * Get the block.
- */
- d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
- args.agbno + (j * blks_per_cluster));
- fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
- args.mp->m_bsize * blks_per_cluster,
- XFS_BUF_LOCK);
- ASSERT(fbuf);
- ASSERT(!XFS_BUF_GETERROR(fbuf));
- /*
- * Set initial values for the inodes in this buffer.
- */
- xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
- for (i = 0; i < ninodes; i++) {
- free = XFS_MAKE_IPTR(args.mp, fbuf, i);
- free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
- free->di_core.di_version = version;
- free->di_next_unlinked = cpu_to_be32(NULLAGINO);
- xfs_ialloc_log_di(tp, fbuf, i,
- XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
- }
- xfs_trans_inode_alloc_buf(tp, fbuf);
- }
+ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
be32_add_cpu(&agi->agi_count, newlen);
be32_add_cpu(&agi->agi_freecount, newlen);
- agno = be32_to_cpu(agi->agi_seqno);
- down_read(&args.mp->m_peraglock);
- args.mp->m_perag[agno].pagi_freecount += newlen;
- up_read(&args.mp->m_peraglock);
+ pag = xfs_perag_get(args.mp, agno);
+ pag->pagi_freecount += newlen;
+ xfs_perag_put(pag);
agi->agi_newino = cpu_to_be32(newino);
+
/*
- * Insert records describing the new inode chunk into the btree.
+ * Insert records describing the new inode chunk into the btrees.
*/
- cur = xfs_btree_init_cursor(args.mp, tp, agbp, agno,
- XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
- for (thisino = newino;
- thisino < newino + newlen;
- thisino += XFS_INODES_PER_CHUNK) {
- if ((error = xfs_inobt_lookup_eq(cur, thisino,
- XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return error;
- }
- ASSERT(i == 0);
- if ((error = xfs_inobt_insert(cur, &i))) {
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_INO);
+ if (error)
+ return error;
+
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_FINO);
+ if (error)
return error;
- }
- ASSERT(i == 1);
}
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
/*
* Log allocation group header fields
*/
@@ -343,7 +541,7 @@ xfs_ialloc_ag_alloc(
return 0;
}
-STATIC_INLINE xfs_agnumber_t
+STATIC xfs_agnumber_t
xfs_ialloc_next_ag(
xfs_mount_t *mp)
{
@@ -351,7 +549,7 @@ xfs_ialloc_next_ag(
spin_lock(&mp->m_agirotor_lock);
agno = mp->m_agirotor;
- if (++mp->m_agirotor == mp->m_maxagi)
+ if (++mp->m_agirotor >= mp->m_maxagi)
mp->m_agirotor = 0;
spin_unlock(&mp->m_agirotor_lock);
@@ -360,16 +558,15 @@ xfs_ialloc_next_ag(
/*
* Select an allocation group to look for a free inode in, based on the parent
- * inode and then mode. Return the allocation group buffer.
+ * inode and the mode. Return the allocation group buffer.
*/
-STATIC xfs_buf_t * /* allocation group buffer */
+STATIC xfs_agnumber_t
xfs_ialloc_ag_select(
xfs_trans_t *tp, /* transaction pointer */
xfs_ino_t parent, /* parent directory inode number */
- mode_t mode, /* bits set to indicate file type */
+ umode_t mode, /* bits set to indicate file type */
int okalloc) /* ok to allocate more space */
{
- xfs_buf_t *agbp; /* allocation group header buffer */
xfs_agnumber_t agcount; /* number of ag's in the filesystem */
xfs_agnumber_t agno; /* current ag number */
int flags; /* alloc buffer locking flags */
@@ -379,6 +576,7 @@ xfs_ialloc_ag_select(
int needspace; /* file mode implies space allocated */
xfs_perag_t *pag; /* per allocation group data */
xfs_agnumber_t pagno; /* parent (starting) ag number */
+ int error;
/*
* Files of these types need at least one block if length > 0
@@ -394,7 +592,9 @@ xfs_ialloc_ag_select(
if (pagno >= agcount)
pagno = 0;
}
+
ASSERT(pagno < agcount);
+
/*
* Loop through allocation groups, looking for one with a little
* free space in it. Note we don't look for free inodes, exactly.
@@ -404,637 +604,876 @@ xfs_ialloc_ag_select(
*/
agno = pagno;
flags = XFS_ALLOC_FLAG_TRYLOCK;
- down_read(&mp->m_peraglock);
for (;;) {
- pag = &mp->m_perag[agno];
+ pag = xfs_perag_get(mp, agno);
+ if (!pag->pagi_inodeok) {
+ xfs_ialloc_next_ag(mp);
+ goto nextag;
+ }
+
if (!pag->pagi_init) {
- if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
- agbp = NULL;
+ error = xfs_ialloc_pagi_init(mp, tp, agno);
+ if (error)
goto nextag;
- }
- } else
- agbp = NULL;
+ }
- if (!pag->pagi_inodeok) {
- xfs_ialloc_next_ag(mp);
- goto unlock_nextag;
+ if (pag->pagi_freecount) {
+ xfs_perag_put(pag);
+ return agno;
}
- /*
- * Is there enough free space for the file plus a block
- * of inodes (if we need to allocate some)?
- */
- ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp);
- if (ineed && !pag->pagf_init) {
- if (agbp == NULL &&
- xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
- agbp = NULL;
+ if (!okalloc)
+ goto nextag;
+
+ if (!pag->pagf_init) {
+ error = xfs_alloc_pagf_init(mp, tp, agno, flags);
+ if (error)
goto nextag;
- }
- (void)xfs_alloc_pagf_init(mp, tp, agno, flags);
}
- if (!ineed || pag->pagf_init) {
- if (ineed && !(longest = pag->pagf_longest))
- longest = pag->pagf_flcount > 0;
- if (!ineed ||
- (pag->pagf_freeblks >= needspace + ineed &&
- longest >= ineed &&
- okalloc)) {
- if (agbp == NULL &&
- xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
- agbp = NULL;
- goto nextag;
- }
- up_read(&mp->m_peraglock);
- return agbp;
- }
+
+ /*
+ * Is there enough free space for the file plus a block of
+ * inodes? (if we need to allocate some)?
+ */
+ ineed = mp->m_ialloc_blks;
+ longest = pag->pagf_longest;
+ if (!longest)
+ longest = pag->pagf_flcount > 0;
+
+ if (pag->pagf_freeblks >= needspace + ineed &&
+ longest >= ineed) {
+ xfs_perag_put(pag);
+ return agno;
}
-unlock_nextag:
- if (agbp)
- xfs_trans_brelse(tp, agbp);
nextag:
+ xfs_perag_put(pag);
/*
* No point in iterating over the rest, if we're shutting
* down.
*/
- if (XFS_FORCED_SHUTDOWN(mp)) {
- up_read(&mp->m_peraglock);
- return NULL;
- }
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return NULLAGNUMBER;
agno++;
if (agno >= agcount)
agno = 0;
if (agno == pagno) {
- if (flags == 0) {
- up_read(&mp->m_peraglock);
- return NULL;
- }
+ if (flags == 0)
+ return NULLAGNUMBER;
flags = 0;
}
}
}
/*
- * Visible inode allocation functions.
- */
-
-/*
- * Allocate an inode on disk.
- * Mode is used to tell whether the new inode will need space, and whether
- * it is a directory.
- *
- * The arguments IO_agbp and alloc_done are defined to work within
- * the constraint of one allocation per transaction.
- * xfs_dialloc() is designed to be called twice if it has to do an
- * allocation to make more free inodes. On the first call,
- * IO_agbp should be set to NULL. If an inode is available,
- * i.e., xfs_dialloc() did not need to do an allocation, an inode
- * number is returned. In this case, IO_agbp would be set to the
- * current ag_buf and alloc_done set to false.
- * If an allocation needed to be done, xfs_dialloc would return
- * the current ag_buf in IO_agbp and set alloc_done to true.
- * The caller should then commit the current transaction, allocate a new
- * transaction, and call xfs_dialloc() again, passing in the previous
- * value of IO_agbp. IO_agbp should be held across the transactions.
- * Since the agbp is locked across the two calls, the second call is
- * guaranteed to have a free inode available.
- *
- * Once we successfully pick an inode its number is returned and the
- * on-disk data structures are updated. The inode itself is not read
- * in, since doing so would break ordering constraints with xfs_reclaim.
+ * Try to retrieve the next record to the left/right from the current one.
*/
-int
-xfs_dialloc(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ino_t parent, /* parent inode (directory) */
- mode_t mode, /* mode bits for new inode */
- int okalloc, /* ok to allocate more space */
- xfs_buf_t **IO_agbp, /* in/out ag header's buffer */
- boolean_t *alloc_done, /* true if we needed to replenish
- inode freelist */
- xfs_ino_t *inop) /* inode number allocated */
+STATIC int
+xfs_ialloc_next_rec(
+ struct xfs_btree_cur *cur,
+ xfs_inobt_rec_incore_t *rec,
+ int *done,
+ int left)
{
- xfs_agnumber_t agcount; /* number of allocation groups */
- xfs_buf_t *agbp; /* allocation group header's buffer */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_agi_t *agi; /* allocation group header structure */
- xfs_btree_cur_t *cur; /* inode allocation btree cursor */
- int error; /* error return value */
- int i; /* result code */
- int ialloced; /* inode allocation status */
- int noroom = 0; /* no space for inode blk allocation */
- xfs_ino_t ino; /* fs-relative inode to be returned */
- /* REFERENCED */
- int j; /* result code */
- xfs_mount_t *mp; /* file system mount structure */
- int offset; /* index of inode in chunk */
- xfs_agino_t pagino; /* parent's a.g. relative inode # */
- xfs_agnumber_t pagno; /* parent's allocation group number */
- xfs_inobt_rec_incore_t rec; /* inode allocation record */
- xfs_agnumber_t tagno; /* testing allocation group number */
- xfs_btree_cur_t *tcur; /* temp cursor */
- xfs_inobt_rec_incore_t trec; /* temp inode allocation record */
-
-
- if (*IO_agbp == NULL) {
- /*
- * We do not have an agbp, so select an initial allocation
- * group for inode allocation.
- */
- agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
- /*
- * Couldn't find an allocation group satisfying the
- * criteria, give up.
- */
- if (!agbp) {
- *inop = NULLFSINO;
- return 0;
- }
- agi = XFS_BUF_TO_AGI(agbp);
- ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
- } else {
- /*
- * Continue where we left off before. In this case, we
- * know that the allocation group has free inodes.
- */
- agbp = *IO_agbp;
- agi = XFS_BUF_TO_AGI(agbp);
- ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
- ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
- }
- mp = tp->t_mountp;
- agcount = mp->m_sb.sb_agcount;
- agno = be32_to_cpu(agi->agi_seqno);
- tagno = agno;
- pagno = XFS_INO_TO_AGNO(mp, parent);
- pagino = XFS_INO_TO_AGINO(mp, parent);
+ int error;
+ int i;
- /*
- * If we have already hit the ceiling of inode blocks then clear
- * okalloc so we scan all available agi structures for a free
- * inode.
- */
+ if (left)
+ error = xfs_btree_decrement(cur, 0, &i);
+ else
+ error = xfs_btree_increment(cur, 0, &i);
- if (mp->m_maxicount &&
- mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
- noroom = 1;
- okalloc = 0;
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
}
- /*
- * Loop until we find an allocation group that either has free inodes
- * or in which we can allocate some inodes. Iterate through the
- * allocation groups upward, wrapping at the end.
- */
- *alloc_done = B_FALSE;
- while (!agi->agi_freecount) {
- /*
- * Don't do anything if we're not supposed to allocate
- * any blocks, just go on to the next ag.
- */
- if (okalloc) {
- /*
- * Try to allocate some new inodes in the allocation
- * group.
- */
- if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) {
- xfs_trans_brelse(tp, agbp);
- if (error == ENOSPC) {
- *inop = NULLFSINO;
- return 0;
- } else
- return error;
- }
- if (ialloced) {
- /*
- * We successfully allocated some inodes, return
- * the current context to the caller so that it
- * can commit the current transaction and call
- * us again where we left off.
- */
- ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
- *alloc_done = B_TRUE;
- *IO_agbp = agbp;
- *inop = NULLFSINO;
- return 0;
- }
- }
- /*
- * If it failed, give up on this ag.
- */
- xfs_trans_brelse(tp, agbp);
- /*
- * Go on to the next ag: get its ag header.
- */
-nextag:
- if (++tagno == agcount)
- tagno = 0;
- if (tagno == agno) {
- *inop = NULLFSINO;
- return noroom ? ENOSPC : 0;
- }
- down_read(&mp->m_peraglock);
- if (mp->m_perag[tagno].pagi_inodeok == 0) {
- up_read(&mp->m_peraglock);
- goto nextag;
- }
- error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
- up_read(&mp->m_peraglock);
+ return 0;
+}
+
+STATIC int
+xfs_ialloc_get_rec(
+ struct xfs_btree_cur *cur,
+ xfs_agino_t agino,
+ xfs_inobt_rec_incore_t *rec,
+ int *done)
+{
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ *done = !i;
+ if (i) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
if (error)
- goto nextag;
- agi = XFS_BUF_TO_AGI(agbp);
- ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
}
- /*
- * Here with an allocation group that has a free inode.
- * Reset agno since we may have chosen a new ag in the
- * loop above.
- */
- agno = tagno;
- *IO_agbp = NULL;
- cur = xfs_btree_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno),
- XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+
+ return 0;
+}
+
+/*
+ * Allocate an inode using the inobt-only algorithm.
+ */
+STATIC int
+xfs_dialloc_ag_inobt(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_ino_t parent,
+ xfs_ino_t *inop)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
+ xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur, *tcur;
+ struct xfs_inobt_rec_incore rec, trec;
+ xfs_ino_t ino;
+ int error;
+ int offset;
+ int i, j;
+
+ pag = xfs_perag_get(mp, agno);
+
+ ASSERT(pag->pagi_init);
+ ASSERT(pag->pagi_inodeok);
+ ASSERT(pag->pagi_freecount > 0);
+
+ restart_pagno:
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
*/
if (!pagino)
pagino = be32_to_cpu(agi->agi_newino);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- freecount += rec.ir_freecount;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
- goto error0;
- } while (i == 1);
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
/*
- * If in the same a.g. as the parent, try to get near the parent.
+ * If in the same AG as the parent, try to get near the parent.
*/
if (pagno == agno) {
- if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
+ int doneleft; /* done, to the left */
+ int doneright; /* done, to the right */
+ int searchdistance = 10;
+
+ error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_inobt_get_rec(cur, &rec, &j);
+ if (error)
goto error0;
- if (i != 0 &&
- (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
- j == 1 &&
- rec.ir_freecount > 0) {
+ XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
+
+ if (rec.ir_freecount > 0) {
/*
* Found a free inode in the same chunk
- * as parent, done.
+ * as the parent, done.
*/
+ goto alloc_inode;
}
+
+
+ /*
+ * In the same AG as parent, but parent's chunk is full.
+ */
+
+ /* duplicate the cursor, search left & right simultaneously */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+
/*
- * In the same a.g. as parent, but parent's chunk is full.
+ * Skip to last blocks looked up if same parent inode.
*/
- else {
- int doneleft; /* done, to the left */
- int doneright; /* done, to the right */
+ if (pagino != NULLAGINO &&
+ pag->pagl_pagino == pagino &&
+ pag->pagl_leftrec != NULLAGINO &&
+ pag->pagl_rightrec != NULLAGINO) {
+ error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+ &trec, &doneleft);
+ if (error)
+ goto error1;
+ error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+ &rec, &doneright);
if (error)
- goto error0;
- ASSERT(i == 1);
- ASSERT(j == 1);
- /*
- * Duplicate the cursor, search left & right
- * simultaneously.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- goto error0;
- /*
- * Search left with tcur, back up 1 record.
- */
- if ((error = xfs_inobt_decrement(tcur, 0, &i)))
goto error1;
- doneleft = !i;
- if (!doneleft) {
- if ((error = xfs_inobt_get_rec(tcur,
- &trec.ir_startino,
- &trec.ir_freecount,
- &trec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
- }
- /*
- * Search right with cur, go forward 1 record.
- */
- if ((error = xfs_inobt_increment(cur, 0, &i)))
+ } else {
+ /* search left with tcur, back up 1 record */
+ error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
+ if (error)
goto error1;
- doneright = !i;
- if (!doneright) {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
- }
- /*
- * Loop until we find the closest inode chunk
- * with a free one.
- */
- while (!doneleft || !doneright) {
- int useleft; /* using left inode
- chunk this time */
+ /* search right with cur, go forward 1 record. */
+ error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+ if (error)
+ goto error1;
+ }
+
+ /*
+ * Loop until we find an inode chunk with a free inode.
+ */
+ while (!doneleft || !doneright) {
+ int useleft; /* using left inode chunk this time */
+
+ if (!--searchdistance) {
/*
- * Figure out which block is closer,
- * if both are valid.
- */
- if (!doneleft && !doneright)
- useleft =
- pagino -
- (trec.ir_startino +
- XFS_INODES_PER_CHUNK - 1) <
- rec.ir_startino - pagino;
- else
- useleft = !doneleft;
- /*
- * If checking the left, does it have
- * free inodes?
- */
- if (useleft && trec.ir_freecount) {
- /*
- * Yes, set it up as the chunk to use.
- */
- rec = trec;
- xfs_btree_del_cursor(cur,
- XFS_BTREE_NOERROR);
- cur = tcur;
- break;
- }
- /*
- * If checking the right, does it have
- * free inodes?
- */
- if (!useleft && rec.ir_freecount) {
- /*
- * Yes, it's already set up.
- */
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- break;
- }
- /*
- * If used the left, get another one
- * further left.
- */
- if (useleft) {
- if ((error = xfs_inobt_decrement(tcur, 0,
- &i)))
- goto error1;
- doneleft = !i;
- if (!doneleft) {
- if ((error = xfs_inobt_get_rec(
- tcur,
- &trec.ir_startino,
- &trec.ir_freecount,
- &trec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1,
- error1);
- }
- }
- /*
- * If used the right, get another one
- * further right.
+ * Not in range - save last search
+ * location and allocate a new inode
*/
- else {
- if ((error = xfs_inobt_increment(cur, 0,
- &i)))
- goto error1;
- doneright = !i;
- if (!doneright) {
- if ((error = xfs_inobt_get_rec(
- cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error1;
- XFS_WANT_CORRUPTED_GOTO(i == 1,
- error1);
- }
- }
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto newino;
+ }
+
+ /* figure out the closer block if both are valid. */
+ if (!doneleft && !doneright) {
+ useleft = pagino -
+ (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
+ rec.ir_startino - pagino;
+ } else {
+ useleft = !doneleft;
+ }
+
+ /* free inodes to the left? */
+ if (useleft && trec.ir_freecount) {
+ rec = trec;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ cur = tcur;
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
+ }
+
+ /* free inodes to the right? */
+ if (!useleft && rec.ir_freecount) {
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+ pag->pagl_leftrec = trec.ir_startino;
+ pag->pagl_rightrec = rec.ir_startino;
+ pag->pagl_pagino = pagino;
+ goto alloc_inode;
+ }
+
+ /* get next record to check */
+ if (useleft) {
+ error = xfs_ialloc_next_rec(tcur, &trec,
+ &doneleft, 1);
+ } else {
+ error = xfs_ialloc_next_rec(cur, &rec,
+ &doneright, 0);
}
- ASSERT(!doneleft || !doneright);
+ if (error)
+ goto error1;
}
+
+ /*
+ * We've reached the end of the btree. because
+ * we are only searching a small chunk of the
+ * btree each search, there is obviously free
+ * inodes closer to the parent inode than we
+ * are now. restart the search again.
+ */
+ pag->pagl_pagino = NULLAGINO;
+ pag->pagl_leftrec = NULLAGINO;
+ pag->pagl_rightrec = NULLAGINO;
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ goto restart_pagno;
}
+
/*
- * In a different a.g. from the parent.
+ * In a different AG from the parent.
* See if the most recently allocated block has any free.
*/
- else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
- if ((error = xfs_inobt_lookup_eq(cur,
- be32_to_cpu(agi->agi_newino), 0, 0, &i)))
+newino:
+ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+ error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+ XFS_LOOKUP_EQ, &i);
+ if (error)
goto error0;
- if (i == 1 &&
- (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
- j == 1 &&
- rec.ir_freecount > 0) {
- /*
- * The last chunk allocated in the group still has
- * a free inode.
- */
- }
- /*
- * None left in the last group, search the whole a.g.
- */
- else {
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, &rec, &j);
if (error)
goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- ASSERT(i == 1);
- for (;;) {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free,
- &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (rec.ir_freecount > 0)
- break;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ if (j == 1 && rec.ir_freecount > 0) {
+ /*
+ * The last chunk allocated in the group
+ * still has a free inode.
+ */
+ goto alloc_inode;
}
}
}
- offset = XFS_IALLOC_FIND_FREE(&rec.ir_free);
+
+ /*
+ * None left in the last group, search the whole AG
+ */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ for (;;) {
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ if (rec.ir_freecount > 0)
+ break;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ }
+
+alloc_inode:
+ offset = xfs_lowbit64(rec.ir_free);
ASSERT(offset >= 0);
ASSERT(offset < XFS_INODES_PER_CHUNK);
ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
XFS_INODES_PER_CHUNK) == 0);
ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
- XFS_INOBT_CLR_FREE(&rec, offset);
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
rec.ir_freecount--;
- if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
- rec.ir_free)))
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
goto error0;
be32_add_cpu(&agi->agi_freecount, -1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
- down_read(&mp->m_peraglock);
- mp->m_perag[tagno].pagi_freecount--;
- up_read(&mp->m_peraglock);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
+ pag->pagi_freecount--;
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- freecount += rec.ir_freecount;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
- goto error0;
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+ xfs_perag_put(pag);
*inop = ino;
return 0;
error1:
xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
error0:
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ xfs_perag_put(pag);
return error;
}
/*
- * Free disk inode. Carefully avoids touching the incore inode, all
- * manipulations incore are the caller's responsibility.
- * The on-disk inode is not changed by this operation, only the
- * btree (free inode mask) is changed.
+ * Use the free inode btree to allocate an inode based on distance from the
+ * parent. Note that the provided cursor may be deleted and replaced.
*/
-int
-xfs_difree(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ino_t inode, /* inode to be freed */
- xfs_bmap_free_t *flist, /* extents to free */
- int *delete, /* set if inode cluster was deleted */
- xfs_ino_t *first_ino) /* first inode in deleted cluster */
+STATIC int
+xfs_dialloc_ag_finobt_near(
+ xfs_agino_t pagino,
+ struct xfs_btree_cur **ocur,
+ struct xfs_inobt_rec_incore *rec)
{
- /* REFERENCED */
- xfs_agblock_t agbno; /* block number containing inode */
- xfs_buf_t *agbp; /* buffer containing allocation group header */
- xfs_agino_t agino; /* inode number relative to allocation group */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_agi_t *agi; /* allocation group header */
- xfs_btree_cur_t *cur; /* inode btree cursor */
- int error; /* error return value */
- int i; /* result code */
- int ilen; /* inodes in an inode cluster */
- xfs_mount_t *mp; /* mount structure for filesystem */
- int off; /* offset of inode in inode chunk */
- xfs_inobt_rec_incore_t rec; /* btree record */
+ struct xfs_btree_cur *lcur = *ocur; /* left search cursor */
+ struct xfs_btree_cur *rcur; /* right search cursor */
+ struct xfs_inobt_rec_incore rrec;
+ int error;
+ int i, j;
- mp = tp->t_mountp;
+ error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ return error;
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(lcur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ /*
+ * See if we've landed in the parent inode record. The finobt
+ * only tracks chunks with at least one free inode, so record
+ * existence is enough.
+ */
+ if (pagino >= rec->ir_startino &&
+ pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
+ return 0;
+ }
+
+ error = xfs_btree_dup_cursor(lcur, &rcur);
+ if (error)
+ return error;
+
+ error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
+ if (error)
+ goto error_rcur;
+ if (j == 1) {
+ error = xfs_inobt_get_rec(rcur, &rrec, &j);
+ if (error)
+ goto error_rcur;
+ XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+ }
+
+ XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+ if (i == 1 && j == 1) {
+ /*
+ * Both the left and right records are valid. Choose the closer
+ * inode chunk to the target.
+ */
+ if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
+ (rrec.ir_startino - pagino)) {
+ *rec = rrec;
+ xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+ *ocur = rcur;
+ } else {
+ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+ }
+ } else if (j == 1) {
+ /* only the right record is valid */
+ *rec = rrec;
+ xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+ *ocur = rcur;
+ } else if (i == 1) {
+ /* only the left record is valid */
+ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+ }
+
+ return 0;
+
+error_rcur:
+ xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Use the free inode btree to find a free inode based on a newino hint. If
+ * the hint is NULL, find the first free inode in the AG.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_newino(
+ struct xfs_agi *agi,
+ struct xfs_btree_cur *cur,
+ struct xfs_inobt_rec_incore *rec)
+{
+ int error;
+ int i;
+
+ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+ error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
+ &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ return 0;
+ }
+ }
/*
- * Break up inode number into its components.
+ * Find the first inode available in the AG.
*/
- agno = XFS_INO_TO_AGNO(mp, inode);
- if (agno >= mp->m_sb.sb_agcount) {
- cmn_err(CE_WARN,
- "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s. Returning EINVAL.",
- agno, mp->m_sb.sb_agcount, mp->m_fsname);
- ASSERT(0);
- return XFS_ERROR(EINVAL);
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ return 0;
+}
+
+/*
+ * Update the inobt based on a modification made to the finobt. Also ensure that
+ * the records from both trees are equivalent post-modification.
+ */
+STATIC int
+xfs_dialloc_ag_update_inobt(
+ struct xfs_btree_cur *cur, /* inobt cursor */
+ struct xfs_inobt_rec_incore *frec, /* finobt record */
+ int offset) /* inode offset */
+{
+ struct xfs_inobt_rec_incore rec;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+
+ XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+ (rec.ir_freecount == frec->ir_freecount));
+
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ return error;
+
+ return 0;
+}
+
+/*
+ * Allocate an inode using the free inode btree, if available. Otherwise, fall
+ * back to the inobt search algorithm.
+ *
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
+ */
+STATIC int
+xfs_dialloc_ag(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_ino_t parent,
+ xfs_ino_t *inop)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
+ xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur; /* finobt cursor */
+ struct xfs_btree_cur *icur; /* inobt cursor */
+ struct xfs_inobt_rec_incore rec;
+ xfs_ino_t ino;
+ int error;
+ int offset;
+ int i;
+
+ if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
+
+ pag = xfs_perag_get(mp, agno);
+
+ /*
+ * If pagino is 0 (this is the root inode allocation) use newino.
+ * This must work because we've just allocated some.
+ */
+ if (!pagino)
+ pagino = be32_to_cpu(agi->agi_newino);
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error_cur;
+
+ /*
+ * The search algorithm depends on whether we're in the same AG as the
+ * parent. If so, find the closest available inode to the parent. If
+ * not, consider the agi hint or find the first free inode in the AG.
+ */
+ if (agno == pagno)
+ error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+ else
+ error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+ if (error)
+ goto error_cur;
+
+ offset = xfs_lowbit64(rec.ir_free);
+ ASSERT(offset >= 0);
+ ASSERT(offset < XFS_INODES_PER_CHUNK);
+ ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+ ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+
+ /*
+ * Modify or remove the finobt record.
+ */
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+ if (rec.ir_freecount)
+ error = xfs_inobt_update(cur, &rec);
+ else
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto error_cur;
+
+ /*
+ * The finobt has now been updated appropriately. We haven't updated the
+ * agi and superblock yet, so we can create an inobt cursor and validate
+ * the original freecount. If all is well, make the equivalent update to
+ * the inobt using the finobt record and offset information.
+ */
+ icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+ error = xfs_check_agi_freecount(icur, agi);
+ if (error)
+ goto error_icur;
+
+ error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
+ if (error)
+ goto error_icur;
+
+ /*
+ * Both trees have now been updated. We must update the perag and
+ * superblock before we can check the freecount for each btree.
+ */
+ be32_add_cpu(&agi->agi_freecount, -1);
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+ pag->pagi_freecount--;
+
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+
+ error = xfs_check_agi_freecount(icur, agi);
+ if (error)
+ goto error_icur;
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error_icur;
+
+ xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_perag_put(pag);
+ *inop = ino;
+ return 0;
+
+error_icur:
+ xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
+error_cur:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
+ * Allocate an inode on disk.
+ *
+ * Mode is used to tell whether the new inode will need space, and whether it
+ * is a directory.
+ *
+ * This function is designed to be called twice if it has to do an allocation
+ * to make more free inodes. On the first call, *IO_agbp should be set to NULL.
+ * If an inode is available without having to performn an allocation, an inode
+ * number is returned. In this case, *IO_agbp is set to NULL. If an allocation
+ * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
+ * The caller should then commit the current transaction, allocate a
+ * new transaction, and call xfs_dialloc() again, passing in the previous value
+ * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI
+ * buffer is locked across the two calls, the second call is guaranteed to have
+ * a free inode available.
+ *
+ * Once we successfully pick an inode its number is returned and the on-disk
+ * data structures are updated. The inode itself is not read in, since doing so
+ * would break ordering constraints with xfs_reclaim.
+ */
+int
+xfs_dialloc(
+ struct xfs_trans *tp,
+ xfs_ino_t parent,
+ umode_t mode,
+ int okalloc,
+ struct xfs_buf **IO_agbp,
+ xfs_ino_t *inop)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *agbp;
+ xfs_agnumber_t agno;
+ int error;
+ int ialloced;
+ int noroom = 0;
+ xfs_agnumber_t start_agno;
+ struct xfs_perag *pag;
+
+ if (*IO_agbp) {
+ /*
+ * If the caller passes in a pointer to the AGI buffer,
+ * continue where we left off before. In this case, we
+ * know that the allocation group has free inodes.
+ */
+ agbp = *IO_agbp;
+ goto out_alloc;
}
- agino = XFS_INO_TO_AGINO(mp, inode);
- if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
- cmn_err(CE_WARN,
- "xfs_difree: inode != XFS_AGINO_TO_INO() "
- "(%llu != %llu) on %s. Returning EINVAL.",
- (unsigned long long)inode,
- (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino),
- mp->m_fsname);
- ASSERT(0);
- return XFS_ERROR(EINVAL);
+
+ /*
+ * We do not have an agbp, so select an initial allocation
+ * group for inode allocation.
+ */
+ start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+ if (start_agno == NULLAGNUMBER) {
+ *inop = NULLFSINO;
+ return 0;
}
- agbno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (agbno >= mp->m_sb.sb_agblocks) {
- cmn_err(CE_WARN,
- "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s. Returning EINVAL.",
- agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
- ASSERT(0);
- return XFS_ERROR(EINVAL);
+
+ /*
+ * If we have already hit the ceiling of inode blocks then clear
+ * okalloc so we scan all available agi structures for a free
+ * inode.
+ */
+ if (mp->m_maxicount &&
+ mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
+ noroom = 1;
+ okalloc = 0;
}
+
/*
- * Get the allocation group header.
+ * Loop until we find an allocation group that either has free inodes
+ * or in which we can allocate some inodes. Iterate through the
+ * allocation groups upward, wrapping at the end.
*/
- down_read(&mp->m_peraglock);
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
- up_read(&mp->m_peraglock);
- if (error) {
- cmn_err(CE_WARN,
- "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.",
- error, mp->m_fsname);
- return error;
+ agno = start_agno;
+ for (;;) {
+ pag = xfs_perag_get(mp, agno);
+ if (!pag->pagi_inodeok) {
+ xfs_ialloc_next_ag(mp);
+ goto nextag;
+ }
+
+ if (!pag->pagi_init) {
+ error = xfs_ialloc_pagi_init(mp, tp, agno);
+ if (error)
+ goto out_error;
+ }
+
+ /*
+ * Do a first racy fast path check if this AG is usable.
+ */
+ if (!pag->pagi_freecount && !okalloc)
+ goto nextag;
+
+ /*
+ * Then read in the AGI buffer and recheck with the AGI buffer
+ * lock held.
+ */
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error)
+ goto out_error;
+
+ if (pag->pagi_freecount) {
+ xfs_perag_put(pag);
+ goto out_alloc;
+ }
+
+ if (!okalloc)
+ goto nextag_relse_buffer;
+
+
+ error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
+ if (error) {
+ xfs_trans_brelse(tp, agbp);
+
+ if (error != ENOSPC)
+ goto out_error;
+
+ xfs_perag_put(pag);
+ *inop = NULLFSINO;
+ return 0;
+ }
+
+ if (ialloced) {
+ /*
+ * We successfully allocated some inodes, return
+ * the current context to the caller so that it
+ * can commit the current transaction and call
+ * us again where we left off.
+ */
+ ASSERT(pag->pagi_freecount > 0);
+ xfs_perag_put(pag);
+
+ *IO_agbp = agbp;
+ *inop = NULLFSINO;
+ return 0;
+ }
+
+nextag_relse_buffer:
+ xfs_trans_brelse(tp, agbp);
+nextag:
+ xfs_perag_put(pag);
+ if (++agno == mp->m_sb.sb_agcount)
+ agno = 0;
+ if (agno == start_agno) {
+ *inop = NULLFSINO;
+ return noroom ? ENOSPC : 0;
+ }
}
- agi = XFS_BUF_TO_AGI(agbp);
- ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
- ASSERT(agbno < be32_to_cpu(agi->agi_length));
+
+out_alloc:
+ *IO_agbp = NULL;
+ return xfs_dialloc_ag(tp, agbp, parent, inop);
+out_error:
+ xfs_perag_put(pag);
+ return XFS_ERROR(error);
+}
+
+STATIC int
+xfs_difree_inobt(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t agino,
+ struct xfs_bmap_free *flist,
+ int *deleted,
+ xfs_ino_t *first_ino,
+ struct xfs_inobt_rec_incore *orec)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur;
+ struct xfs_inobt_rec_incore rec;
+ int ilen;
+ int error;
+ int i;
+ int off;
+
+ ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+ ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
+
/*
* Initialize the cursor.
*/
- cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
- (xfs_inode_t *)0, 0);
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
- &rec.ir_freecount, &rec.ir_free, &i)))
- goto error0;
- if (i) {
- freecount += rec.ir_freecount;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
- goto error0;
- }
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
/*
* Look for the entry describing this inode.
*/
- if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
- cmn_err(CE_WARN,
- "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.",
- error, mp->m_fsname);
+ if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
+ xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
+ __func__, error);
goto error0;
}
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
- &rec.ir_free, &i))) {
- cmn_err(CE_WARN,
- "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.",
- error, mp->m_fsname);
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
+ __func__, error);
goto error0;
}
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1043,20 +1482,20 @@ xfs_difree(
*/
off = agino - rec.ir_startino;
ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
- ASSERT(!XFS_INOBT_IS_FREE(&rec, off));
+ ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
/*
* Mark the inode free & increment the count.
*/
- XFS_INOBT_SET_FREE(&rec, off);
+ rec.ir_free |= XFS_INOBT_MASK(off);
rec.ir_freecount++;
/*
* When an inode cluster is free, it becomes eligible for removal
*/
if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
- (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
+ (rec.ir_freecount == mp->m_ialloc_inos)) {
- *delete = 1;
+ *deleted = 1;
*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
/*
@@ -1064,67 +1503,51 @@ xfs_difree(
* AGI and Superblock inode counts, and mark the disk space
* to be freed when the transaction is committed.
*/
- ilen = XFS_IALLOC_INODES(mp);
+ ilen = mp->m_ialloc_inos;
be32_add_cpu(&agi->agi_count, -ilen);
be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
- down_read(&mp->m_peraglock);
- mp->m_perag[agno].pagi_freecount -= ilen - 1;
- up_read(&mp->m_peraglock);
+ pag = xfs_perag_get(mp, agno);
+ pag->pagi_freecount -= ilen - 1;
+ xfs_perag_put(pag);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
- if ((error = xfs_inobt_delete(cur, &i))) {
- cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n",
- error, mp->m_fsname);
+ if ((error = xfs_btree_delete(cur, &i))) {
+ xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
+ __func__, error);
goto error0;
}
- xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
- agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
- XFS_IALLOC_BLOCKS(mp), flist, mp);
+ xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+ XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
+ mp->m_ialloc_blks, flist, mp);
} else {
- *delete = 0;
+ *deleted = 0;
- if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
- cmn_err(CE_WARN,
- "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.",
- error, mp->m_fsname);
+ error = xfs_inobt_update(cur, &rec);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
+ __func__, error);
goto error0;
}
+
/*
* Change the inode free counts and log the ag/sb changes.
*/
be32_add_cpu(&agi->agi_freecount, 1);
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
- down_read(&mp->m_peraglock);
- mp->m_perag[agno].pagi_freecount++;
- up_read(&mp->m_peraglock);
+ pag = xfs_perag_get(mp, agno);
+ pag->pagi_freecount++;
+ xfs_perag_put(pag);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
}
-#ifdef DEBUG
- if (cur->bc_nlevels == 1) {
- int freecount = 0;
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error0;
- if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
- goto error0;
- do {
- if ((error = xfs_inobt_get_rec(cur,
- &rec.ir_startino,
- &rec.ir_freecount,
- &rec.ir_free, &i)))
- goto error0;
- if (i) {
- freecount += rec.ir_freecount;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
- goto error0;
- }
- } while (i == 1);
- ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
- }
-#endif
+ *orec = rec;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return 0;
@@ -1134,36 +1557,264 @@ error0:
}
/*
- * Return the location of the inode in bno/off, for mapping it into a buffer.
+ * Free an inode in the free inode btree.
+ */
+STATIC int
+xfs_difree_finobt(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t agino,
+ struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ struct xfs_btree_cur *cur;
+ struct xfs_inobt_rec_incore rec;
+ int offset = agino - ibtrec->ir_startino;
+ int error;
+ int i;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+ error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ if (i == 0) {
+ /*
+ * If the record does not exist in the finobt, we must have just
+ * freed an inode in a previously fully allocated chunk. If not,
+ * something is out of sync.
+ */
+ XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+
+ error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+ ibtrec->ir_free, &i);
+ if (error)
+ goto error;
+ ASSERT(i == 1);
+
+ goto out;
+ }
+
+ /*
+ * Read and update the existing record. We could just copy the ibtrec
+ * across here, but that would defeat the purpose of having redundant
+ * metadata. By making the modifications independently, we can catch
+ * corruptions that we wouldn't see if we just copied from one record
+ * to another.
+ */
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+
+ rec.ir_free |= XFS_INOBT_MASK(offset);
+ rec.ir_freecount++;
+
+ XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+ (rec.ir_freecount == ibtrec->ir_freecount),
+ error);
+
+ /*
+ * The content of inobt records should always match between the inobt
+ * and finobt. The lifecycle of records in the finobt is different from
+ * the inobt in that the finobt only tracks records with at least one
+ * free inode. Hence, if all of the inodes are free and we aren't
+ * keeping inode chunks permanently on disk, remove the record.
+ * Otherwise, update the record with the new information.
+ */
+ if (rec.ir_freecount == mp->m_ialloc_inos &&
+ !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto error;
+ ASSERT(i == 1);
+ } else {
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ goto error;
+ }
+
+out:
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Free disk inode. Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
*/
-/*ARGSUSED*/
int
-xfs_dilocate(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
+xfs_difree(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_ino_t inode, /* inode to be freed */
+ struct xfs_bmap_free *flist, /* extents to free */
+ int *deleted,/* set if inode cluster was deleted */
+ xfs_ino_t *first_ino)/* first inode in deleted cluster */
+{
+ /* REFERENCED */
+ xfs_agblock_t agbno; /* block number containing inode */
+ struct xfs_buf *agbp; /* buffer for allocation group header */
+ xfs_agino_t agino; /* allocation group inode number */
+ xfs_agnumber_t agno; /* allocation group number */
+ int error; /* error return value */
+ struct xfs_mount *mp; /* mount structure for filesystem */
+ struct xfs_inobt_rec_incore rec;/* btree record */
+
+ mp = tp->t_mountp;
+
+ /*
+ * Break up inode number into its components.
+ */
+ agno = XFS_INO_TO_AGNO(mp, inode);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+ __func__, agno, mp->m_sb.sb_agcount);
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ agino = XFS_INO_TO_AGINO(mp, inode);
+ if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+ __func__, (unsigned long long)inode,
+ (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ if (agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+ __func__, agbno, mp->m_sb.sb_agblocks);
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ /*
+ * Get the allocation group header.
+ */
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ /*
+ * Fix up the inode allocation btree.
+ */
+ error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
+ &rec);
+ if (error)
+ goto error0;
+
+ /*
+ * Fix up the free inode btree.
+ */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
+ if (error)
+ goto error0;
+ }
+
+ return 0;
+
+error0:
+ return error;
+}
+
+STATIC int
+xfs_imap_lookup(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ xfs_agblock_t agbno,
+ xfs_agblock_t *chunk_agbno,
+ xfs_agblock_t *offset_agbno,
+ int flags)
+{
+ struct xfs_inobt_rec_incore rec;
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp;
+ int error;
+ int i;
+
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error) {
+ xfs_alert(mp,
+ "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
+ __func__, error, agno);
+ return error;
+ }
+
+ /*
+ * Lookup the inode record for the given agino. If the record cannot be
+ * found, then it's an invalid inode number and we should abort. Once
+ * we have a record, we need to ensure it contains the inode number
+ * we are looking up.
+ */
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+ error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
+ if (!error) {
+ if (i)
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (!error && i == 0)
+ error = EINVAL;
+ }
+
+ xfs_trans_brelse(tp, agbp);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ if (error)
+ return error;
+
+ /* check that the returned record contains the required inode */
+ if (rec.ir_startino > agino ||
+ rec.ir_startino + mp->m_ialloc_inos <= agino)
+ return EINVAL;
+
+ /* for untrusted inodes check it is allocated first */
+ if ((flags & XFS_IGET_UNTRUSTED) &&
+ (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
+ return EINVAL;
+
+ *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
+ *offset_agbno = agbno - *chunk_agbno;
+ return 0;
+}
+
+/*
+ * Return the location of the inode in imap, for mapping it into a buffer.
+ */
+int
+xfs_imap(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
xfs_ino_t ino, /* inode to locate */
- xfs_fsblock_t *bno, /* output: block containing inode */
- int *len, /* output: num blocks in inode cluster */
- int *off, /* output: index in block of inode */
- uint flags) /* flags concerning inode lookup */
+ struct xfs_imap *imap, /* location map structure */
+ uint flags) /* flags for inode btree lookup */
{
xfs_agblock_t agbno; /* block number of inode in the alloc group */
- xfs_buf_t *agbp; /* agi buffer */
xfs_agino_t agino; /* inode number within alloc group */
xfs_agnumber_t agno; /* allocation group number */
int blks_per_cluster; /* num blocks per inode cluster */
xfs_agblock_t chunk_agbno; /* first block in inode chunk */
- xfs_agino_t chunk_agino; /* first agino in inode chunk */
- __int32_t chunk_cnt; /* count of free inodes in chunk */
- xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
xfs_agblock_t cluster_agbno; /* first block in inode cluster */
- xfs_btree_cur_t *cur; /* inode btree cursor */
int error; /* error code */
- int i; /* temp state */
int offset; /* index of inode in its buffer */
- int offset_agbno; /* blks from chunk start to inode */
+ xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
ASSERT(ino != NULLFSINO);
+
/*
* Split up the inode number into its parts.
*/
@@ -1173,112 +1824,107 @@ xfs_dilocate(
if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
#ifdef DEBUG
- /* no diagnostics for bulkstat, ino comes from userspace */
- if (flags & XFS_IMAP_BULKSTAT)
+ /*
+ * Don't output diagnostic information for untrusted inodes
+ * as they can be invalid without implying corruption.
+ */
+ if (flags & XFS_IGET_UNTRUSTED)
return XFS_ERROR(EINVAL);
if (agno >= mp->m_sb.sb_agcount) {
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_dilocate: agno (%d) >= "
- "mp->m_sb.sb_agcount (%d)",
- agno, mp->m_sb.sb_agcount);
+ xfs_alert(mp,
+ "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
+ __func__, agno, mp->m_sb.sb_agcount);
}
if (agbno >= mp->m_sb.sb_agblocks) {
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_dilocate: agbno (0x%llx) >= "
- "mp->m_sb.sb_agblocks (0x%lx)",
- (unsigned long long) agbno,
- (unsigned long) mp->m_sb.sb_agblocks);
+ xfs_alert(mp,
+ "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
+ __func__, (unsigned long long)agbno,
+ (unsigned long)mp->m_sb.sb_agblocks);
}
if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_dilocate: ino (0x%llx) != "
- "XFS_AGINO_TO_INO(mp, agno, agino) "
- "(0x%llx)",
- ino, XFS_AGINO_TO_INO(mp, agno, agino));
+ xfs_alert(mp,
+ "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+ __func__, ino,
+ XFS_AGINO_TO_INO(mp, agno, agino));
}
xfs_stack_trace();
#endif /* DEBUG */
return XFS_ERROR(EINVAL);
}
- if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) ||
- !(flags & XFS_IMAP_LOOKUP)) {
- offset = XFS_INO_TO_OFFSET(mp, ino);
- ASSERT(offset < mp->m_sb.sb_inopblock);
- *bno = XFS_AGB_TO_FSB(mp, agno, agbno);
- *off = offset;
- *len = 1;
- return 0;
+
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+
+ /*
+ * For bulkstat and handle lookups, we have an untrusted inode number
+ * that we have to verify is valid. We cannot do this just by reading
+ * the inode buffer as it may have been unlinked and removed leaving
+ * inodes in stale state on disk. Hence we have to do a btree lookup
+ * in all cases where an untrusted inode number is passed.
+ */
+ if (flags & XFS_IGET_UNTRUSTED) {
+ error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+ &chunk_agbno, &offset_agbno, flags);
+ if (error)
+ return error;
+ goto out_map;
}
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
- if (*bno != NULLFSBLOCK) {
+
+ /*
+ * If the inode cluster size is the same as the blocksize or
+ * smaller we get to the buffer by simple arithmetics.
+ */
+ if (blks_per_cluster == 1) {
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
- cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno);
- *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
- offset;
- *len = blks_per_cluster;
+
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap->im_len = XFS_FSB_TO_BB(mp, 1);
+ imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
return 0;
}
+
+ /*
+ * If the inode chunks are aligned then use simple maths to
+ * find the location. Otherwise we have to do a btree
+ * lookup to find the location.
+ */
if (mp->m_inoalign_mask) {
offset_agbno = agbno & mp->m_inoalign_mask;
chunk_agbno = agbno - offset_agbno;
} else {
- down_read(&mp->m_peraglock);
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
- up_read(&mp->m_peraglock);
- if (error) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
- "xfs_ialloc_read_agi() returned "
- "error %d, agno %d",
- error, agno);
-#endif /* DEBUG */
- return error;
- }
- cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
- (xfs_inode_t *)0, 0);
- if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
- "xfs_inobt_lookup_le() failed");
-#endif /* DEBUG */
- goto error0;
- }
- if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
- &chunk_free, &i))) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
- "xfs_inobt_get_rec() failed");
-#endif /* DEBUG */
- goto error0;
- }
- if (i == 0) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
- "xfs_inobt_get_rec() failed");
-#endif /* DEBUG */
- error = XFS_ERROR(EINVAL);
- }
- xfs_trans_brelse(tp, agbp);
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
+ &chunk_agbno, &offset_agbno, flags);
if (error)
return error;
- chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
- offset_agbno = agbno - chunk_agbno;
}
+
+out_map:
ASSERT(agbno >= chunk_agbno);
cluster_agbno = chunk_agbno +
((offset_agbno / blks_per_cluster) * blks_per_cluster);
offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
XFS_INO_TO_OFFSET(mp, ino);
- *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno);
- *off = offset;
- *len = blks_per_cluster;
+
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
+ imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+
+ /*
+ * If the inode number maps to a block outside the bounds
+ * of the file system then return NULL rather than calling
+ * read_buf and panicing when we get an error from the
+ * driver.
+ */
+ if ((imap->im_blkno + imap->im_len) >
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+ xfs_alert(mp,
+ "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
+ __func__, (unsigned long long) imap->im_blkno,
+ (unsigned long long) imap->im_len,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+ return XFS_ERROR(EINVAL);
+ }
return 0;
-error0:
- xfs_trans_brelse(tp, agbp);
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return error;
}
/*
@@ -1305,7 +1951,16 @@ xfs_ialloc_compute_maxlevels(
}
/*
- * Log specified fields for the ag hdr (inode section)
+ * Log specified fields for the ag hdr (inode section). The growth of the agi
+ * structure over time requires that we interpret the buffer as two logical
+ * regions delineated by the end of the unlinked list. This is due to the size
+ * of the hash table and its location in the middle of the agi.
+ *
+ * For example, a request to log a field before agi_unlinked and a field after
+ * agi_unlinked could cause us to log the entire hash table and use an excessive
+ * amount of log space. To avoid this behavior, log the region up through
+ * agi_unlinked in one call and the region after agi_unlinked through the end of
+ * the structure in another.
*/
void
xfs_ialloc_log_agi(
@@ -1328,88 +1983,188 @@ xfs_ialloc_log_agi(
offsetof(xfs_agi_t, agi_newino),
offsetof(xfs_agi_t, agi_dirino),
offsetof(xfs_agi_t, agi_unlinked),
+ offsetof(xfs_agi_t, agi_free_root),
+ offsetof(xfs_agi_t, agi_free_level),
sizeof(xfs_agi_t)
};
#ifdef DEBUG
xfs_agi_t *agi; /* allocation group header */
agi = XFS_BUF_TO_AGI(bp);
- ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC);
+ ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
+#endif
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
+
+ /*
+ * Compute byte offsets for the first and last fields in the first
+ * region and log the agi buffer. This only logs up through
+ * agi_unlinked.
+ */
+ if (fields & XFS_AGI_ALL_BITS_R1) {
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
+ &first, &last);
+ xfs_trans_log_buf(tp, bp, first, last);
+ }
+
+ /*
+ * Mask off the bits in the first region and calculate the first and
+ * last field offsets for any bits in the second region.
+ */
+ fields &= ~XFS_AGI_ALL_BITS_R1;
+ if (fields) {
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
+ &first, &last);
+ xfs_trans_log_buf(tp, bp, first, last);
+ }
+}
+
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+ struct xfs_agi *agi)
+{
+ int i;
+
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+ ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
#endif
+
+static bool
+xfs_agi_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+ return false;
/*
- * Compute byte offsets for the first and last fields.
+ * Validate the magic number of the agi block.
*/
- xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
+ if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+ return false;
+ if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
+ return false;
+
/*
- * Log the allocation group inode header buffer.
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
*/
- xfs_trans_log_buf(tp, bp, first, last);
+ if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
+ return false;
+
+ xfs_check_agi_unlinked(agi);
+ return true;
+}
+
+static void
+xfs_agi_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
+ XFS_ERRTAG_IALLOC_READ_AGI,
+ XFS_RANDOM_IALLOC_READ_AGI))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
+static void
+xfs_agi_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_agi_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+ xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+ .verify_read = xfs_agi_read_verify,
+ .verify_write = xfs_agi_write_verify,
+};
+
/*
* Read in the allocation group header (inode allocation section)
*/
int
-xfs_ialloc_read_agi(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_buf_t **bpp) /* allocation group hdr buf */
+xfs_read_agi(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_buf **bpp) /* allocation group hdr buf */
{
- xfs_agi_t *agi; /* allocation group header */
- int agi_ok; /* agi is consistent */
- xfs_buf_t *bp; /* allocation group hdr buf */
- xfs_perag_t *pag; /* per allocation group data */
- int error;
+ int error;
+
+ trace_xfs_read_agi(mp, agno);
ASSERT(agno != NULLAGNUMBER);
- error = xfs_trans_read_buf(
- mp, tp, mp->m_ddev_targp,
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
if (error)
return error;
- ASSERT(bp && !XFS_BUF_GETERROR(bp));
- /*
- * Validate the magic number of the agi block.
- */
- agi = XFS_BUF_TO_AGI(bp);
- agi_ok =
- be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
- XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
- XFS_RANDOM_IALLOC_READ_AGI))) {
- XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW,
- mp, agi);
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- pag = &mp->m_perag[agno];
+ xfs_buf_set_ref(*bpp, XFS_AGI_REF);
+ return 0;
+}
+
+int
+xfs_ialloc_read_agi(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_buf **bpp) /* allocation group hdr buf */
+{
+ struct xfs_agi *agi; /* allocation group header */
+ struct xfs_perag *pag; /* per allocation group data */
+ int error;
+
+ trace_xfs_ialloc_read_agi(mp, agno);
+
+ error = xfs_read_agi(mp, tp, agno, bpp);
+ if (error)
+ return error;
+
+ agi = XFS_BUF_TO_AGI(*bpp);
+ pag = xfs_perag_get(mp, agno);
if (!pag->pagi_init) {
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
pag->pagi_count = be32_to_cpu(agi->agi_count);
pag->pagi_init = 1;
- } else {
- /*
- * It's possible for these to be out of sync if
- * we are in the middle of a forced shutdown.
- */
- ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
}
-#ifdef DEBUG
- {
- int i;
-
- for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
- ASSERT(agi->agi_unlinked[i]);
- }
-#endif
-
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF);
- *bpp = bp;
+ /*
+ * It's possible for these to be out of sync if
+ * we are in the middle of a forced shutdown.
+ */
+ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
+ XFS_FORCED_SHUTDOWN(mp));
+ xfs_perag_put(pag);
return 0;
}