aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_alloc.c')
-rw-r--r--fs/xfs/xfs_alloc.c896
1 files changed, 242 insertions, 654 deletions
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index ce84ffd0264..d43813267a8 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -17,24 +17,27 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
+#include "xfs_extent_busy.h"
#include "xfs_error.h"
+#include "xfs_cksum.h"
#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_log.h"
+struct workqueue_struct *xfs_alloc_wq;
#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
@@ -46,8 +49,6 @@ STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
-STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
- xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
/*
* Lookup the record equal to [bno, len] in the btree given by cur.
@@ -68,7 +69,7 @@ xfs_alloc_lookup_eq(
* Lookup the first record greater than or equal to [bno, len]
* in the btree given by cur.
*/
-STATIC int /* error */
+int /* error */
xfs_alloc_lookup_ge(
struct xfs_btree_cur *cur, /* btree cursor */
xfs_agblock_t bno, /* starting block of extent */
@@ -151,7 +152,7 @@ xfs_alloc_compute_aligned(
xfs_extlen_t len;
/* Trim busy sections out of found extent */
- xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
+ xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
if (args->alignment > 1 && len >= args->minlen) {
xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
@@ -174,6 +175,7 @@ xfs_alloc_compute_diff(
xfs_agblock_t wantbno, /* target starting block */
xfs_extlen_t wantlen, /* target length */
xfs_extlen_t alignment, /* target alignment */
+ char userdata, /* are we allocating data? */
xfs_agblock_t freebno, /* freespace's starting block */
xfs_extlen_t freelen, /* freespace's length */
xfs_agblock_t *newbnop) /* result: best start block from free */
@@ -188,7 +190,14 @@ xfs_alloc_compute_diff(
ASSERT(freelen >= wantlen);
freeend = freebno + freelen;
wantend = wantbno + wantlen;
- if (freebno >= wantbno) {
+ /*
+ * We want to allocate from the start of a free extent if it is past
+ * the desired block or if we are allocating user data and the free
+ * extent is before desired block. The second case is there to allow
+ * for contiguous allocation from the remaining free space if the file
+ * grows in the short term.
+ */
+ if (freebno >= wantbno || (userdata && freeend < wantend)) {
if ((newbno1 = roundup(freebno, alignment)) >= freeend)
newbno1 = NULLAGBLOCK;
} else if (freeend >= wantend && alignment > 1) {
@@ -248,16 +257,14 @@ xfs_alloc_fix_len(
k = rlen % args->prod;
if (k == args->mod)
return;
- if (k > args->mod) {
- if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen)
- return;
- } else {
- if ((int)(rlen = rlen - args->prod - (args->mod - k)) <
- (int)args->minlen)
- return;
- }
- ASSERT(rlen >= args->minlen);
- ASSERT(rlen <= args->maxlen);
+ if (k > args->mod)
+ rlen = rlen - (k - args->mod);
+ else
+ rlen = rlen - args->prod + (args->mod - k);
+ if ((int)rlen < (int)args->minlen)
+ return;
+ ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
+ ASSERT(rlen % args->prod == args->mod);
args->len = rlen;
}
@@ -431,6 +438,87 @@ xfs_alloc_fixup_trees(
return 0;
}
+static bool
+xfs_agfl_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+ int i;
+
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+ return false;
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
+ return false;
+
+ for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+ if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
+ be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+ return false;
+ }
+ return true;
+}
+
+static void
+xfs_agfl_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ /*
+ * There is no verification of non-crc AGFLs because mkfs does not
+ * initialise the AGFL to zero or NULL. Hence the only valid part of the
+ * AGFL is what the AGF says is active. We can't get to the AGF, so we
+ * can't verify just those entries are valid.
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_agfl_verify(bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+static void
+xfs_agfl_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ /* no verification of non-crc AGFLs */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (!xfs_agfl_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (bip)
+ XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+ .verify_read = xfs_agfl_read_verify,
+ .verify_write = xfs_agfl_write_verify,
+};
+
/*
* Read in the allocation group free block array.
*/
@@ -448,10 +536,9 @@ xfs_alloc_read_agfl(
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (error)
return error;
- ASSERT(!xfs_buf_geterror(bp));
xfs_buf_set_ref(bp, XFS_AGFL_REF);
*bpp = bp;
return 0;
@@ -535,7 +622,7 @@ xfs_alloc_ag_vextent(
if (error)
return error;
- ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
+ ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
args->agbno, args->len));
}
@@ -602,7 +689,7 @@ xfs_alloc_ag_vextent_exact(
/*
* Check for overlapping busy extents.
*/
- xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
+ xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
/*
* Give up if the start of the extent is busy, or the freespace isn't
@@ -719,7 +806,8 @@ xfs_alloc_find_best_extent(
xfs_alloc_fix_len(args);
sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, *sbnoa,
+ args->alignment,
+ args->userdata, *sbnoa,
*slena, &new);
/*
@@ -783,13 +871,13 @@ xfs_alloc_ag_vextent_near(
xfs_agblock_t ltnew; /* useful start bno of left side */
xfs_extlen_t rlen; /* length of returned extent */
int forced = 0;
-#if defined(DEBUG) && defined(__KERNEL__)
+#ifdef DEBUG
/*
* Randomly don't execute the first algorithm.
*/
int dofirst; /* set to do first algorithm */
- dofirst = random32() & 1;
+ dofirst = prandom_u32() & 1;
#endif
restart:
@@ -843,8 +931,8 @@ restart:
xfs_extlen_t blen=0;
xfs_agblock_t bnew=0;
-#if defined(DEBUG) && defined(__KERNEL__)
- if (!dofirst)
+#ifdef DEBUG
+ if (dofirst)
break;
#endif
/*
@@ -890,7 +978,8 @@ restart:
if (args->len < blen)
continue;
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbnoa, ltlena, &ltnew);
+ args->alignment, args->userdata, ltbnoa,
+ ltlena, &ltnew);
if (ltnew != NULLAGBLOCK &&
(args->len > blen || ltdiff < bdiff)) {
bdiff = ltdiff;
@@ -1042,7 +1131,8 @@ restart:
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbnoa, ltlena, &ltnew);
+ args->alignment, args->userdata, ltbnoa,
+ ltlena, &ltnew);
error = xfs_alloc_find_best_extent(args,
&bno_cur_lt, &bno_cur_gt,
@@ -1058,7 +1148,8 @@ restart:
args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
xfs_alloc_fix_len(args);
gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, gtbnoa, gtlena, &gtnew);
+ args->alignment, args->userdata, gtbnoa,
+ gtlena, &gtnew);
error = xfs_alloc_find_best_extent(args,
&bno_cur_gt, &bno_cur_lt,
@@ -1075,12 +1166,13 @@ restart:
* If we couldn't get anything, give up.
*/
if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+
if (!forced++) {
trace_xfs_alloc_near_busy(args);
xfs_log_force(args->mp, XFS_LOG_SYNC);
goto restart;
}
-
trace_xfs_alloc_size_neither(args);
args->agbno = NULLAGBLOCK;
return 0;
@@ -1116,7 +1208,7 @@ restart:
}
rlen = args->len;
(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
- ltbnoa, ltlena, &ltnew);
+ args->userdata, ltbnoa, ltlena, &ltnew);
ASSERT(ltnew >= ltbno);
ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
@@ -1390,7 +1482,7 @@ xfs_alloc_ag_vextent_small(
if (error)
goto error0;
if (fbno != NULLAGBLOCK) {
- xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
+ xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
args->userdata);
if (args->userdata) {
@@ -1866,12 +1958,11 @@ xfs_alloc_fix_freelist(
/*
* Initialize the args structure.
*/
+ memset(&targs, 0, sizeof(targs));
targs.tp = tp;
targs.mp = mp;
targs.agbp = agbp;
targs.agno = args->agno;
- targs.mod = targs.minleft = targs.wasdel = targs.userdata =
- targs.minalignslop = 0;
targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
targs.type = XFS_ALLOCTYPE_THIS_AG;
targs.pag = pag;
@@ -1929,18 +2020,18 @@ xfs_alloc_get_freelist(
int btreeblk) /* destination is a AGF btree */
{
xfs_agf_t *agf; /* a.g. freespace structure */
- xfs_agfl_t *agfl; /* a.g. freelist structure */
xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
xfs_agblock_t bno; /* block number returned */
+ __be32 *agfl_bno;
int error;
int logflags;
- xfs_mount_t *mp; /* mount structure */
+ xfs_mount_t *mp = tp->t_mountp;
xfs_perag_t *pag; /* per allocation group data */
- agf = XFS_BUF_TO_AGF(agbp);
/*
* Freelist is empty, give up.
*/
+ agf = XFS_BUF_TO_AGF(agbp);
if (!agf->agf_flcount) {
*bnop = NULLAGBLOCK;
return 0;
@@ -1948,15 +2039,17 @@ xfs_alloc_get_freelist(
/*
* Read the array of free blocks.
*/
- mp = tp->t_mountp;
- if ((error = xfs_alloc_read_agfl(mp, tp,
- be32_to_cpu(agf->agf_seqno), &agflbp)))
+ error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
+ &agflbp);
+ if (error)
return error;
- agfl = XFS_BUF_TO_AGFL(agflbp);
+
+
/*
* Get the block number and update the data structures.
*/
- bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
be32_add_cpu(&agf->agf_flfirst, 1);
xfs_trans_brelse(tp, agflbp);
if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
@@ -2005,11 +2098,14 @@ xfs_alloc_log_agf(
offsetof(xfs_agf_t, agf_freeblks),
offsetof(xfs_agf_t, agf_longest),
offsetof(xfs_agf_t, agf_btreeblks),
+ offsetof(xfs_agf_t, agf_uuid),
sizeof(xfs_agf_t)
};
trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
+
xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
}
@@ -2046,12 +2142,13 @@ xfs_alloc_put_freelist(
int btreeblk) /* block came from a AGF btree */
{
xfs_agf_t *agf; /* a.g. freespace structure */
- xfs_agfl_t *agfl; /* a.g. free block array */
__be32 *blockp;/* pointer to array entry */
int error;
int logflags;
xfs_mount_t *mp; /* mount structure */
xfs_perag_t *pag; /* per allocation group data */
+ __be32 *agfl_bno;
+ int startoff;
agf = XFS_BUF_TO_AGF(agbp);
mp = tp->t_mountp;
@@ -2059,7 +2156,6 @@ xfs_alloc_put_freelist(
if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
be32_to_cpu(agf->agf_seqno), &agflbp)))
return error;
- agfl = XFS_BUF_TO_AGFL(agflbp);
be32_add_cpu(&agf->agf_fllast, 1);
if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
agf->agf_fllast = 0;
@@ -2080,16 +2176,101 @@ xfs_alloc_put_freelist(
xfs_alloc_log_agf(tp, agbp, logflags);
ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
- blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
+
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
*blockp = cpu_to_be32(bno);
+ startoff = (char *)blockp - (char *)agflbp->b_addr;
+
xfs_alloc_log_agf(tp, agbp, logflags);
- xfs_trans_log_buf(tp, agflbp,
- (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
- (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
- sizeof(xfs_agblock_t) - 1));
+
+ xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
+ xfs_trans_log_buf(tp, agflbp, startoff,
+ startoff + sizeof(xfs_agblock_t) - 1);
return 0;
}
+static bool
+xfs_agf_verify(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+ {
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+ return false;
+
+ if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+ XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+ be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
+ return false;
+
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
+ return false;
+
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
+ be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
+ return false;
+
+ return true;;
+
+}
+
+static void
+xfs_agf_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
+ XFS_ERRTAG_ALLOC_READ_AGF,
+ XFS_RANDOM_ALLOC_READ_AGF))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+static void
+xfs_agf_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_buf_log_item *bip = bp->b_fspriv;
+
+ if (!xfs_agf_verify(mp, bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return;
+
+ if (bip)
+ XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
+
+ xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
+}
+
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+ .verify_read = xfs_agf_read_verify,
+ .verify_write = xfs_agf_write_verify,
+};
+
/*
* Read in the allocation group header (free/alloc section).
*/
@@ -2101,44 +2282,21 @@ xfs_read_agf(
int flags, /* XFS_BUF_ */
struct xfs_buf **bpp) /* buffer for the ag freelist header */
{
- struct xfs_agf *agf; /* ag freelist header */
- int agf_ok; /* set if agf is consistent */
int error;
+ trace_xfs_read_agf(mp, agno);
+
ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), flags, bpp);
+ XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
if (error)
return error;
if (!*bpp)
return 0;
ASSERT(!(*bpp)->b_error);
- agf = XFS_BUF_TO_AGF(*bpp);
-
- /*
- * Validate the magic number of the agf block.
- */
- agf_ok =
- agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
- be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
- be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_seqno) == agno;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
- agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
- be32_to_cpu(agf->agf_length);
- if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
- XFS_RANDOM_ALLOC_READ_AGF))) {
- XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
- XFS_ERRLEVEL_LOW, mp, agf);
- xfs_trans_brelse(tp, *bpp);
- return XFS_ERROR(EFSCORRUPTED);
- }
xfs_buf_set_ref(*bpp, XFS_AGF_REF);
return 0;
}
@@ -2158,8 +2316,9 @@ xfs_alloc_read_agf(
struct xfs_perag *pag; /* per allocation group data */
int error;
- ASSERT(agno != NULLAGNUMBER);
+ trace_xfs_alloc_read_agf(mp, agno);
+ ASSERT(agno != NULLAGNUMBER);
error = xfs_read_agf(mp, tp, agno,
(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
bpp);
@@ -2464,579 +2623,8 @@ xfs_free_extent(
error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
if (!error)
- xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
+ xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0);
error0:
xfs_perag_put(args.pag);
return error;
}
-
-void
-xfs_alloc_busy_insert(
- struct xfs_trans *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- unsigned int flags)
-{
- struct xfs_busy_extent *new;
- struct xfs_busy_extent *busyp;
- struct xfs_perag *pag;
- struct rb_node **rbp;
- struct rb_node *parent = NULL;
-
- new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
- if (!new) {
- /*
- * No Memory! Since it is now not possible to track the free
- * block, make this a synchronous transaction to insure that
- * the block is not reused before this transaction commits.
- */
- trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
- xfs_trans_set_sync(tp);
- return;
- }
-
- new->agno = agno;
- new->bno = bno;
- new->length = len;
- INIT_LIST_HEAD(&new->list);
- new->flags = flags;
-
- /* trace before insert to be able to see failed inserts */
- trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
-
- pag = xfs_perag_get(tp->t_mountp, new->agno);
- spin_lock(&pag->pagb_lock);
- rbp = &pag->pagb_tree.rb_node;
- while (*rbp) {
- parent = *rbp;
- busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
-
- if (new->bno < busyp->bno) {
- rbp = &(*rbp)->rb_left;
- ASSERT(new->bno + new->length <= busyp->bno);
- } else if (new->bno > busyp->bno) {
- rbp = &(*rbp)->rb_right;
- ASSERT(bno >= busyp->bno + busyp->length);
- } else {
- ASSERT(0);
- }
- }
-
- rb_link_node(&new->rb_node, parent, rbp);
- rb_insert_color(&new->rb_node, &pag->pagb_tree);
-
- list_add(&new->list, &tp->t_busy);
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
-}
-
-/*
- * Search for a busy extent within the range of the extent we are about to
- * allocate. You need to be holding the busy extent tree lock when calling
- * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
- * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
- * match. This is done so that a non-zero return indicates an overlap that
- * will require a synchronous transaction, but it can still be
- * used to distinguish between a partial or exact match.
- */
-int
-xfs_alloc_busy_search(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len)
-{
- struct xfs_perag *pag;
- struct rb_node *rbp;
- struct xfs_busy_extent *busyp;
- int match = 0;
-
- pag = xfs_perag_get(mp, agno);
- spin_lock(&pag->pagb_lock);
-
- rbp = pag->pagb_tree.rb_node;
-
- /* find closest start bno overlap */
- while (rbp) {
- busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
- if (bno < busyp->bno) {
- /* may overlap, but exact start block is lower */
- if (bno + len > busyp->bno)
- match = -1;
- rbp = rbp->rb_left;
- } else if (bno > busyp->bno) {
- /* may overlap, but exact start block is higher */
- if (bno < busyp->bno + busyp->length)
- match = -1;
- rbp = rbp->rb_right;
- } else {
- /* bno matches busyp, length determines exact match */
- match = (busyp->length == len) ? 1 : -1;
- break;
- }
- }
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
- return match;
-}
-
-/*
- * The found free extent [fbno, fend] overlaps part or all of the given busy
- * extent. If the overlap covers the beginning, the end, or all of the busy
- * extent, the overlapping portion can be made unbusy and used for the
- * allocation. We can't split a busy extent because we can't modify a
- * transaction/CIL context busy list, but we can update an entries block
- * number or length.
- *
- * Returns true if the extent can safely be reused, or false if the search
- * needs to be restarted.
- */
-STATIC bool
-xfs_alloc_busy_update_extent(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
- struct xfs_busy_extent *busyp,
- xfs_agblock_t fbno,
- xfs_extlen_t flen,
- bool userdata)
-{
- xfs_agblock_t fend = fbno + flen;
- xfs_agblock_t bbno = busyp->bno;
- xfs_agblock_t bend = bbno + busyp->length;
-
- /*
- * This extent is currently being discarded. Give the thread
- * performing the discard a chance to mark the extent unbusy
- * and retry.
- */
- if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
- spin_unlock(&pag->pagb_lock);
- delay(1);
- spin_lock(&pag->pagb_lock);
- return false;
- }
-
- /*
- * If there is a busy extent overlapping a user allocation, we have
- * no choice but to force the log and retry the search.
- *
- * Fortunately this does not happen during normal operation, but
- * only if the filesystem is very low on space and has to dip into
- * the AGFL for normal allocations.
- */
- if (userdata)
- goto out_force_log;
-
- if (bbno < fbno && bend > fend) {
- /*
- * Case 1:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +---------+
- * fbno fend
- */
-
- /*
- * We would have to split the busy extent to be able to track
- * it correct, which we cannot do because we would have to
- * modify the list of busy extents attached to the transaction
- * or CIL context, which is immutable.
- *
- * Force out the log to clear the busy extent and retry the
- * search.
- */
- goto out_force_log;
- } else if (bbno >= fbno && bend <= fend) {
- /*
- * Case 2:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-----------------+
- * fbno fend
- *
- * Case 3:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +--------------------------+
- * fbno fend
- *
- * Case 4:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +--------------------------+
- * fbno fend
- *
- * Case 5:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-----------------------------------+
- * fbno fend
- *
- */
-
- /*
- * The busy extent is fully covered by the extent we are
- * allocating, and can simply be removed from the rbtree.
- * However we cannot remove it from the immutable list
- * tracking busy extents in the transaction or CIL context,
- * so set the length to zero to mark it invalid.
- *
- * We also need to restart the busy extent search from the
- * tree root, because erasing the node can rearrange the
- * tree topology.
- */
- rb_erase(&busyp->rb_node, &pag->pagb_tree);
- busyp->length = 0;
- return false;
- } else if (fend < bend) {
- /*
- * Case 6:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +---------+
- * fbno fend
- *
- * Case 7:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +------------------+
- * fbno fend
- *
- */
- busyp->bno = fend;
- } else if (bbno < fbno) {
- /*
- * Case 8:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-------------+
- * fbno fend
- *
- * Case 9:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +----------------------+
- * fbno fend
- */
- busyp->length = fbno - busyp->bno;
- } else {
- ASSERT(0);
- }
-
- trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
- return true;
-
-out_force_log:
- spin_unlock(&pag->pagb_lock);
- xfs_log_force(mp, XFS_LOG_SYNC);
- trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
- spin_lock(&pag->pagb_lock);
- return false;
-}
-
-
-/*
- * For a given extent [fbno, flen], make sure we can reuse it safely.
- */
-void
-xfs_alloc_busy_reuse(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- xfs_agblock_t fbno,
- xfs_extlen_t flen,
- bool userdata)
-{
- struct xfs_perag *pag;
- struct rb_node *rbp;
-
- ASSERT(flen > 0);
-
- pag = xfs_perag_get(mp, agno);
- spin_lock(&pag->pagb_lock);
-restart:
- rbp = pag->pagb_tree.rb_node;
- while (rbp) {
- struct xfs_busy_extent *busyp =
- rb_entry(rbp, struct xfs_busy_extent, rb_node);
- xfs_agblock_t bbno = busyp->bno;
- xfs_agblock_t bend = bbno + busyp->length;
-
- if (fbno + flen <= bbno) {
- rbp = rbp->rb_left;
- continue;
- } else if (fbno >= bend) {
- rbp = rbp->rb_right;
- continue;
- }
-
- if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
- userdata))
- goto restart;
- }
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
-}
-
-/*
- * For a given extent [fbno, flen], search the busy extent list to find a
- * subset of the extent that is not busy. If *rlen is smaller than
- * args->minlen no suitable extent could be found, and the higher level
- * code needs to force out the log and retry the allocation.
- */
-STATIC void
-xfs_alloc_busy_trim(
- struct xfs_alloc_arg *args,
- xfs_agblock_t bno,
- xfs_extlen_t len,
- xfs_agblock_t *rbno,
- xfs_extlen_t *rlen)
-{
- xfs_agblock_t fbno;
- xfs_extlen_t flen;
- struct rb_node *rbp;
-
- ASSERT(len > 0);
-
- spin_lock(&args->pag->pagb_lock);
-restart:
- fbno = bno;
- flen = len;
- rbp = args->pag->pagb_tree.rb_node;
- while (rbp && flen >= args->minlen) {
- struct xfs_busy_extent *busyp =
- rb_entry(rbp, struct xfs_busy_extent, rb_node);
- xfs_agblock_t fend = fbno + flen;
- xfs_agblock_t bbno = busyp->bno;
- xfs_agblock_t bend = bbno + busyp->length;
-
- if (fend <= bbno) {
- rbp = rbp->rb_left;
- continue;
- } else if (fbno >= bend) {
- rbp = rbp->rb_right;
- continue;
- }
-
- /*
- * If this is a metadata allocation, try to reuse the busy
- * extent instead of trimming the allocation.
- */
- if (!args->userdata &&
- !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
- if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
- busyp, fbno, flen,
- false))
- goto restart;
- continue;
- }
-
- if (bbno <= fbno) {
- /* start overlap */
-
- /*
- * Case 1:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +---------+
- * fbno fend
- *
- * Case 2:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-------------+
- * fbno fend
- *
- * Case 3:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-------------+
- * fbno fend
- *
- * Case 4:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-----------------+
- * fbno fend
- *
- * No unbusy region in extent, return failure.
- */
- if (fend <= bend)
- goto fail;
-
- /*
- * Case 5:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +----------------------+
- * fbno fend
- *
- * Case 6:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +--------------------------+
- * fbno fend
- *
- * Needs to be trimmed to:
- * +-------+
- * fbno fend
- */
- fbno = bend;
- } else if (bend >= fend) {
- /* end overlap */
-
- /*
- * Case 7:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +------------------+
- * fbno fend
- *
- * Case 8:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +--------------------------+
- * fbno fend
- *
- * Needs to be trimmed to:
- * +-------+
- * fbno fend
- */
- fend = bbno;
- } else {
- /* middle overlap */
-
- /*
- * Case 9:
- * bbno bend
- * +BBBBBBBBBBBBBBBBB+
- * +-----------------------------------+
- * fbno fend
- *
- * Can be trimmed to:
- * +-------+ OR +-------+
- * fbno fend fbno fend
- *
- * Backward allocation leads to significant
- * fragmentation of directories, which degrades
- * directory performance, therefore we always want to
- * choose the option that produces forward allocation
- * patterns.
- * Preferring the lower bno extent will make the next
- * request use "fend" as the start of the next
- * allocation; if the segment is no longer busy at
- * that point, we'll get a contiguous allocation, but
- * even if it is still busy, we will get a forward
- * allocation.
- * We try to avoid choosing the segment at "bend",
- * because that can lead to the next allocation
- * taking the segment at "fbno", which would be a
- * backward allocation. We only use the segment at
- * "fbno" if it is much larger than the current
- * requested size, because in that case there's a
- * good chance subsequent allocations will be
- * contiguous.
- */
- if (bbno - fbno >= args->maxlen) {
- /* left candidate fits perfect */
- fend = bbno;
- } else if (fend - bend >= args->maxlen * 4) {
- /* right candidate has enough free space */
- fbno = bend;
- } else if (bbno - fbno >= args->minlen) {
- /* left candidate fits minimum requirement */
- fend = bbno;
- } else {
- goto fail;
- }
- }
-
- flen = fend - fbno;
- }
- spin_unlock(&args->pag->pagb_lock);
-
- if (fbno != bno || flen != len) {
- trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
- fbno, flen);
- }
- *rbno = fbno;
- *rlen = flen;
- return;
-fail:
- /*
- * Return a zero extent length as failure indications. All callers
- * re-check if the trimmed extent satisfies the minlen requirement.
- */
- spin_unlock(&args->pag->pagb_lock);
- trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
- *rbno = fbno;
- *rlen = 0;
-}
-
-static void
-xfs_alloc_busy_clear_one(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
- struct xfs_busy_extent *busyp)
-{
- if (busyp->length) {
- trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
- busyp->length);
- rb_erase(&busyp->rb_node, &pag->pagb_tree);
- }
-
- list_del_init(&busyp->list);
- kmem_free(busyp);
-}
-
-/*
- * Remove all extents on the passed in list from the busy extents tree.
- * If do_discard is set skip extents that need to be discarded, and mark
- * these as undergoing a discard operation instead.
- */
-void
-xfs_alloc_busy_clear(
- struct xfs_mount *mp,
- struct list_head *list,
- bool do_discard)
-{
- struct xfs_busy_extent *busyp, *n;
- struct xfs_perag *pag = NULL;
- xfs_agnumber_t agno = NULLAGNUMBER;
-
- list_for_each_entry_safe(busyp, n, list, list) {
- if (busyp->agno != agno) {
- if (pag) {
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
- }
- pag = xfs_perag_get(mp, busyp->agno);
- spin_lock(&pag->pagb_lock);
- agno = busyp->agno;
- }
-
- if (do_discard && busyp->length &&
- !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
- busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
- else
- xfs_alloc_busy_clear_one(mp, pag, busyp);
- }
-
- if (pag) {
- spin_unlock(&pag->pagb_lock);
- xfs_perag_put(pag);
- }
-}
-
-/*
- * Callback for list_sort to sort busy extents by the AG they reside in.
- */
-int
-xfs_busy_extent_ag_cmp(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- return container_of(a, struct xfs_busy_extent, list)->agno -
- container_of(b, struct xfs_busy_extent, list)->agno;
-}