aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_btree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_btree.c')
-rw-r--r--fs/xfs/xfs_btree.c341
1 files changed, 270 insertions, 71 deletions
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 0903960410a..cf893bc1e37 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -17,24 +17,23 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_buf_item.h"
#include "xfs_btree.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_alloc.h"
/*
* Cursor allocation zone.
@@ -45,9 +44,10 @@ kmem_zone_t *xfs_btree_cur_zone;
* Btree magic numbers.
*/
static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
- { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC },
+ { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+ XFS_FIBT_MAGIC },
{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
- XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC }
+ XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
};
#define xfs_btree_magic(cur) \
xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
@@ -236,8 +236,7 @@ xfs_btree_lblock_calc_crc(
return;
if (bip)
block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_BTREE_LBLOCK_CRC_OFF);
+ xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
}
bool
@@ -245,8 +244,8 @@ xfs_btree_lblock_verify_crc(
struct xfs_buf *bp)
{
if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
- return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_BTREE_LBLOCK_CRC_OFF);
+ return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+
return true;
}
@@ -269,8 +268,7 @@ xfs_btree_sblock_calc_crc(
return;
if (bip)
block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_BTREE_SBLOCK_CRC_OFF);
+ xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
}
bool
@@ -278,8 +276,8 @@ xfs_btree_sblock_verify_crc(
struct xfs_buf *bp)
{
if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
- return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_BTREE_SBLOCK_CRC_OFF);
+ return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+
return true;
}
@@ -510,7 +508,7 @@ xfs_btree_ptr_addr(
}
/*
- * Get a the root block which is stored in the inode.
+ * Get the root block which is stored in the inode.
*
* For now this btree implementation assumes the btree root is always
* stored in the if_broot field of an inode fork.
@@ -556,14 +554,11 @@ xfs_btree_get_bufl(
xfs_fsblock_t fsbno, /* file system block number */
uint lock) /* lock flags for get_buf */
{
- xfs_buf_t *bp; /* buffer pointer (return value) */
xfs_daddr_t d; /* real disk block address */
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
- ASSERT(!xfs_buf_geterror(bp));
- return bp;
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
}
/*
@@ -578,15 +573,12 @@ xfs_btree_get_bufs(
xfs_agblock_t agbno, /* allocation group block number */
uint lock) /* lock flags for get_buf */
{
- xfs_buf_t *bp; /* buffer pointer (return value) */
xfs_daddr_t d; /* real disk block address */
ASSERT(agno != NULLAGNUMBER);
ASSERT(agbno != NULLAGBLOCK);
d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
- ASSERT(!xfs_buf_geterror(bp));
- return bp;
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
}
/*
@@ -726,7 +718,6 @@ xfs_btree_read_bufl(
mp->m_bsize, lock, &bp, ops);
if (error)
return error;
- ASSERT(!xfs_buf_geterror(bp));
if (bp)
xfs_buf_set_ref(bp, refval);
*bpp = bp;
@@ -855,6 +846,41 @@ xfs_btree_readahead(
return xfs_btree_readahead_sblock(cur, lr, block);
}
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
+
+ return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+ } else {
+ ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+ ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
+
+ return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+ be32_to_cpu(ptr->s));
+ }
+}
+
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ xfs_extlen_t count)
+{
+ xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
+ xfs_btree_ptr_to_daddr(cur, ptr),
+ cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+}
+
/*
* Set the buffer for level "lev" in the cursor to bp, releasing
* any previous buffer.
@@ -978,6 +1004,7 @@ xfs_btree_init_block_int(
buf->bb_u.l.bb_owner = cpu_to_be64(owner);
uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
buf->bb_u.l.bb_pad = 0;
+ buf->bb_u.l.bb_lsn = 0;
}
} else {
/* owner is a 32 bit value on short blocks */
@@ -989,6 +1016,7 @@ xfs_btree_init_block_int(
buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+ buf->bb_u.s.bb_lsn = 0;
}
}
}
@@ -1071,24 +1099,6 @@ xfs_btree_buf_to_ptr(
}
}
-STATIC xfs_daddr_t
-xfs_btree_ptr_to_daddr(
- struct xfs_btree_cur *cur,
- union xfs_btree_ptr *ptr)
-{
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
- ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
-
- return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
- } else {
- ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
- ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
-
- return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
- be32_to_cpu(ptr->s));
- }
-}
-
STATIC void
xfs_btree_set_refs(
struct xfs_btree_cur *cur,
@@ -1100,6 +1110,7 @@ xfs_btree_set_refs(
xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
break;
case XFS_BTNUM_INO:
+ case XFS_BTNUM_FINO:
xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
break;
case XFS_BTNUM_BMAP:
@@ -1144,7 +1155,6 @@ STATIC int
xfs_btree_read_buf_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr,
- int level,
int flags,
struct xfs_btree_block **block,
struct xfs_buf **bpp)
@@ -1163,7 +1173,6 @@ xfs_btree_read_buf_block(
if (error)
return error;
- ASSERT(!xfs_buf_geterror(*bpp));
xfs_btree_set_refs(cur, *bpp);
*block = XFS_BUF_TO_BLOCK(*bpp);
return 0;
@@ -1502,8 +1511,8 @@ xfs_btree_increment(
union xfs_btree_ptr *ptrp;
ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
- error = xfs_btree_read_buf_block(cur, ptrp, --lev,
- 0, &block, &bp);
+ --lev;
+ error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
@@ -1601,8 +1610,8 @@ xfs_btree_decrement(
union xfs_btree_ptr *ptrp;
ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
- error = xfs_btree_read_buf_block(cur, ptrp, --lev,
- 0, &block, &bp);
+ --lev;
+ error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
xfs_btree_setbuf(cur, lev, bp);
@@ -1652,7 +1661,7 @@ xfs_btree_lookup_get_block(
return 0;
}
- error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+ error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
if (error)
return error;
@@ -1684,7 +1693,7 @@ xfs_lookup_get_search_key(
/*
* Lookup the record. The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
+ * stat is set to 0 if can't find any such record, 1 for success.
*/
int /* error */
xfs_btree_lookup(
@@ -2003,7 +2012,7 @@ xfs_btree_lshift(
goto out0;
/* Set up the left neighbor as "left". */
- error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
if (error)
goto error0;
@@ -2187,7 +2196,7 @@ xfs_btree_rshift(
goto out0;
/* Set up the right neighbor as "right". */
- error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
if (error)
goto error0;
@@ -2315,7 +2324,7 @@ error1:
* record (to be inserted into parent).
*/
STATIC int /* error */
-xfs_btree_split(
+__xfs_btree_split(
struct xfs_btree_cur *cur,
int level,
union xfs_btree_ptr *ptrp,
@@ -2357,7 +2366,7 @@ xfs_btree_split(
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+ error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -2455,7 +2464,7 @@ xfs_btree_split(
* point back to right instead of to left.
*/
if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
- error = xfs_btree_read_buf_block(cur, &rrptr, level,
+ error = xfs_btree_read_buf_block(cur, &rrptr,
0, &rrblock, &rrbp);
if (error)
goto error0;
@@ -2495,6 +2504,85 @@ error0:
return error;
}
+struct xfs_btree_split_args {
+ struct xfs_btree_cur *cur;
+ int level;
+ union xfs_btree_ptr *ptrp;
+ union xfs_btree_key *key;
+ struct xfs_btree_cur **curp;
+ int *stat; /* success/failure */
+ int result;
+ bool kswapd; /* allocation in kswapd context */
+ struct completion *done;
+ struct work_struct work;
+};
+
+/*
+ * Stack switching interfaces for allocation
+ */
+static void
+xfs_btree_split_worker(
+ struct work_struct *work)
+{
+ struct xfs_btree_split_args *args = container_of(work,
+ struct xfs_btree_split_args, work);
+ unsigned long pflags;
+ unsigned long new_pflags = PF_FSTRANS;
+
+ /*
+ * we are in a transaction context here, but may also be doing work
+ * in kswapd context, and hence we may need to inherit that state
+ * temporarily to ensure that we don't block waiting for memory reclaim
+ * in any way.
+ */
+ if (args->kswapd)
+ new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+
+ current_set_flags_nested(&pflags, new_pflags);
+
+ args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
+ args->key, args->curp, args->stat);
+ complete(args->done);
+
+ current_restore_flags_nested(&pflags, new_pflags);
+}
+
+/*
+ * BMBT split requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. For the other
+ * btree types, just call directly to avoid the context switch overhead here.
+ */
+STATIC int /* error */
+xfs_btree_split(
+ struct xfs_btree_cur *cur,
+ int level,
+ union xfs_btree_ptr *ptrp,
+ union xfs_btree_key *key,
+ struct xfs_btree_cur **curp,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_split_args args;
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ if (cur->bc_btnum != XFS_BTNUM_BMAP)
+ return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
+
+ args.cur = cur;
+ args.level = level;
+ args.ptrp = ptrp;
+ args.key = key;
+ args.curp = curp;
+ args.stat = stat;
+ args.done = &done;
+ args.kswapd = current_is_kswapd();
+ INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
+ queue_work(xfs_alloc_wq, &args.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&args.work);
+ return args.result;
+}
+
+
/*
* Copy the old inode root contents into a real block and make the
* broot point to it.
@@ -2530,7 +2618,7 @@ xfs_btree_new_iroot(
pp = xfs_btree_ptr_addr(cur, 1, block);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+ error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
if (error)
goto error0;
if (*stat == 0) {
@@ -2634,7 +2722,7 @@ xfs_btree_new_root(
cur->bc_ops->init_ptr_from_cur(cur, &rptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+ error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -2669,8 +2757,7 @@ xfs_btree_new_root(
lbp = bp;
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
left = block;
- error = xfs_btree_read_buf_block(cur, &rptr,
- cur->bc_nlevels - 1, 0, &right, &rbp);
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
if (error)
goto error0;
bp = rbp;
@@ -2681,8 +2768,7 @@ xfs_btree_new_root(
xfs_btree_buf_to_ptr(cur, rbp, &rptr);
right = block;
xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
- error = xfs_btree_read_buf_block(cur, &lptr,
- cur->bc_nlevels - 1, 0, &left, &lbp);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
if (error)
goto error0;
bp = lbp;
@@ -2756,7 +2842,6 @@ xfs_btree_make_block_unfull(
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
/* A root block that can be made bigger. */
-
xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
} else {
/* A root block that needs replacing */
@@ -3635,8 +3720,7 @@ xfs_btree_delrec(
rptr = cptr;
right = block;
rbp = bp;
- error = xfs_btree_read_buf_block(cur, &lptr, level,
- 0, &left, &lbp);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
if (error)
goto error0;
@@ -3653,8 +3737,7 @@ xfs_btree_delrec(
lptr = cptr;
left = block;
lbp = bp;
- error = xfs_btree_read_buf_block(cur, &rptr, level,
- 0, &right, &rbp);
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
if (error)
goto error0;
@@ -3726,8 +3809,7 @@ xfs_btree_delrec(
/* If there is a right sibling, point it to the remaining block. */
xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
if (!xfs_btree_ptr_is_null(cur, &cptr)) {
- error = xfs_btree_read_buf_block(cur, &cptr, level,
- 0, &rrblock, &rrbp);
+ error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
if (error)
goto error0;
xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
@@ -3868,3 +3950,120 @@ xfs_btree_get_rec(
*stat = 1;
return 0;
}
+
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction. If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+static int
+xfs_btree_block_change_owner(
+ struct xfs_btree_cur *cur,
+ int level,
+ __uint64_t new_owner,
+ struct list_head *buffer_list)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ union xfs_btree_ptr rptr;
+
+ /* do right sibling readahead */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+ /* modify the owner */
+ block = xfs_btree_get_block(cur, level, &bp);
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+ else
+ block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+
+ /*
+ * If the block is a root block hosted in an inode, we might not have a
+ * buffer pointer here and we shouldn't attempt to log the change as the
+ * information is already held in the inode and discarded when the root
+ * block is formatted into the on-disk inode fork. We still change it,
+ * though, so everything is consistent in memory.
+ */
+ if (bp) {
+ if (cur->bc_tp) {
+ xfs_trans_ordered_buf(cur->bc_tp, bp);
+ xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+ } else {
+ xfs_buf_delwri_queue(bp, buffer_list);
+ }
+ } else {
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(level == cur->bc_nlevels - 1);
+ }
+
+ /* now read rh sibling block for next iteration */
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &rptr))
+ return ENOENT;
+
+ return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+int
+xfs_btree_change_owner(
+ struct xfs_btree_cur *cur,
+ __uint64_t new_owner,
+ struct list_head *buffer_list)
+{
+ union xfs_btree_ptr lptr;
+ int level;
+ struct xfs_btree_block *block = NULL;
+ int error = 0;
+
+ cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+ /* for each level */
+ for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+ /* grab the left hand block */
+ error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+ if (error)
+ return error;
+
+ /* readahead the left most block for the next level down */
+ if (level > 0) {
+ union xfs_btree_ptr *ptr;
+
+ ptr = xfs_btree_ptr_addr(cur, 1, block);
+ xfs_btree_readahead_ptr(cur, ptr, 1);
+
+ /* save for the next iteration of the loop */
+ lptr = *ptr;
+ }
+
+ /* for each buffer in the level */
+ do {
+ error = xfs_btree_block_change_owner(cur, level,
+ new_owner,
+ buffer_list);
+ } while (!error);
+
+ if (error != ENOENT)
+ return error;
+ }
+
+ return 0;
+}