aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorAlex Elder <aelder@sgi.com>2010-05-24 11:57:36 -0500
committerAlex Elder <aelder@sgi.com>2010-05-24 11:57:36 -0500
commit88e88374ee4958786397a57f684de6f1fc5e0242 (patch)
tree750fe86ece5d65e597223eb07c5ce7cf5b3749a0 /fs
parent7e125f7b9cbfce4101191b8076d606c517a73066 (diff)
parentccf7c23fc129e75ef60e6f59f60a485b7a056598 (diff)
Merge branch 'delayed-logging-for-2.6.35' into for-linus
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h83
-rw-r--r--fs/xfs/quota/xfs_dquot.c6
-rw-r--r--fs/xfs/xfs_ag.h24
-rw-r--r--fs/xfs/xfs_alloc.c357
-rw-r--r--fs/xfs/xfs_alloc.h7
-rw-r--r--fs/xfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/xfs_buf_item.c166
-rw-r--r--fs/xfs/xfs_buf_item.h18
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_log.c120
-rw-r--r--fs/xfs/xfs_log.h14
-rw-r--r--fs/xfs/xfs_log_cil.c725
-rw-r--r--fs/xfs/xfs_log_priv.h118
-rw-r--r--fs/xfs/xfs_log_recover.c46
-rw-r--r--fs/xfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_trans.c144
-rw-r--r--fs/xfs/xfs_trans.h44
-rw-r--r--fs/xfs/xfs_trans_buf.c46
-rw-r--r--fs/xfs/xfs_trans_item.c114
-rw-r--r--fs/xfs/xfs_trans_priv.h15
-rw-r--r--fs/xfs/xfs_types.h2
26 files changed, 1566 insertions, 513 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8b..c8fb13f83b3 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \
xfs_itable.o \
xfs_dfrag.o \
xfs_log.o \
+ xfs_log_cil.o \
xfs_log_recover.o \
xfs_mount.o \
xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index f01de3c55c4..649ade8ef59 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -37,6 +37,7 @@
#include "xfs_sb.h"
#include "xfs_inum.h"
+#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
@@ -850,6 +851,12 @@ xfs_buf_lock_value(
* Note that this in no way locks the underlying pages, so it is only
* useful for synchronizing concurrent use of buffer objects, not for
* synchronizing independent access to the underlying pages.
+ *
+ * If we come across a stale, pinned, locked buffer, we know that we
+ * are being asked to lock a buffer that has been reallocated. Because
+ * it is pinned, we know that the log has not been pushed to disk and
+ * hence it will still be locked. Rather than sleeping until someone
+ * else pushes the log, push it ourselves before trying to get the lock.
*/
void
xfs_buf_lock(
@@ -857,6 +864,8 @@ xfs_buf_lock(
{
trace_xfs_buf_lock(bp, _RET_IP_);
+ if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+ xfs_log_force(bp->b_mount, 0);
if (atomic_read(&bp->b_io_remaining))
blk_run_address_space(bp->b_target->bt_mapping);
down(&bp->b_sema);
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index e31bf21fe5d..9ac8aea9152 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,6 +19,7 @@
#include "xfs_dmapi.h"
#include "xfs_sb.h"
#include "xfs_inum.h"
+#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_quota.h"
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f24dbe5efde..f2d1718c916 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool;
#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */
#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
+#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
/*
* Table driven mount option parser.
@@ -374,6 +376,13 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_DMAPI;
} else if (!strcmp(this_char, MNTOPT_DMI)) {
mp->m_flags |= XFS_MOUNT_DMAPI;
+ } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
+ mp->m_flags |= XFS_MOUNT_DELAYLOG;
+ cmn_err(CE_WARN,
+ "Enabling EXPERIMENTAL delayed logging feature "
+ "- use at your own risk.\n");
+ } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
+ mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
} else if (!strcmp(this_char, "ihashsize")) {
cmn_err(CE_WARN,
"XFS: ihashsize no longer used, option is deprecated.");
@@ -535,6 +544,7 @@ xfs_showargs(
{ XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
{ XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
+ { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
@@ -1755,7 +1765,7 @@ xfs_init_zones(void)
* but it is much faster.
*/
xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
- (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+ (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
NBWORD) * sizeof(int))), "xfs_buf_item");
if (!xfs_buf_item_zone)
goto out_destroy_trans_zone;
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 8a319cfd290..ff6bc797baf 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1059,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap,
);
+#define XFS_BUSY_SYNC \
+ { 0, "async" }, \
+ { 1, "sync" }
+
TRACE_EVENT(xfs_alloc_busy,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
- xfs_extlen_t len, int slot),
- TP_ARGS(mp, agno, agbno, len, slot),
+ TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len, int sync),
+ TP_ARGS(trans, agno, agbno, len, sync),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __field(struct xfs_trans *, tp)
+ __field(int, tid)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
- __field(int, slot)
+ __field(int, sync)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
+ __entry->dev = trans->t_mountp->m_super->s_dev;
+ __entry->tp = trans;
+ __entry->tid = trans->t_ticket->t_tid;
__entry->agno = agno;
__entry->agbno = agbno;
__entry->len = len;
- __entry->slot = slot;
+ __entry->sync = sync;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u slot %d",
+ TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->tp,
+ __entry->tid,
__entry->agno,
__entry->agbno,
__entry->len,
- __entry->slot)
+ __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
);
-#define XFS_BUSY_STATES \
- { 0, "found" }, \
- { 1, "missing" }
-
TRACE_EVENT(xfs_alloc_unbusy,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- int slot, int found),
- TP_ARGS(mp, agno, slot, found),
+ xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
- __field(int, slot)
- __field(int, found)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
- __entry->slot = slot;
- __entry->found = found;
+ __entry->agbno = agbno;
+ __entry->len = len;
),
- TP_printk("dev %d:%d agno %u slot %d %s",
+ TP_printk("dev %d:%d agno %u agbno %u len %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
- __entry->slot,
- __print_symbolic(__entry->found, XFS_BUSY_STATES))
+ __entry->agbno,
+ __entry->len)
);
+#define XFS_BUSY_STATES \
+ { 0, "missing" }, \
+ { 1, "found" }
+
TRACE_EVENT(xfs_alloc_busysearch,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
- xfs_extlen_t len, xfs_lsn_t lsn),
- TP_ARGS(mp, agno, agbno, len, lsn),
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len, int found),
+ TP_ARGS(mp, agno, agbno, len, found),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
- __field(xfs_lsn_t, lsn)
+ __field(int, found)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
__entry->agbno = agbno;
__entry->len = len;
- __entry->lsn = lsn;
+ __entry->found = found;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
+ TP_printk("dev %d:%d agno %u agbno %u len %u %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
__entry->len,
+ __print_symbolic(__entry->found, XFS_BUSY_STATES))
+);
+
+TRACE_EVENT(xfs_trans_commit_lsn,
+ TP_PROTO(struct xfs_trans *trans),
+ TP_ARGS(trans),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(struct xfs_trans *, tp)
+ __field(xfs_lsn_t, lsn)
+ ),
+ TP_fast_assign(
+ __entry->dev = trans->t_mountp->m_super->s_dev;
+ __entry->tp = trans;
+ __entry->lsn = trans->t_commit_lsn;
+ ),
+ TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->tp,
__entry->lsn)
);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index b89ec5df012..585e7633dfc 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk(
for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
xfs_qm_dqinit_core(curid, type, d);
xfs_trans_dquot_buf(tp, bp,
- (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
- ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
- XFS_BLI_GDQUOT_BUF)));
+ (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
+ ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
+ XFS_BLF_GDQUOT_BUF)));
xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index abb8222b88c..401f364ad36 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
} xfs_agfl_t;
/*
- * Busy block/extent entry. Used in perag to mark blocks that have been freed
- * but whose transactions aren't committed to disk yet.
+ * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_alloc_busy_insert() for details.
*/
-typedef struct xfs_perag_busy {
- xfs_agblock_t busy_start;
- xfs_extlen_t busy_length;
- struct xfs_trans *busy_tp; /* transaction that did the free */
-} xfs_perag_busy_t;
+struct xfs_busy_extent {
+ struct rb_node rb_node; /* ag by-bno indexed search tree */
+ struct list_head list; /* transaction busy extent list */
+ xfs_agnumber_t agno;
+ xfs_agblock_t bno;
+ xfs_extlen_t length;
+ xlog_tid_t tid; /* transaction that created this */
+};
/*
* Per-ag incore structure, copies of information in agf and agi,
@@ -216,7 +222,8 @@ typedef struct xfs_perag {
xfs_agino_t pagl_leftrec;
xfs_agino_t pagl_rightrec;
#ifdef __KERNEL__
- spinlock_t pagb_lock; /* lock for pagb_list */
+ spinlock_t pagb_lock; /* lock for pagb_tree */
+ struct rb_root pagb_tree; /* ordered tree of busy extents */
atomic_t pagf_fstrms; /* # of filestreams active in this AG */
@@ -226,7 +233,6 @@ typedef struct xfs_perag {
int pag_ici_reclaimable; /* reclaimable inodes */
#endif
int pagb_count; /* pagb slots in use */
- xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
} xfs_perag_t;
/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb256..a7fbe8a99b1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len);
+static int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len);
/*
* Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
be32_to_cpu(agf->agf_length));
xfs_alloc_log_agf(args->tp, args->agbp,
XFS_AGF_FREEBLKS);
- /* search the busylist for these blocks */
- xfs_alloc_search_busy(args->tp, args->agno,
- args->agbno, args->len);
+ /*
+ * Search the busylist for these blocks and mark the
+ * transaction as synchronous if blocks are found. This
+ * avoids the need to block due to a synchronous log
+ * force to ensure correct ordering as the synchronous
+ * transaction will guarantee that for us.
+ */
+ if (xfs_alloc_busy_search(args->mp, args->agno,
+ args->agbno, args->len))
+ xfs_trans_set_sync(args->tp);
}
if (!args->isfl)
xfs_trans_mod_sb(args->tp,
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
* when the iclog commits to disk. If a busy block is allocated,
* the iclog is pushed up to the LSN that freed the block.
*/
- xfs_alloc_mark_busy(tp, agno, bno, len);
+ xfs_alloc_busy_insert(tp, agno, bno, len);
return 0;
error0:
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
*bnop = bno;
/*
- * As blocks are freed, they are added to the per-ag busy list
- * and remain there until the freeing transaction is committed to
- * disk. Now that we have allocated blocks, this list must be
- * searched to see if a block is being reused. If one is, then
- * the freeing transaction must be pushed to disk NOW by forcing
- * to disk all iclogs up that transaction's LSN.
+ * As blocks are freed, they are added to the per-ag busy list and
+ * remain there until the freeing transaction is committed to disk.
+ * Now that we have allocated blocks, this list must be searched to see
+ * if a block is being reused. If one is, then the freeing transaction
+ * must be pushed to disk before this transaction.
+ *
+ * We do this by setting the current transaction to a sync transaction
+ * which guarantees that the freeing transaction is on disk before this
+ * transaction. This is done instead of a synchronous log force here so
+ * that we don't sit and wait with the AGF locked in the transaction
+ * during the log force.
*/
- xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+ if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
+ xfs_trans_set_sync(tp);
return 0;
}
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
spin_lock_init(&pag->pagb_lock);
pag->pagb_count = 0;
- memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
+ pag->pagb_tree = RB_ROOT;
pag->pagf_init = 1;
}
#ifdef DEBUG
@@ -2479,127 +2490,263 @@ error0:
* list is reused, the transaction that freed it must be forced to disk
* before continuing to use the block.
*
- * xfs_alloc_mark_busy - add to the per-ag busy list
- * xfs_alloc_clear_busy - remove an item from the per-ag busy list
+ * xfs_alloc_busy_insert - add to the per-ag busy list
+ * xfs_alloc_busy_clear - remove an item from the per-ag busy list
+ * xfs_alloc_busy_search - search for a busy extent
+ */
+
+/*
+ * Insert a new extent into the busy tree.
+ *
+ * The busy extent tree is indexed by the start block of the busy extent.
+ * there can be multiple overlapping ranges in the busy extent tree but only
+ * ever one entry at a given start block. The reason for this is that
+ * multi-block extents can be freed, then smaller chunks of that extent
+ * allocated and freed again before the first transaction commit is on disk.
+ * If the exact same start block is freed a second time, we have to wait for
+ * that busy extent to pass out of the tree before the new extent is inserted.
+ * There are two main cases we have to handle here.
+ *
+ * The first case is a transaction that triggers a "free - allocate - free"
+ * cycle. This can occur during btree manipulations as a btree block is freed
+ * to the freelist, then allocated from the free list, then freed again. In
+ * this case, the second extxpnet free is what triggers the duplicate and as
+ * such the transaction IDs should match. Because the extent was allocated in
+ * this transaction, the transaction must be marked as synchronous. This is
+ * true for all cases where the free/alloc/free occurs in the one transaction,
+ * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
+ * This serves to catch violations of the second case quite effectively.
+ *
+ * The second case is where the free/alloc/free occur in different
+ * transactions. In this case, the thread freeing the extent the second time
+ * can't mark the extent busy immediately because it is already tracked in a
+ * transaction that may be committing. When the log commit for the existing
+ * busy extent completes, the busy extent will be removed from the tree. If we
+ * allow the second busy insert to continue using that busy extent structure,
+ * it can be freed before this transaction is safely in the log. Hence our
+ * only option in this case is to force the log to remove the existing busy
+ * extent from the list before we insert the new one with the current
+ * transaction ID.
+ *
+ * The problem we are trying to avoid in the free-alloc-free in separate
+ * transactions is most easily described with a timeline:
+ *
+ * Thread 1 Thread 2 Thread 3 xfslogd
+ * xact alloc
+ * free X
+ * mark busy
+ * commit xact
+ * free xact
+ * xact alloc
+ * alloc X
+ * busy search
+ * mark xact sync
+ * commit xact
+ * free xact
+ * force log
+ * checkpoint starts
+ * ....
+ * xact alloc
+ * free X
+ * mark busy
+ * finds match
+ * *** KABOOM! ***
+ * ....
+ * log IO completes
+ * unbusy X
+ * checkpoint completes
+ *
+ * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
+ * the checkpoint completes, and the busy extent it matched will have been
+ * removed from the tree when it is woken. Hence it can then continue safely.
+ *
+ * However, to ensure this matching process is robust, we need to use the
+ * transaction ID for identifying transaction, as delayed logging results in
+ * the busy extent and transaction lifecycles being different. i.e. the busy
+ * extent is active for a lot longer than the transaction. Hence the
+ * transaction structure can be freed and reallocated, then mark the same
+ * extent busy again in the new transaction. In this case the new transaction
+ * will have a different tid but can have the same address, and hence we need
+ * to check against the tid.
+ *
+ * Future: for delayed logging, we could avoid the log force if the extent was
+ * first freed in the current checkpoint sequence. This, however, requires the
+ * ability to pin the current checkpoint in memory until this transaction
+ * commits to ensure that both the original free and the current one combine
+ * logically into the one checkpoint. If the checkpoint sequences are
+ * different, however, we still need to wait on a log force.
*/
void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len)
+xfs_alloc_busy_insert(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len)
{
- xfs_perag_busy_t *bsy;
+ struct xfs_busy_extent *new;
+ struct xfs_busy_extent *busyp;
struct xfs_perag *pag;
- int n;
+ struct rb_node **rbp;
+ struct rb_node *parent;
+ int match;
- pag = xfs_perag_get(tp->t_mountp, agno);
- spin_lock(&pag->pagb_lock);
- /* search pagb_list for an open slot */
- for (bsy = pag->pagb_list, n = 0;
- n < XFS_PAGB_NUM_SLOTS;
- bsy++, n++) {
- if (bsy->busy_tp == NULL) {
- break;
- }
+ new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
+ if (!new) {
+ /*
+ * No Memory! Since it is now not possible to track the free
+ * block, make this a synchronous transaction to insure that
+ * the block is not reused before this transaction commits.
+ */
+ trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+ xfs_trans_set_sync(tp);
+ return;
}
- trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
+ new->agno = agno;
+ new->bno = bno;
+ new->length = len;
+ new->tid = xfs_log_get_trans_ident(tp);
- if (n < XFS_PAGB_NUM_SLOTS) {
- bsy = &pag->pagb_list[n];
- pag->pagb_count++;
- bsy->busy_start = bno;
- bsy->busy_length = len;
- bsy->busy_tp = tp;
- xfs_trans_add_busy(tp, agno, n);
- } else {
+ INIT_LIST_HEAD(&new->list);
+
+ /* trace before insert to be able to see failed inserts */
+ trace_xfs_alloc_busy(tp, agno, bno, len, 0);
+
+ pag = xfs_perag_get(tp->t_mountp, new->agno);
+restart:
+ spin_lock(&pag->pagb_lock);
+ rbp = &pag->pagb_tree.rb_node;
+ parent = NULL;
+ busyp = NULL;
+ match = 0;
+ while (*rbp && match >= 0) {
+ parent = *rbp;
+ busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
+
+ if (new->bno < busyp->bno) {
+ /* may overlap, but exact start block is lower */
+ rbp = &(*rbp)->rb_left;
+ if (new->bno + new->length > busyp->bno)
+ match = busyp->tid == new->tid ? 1 : -1;
+ } else if (new->bno > busyp->bno) {
+ /* may overlap, but exact start block is higher */
+ rbp = &(*rbp)->rb_right;
+ if (bno < busyp->bno + busyp->length)
+ match = busyp->tid == new->tid ? 1 : -1;
+ } else {
+ match = busyp->tid == new->tid ? 1 : -1;
+ break;
+ }
+ }
+ if (match < 0) {
+ /* overlap marked busy in different transaction */
+ spin_unlock(&pag->pagb_lock);
+ xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
+ goto restart;
+ }
+ if (match > 0) {
/*
- * The busy list is full! Since it is now not possible to
- * track the free block, make this a synchronous transaction
- * to insure that the block is not reused before this
- * transaction commits.
+ * overlap marked busy in same transaction. Update if exact
+ * start block match, otherwise combine the busy extents into
+ * a single range.
*/
- xfs_trans_set_sync(tp);
- }
+ if (busyp->bno == new->bno) {
+ busyp->length = max(busyp->length, new->length);
+ spin_unlock(&pag->pagb_lock);
+ ASSERT(tp->t_flags & XFS_TRANS_SYNC);
+ xfs_perag_put(pag);
+ kmem_free(new);
+ return;
+ }
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ new->length = max(busyp->bno + busyp->length,
+ new->bno + new->length) -
+ min(busyp->bno, new->bno);
+ new->bno = min(busyp->bno, new->bno);
+ } else
+ busyp = NULL;
+ rb_link_node(&new->rb_node, parent, rbp);
+ rb_insert_color(&new->rb_node, &pag->pagb_tree);
+
+ list_add(&new->list, &tp->t_busy);
spin_unlock(&pag->pagb_lock);
xfs_perag_put(pag);
+ kmem_free(busyp);
}
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- int idx)
+/*
+ * Search for a busy extent within the range of the extent we are about to
+ * allocate. You need to be holding the busy extent tree lock when calling
+ * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+static int
+xfs_alloc_busy_search(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len)
{
struct xfs_perag *pag;
- xfs_perag_busy_t *list;
+ struct rb_node *rbp;
+ struct xfs_busy_extent *busyp;
+ int match = 0;
- ASSERT(idx < XFS_PAGB_NUM_SLOTS);
- pag = xfs_perag_get(tp->t_mountp, agno);
+ pag = xfs_perag_get(mp, agno);
spin_lock(&pag->pagb_lock);
- list = pag->pagb_list;
- trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
-
- if (list[idx].busy_tp == tp) {
- list[idx].busy_tp = NULL;
- pag->pagb_count--;
+ rbp = pag->pagb_tree.rb_node;
+
+ /* find closest start bno overlap */
+ while (rbp) {
+ busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
+ if (bno < busyp->bno) {
+ /* may overlap, but exact start block is lower */
+ if (bno + len > busyp->bno)
+ match = -1;
+ rbp = rbp->rb_left;
+ } else if (bno > busyp->bno) {
+ /* may overlap, but exact start block is higher */
+ if (bno < busyp->bno + busyp->length)
+ match = -1;
+ rbp = rbp->rb_right;
+ } else {
+ /* bno matches busyp, length determines exact match */
+ match = (busyp->length == len) ? 1 : -1;
+ break;
+ }
}
-
spin_unlock(&pag->pagb_lock);
+ trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
xfs_perag_put(pag);
+ return match;
}
-
-/*
- * If we find the extent in the busy list, force the log out to get the
- * extent out of the busy list so the caller can use it straight away.
- */
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len)
+void
+xfs_alloc_busy_clear(
+ struct xfs_mount *mp,
+ struct xfs_busy_extent *busyp)
{
struct xfs_perag *pag;
- xfs_perag_busy_t *bsy;
- xfs_agblock_t uend, bend;
- xfs_lsn_t lsn = 0;
- int cnt;
- pag = xfs_perag_get(tp->t_mountp, agno);
- spin_lock(&pag->pagb_lock);
- cnt = pag->pagb_count;
+ trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
+ busyp->length);
- /*
- * search pagb_list for this slot, skipping open slots. We have to
- * search the entire array as there may be multiple overlaps and
- * we have to get the most recent LSN for the log force to push out
- * all the transactions that span the range.
- */
- uend = bno + len - 1;
- for (cnt = 0; cnt < pag->pagb_count; cnt++) {
- bsy = &pag->pagb_list[cnt];
- if (!bsy->busy_tp)
- continue;
+ ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
+ busyp->length) == 1);
- bend = bsy->busy_start + bsy->busy_length - 1;
- if (bno > bend || uend < bsy->busy_start)
- continue;
+ list_del_init(&busyp->list);
- /* (start1,length1) within (start2, length2) */
- if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
- lsn = bsy->busy_tp->t_commit_lsn;
- }
+ pag = xfs_perag_get(mp, busyp->agno);
+ spin_lock(&pag->pagb_lock);
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
spin_unlock(&pag->pagb_lock);
xfs_perag_put(pag);
- trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
- /*
- * If a block was found, force the log through the LSN of the
- * transaction that freed the block
- */
- if (lsn)
- xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
+ kmem_free(busyp);
}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa3978..6d05199b667 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
struct xfs_mount;
struct xfs_perag;
struct xfs_trans;
+struct xfs_busy_extent;
/*
* Freespace allocation types. Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
#ifdef __KERNEL__
void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
+xfs_alloc_busy_insert(xfs_trans_t *tp,
xfs_agnumber_t agno,
xfs_agblock_t bno,
xfs_extlen_t len);
void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
- xfs_agnumber_t ag,
- int idx);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
#endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1..83f49421875 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -134,7 +134,7 @@ xfs_allocbt_free_block(
* disk. If a busy block is allocated, the iclog is pushed up to the
* LSN that freed the block.
*/
- xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+ xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
return 0;
}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 240340a4727..02a80984aa0 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
nbytes = last - first + 1;
bfset(bip->bli_logged, first, nbytes);
for (x = 0; x < nbytes; x++) {
- chunk_num = byte >> XFS_BLI_SHIFT;
+ chunk_num = byte >> XFS_BLF_SHIFT;
word_num = chunk_num >> BIT_TO_WORD_SHIFT;
bit_num = chunk_num & (NBWORD - 1);
wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
* cancel flag in it.
*/
trace_xfs_buf_item_size_stale(bip);
- ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+ ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
return 1;
}
@@ -197,9 +197,9 @@ xfs_buf_item_size(
} else if (next_bit != last_bit + 1) {
last_bit = next_bit;
nvecs++;
- } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
- (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
- XFS_BLI_CHUNK)) {
+ } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
+ (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
+ XFS_BLF_CHUNK)) {
last_bit = next_bit;
nvecs++;
} else {
@@ -254,6 +254,20 @@ xfs_buf_item_format(
vecp++;
nvecs = 1;