diff options
author | Alex Elder <aelder@sgi.com> | 2010-05-24 11:57:36 -0500 |
---|---|---|
committer | Alex Elder <aelder@sgi.com> | 2010-05-24 11:57:36 -0500 |
commit | 88e88374ee4958786397a57f684de6f1fc5e0242 (patch) | |
tree | 750fe86ece5d65e597223eb07c5ce7cf5b3749a0 /fs | |
parent | 7e125f7b9cbfce4101191b8076d606c517a73066 (diff) | |
parent | ccf7c23fc129e75ef60e6f59f60a485b7a056598 (diff) |
Merge branch 'delayed-logging-for-2.6.35' into for-linus
Diffstat (limited to 'fs')
-rw-r--r-- | fs/xfs/Makefile | 1 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.c | 9 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_quotaops.c | 1 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_super.c | 12 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_trace.h | 83 | ||||
-rw-r--r-- | fs/xfs/quota/xfs_dquot.c | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_ag.h | 24 | ||||
-rw-r--r-- | fs/xfs/xfs_alloc.c | 357 | ||||
-rw-r--r-- | fs/xfs/xfs_alloc.h | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_alloc_btree.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_buf_item.c | 166 | ||||
-rw-r--r-- | fs/xfs/xfs_buf_item.h | 18 | ||||
-rw-r--r-- | fs/xfs/xfs_error.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_log.c | 120 | ||||
-rw-r--r-- | fs/xfs/xfs_log.h | 14 | ||||
-rw-r--r-- | fs/xfs/xfs_log_cil.c | 725 | ||||
-rw-r--r-- | fs/xfs/xfs_log_priv.h | 118 | ||||
-rw-r--r-- | fs/xfs/xfs_log_recover.c | 46 | ||||
-rw-r--r-- | fs/xfs/xfs_log_recover.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_trans.c | 144 | ||||
-rw-r--r-- | fs/xfs/xfs_trans.h | 44 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_buf.c | 46 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_item.c | 114 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_priv.h | 15 | ||||
-rw-r--r-- | fs/xfs/xfs_types.h | 2 |
26 files changed, 1566 insertions, 513 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index b4769e40e8b..c8fb13f83b3 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \ xfs_itable.o \ xfs_dfrag.o \ xfs_log.o \ + xfs_log_cil.o \ xfs_log_recover.o \ xfs_mount.o \ xfs_mru_cache.o \ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index f01de3c55c4..649ade8ef59 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -37,6 +37,7 @@ #include "xfs_sb.h" #include "xfs_inum.h" +#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" @@ -850,6 +851,12 @@ xfs_buf_lock_value( * Note that this in no way locks the underlying pages, so it is only * useful for synchronizing concurrent use of buffer objects, not for * synchronizing independent access to the underlying pages. + * + * If we come across a stale, pinned, locked buffer, we know that we + * are being asked to lock a buffer that has been reallocated. Because + * it is pinned, we know that the log has not been pushed to disk and + * hence it will still be locked. Rather than sleeping until someone + * else pushes the log, push it ourselves before trying to get the lock. */ void xfs_buf_lock( @@ -857,6 +864,8 @@ xfs_buf_lock( { trace_xfs_buf_lock(bp, _RET_IP_); + if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_mount, 0); if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); down(&bp->b_sema); diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index e31bf21fe5d..9ac8aea9152 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -19,6 +19,7 @@ #include "xfs_dmapi.h" #include "xfs_sb.h" #include "xfs_inum.h" +#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index f24dbe5efde..f2d1718c916 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ +#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ +#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ /* * Table driven mount option parser. @@ -374,6 +376,13 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DMAPI; } else if (!strcmp(this_char, MNTOPT_DMI)) { mp->m_flags |= XFS_MOUNT_DMAPI; + } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { + mp->m_flags |= XFS_MOUNT_DELAYLOG; + cmn_err(CE_WARN, + "Enabling EXPERIMENTAL delayed logging feature " + "- use at your own risk.\n"); + } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { + mp->m_flags &= ~XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, "ihashsize")) { cmn_err(CE_WARN, "XFS: ihashsize no longer used, option is deprecated."); @@ -535,6 +544,7 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, + { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -1755,7 +1765,7 @@ xfs_init_zones(void) * but it is much faster. */ xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + - (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / + (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) * sizeof(int))), "xfs_buf_item"); if (!xfs_buf_item_zone) goto out_destroy_trans_zone; diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 8a319cfd290..ff6bc797baf 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -1059,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap, ); +#define XFS_BUSY_SYNC \ + { 0, "async" }, \ + { 1, "sync" } + TRACE_EVENT(xfs_alloc_busy, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, int slot), - TP_ARGS(mp, agno, agbno, len, slot), + TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, int sync), + TP_ARGS(trans, agno, agbno, len, sync), TP_STRUCT__entry( __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(int, tid) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, slot) + __field(int, sync) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->tid = trans->t_ticket->t_tid; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->slot = slot; + __entry->sync = sync; ), - TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", + TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, + __entry->tid, __entry->agno, __entry->agbno, __entry->len, - __entry->slot) + __print_symbolic(__entry->sync, XFS_BUSY_SYNC)) ); -#define XFS_BUSY_STATES \ - { 0, "found" }, \ - { 1, "missing" } - TRACE_EVENT(xfs_alloc_unbusy, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int slot, int found), - TP_ARGS(mp, agno, slot, found), + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) - __field(int, slot) - __field(int, found) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->slot = slot; - __entry->found = found; + __entry->agbno = agbno; + __entry->len = len; ), - TP_printk("dev %d:%d agno %u slot %d %s", + TP_printk("dev %d:%d agno %u agbno %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->slot, - __print_symbolic(__entry->found, XFS_BUSY_STATES)) + __entry->agbno, + __entry->len) ); +#define XFS_BUSY_STATES \ + { 0, "missing" }, \ + { 1, "found" } + TRACE_EVENT(xfs_alloc_busysearch, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, xfs_lsn_t lsn), - TP_ARGS(mp, agno, agbno, len, lsn), + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, int found), + TP_ARGS(mp, agno, agbno, len, found), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(xfs_lsn_t, lsn) + __field(int, found) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->lsn = lsn; + __entry->found = found; ), - TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", + TP_printk("dev %d:%d agno %u agbno %u len %u %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, + __print_symbolic(__entry->found, XFS_BUSY_STATES)) +); + +TRACE_EVENT(xfs_trans_commit_lsn, + TP_PROTO(struct xfs_trans *trans), + TP_ARGS(trans), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->lsn = trans->t_commit_lsn; + ), + TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, __entry->lsn) ); diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index b89ec5df012..585e7633dfc 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk( for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) xfs_qm_dqinit_core(curid, type, d); xfs_trans_dquot_buf(tp, bp, - (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : - ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : - XFS_BLI_GDQUOT_BUF))); + (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : + ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : + XFS_BLF_GDQUOT_BUF))); xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index abb8222b88c..401f364ad36 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -175,14 +175,20 @@ typedef struct xfs_agfl { } xfs_agfl_t; /* - * Busy block/extent entry. Used in perag to mark blocks that have been freed - * but whose transactions aren't committed to disk yet. + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_alloc_busy_insert() for details. */ -typedef struct xfs_perag_busy { - xfs_agblock_t busy_start; - xfs_extlen_t busy_length; - struct xfs_trans *busy_tp; /* transaction that did the free */ -} xfs_perag_busy_t; +struct xfs_busy_extent { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + xlog_tid_t tid; /* transaction that created this */ +}; /* * Per-ag incore structure, copies of information in agf and agi, @@ -216,7 +222,8 @@ typedef struct xfs_perag { xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; #ifdef __KERNEL__ - spinlock_t pagb_lock; /* lock for pagb_list */ + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ atomic_t pagf_fstrms; /* # of filestreams active in this AG */ @@ -226,7 +233,6 @@ typedef struct xfs_perag { int pag_ici_reclaimable; /* reclaimable inodes */ #endif int pagb_count; /* pagb slots in use */ - xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */ } xfs_perag_t; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 94cddbfb256..a7fbe8a99b1 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -46,11 +46,9 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +static int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); /* * Prototypes for per-ag allocation routines @@ -540,9 +538,16 @@ xfs_alloc_ag_vextent( be32_to_cpu(agf->agf_length)); xfs_alloc_log_agf(args->tp, args->agbp, XFS_AGF_FREEBLKS); - /* search the busylist for these blocks */ - xfs_alloc_search_busy(args->tp, args->agno, - args->agbno, args->len); + /* + * Search the busylist for these blocks and mark the + * transaction as synchronous if blocks are found. This + * avoids the need to block due to a synchronous log + * force to ensure correct ordering as the synchronous + * transaction will guarantee that for us. + */ + if (xfs_alloc_busy_search(args->mp, args->agno, + args->agbno, args->len)) + xfs_trans_set_sync(args->tp); } if (!args->isfl) xfs_trans_mod_sb(args->tp, @@ -1693,7 +1698,7 @@ xfs_free_ag_extent( * when the iclog commits to disk. If a busy block is allocated, * the iclog is pushed up to the LSN that freed the block. */ - xfs_alloc_mark_busy(tp, agno, bno, len); + xfs_alloc_busy_insert(tp, agno, bno, len); return 0; error0: @@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist( *bnop = bno; /* - * As blocks are freed, they are added to the per-ag busy list - * and remain there until the freeing transaction is committed to - * disk. Now that we have allocated blocks, this list must be - * searched to see if a block is being reused. If one is, then - * the freeing transaction must be pushed to disk NOW by forcing - * to disk all iclogs up that transaction's LSN. + * As blocks are freed, they are added to the per-ag busy list and + * remain there until the freeing transaction is committed to disk. + * Now that we have allocated blocks, this list must be searched to see + * if a block is being reused. If one is, then the freeing transaction + * must be pushed to disk before this transaction. + * + * We do this by setting the current transaction to a sync transaction + * which guarantees that the freeing transaction is on disk before this + * transaction. This is done instead of a synchronous log force here so + * that we don't sit and wait with the AGF locked in the transaction + * during the log force. */ - xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); + if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1)) + xfs_trans_set_sync(tp); return 0; } @@ -2201,7 +2212,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; - memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); + pag->pagb_tree = RB_ROOT; pag->pagf_init = 1; } #ifdef DEBUG @@ -2479,127 +2490,263 @@ error0: * list is reused, the transaction that freed it must be forced to disk * before continuing to use the block. * - * xfs_alloc_mark_busy - add to the per-ag busy list - * xfs_alloc_clear_busy - remove an item from the per-ag busy list + * xfs_alloc_busy_insert - add to the per-ag busy list + * xfs_alloc_busy_clear - remove an item from the per-ag busy list + * xfs_alloc_busy_search - search for a busy extent + */ + +/* + * Insert a new extent into the busy tree. + * + * The busy extent tree is indexed by the start block of the busy extent. + * there can be multiple overlapping ranges in the busy extent tree but only + * ever one entry at a given start block. The reason for this is that + * multi-block extents can be freed, then smaller chunks of that extent + * allocated and freed again before the first transaction commit is on disk. + * If the exact same start block is freed a second time, we have to wait for + * that busy extent to pass out of the tree before the new extent is inserted. + * There are two main cases we have to handle here. + * + * The first case is a transaction that triggers a "free - allocate - free" + * cycle. This can occur during btree manipulations as a btree block is freed + * to the freelist, then allocated from the free list, then freed again. In + * this case, the second extxpnet free is what triggers the duplicate and as + * such the transaction IDs should match. Because the extent was allocated in + * this transaction, the transaction must be marked as synchronous. This is + * true for all cases where the free/alloc/free occurs in the one transaction, + * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case. + * This serves to catch violations of the second case quite effectively. + * + * The second case is where the free/alloc/free occur in different + * transactions. In this case, the thread freeing the extent the second time + * can't mark the extent busy immediately because it is already tracked in a + * transaction that may be committing. When the log commit for the existing + * busy extent completes, the busy extent will be removed from the tree. If we + * allow the second busy insert to continue using that busy extent structure, + * it can be freed before this transaction is safely in the log. Hence our + * only option in this case is to force the log to remove the existing busy + * extent from the list before we insert the new one with the current + * transaction ID. + * + * The problem we are trying to avoid in the free-alloc-free in separate + * transactions is most easily described with a timeline: + * + * Thread 1 Thread 2 Thread 3 xfslogd + * xact alloc + * free X + * mark busy + * commit xact + * free xact + * xact alloc + * alloc X + * busy search + * mark xact sync + * commit xact + * free xact + * force log + * checkpoint starts + * .... + * xact alloc + * free X + * mark busy + * finds match + * *** KABOOM! *** + * .... + * log IO completes + * unbusy X + * checkpoint completes + * + * By issuing a log force in thread 3 @ "KABOOM", the thread will block until + * the checkpoint completes, and the busy extent it matched will have been + * removed from the tree when it is woken. Hence it can then continue safely. + * + * However, to ensure this matching process is robust, we need to use the + * transaction ID for identifying transaction, as delayed logging results in + * the busy extent and transaction lifecycles being different. i.e. the busy + * extent is active for a lot longer than the transaction. Hence the + * transaction structure can be freed and reallocated, then mark the same + * extent busy again in the new transaction. In this case the new transaction + * will have a different tid but can have the same address, and hence we need + * to check against the tid. + * + * Future: for delayed logging, we could avoid the log force if the extent was + * first freed in the current checkpoint sequence. This, however, requires the + * ability to pin the current checkpoint in memory until this transaction + * commits to ensure that both the original free and the current one combine + * logically into the one checkpoint. If the checkpoint sequences are + * different, however, we still need to wait on a log force. */ void -xfs_alloc_mark_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +xfs_alloc_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { - xfs_perag_busy_t *bsy; + struct xfs_busy_extent *new; + struct xfs_busy_extent *busyp; struct xfs_perag *pag; - int n; + struct rb_node **rbp; + struct rb_node *parent; + int match; - pag = xfs_perag_get(tp->t_mountp, agno); - spin_lock(&pag->pagb_lock); - /* search pagb_list for an open slot */ - for (bsy = pag->pagb_list, n = 0; - n < XFS_PAGB_NUM_SLOTS; - bsy++, n++) { - if (bsy->busy_tp == NULL) { - break; - } + new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_alloc_busy(tp, agno, bno, len, 1); + xfs_trans_set_sync(tp); + return; } - trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); + new->agno = agno; + new->bno = bno; + new->length = len; + new->tid = xfs_log_get_trans_ident(tp); - if (n < XFS_PAGB_NUM_SLOTS) { - bsy = &pag->pagb_list[n]; - pag->pagb_count++; - bsy->busy_start = bno; - bsy->busy_length = len; - bsy->busy_tp = tp; - xfs_trans_add_busy(tp, agno, n); - } else { + INIT_LIST_HEAD(&new->list); + + /* trace before insert to be able to see failed inserts */ + trace_xfs_alloc_busy(tp, agno, bno, len, 0); + + pag = xfs_perag_get(tp->t_mountp, new->agno); +restart: + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + parent = NULL; + busyp = NULL; + match = 0; + while (*rbp && match >= 0) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); + + if (new->bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + rbp = &(*rbp)->rb_left; + if (new->bno + new->length > busyp->bno) + match = busyp->tid == new->tid ? 1 : -1; + } else if (new->bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + rbp = &(*rbp)->rb_right; + if (bno < busyp->bno + busyp->length) + match = busyp->tid == new->tid ? 1 : -1; + } else { + match = busyp->tid == new->tid ? 1 : -1; + break; + } + } + if (match < 0) { + /* overlap marked busy in different transaction */ + spin_unlock(&pag->pagb_lock); + xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); + goto restart; + } + if (match > 0) { /* - * The busy list is full! Since it is now not possible to - * track the free block, make this a synchronous transaction - * to insure that the block is not reused before this - * transaction commits. + * overlap marked busy in same transaction. Update if exact + * start block match, otherwise combine the busy extents into + * a single range. */ - xfs_trans_set_sync(tp); - } + if (busyp->bno == new->bno) { + busyp->length = max(busyp->length, new->length); + spin_unlock(&pag->pagb_lock); + ASSERT(tp->t_flags & XFS_TRANS_SYNC); + xfs_perag_put(pag); + kmem_free(new); + return; + } + rb_erase(&busyp->rb_node, &pag->pagb_tree); + new->length = max(busyp->bno + busyp->length, + new->bno + new->length) - + min(busyp->bno, new->bno); + new->bno = min(busyp->bno, new->bno); + } else + busyp = NULL; + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); + kmem_free(busyp); } -void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - int idx) +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +static int +xfs_alloc_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { struct xfs_perag *pag; - xfs_perag_busy_t *list; + struct rb_node *rbp; + struct xfs_busy_extent *busyp; + int match = 0; - ASSERT(idx < XFS_PAGB_NUM_SLOTS); - pag = xfs_perag_get(tp->t_mountp, agno); + pag = xfs_perag_get(mp, agno); spin_lock(&pag->pagb_lock); - list = pag->pagb_list; - trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); - - if (list[idx].busy_tp == tp) { - list[idx].busy_tp = NULL; - pag->pagb_count--; + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } } - spin_unlock(&pag->pagb_lock); + trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match); xfs_perag_put(pag); + return match; } - -/* - * If we find the extent in the busy list, force the log out to get the - * extent out of the busy list so the caller can use it straight away. - */ -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +void +xfs_alloc_busy_clear( + struct xfs_mount *mp, + struct xfs_busy_extent *busyp) { struct xfs_perag *pag; - xfs_perag_busy_t *bsy; - xfs_agblock_t uend, bend; - xfs_lsn_t lsn = 0; - int cnt; - pag = xfs_perag_get(tp->t_mountp, agno); - spin_lock(&pag->pagb_lock); - cnt = pag->pagb_count; + trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, + busyp->length); - /* - * search pagb_list for this slot, skipping open slots. We have to - * search the entire array as there may be multiple overlaps and - * we have to get the most recent LSN for the log force to push out - * all the transactions that span the range. - */ - uend = bno + len - 1; - for (cnt = 0; cnt < pag->pagb_count; cnt++) { - bsy = &pag->pagb_list[cnt]; - if (!bsy->busy_tp) - continue; + ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, + busyp->length) == 1); - bend = bsy->busy_start + bsy->busy_length - 1; - if (bno > bend || uend < bsy->busy_start) - continue; + list_del_init(&busyp->list); - /* (start1,length1) within (start2, length2) */ - if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) - lsn = bsy->busy_tp->t_commit_lsn; - } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + rb_erase(&busyp->rb_node, &pag->pagb_tree); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); - trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); - /* - * If a block was found, force the log through the LSN of the - * transaction that freed the block - */ - if (lsn) - xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC); + kmem_free(busyp); } diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 599bffa3978..6d05199b667 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -22,6 +22,7 @@ struct xfs_buf; struct xfs_mount; struct xfs_perag; struct xfs_trans; +struct xfs_busy_extent; /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. @@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, #ifdef __KERNEL__ void -xfs_alloc_mark_busy(xfs_trans_t *tp, +xfs_alloc_busy_insert(xfs_trans_t *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len); void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - int idx); +xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b726e10d2c1..83f49421875 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -134,7 +134,7 @@ xfs_allocbt_free_block( * disk. If a busy block is allocated, the iclog is pushed up to the * LSN that freed the block. */ - xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); + xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 240340a4727..02a80984aa0 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -64,7 +64,7 @@ xfs_buf_item_log_debug( nbytes = last - first + 1; bfset(bip->bli_logged, first, nbytes); for (x = 0; x < nbytes; x++) { - chunk_num = byte >> XFS_BLI_SHIFT; + chunk_num = byte >> XFS_BLF_SHIFT; word_num = chunk_num >> BIT_TO_WORD_SHIFT; bit_num = chunk_num & (NBWORD - 1); wordp = &(bip->bli_format.blf_data_map[word_num]); @@ -166,7 +166,7 @@ xfs_buf_item_size( * cancel flag in it. */ trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); return 1; } @@ -197,9 +197,9 @@ xfs_buf_item_size( } else if (next_bit != last_bit + 1) { last_bit = next_bit; nvecs++; - } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != - (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + - XFS_BLI_CHUNK)) { + } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != + (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + + XFS_BLF_CHUNK)) { last_bit = next_bit; nvecs++; } else { @@ -254,6 +254,20 @@ xfs_buf_item_format( vecp++; nvecs = 1; |