diff options
author | David S. Miller <davem@davemloft.net> | 2010-05-31 05:46:45 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-05-31 05:46:45 -0700 |
commit | 64960848abd18d0bcde3f53ffa7ed0b631e6b25d (patch) | |
tree | 8424a1c550a98ce09f127425fde9b7b5f2f5027a /fs/xfs | |
parent | 2903037400a26e7c0cc93ab75a7d62abfacdf485 (diff) | |
parent | 67a3e12b05e055c0415c556a315a3d3eb637e29e (diff) |
Merge branch 'master' of /home/davem/src/GIT/linux-2.6/
Diffstat (limited to 'fs/xfs')
55 files changed, 3437 insertions, 2592 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index b4769e40e8b..c8fb13f83b3 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \ xfs_itable.o \ xfs_dfrag.o \ xfs_log.o \ + xfs_log_cil.o \ xfs_log_recover.o \ xfs_mount.o \ xfs_mru_cache.o \ diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index a7bc925c4d6..9f769b5b38f 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -440,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, return error; } -struct xattr_handler xfs_xattr_acl_access_handler = { +const struct xattr_handler xfs_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .get = xfs_xattr_acl_get, .set = xfs_xattr_acl_set, }; -struct xattr_handler xfs_xattr_acl_default_handler = { +const struct xattr_handler xfs_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = xfs_xattr_acl_get, diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 0f8b9968a80..089eaca860b 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -45,6 +45,15 @@ #include <linux/pagevec.h> #include <linux/writeback.h> +/* + * Types of I/O for bmap clustering and I/O completion tracking. + */ +enum { + IO_READ, /* mapping for a read */ + IO_DELAY, /* mapping covers delalloc region */ + IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */ + IO_NEW /* just allocated */ +}; /* * Prime number of hash buckets since address is used as the key. @@ -103,8 +112,9 @@ xfs_count_page_state( STATIC struct block_device * xfs_find_bdev_for_inode( - struct xfs_inode *ip) + struct inode *inode) { + struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; if (XFS_IS_REALTIME_INODE(ip)) @@ -183,7 +193,7 @@ xfs_setfilesize( xfs_fsize_t isize; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); - ASSERT(ioend->io_type != IOMAP_READ); + ASSERT(ioend->io_type != IO_READ); if (unlikely(ioend->io_error)) return 0; @@ -214,7 +224,7 @@ xfs_finish_ioend( if (atomic_dec_and_test(&ioend->io_remaining)) { struct workqueue_struct *wq; - wq = (ioend->io_type == IOMAP_UNWRITTEN) ? + wq = (ioend->io_type == IO_UNWRITTEN) ? xfsconvertd_workqueue : xfsdatad_workqueue; queue_work(wq, &ioend->io_work); if (wait) @@ -237,7 +247,7 @@ xfs_end_io( * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. */ - if (ioend->io_type == IOMAP_UNWRITTEN && + if (ioend->io_type == IO_UNWRITTEN && likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { error = xfs_iomap_write_unwritten(ip, ioend->io_offset, @@ -250,7 +260,7 @@ xfs_end_io( * We might have to update the on-disk file size after extending * writes. */ - if (ioend->io_type != IOMAP_READ) { + if (ioend->io_type != IO_READ) { error = xfs_setfilesize(ioend); ASSERT(!error || error == EAGAIN); } @@ -309,21 +319,25 @@ xfs_map_blocks( struct inode *inode, loff_t offset, ssize_t count, - xfs_iomap_t *mapp, + struct xfs_bmbt_irec *imap, int flags) { int nmaps = 1; + int new = 0; - return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps); + return -xfs_iomap(XFS_I(inode), offset, count, flags, imap, &nmaps, &new); } STATIC int -xfs_iomap_valid( - xfs_iomap_t *iomapp, - loff_t offset) +xfs_imap_valid( + struct inode *inode, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { - return offset >= iomapp->iomap_offset && - offset < iomapp->iomap_offset + iomapp->iomap_bsize; + offset >>= inode->i_blkbits; + + return offset >= imap->br_startoff && + offset < imap->br_startoff + imap->br_blockcount; } /* @@ -554,19 +568,23 @@ xfs_add_to_ioend( STATIC void xfs_map_buffer( + struct inode *inode, struct buffer_head *bh, - xfs_iomap_t *mp, - xfs_off_t offset, - uint block_bits) + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { sector_t bn; + struct xfs_mount *m = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); + xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); - ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) + - ((offset - mp->iomap_offset) >> block_bits); + bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + + ((offset - iomap_offset) >> inode->i_blkbits); - ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME)); + ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); bh->b_blocknr = bn; set_buffer_mapped(bh); @@ -574,17 +592,17 @@ xfs_map_buffer( STATIC void xfs_map_at_offset( + struct inode *inode, struct buffer_head *bh, - loff_t offset, - int block_bits, - xfs_iomap_t *iomapp) + struct xfs_bmbt_irec *imap, + xfs_off_t offset) { - ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); lock_buffer(bh); - xfs_map_buffer(bh, iomapp, offset, block_bits); - bh->b_bdev = iomapp->iomap_target->bt_bdev; + xfs_map_buffer(inode, bh, imap, offset); + bh->b_bdev = xfs_find_bdev_for_inode(inode); set_buffer_mapped(bh); clear_buffer_delay(bh); clear_buffer_unwritten(bh); @@ -713,11 +731,11 @@ xfs_is_delayed_page( bh = head = page_buffers(page); do { if (buffer_unwritten(bh)) - acceptable = (type == IOMAP_UNWRITTEN); + acceptable = (type == IO_UNWRITTEN); else if (buffer_delay(bh)) - acceptable = (type == IOMAP_DELAY); + acceptable = (type == IO_DELAY); else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == IOMAP_NEW); + acceptable = (type == IO_NEW); else break; } while ((bh = bh->b_this_page) != head); @@ -740,7 +758,7 @@ xfs_convert_page( struct inode *inode, struct page *page, loff_t tindex, - xfs_iomap_t *mp, + struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, int startio, @@ -750,7 +768,6 @@ xfs_convert_page( xfs_off_t end_offset; unsigned long p_offset; unsigned int type; - int bbits = inode->i_blkbits; int len, page_dirty; int count = 0, done = 0, uptodate = 1; xfs_off_t offset = page_offset(page); @@ -802,19 +819,19 @@ xfs_convert_page( if (buffer_unwritten(bh) || buffer_delay(bh)) { if (buffer_unwritten(bh)) - type = IOMAP_UNWRITTEN; + type = IO_UNWRITTEN; else - type = IOMAP_DELAY; + type = IO_DELAY; - if (!xfs_iomap_valid(mp, offset)) { + if (!xfs_imap_valid(inode, imap, offset)) { done = 1; continue; } - ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); - xfs_map_at_offset(bh, offset, bbits, mp); + xfs_map_at_offset(inode, bh, imap, offset); if (startio) { xfs_add_to_ioend(inode, bh, offset, type, ioendp, done); @@ -826,7 +843,7 @@ xfs_convert_page( page_dirty--; count++; } else { - type = IOMAP_NEW; + type = IO_NEW; if (buffer_mapped(bh) && all_bh && startio) { lock_buffer(bh); xfs_add_to_ioend(inode, bh, offset, @@ -866,7 +883,7 @@ STATIC void xfs_cluster_write( struct inode *inode, pgoff_t tindex, - xfs_iomap_t *iomapp, + struct xfs_bmbt_irec *imap, xfs_ioend_t **ioendp, struct writeback_control *wbc, int startio, @@ -885,7 +902,7 @@ xfs_cluster_write( for (i = 0; i < pagevec_count(&pvec); i++) { done = xfs_convert_page(inode, pvec.pages[i], tindex++, - iomapp, ioendp, wbc, startio, all_bh); + imap, ioendp, wbc, startio, all_bh); if (done) break; } @@ -930,7 +947,7 @@ xfs_aops_discard_page( loff_t offset = page_offset(page); ssize_t len = 1 << inode->i_blkbits; - if (!xfs_is_delayed_page(page, IOMAP_DELAY)) + if (!xfs_is_delayed_page(page, IO_DELAY)) goto out_invalidate; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) @@ -1042,15 +1059,15 @@ xfs_page_state_convert( int unmapped) /* also implies page uptodate */ { struct buffer_head *bh, *head; - xfs_iomap_t iomap; + struct xfs_bmbt_irec imap; xfs_ioend_t *ioend = NULL, *iohead = NULL; loff_t offset; unsigned long p_offset = 0; unsigned int type; __uint64_t end_offset; - pgoff_t end_index, last_index, tlast; + pgoff_t end_index, last_index; ssize_t size, len; - int flags, err, iomap_valid = 0, uptodate = 1; + int flags, err, imap_valid = 0, uptodate = 1; int page_dirty, count = 0; int trylock = 0; int all_bh = unmapped; @@ -1097,7 +1114,7 @@ xfs_page_state_convert( bh = head = page_buffers(page); offset = page_offset(page); flags = BMAPI_READ; - type = IOMAP_NEW; + type = IO_NEW; /* TODO: cleanup count and page_dirty */ @@ -1111,12 +1128,12 @@ xfs_page_state_convert( * the iomap is actually still valid, but the ioend * isn't. shouldn't happen too often. */ - iomap_valid = 0; + imap_valid = 0; continue; } - if (iomap_valid) - iomap_valid = xfs_iomap_valid(&iomap, offset); + if (imap_valid) + imap_valid = xfs_imap_valid(inode, &imap, offset); /* * First case, map an unwritten extent and prepare for @@ -1137,20 +1154,20 @@ xfs_page_state_convert( * Make sure we don't use a read-only iomap */ if (flags == BMAPI_READ) - iomap_valid = 0; + imap_valid = 0; if (buffer_unwritten(bh)) { - type = IOMAP_UNWRITTEN; + type = IO_UNWRITTEN; flags = BMAPI_WRITE | BMAPI_IGNSTATE; } else if (buffer_delay(bh)) { - type = IOMAP_DELAY; + type = IO_DELAY; flags = BMAPI_ALLOCATE | trylock; } else { - type = IOMAP_NEW; + type = IO_NEW; flags = BMAPI_WRITE | BMAPI_MMAP; } - if (!iomap_valid) { + if (!imap_valid) { /* * if we didn't have a valid mapping then we * need to ensure that we put the new mapping @@ -1160,7 +1177,7 @@ xfs_page_state_convert( * for unwritten extent conversion. */ new_ioend = 1; - if (type == IOMAP_NEW) { + if (type == IO_NEW) { size = xfs_probe_cluster(inode, page, bh, head, 0); } else { @@ -1168,14 +1185,14 @@ xfs_page_state_convert( } err = xfs_map_blocks(inode, offset, size, - &iomap, flags); + &imap, flags); if (err) goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); + imap_valid = xfs_imap_valid(inode, &imap, + offset); } - if (iomap_valid) { - xfs_map_at_offset(bh, offset, - inode->i_blkbits, &iomap); + if (imap_valid) { + xfs_map_at_offset(inode, bh, &imap, offset); if (startio) { xfs_add_to_ioend(inode, bh, offset, type, &ioend, @@ -1194,40 +1211,41 @@ xfs_page_state_convert( * That means it must already have extents allocated * underneath it. Map the extent by reading it. */ - if (!iomap_valid || flags != BMAPI_READ) { + if (!imap_valid || flags != BMAPI_READ) { flags = BMAPI_READ; size = xfs_probe_cluster(inode, page, bh, head, 1); err = xfs_map_blocks(inode, offset, size, - &iomap, flags); + &imap, flags); if (err) goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); + imap_valid = xfs_imap_valid(inode, &imap, + offset); } /* - * We set the type to IOMAP_NEW in case we are doing a + * We set the type to IO_NEW in case we are doing a * small write at EOF that is extending the file but * without needing an allocation. We need to update the * file size on I/O completion in this case so it is * the same case as having just allocated a new extent * that we are writing into for the first time. */ - type = IOMAP_NEW; + type = IO_NEW; if (trylock_buffer(bh)) { ASSERT(buffer_mapped(bh)); - if (iomap_valid) + if (imap_valid) all_bh = 1; xfs_add_to_ioend(inode, bh, offset, type, - &ioend, !iomap_valid); + &ioend, !imap_valid); page_dirty--; count++; } else { - iomap_valid = 0; + imap_valid = 0; } } else if ((buffer_uptodate(bh) || PageUptodate(page)) && (unmapped || startio)) { - iomap_valid = 0; + imap_valid = 0; } if (!iohead) @@ -1241,12 +1259,23 @@ xfs_page_state_convert( if (startio) xfs_start_page_writeback(page, 1, count); - if (ioend && iomap_valid) { - offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> - PAGE_CACHE_SHIFT; - tlast = min_t(pgoff_t, offset, last_index); - xfs_cluster_write(inode, page->index + 1, &iomap, &ioend, - wbc, startio, all_bh, tlast); + if (ioend && imap_valid) { + xfs_off_t end_index; + + end_index = imap.br_startoff + imap.br_blockcount; + + /* to bytes */ + end_index <<= inode->i_blkbits; + + /* to pages */ + end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; + + /* check against file size */ + if (end_index > last_index) + end_index = last_index; + + xfs_cluster_write(inode, page->index + 1, &imap, &ioend, + wbc, startio, all_bh, end_index); } if (iohead) @@ -1448,10 +1477,11 @@ __xfs_get_blocks( int direct, bmapi_flags_t flags) { - xfs_iomap_t iomap; + struct xfs_bmbt_irec imap; xfs_off_t offset; ssize_t size; - int niomap = 1; + int nimap = 1; + int new = 0; int error; offset = (xfs_off_t)iblock << inode->i_blkbits; @@ -1462,22 +1492,21 @@ __xfs_get_blocks( return 0; error = xfs_iomap(XFS_I(inode), offset, size, - create ? flags : BMAPI_READ, &iomap, &niomap); + create ? flags : BMAPI_READ, &imap, &nimap, &new); if (error) return -error; - if (niomap == 0) + if (nimap == 0) return 0; - if (iomap.iomap_bn != IOMAP_DADDR_NULL) { + if (imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK) { /* * For unwritten extents do not report a disk address on * the read case (treat as if we're reading into a hole). */ - if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { - xfs_map_buffer(bh_result, &iomap, offset, - inode->i_blkbits); - } - if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) { + if (create || !ISUNWRITTEN(&imap)) + xfs_map_buffer(inode, bh_result, &imap, offset); + if (create && ISUNWRITTEN(&imap)) { if (direct) bh_result->b_private = inode; set_buffer_unwritten(bh_result); @@ -1488,7 +1517,7 @@ __xfs_get_blocks( * If this is a realtime file, data may be on a different device. * to that pointed to from the buffer_head b_bdev currently. */ - bh_result->b_bdev = iomap.iomap_target->bt_bdev; + bh_result->b_bdev = xfs_find_bdev_for_inode(inode); /* * If we previously allocated a block out beyond eof and we are now @@ -1502,10 +1531,10 @@ __xfs_get_blocks( if (create && ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || (offset >= i_size_read(inode)) || - (iomap.iomap_flags & (IOMAP_NEW|IOMAP_UNWRITTEN)))) + (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - if (iomap.iomap_flags & IOMAP_DELAY) { + if (imap.br_startblock == DELAYSTARTBLOCK) { BUG_ON(direct); if (create) { set_buffer_uptodate(bh_result); @@ -1514,11 +1543,23 @@ __xfs_get_blocks( } } + /* + * If this is O_DIRECT or the mpage code calling tell them how large + * the mapping is, so that we can avoid repeated get_blocks calls. + */ if (direct || size > (1 << inode->i_blkbits)) { - ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); - offset = min_t(xfs_off_t, - iomap.iomap_bsize - iomap.iomap_delta, size); - bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset); + xfs_off_t mapping_size; + + mapping_size = imap.br_startoff + imap.br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; } return 0; @@ -1576,7 +1617,7 @@ xfs_end_io_direct( */ ioend->io_offset = offset; ioend->io_size = size; - if (ioend->io_type == IOMAP_READ) { + if (ioend->io_type == IO_READ) { xfs_finish_ioend(ioend, 0); } else if (private && size > 0) { xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); @@ -1587,7 +1628,7 @@ xfs_end_io_direct( * didn't map an unwritten extent so switch it's completion * handler. */ - ioend->io_type = IOMAP_NEW; + ioend->io_type = IO_NEW; xfs_finish_ioend(ioend, 0); } @@ -1612,10 +1653,10 @@ xfs_vm_direct_IO( struct block_device *bdev; ssize_t ret; - bdev = xfs_find_bdev_for_inode(XFS_I(inode)); + bdev = xfs_find_bdev_for_inode(inode); iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? - IOMAP_UNWRITTEN : IOMAP_READ); + IO_UNWRITTEN : IO_READ); ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, offset, nr_segs, diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 44c2b0ef9a4..649ade8ef59 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -37,6 +37,7 @@ #include "xfs_sb.h" #include "xfs_inum.h" +#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" @@ -850,6 +851,12 @@ xfs_buf_lock_value( * Note that this in no way locks the underlying pages, so it is only * useful for synchronizing concurrent use of buffer objects, not for * synchronizing independent access to the underlying pages. + * + * If we come across a stale, pinned, locked buffer, we know that we + * are being asked to lock a buffer that has been reallocated. Because + * it is pinned, we know that the log has not been pushed to disk and + * hence it will still be locked. Rather than sleeping until someone + * else pushes the log, push it ourselves before trying to get the lock. */ void xfs_buf_lock( @@ -857,6 +864,8 @@ xfs_buf_lock( { trace_xfs_buf_lock(bp, _RET_IP_); + if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_mount, 0); if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); down(&bp->b_sema); @@ -1007,25 +1016,20 @@ xfs_bwrite( struct xfs_mount *mp, struct xfs_buf *bp) { - int iowait = (bp->b_flags & XBF_ASYNC) == 0; - int error = 0; + int error; bp->b_strat = xfs_bdstrat_cb; bp->b_mount = mp; bp->b_flags |= XBF_WRITE; - if (!iowait) - bp->b_flags |= _XBF_RUN_QUEUES; + bp->b_flags &= ~(XBF_ASYNC | XBF_READ); xfs_buf_delwri_dequeue(bp); xfs_buf_iostrategy(bp); - if (iowait) { - error = xfs_buf_iowait(bp); - if (error) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - xfs_buf_relse(bp); - } - + error = xfs_buf_iowait(bp); + if (error) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_buf_relse(bp); return error; } @@ -1614,7 +1618,8 @@ xfs_mapping_buftarg( STATIC int xfs_alloc_delwrite_queue( - xfs_buftarg_t *btp) + xfs_buftarg_t *btp, + const char *fsname) { int error = 0; @@ -1622,7 +1627,7 @@ xfs_alloc_delwrite_queue( INIT_LIST_HEAD(&btp->bt_delwrite_queue); spin_lock_init(&btp->bt_delwrite_lock); btp->bt_flags = 0; - btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); + btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); if (IS_ERR(btp->bt_task)) { error = PTR_ERR(btp->bt_task); goto out_error; @@ -1635,7 +1640,8 @@ out_error: xfs_buftarg_t * xfs_alloc_buftarg( struct block_device *bdev, - int external) + int external, + const char *fsname) { xfs_buftarg_t *btp; @@ -1647,7 +1653,7 @@ xfs_alloc_buftarg( goto error; if (xfs_mapping_buftarg(btp, bdev)) goto error; - if (xfs_alloc_delwrite_queue(btp)) + if (xfs_alloc_delwrite_queue(btp, fsname)) goto error; xfs_alloc_bufhash(btp, external); return btp; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 386e7361e50..5fbecefa5df 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -390,7 +390,7 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) /* * Handling of buftargs. */ -extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); +extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *); extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 42dd3bcfba6..257a56b127c 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -100,10 +100,10 @@ xfs_iozero( STATIC int xfs_file_fsync( struct file *file, - struct dentry *dentry, int datasync) { - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); struct xfs_trans *tp; int error = 0; int log_flushed = 0; @@ -115,6 +115,8 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); + xfs_ioend_wait(ip); + /* * We always need to make sure that the required inode state is safe on * disk. The inode might be clean but we still might need to force the @@ -138,8 +140,8 @@ xfs_file_fsync( * might gets cleared when the inode gets written out via the AIL * or xfs_iflush_cluster. */ - if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) || - ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) && + if (((inode->i_state & I_DIRTY_DATASYNC) || + ((inode->i_state & I_DIRTY_SYNC) && !datasync)) && ip->i_update_core) { /* * Kick off a transaction to log the inode core to get the @@ -866,7 +868,7 @@ write_retry: mutex_lock(&inode->i_mutex); xfs_ilock(ip, iolock); - error2 = -xfs_file_fsync(file, file->f_path.dentry, + error2 = -xfs_file_fsync(file, (file->f_flags & __O_SYNC) ? 0 : 1); if (!error) error = error2; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 7b26cc2fd28..699b60cbab9 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -527,6 +527,10 @@ xfs_attrmulti_by_handle( if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) + return -E2BIG; + dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 593c05b4df8..9287135e9bf 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -420,6 +420,10 @@ xfs_compat_attrmulti_by_handle( sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t)) + return -E2BIG; + dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index e65a7937f3a..9c8019c78c9 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -673,7 +673,10 @@ xfs_vn_fiemap( bm.bmv_length = BTOBB(length); /* We add one because in getbmap world count includes the header */ - bm.bmv_count = fieinfo->fi_extents_max + 1; + bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : + fieinfo->fi_extents_max + 1; + bm.bmv_count = min_t(__s32, bm.bmv_count, + (PAGE_SIZE * 16 / sizeof(struct getbmapx))); bm.bmv_iflags = BMV_IF_PREALLOC; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) bm.bmv_iflags |= BMV_IF_ATTRFORK; diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index 1947514ce1a..9ac8aea9152 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -19,6 +19,7 @@ #include "xfs_dmapi.h" #include "xfs_sb.h" #include "xfs_inum.h" +#include "xfs_log.h" #include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" @@ -97,7 +98,7 @@ xfs_fs_set_xstate( } STATIC int -xfs_fs_get_xquota( +xfs_fs_get_dqblk( struct super_block *sb, int type, qid_t id, @@ -114,7 +115,7 @@ xfs_fs_get_xquota( } STATIC int -xfs_fs_set_xquota( +xfs_fs_set_dqblk( struct super_block *sb, int type, qid_t id, @@ -135,6 +136,6 @@ xfs_fs_set_xquota( const struct quotactl_ops xfs_quotactl_operations = { .get_xstate = xfs_fs_get_xstate, .set_xstate = xfs_fs_set_xstate, - .get_xquota = xfs_fs_get_xquota, - .set_xquota = xfs_fs_set_xquota, + .get_dqblk = xfs_fs_get_dqblk, + .set_dqblk = xfs_fs_set_dqblk, }; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 29f1edca76d..f2d1718c916 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool; #define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ #define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ +#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ +#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ /* * Table driven mount option parser. @@ -374,6 +376,13 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DMAPI; } else if (!strcmp(this_char, MNTOPT_DMI)) { mp->m_flags |= XFS_MOUNT_DMAPI; + } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { + mp->m_flags |= XFS_MOUNT_DELAYLOG; + cmn_err(CE_WARN, + "Enabling EXPERIMENTAL delayed logging feature " + "- use at your own risk.\n"); + } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { + mp->m_flags &= ~XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, "ihashsize")) { cmn_err(CE_WARN, "XFS: ihashsize no longer used, option is deprecated."); @@ -535,6 +544,7 @@ xfs_showargs( { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, + { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -725,7 +735,8 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); } STATIC void @@ -789,18 +800,18 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); + mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); + mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1); + mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -902,7 +913,8 @@ xfsaild_start( struct xfs_ail *ailp) { ailp->xa_target = 0; - ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild"); + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->xa_mount->m_fsname); if (IS_ERR(ailp->xa_task)) return -PTR_ERR(ailp->xa_task); return 0; @@ -1092,6 +1104,7 @@ xfs_fs_write_inode( * the code will only flush the inode if it isn't already * being flushed. */ + xfs_ioend_wait(ip); xfs_ilock(ip, XFS_ILOCK_SHARED); if (ip->i_update_core) { error = xfs_log_inode(ip); @@ -1752,7 +1765,7 @@ xfs_init_zones(void) * but it is much faster. */ xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + - (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / + (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) * sizeof(int))), "xfs_buf_item"); if (!xfs_buf_item_zone) goto out_destroy_trans_zone; diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 233d4b9881b..519618e9279 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern const struct export_operations xfs_export_operations; -extern struct xattr_handler *xfs_xattr_handlers[]; +extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index a427c638d90..3884e20bc14 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -356,68 +356,23 @@ xfs_commit_dummy_trans( STATIC int xfs_sync_fsdata( - struct xfs_mount *mp, - int flags) + struct xfs_mount *mp) { struct xfs_buf *bp; - struct xfs_buf_log_item *bip; - int error = 0; /* - * If this is xfssyncd() then only sync the superblock if we can - * lock it without sleeping and it is not pinned. + * If the buffer is pinned then push on the log so we won't get stuck + * waiting in the write for someone, maybe ourselves, to flush the log. + * + * Even though we just pushed the log above, we did not have the + * superblock buffer locked at that point so it can become pinned in + * between there and here. */ - if (flags & SYNC_TRYLOCK) { - ASSERT(!(flags & SYNC_WAIT)); - - bp = xfs_getsb(mp, XBF_TRYLOCK); - if (!bp) - goto out; - - bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *); - if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp)) - goto out_brelse; - } else { - bp = xfs_getsb(mp, 0); - - /* - * If the buffer is pinned then push on the log so we won't - * get stuck waiting in the write for someone, maybe - * ourselves, to flush the log. - * - * Even though we just pushed the log above, we did not have - * the superblock buffer locked at that point so it can - * become pinned in between there and here. - */ - if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(mp, 0); - } - - - if (flags & SYNC_WAIT) - XFS_BUF_UNASYNC(bp); - else - XFS_BUF_ASYNC(bp); - - error = xfs_bwrite(mp, bp); - if (error) - return error; - - /* - * If this is a data integrity sync make sure all pending buffers - * are flushed out for the log coverage check below. - */ - if (flags & SYNC_WAIT) - xfs_flush_buftarg(mp->m_ddev_targp, 1); - - if (xfs_log_need_covered(mp)) - error = xfs_commit_dummy_trans(mp, flags); - return error; + bp = xfs_getsb(mp, 0); + if (XFS_BUF_ISPINNED(bp)) + xfs_log_force(mp, 0); - out_brelse: - xfs_buf_relse(bp); - out: - return error; + return xfs_bwrite(mp, bp); } /* @@ -441,7 +396,7 @@ int xfs_quiesce_data( struct xfs_mount *mp) { - int error; + int error, error2 = 0; /* push non-blocking */ xfs_sync_data(mp, 0); @@ -452,13 +407,20 @@ xfs_quiesce_data( xfs_qm_sync(mp, SYNC_WAIT); /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp, SYNC_WAIT); + error = xfs_sync_fsdata(mp); + + /* make sure all delwri buffers are written out */ + xfs_flush_buftarg(mp->m_ddev_targp, 1); + + /* mark the log as covered if needed */ + if (xfs_log_need_covered(mp)) + error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) XFS_bflush(mp->m_rtdev_targp); - return error; + return error ? error : error2; } STATIC void @@ -581,9 +543,9 @@ xfs_flush_inodes( } /* - * Every sync period we need to unpin all items, reclaim inodes, sync - * quota and write out the superblock. We might need to cover the log - * to indicate it is idle. + * Every sync period we need to unpin all items, reclaim inodes and sync + * disk quotas. We might need to cover the log to indicate that the + * filesystem is idle. */ STATIC void xfs_sync_worker( @@ -597,7 +559,8 @@ xfs_sync_worker( xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); - error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); + if (xfs_log_need_covered(mp)) + error = xfs_commit_dummy_trans(mp, 0); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); @@ -660,7 +623,7 @@ xfs_syncd_init( mp->m_sync_work.w_syncer = xfs_sync_worker; mp->m_sync_work.w_mount = mp; mp->m_sync_work.w_completion = NULL; - mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); + mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); if (IS_ERR(mp->m_sync_task)) return -PTR_ERR(mp->m_sync_task); return 0; diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c index 5a107601e96..207fa77f63a 100644 --- a/fs/xfs/linux-2.6/xfs_trace.c +++ b/fs/xfs/linux-2.6/xfs_trace.c @@ -41,7 +41,6 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_attr.h" -#include "xfs_attr_sf.h" #include "xfs_attr_leaf.h" #include "xfs_log_priv.h" #include "xfs_buf_item.h" @@ -50,6 +49,9 @@ #include "xfs_aops.h" #include "quota/xfs_dquot_item.h" #include "quota/xfs_dquot.h" +#include "xfs_log_recover.h" +#include "xfs_buf_item.h" +#include "xfs_inode_item.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index fcaa62f0799..ff6bc797baf 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -32,6 +32,10 @@ struct xfs_da_node_entry; struct xfs_dquot; struct xlog_ticket; struct log; +struct xlog_recover; +struct xlog_recover_item; +struct xfs_buf_log_format; +struct xfs_inode_log_format; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -562,18 +566,21 @@ DECLARE_EVENT_CLASS(xfs_inode_class, __field(dev_t, dev) __field(xfs_ino_t, ino) __field(int, count) + __field(int, pincount) __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->pincount = atomic_read(&ip->i_pincount); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, + __entry->pincount, (char *)__entry->caller_ip) ) @@ -583,6 +590,10 @@ DEFINE_EVENT(xfs_inode_class, name, \ TP_ARGS(ip, caller_ip)) DEFINE_INODE_EVENT(xfs_ihold); DEFINE_INODE_EVENT(xfs_irele); +DEFINE_INODE_EVENT(xfs_inode_pin); +DEFINE_INODE_EVENT(xfs_inode_unpin); +DEFINE_INODE_EVENT(xfs_inode_unpin_nowait); + /* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ DEFINE_INODE_EVENT(xfs_inode); #define xfs_itrace_entry(ip) \ @@ -642,8 +653,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \ TP_PROTO(struct xfs_dquot *dqp), \ TP_ARGS(dqp)) DEFINE_DQUOT_EVENT(xfs_dqadjust); -DEFINE_DQUOT_EVENT(xfs_dqshake_dirty); -DEFINE_DQUOT_EVENT(xfs_dqshake_unlink); DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); @@ -658,7 +667,6 @@ DEFINE_DQUOT_EVENT(xfs_dqread_fail); DEFINE_DQUOT_EVENT(xfs_dqlookup_found); DEFINE_DQUOT_EVENT(xfs_dqlookup_want); DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist); -DEFINE_DQUOT_EVENT(xfs_dqlookup_move); DEFINE_DQUOT_EVENT(xfs_dqlookup_done); DEFINE_DQUOT_EVENT(xfs_dqget_hit); DEFINE_DQUOT_EVENT(xfs_dqget_miss); @@ -1051,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap, ); +#define XFS_BUSY_SYNC \ + { 0, "async" }, \ + { 1, "sync" } + TRACE_EVENT(xfs_alloc_busy, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, int slot), - TP_ARGS(mp, agno, agbno, len, slot), + TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, int sync), + TP_ARGS(trans, agno, agbno, len, sync), TP_STRUCT__entry( __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(int, tid) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, slot) + __field(int, sync) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->tid = trans->t_ticket->t_tid; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->slot = slot; + __entry->sync = sync; ), - TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", + TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, + __entry->tid, __entry->agno, __entry->agbno, __entry->len, - __entry->slot) + __print_symbolic(__entry->sync, XFS_BUSY_SYNC)) ); -#define XFS_BUSY_STATES \ - { 0, "found" }, \ - { 1, "missing" } - TRACE_EVENT(xfs_alloc_unbusy, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int slot, int found), - TP_ARGS(mp, agno, slot, found), + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) - __field(int, slot) - __field(int, found) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; - __entry->slot = slot; - __entry->found = found; + __entry->agbno = agbno; + __entry->len = len; ), - TP_printk("dev %d:%d agno %u slot %d %s", + TP_printk("dev %d:%d agno %u agbno %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->slot, - __print_symbolic(__entry->found, XFS_BUSY_STATES)) + __entry->agbno, + __entry->len) ); +#define XFS_BUSY_STATES \ + { 0, "missing" }, \ + { 1, "found" } + TRACE_EVENT(xfs_alloc_busysearch, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, xfs_lsn_t lsn), - TP_ARGS(mp, agno, agbno, len, lsn), + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, int found), + TP_ARGS(mp, agno, agbno, len, found), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(xfs_lsn_t, lsn) + __field(int, found) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->lsn = lsn; + __entry->found = found; ), - TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", + TP_printk("dev %d:%d agno %u agbno %u len %u %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, + __print_symbolic(__entry->found, XFS_BUSY_STATES)) +); + +TRACE_EVENT(xfs_trans_commit_lsn, + TP_PROTO(struct xfs_trans *trans), + TP_ARGS(trans), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->lsn = trans->t_commit_lsn; + ), + TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, __entry->lsn) ); @@ -1495,6 +1532,140 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); +DECLARE_EVENT_CLASS(xfs_log_recover_item_class, + TP_PROTO(struct log *log, struct xlog_recover *trans, + struct xlog_recover_item *item, int pass), + TP_ARGS(log, trans, item, pass), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, item) + __field(xlog_tid_t, tid) + __field(int, type) + __field(int, pass) + __field(int, count) + __field(int, total) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->item = (unsigned long)item; + __entry->tid = trans->r_log_tid; + __entry->type = ITEM_TYPE(item); + __entry->pass = pass; + __entry->count = item->ri_cnt; + __entry->total = item->ri_total; + ), + TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " + "item region count/total %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tid, + __entry->pass, + (void *)__entry->item, + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __entry->count, + __entry->total) +) + +#define DEFINE_LOG_RECOVER_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_item_class, name, \ + TP_PROTO(struct log *log, struct xlog_recover *trans, \ + struct xlog_recover_item *item, int pass), \ + TP_ARGS(log, trans, item, pass)) + +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover); + +DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, + TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), + TP_ARGS(log, buf_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__int64_t, blkno) + __field(unsigned short, len) + __field(unsigned short, flags) + __field(unsigned short, size) + __field(unsigned int, map_size) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->blkno = buf_f->blf_blkno; + __entry->len = buf_f->blf_len; + __entry->flags = buf_f->blf_flags; + __entry->size = buf_f->blf_size; + __entry->map_size = buf_f->blf_map_size; + ), + TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, " + "map_size %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->blkno, + __entry->len, + __entry->flags, + __entry->size, + __entry->map_size) +) + +#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \ + TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \ + TP_ARGS(log, buf_f)) + +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); + +DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, + TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), + TP_ARGS(log, in_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned short, size) + __field(int, fields) + __field(unsigned short, asize) + __field(unsigned short, dsize) + __field(__int64_t, blkno) + __field(int, len) + __field(int, boffset) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->ino = in_f->ilf_ino; + __entry->size = in_f->ilf_size; + __entry->fields = in_f->ilf_fields; + __entry->asize = in_f->ilf_asize; + __entry->dsize = in_f->ilf_dsize; + __entry->blkno = in_f->ilf_blkno; + __entry->len = in_f->ilf_len; + __entry->boffset = in_f->ilf_boffset; + ), + TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, " + "dsize %d, blkno 0x%llx, len %d, boffset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->fields, + __entry->asize, + __entry->dsize, + __entry->blkno, + __entry->len, + __entry->boffset) +) +#define DEFINE_LOG_RECOVER_INO_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \ + TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \ + TP_ARGS(log, in_f)) + +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c index fa01b9daba6..87d3e03878c 100644 --- a/fs/xfs/linux-2.6/xfs_xattr.c +++ b/fs/xfs/linux-2.6/xfs_xattr.c @@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, (void *)value, size, xflags); } -static struct xattr_handler xfs_xattr_user_handler = { +static const struct xattr_handler xfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = 0, /* no flags implies user namespace */ .get = xfs_xattr_get, .set = xfs_xattr_set, }; -static struct xattr_handler xfs_xattr_trusted_handler = { +static const struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = ATTR_ROOT, .get = xfs_xattr_get, .set = xfs_xattr_set, }; -static struct xattr_handler xfs_xattr_security_handler = { +static const struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = ATTR_SECURE, .get = xfs_xattr_get, .set = xfs_xattr_set, }; -struct xattr_handler *xfs_xattr_handlers[] = { +const struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 5f79dd78626..585e7633dfc 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -101,7 +101,7 @@ xfs_qm_dqinit( * No need to re-initialize these if this is a reclaimed dquot. */ if (brandnewdquot) { - dqp->dq_flnext = dqp->dq_flprev = dqp; + INIT_LIST_HEAD(&dqp->q_freelist); mutex_init(&dqp->q_qlock); init_waitqueue_head(&dqp->q_pinwait); @@ -119,20 +119,20 @@ xfs_qm_dqinit( * Only the q_core portion was zeroed in dqreclaim_one(). * So, we need to reset others. */ - dqp->q_nrefs = 0; - dqp->q_blkno = 0; - dqp->MPL_NEXT = dqp->HL_NEXT = NULL; - dqp->HL_PREVP = dqp->MPL_PREVP = NULL; - dqp->q_bufoffset = 0; - dqp->q_fileoffset = 0; - dqp->q_transp = NULL; - dqp->q_gdquot = NULL; - dqp->q_res_bcount = 0; - dqp->q_res_icount = 0; - dqp->q_res_rtbcount = 0; - atomic_set(&dqp->q_pincount, 0); - dqp->q_hash = NULL; - ASSERT(dqp->dq_flnext == dqp->dq_flprev); + dqp->q_nrefs = 0; + dqp->q_blkno = 0; + INIT_LIST_HEAD(&dqp->q_mplist); + INIT_LIST_HEAD(&dqp->q_hashlist); + dqp->q_bufoffset = 0; + dqp->q_fileoffset = 0; + dqp->q_transp = NULL; + dqp->q_gdquot = NULL; + dqp->q_res_bcount = 0; + dqp->q_res_icount = 0; + dqp->q_res_rtbcount = 0; + atomic_set(&dqp->q_pincount, 0); + dqp->q_hash = NULL; + ASSERT(list_empty(&dqp->q_freelist)); trace_xfs_dqreuse(dqp); } @@ -158,7 +158,7 @@ void xfs_qm_dqdestroy( xfs_dquot_t *dqp) { - ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); + ASSERT(list_empty(&dqp->q_freelist)); mutex_destroy(&dqp->q_qlock); sv_destroy(&dqp->q_pinwait); @@ -252,7 +252,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_bcount) >= be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = cpu_to_be32(get_seconds() + - XFS_QI_BTIMELIMIT(mp)); + mp->m_quotainfo->qi_btimelimit); } else { d->d_bwarns = 0; } @@ -275,7 +275,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_icount) >= be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = cpu_to_be32(get_seconds() + - XFS_QI_ITIMELIMIT(mp)); + mp->m_quotainfo->qi_itimelimit); } else { d->d_iwarns = 0; } @@ -298,7 +298,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_rtbcount) >= be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = cpu_to_be32(get_seconds() + - XFS_QI_RTBTIMELIMIT(mp)); + mp->m_quotainfo->qi_rtbtimelimit); } else { d->d_rtbwarns = 0; } @@ -325,6 +325,7 @@ xfs_qm_init_dquot_blk( uint type, xfs_buf_t *bp) { + struct xfs_quotainfo *q = mp->m_quotainfo; xfs_dqblk_t *d; int curid, i; @@ -337,16 +338,16 @@ xfs_qm_init_dquot_blk( /* * ID of the first dquot in the block - id's are zero based. */ - curid = id - (id % XFS_QM_DQPERBLK(mp)); + curid = id - (id % q->qi_dqperchunk); ASSERT(curid >= 0); - memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp))); - for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++) + memset(d, 0, BBTOB(q->qi_dqchunklen)); + for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) xfs_qm_dqinit_core(curid, type, d); xfs_trans_dquot_buf(tp, bp, - (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : - ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : - XFS_BLI_GDQUOT_BUF))); - xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1); + (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : + ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : + XFS_BLF_GDQUOT_BUF))); + xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } @@ -419,7 +420,7 @@ xfs_qm_dqalloc( /* now we can just get the buffer (there's nothing to read yet) */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, - XFS_QI_DQCHUNKLEN(mp), + mp->m_quotainfo->qi_dqchunklen, 0); if (!bp || (error = XFS_BUF_GETERROR(bp))) goto error1; @@ -500,7 +501,8 @@ xfs_qm_dqtobp( */ if (dqp->q_blkno == (xfs_daddr_t) 0) { /* We use the id as an index */ - dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp); + dqp->q_fileoffset = (xfs_fileoff_t)id / + mp->m_quotainfo->qi_dqperchunk; nmaps = 1; quotip = XFS_DQ_TO_QIP(dqp); xfs_ilock(quotip, XFS_ILOCK_SHARED); @@ -529,7 +531,7 @@ xfs_qm_dqtobp( /* * offset of dquot in the (fixed sized) dquot chunk. */ - dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) * + dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * sizeof(xfs_dqblk_t); if (map.br_startblock == HOLESTARTBLOCK) { /* @@ -559,15 +561,13 @@ xfs_qm_dqtobp( * Read in the buffer, unless we've just done the allocation * (in which case we already have the buf). */ - if (! newdquot) { + if (!newdquot) { trace_xfs_dqtobp_read(dqp); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - dqp->q_blkno, - XFS_QI_DQCHUNKLEN(mp), - 0, &bp))) { - return (error); - } + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, &bp); if (error || !bp) return XFS_ERROR(error); } @@ -689,14 +689,14 @@ xfs_qm_idtodq( tp = NULL; if (flags & XFS_QMOPT_DQALLOC) { tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - if ((error = xfs_trans_reserve(tp, - XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 + - 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT))) { + error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), + XFS_WRITE_LOG_RES(mp) + + BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + + 128, + 0, + XFS_TRANS_PERM_LOG_RES, + XFS_WRITE_LOG_COUNT); + if (error) { cancelflags = 0; goto error0; } @@ -751,7 +751,6 @@ xfs_qm_dqlookup( { xfs_dquot_t *dqp; uint flist_locked; - xfs_dquot_t *d; ASSERT(mutex_is_locked(&qh->qh_lock)); @@ -760,7 +759,7 @@ xfs_qm_dqlookup( /* * Traverse the hashchain looking for a match */ - for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) { + list_for_each_entry(dqp, &qh->qh_list, q_hashlist) { /* * We already have the hashlock. We don't need the * dqlock to look at the id field of the dquot, since the @@ -772,12 +771,12 @@ xfs_qm_dqlookup( /* * All in core dquots must be on the dqlist of mp */ - ASSERT(dqp->MPL_PREVP != NULL); + ASSERT(!list_empty(&dqp->q_mplist)); xfs_dqlock(dqp); if (dqp->q_nrefs == 0) { - ASSERT (XFS_DQ_IS_ON_FREELIST(dqp)); - if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { + ASSERT(!list_empty(&dqp->q_freelist)); + if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { trace_xfs_dqlookup_want(dqp); /* @@ -787,7 +786,7 @@ xfs_qm_dqlookup( */ dqp->dq_flags |= XFS_DQ_WANT; xfs_dqunlock(dqp); - xfs_qm_freelist_lock(xfs_Gqm); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); xfs_dqlock(dqp); dqp->dq_flags &= ~(XFS_DQ_WANT); } @@ -802,46 +801,28 @@ xfs_qm_dqlookup( if (flist_locked) { if (dqp->q_nrefs != 0) { - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); flist_locked = B_FALSE; } else { - /* - * take it off the freelist - */ + /* take it off the freelist */ trace_xfs_dqlookup_freelist(dqp); - XQM_FREELIST_REMOVE(dqp); - /* xfs_qm_freelist_print(&(xfs_Gqm-> - qm_dqfreelist), - "after removal"); */ + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; } } - /* - * grab a reference - */ XFS_DQHOLD(dqp); if (flist_locked) - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); /* * move the dquot to the front of the hashchain */ ASSERT(mutex_is_locked(&qh->qh_lock)); - if (dqp->HL_PREVP != &qh->qh_next) { - trace_xfs_dqlookup_move(dqp); - if ((d = dqp->HL_NEXT)) - d->HL_PREVP = dqp->HL_PREVP; - *(dqp->HL_PREVP) = d; - d = qh->qh_next; - d->HL_PREVP = &dqp->HL_NEXT; - dqp->HL_NEXT = d; - dqp->HL_PREVP = &qh->qh_next; - qh->qh_next = dqp; - } + list_move(&dqp->q_hashlist, &qh->qh_list); trace_xfs_dqlookup_done(dqp); *O_dqpp = dqp; - ASSERT(mutex_is_locked(&qh->qh_lock)); - return (0); + return 0; } } @@ -975,16 +956,17 @@ xfs_qm_dqget( */ if (ip) { xfs_ilock(ip, XFS_ILOCK_EXCL); - if (! XFS_IS_DQTYPE_ON(mp, type)) { - /* inode stays locked on return */ - xfs_qm_dqdestroy(dqp); - return XFS_ERROR(ESRCH); - } + /* * A dquot could be attached to this inode by now, since * we had dropped the ilock. */ if (type == XFS_DQ_USER) { + if (!XFS_IS_UQUOTA_ON(mp)) { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); + } if (ip->i_udquot) { xfs_qm_dqdestroy(dqp); dqp = ip->i_udquot; @@ -992,6 +974,11 @@ xfs_qm_dqget( goto dqret; } } else { + if (!XFS_IS_OQUOTA_ON(mp)) { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); + } if (ip->i_gdquot) { xfs_qm_dqdestroy(dqp); dqp = ip->i_gdquot; @@ -1033,13 +1020,14 @@ xfs_qm_dqget( */ ASSERT(mutex_is_locked(&h->qh_lock)); dqp->q_hash = h; - XQM_HASHLIST_INSERT(h, dqp); + list_add(&dqp->q_hashlist, &h->qh_list); + h->qh_version++; /* * Attach this dquot to this filesystem's list of all dquots, * kept inside the mount structure in m_quotainfo field */ - xfs_qm_mplist_lock(mp); + mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); /* * We return a locked dquot to the caller, with a reference taken @@ -1047,9 +1035,9 @@ xfs_qm_dqget( xfs_dqlock(dqp); dqp->q_nrefs = 1; - XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); - - xfs_qm_mplist_unlock(mp); + list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist); + mp->m_quotainfo->qi_dquots++; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); mutex_unlock(&h->qh_lock); dqret: ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -1086,10 +1074,10 @@ xfs_qm_dqput( * drop the dqlock and acquire the freelist and dqlock * in the right order; but try to get it out-of-order first */ - if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { + if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) { trace_xfs_dqput_wait(dqp); xfs_dqunlock(dqp); - xfs_qm_freelist_lock(xfs_Gqm); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); xfs_dqlock(dqp); } @@ -1100,10 +1088,8 @@ xfs_qm_dqput( if (--dqp->q_nrefs == 0) { trace_xfs_dqput_free(dqp); - /* - * insert at end of the freelist. - */ - XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp); + list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); + xfs_Gqm->qm_dqfrlist_cnt++; /* * If we just added a udquot to the freelist, then @@ -1118,10 +1104,6 @@ xfs_qm_dqput( xfs_dqlock(gdqp); dqp->q_gdquot = NULL; } - - /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist), - "@@@@@++ Free list (after append) @@@@@+"); - */ } xfs_dqunlock(dqp); @@ -1133,7 +1115,7 @@ xfs_qm_dqput( break; dqp = gdqp; } - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); } /* @@ -1386,10 +1368,10 @@ int xfs_qm_dqpurge( xfs_dquot_t *dqp) { - xfs_dqhash_t *thishash; + xfs_dqhash_t *qh = dqp->q_hash; xfs_mount_t *mp = dqp->q_mount; - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); + ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock)); ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); xfs_dqlock(dqp); @@ -1407,7 +1389,7 @@ xfs_qm_dqpurge( return (1); } - ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); + ASSERT(!list_empty(&dqp->q_freelist)); /* * If we're turning off quotas, we have to make sure that, for @@ -1452,14 +1434,16 @@ xfs_qm_dqpurge( ASSERT(XFS_FORCED_SHUTDOWN(mp) || !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); - thishash = dqp->q_hash; - XQM_HASHLIST_REMOVE(thishash, dqp); - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp); + list_del_init(&dqp->q_hashlist); + qh->qh_version++; + list_del_init(&dqp->q_mplist); + mp->m_quotainfo->qi_dqreclaims++; + mp->m_quotainfo->qi_dquots--; /* * XXX Move this to the front of the freelist, if we can get the * freelist lock. */ - ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); + ASSERT(!list_empty(&dqp->q_freelist)); dqp->q_mount = NULL; dqp->q_hash = NULL; @@ -1467,7 +1451,7 @@ xfs_qm_dqpurge( memset(&dqp->q_core, 0, sizeof(dqp->q_core)); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - mutex_unlock(&thishash->qh_lock); + mutex_unlock(&qh->qh_lock); return (0); } @@ -1517,6 +1501,7 @@ void xfs_qm_dqflock_pushbuf_wait( xfs_dquot_t *dqp) { + xfs_mount_t *mp = dqp->q_mount; xfs_buf_t *bp; /* @@ -1525,14 +1510,14 @@ xfs_qm_dqflock_pushbuf_wait( * out immediately. We'll be able to acquire * the flush lock when the I/O completes. */ - bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, - XFS_QI_DQCHUNKLEN(dqp->q_mount), XBF_TRYLOCK); + bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); if (!bp) goto out_lock; if (XFS_BUF_ISDELAYWRITE(bp)) { if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(dqp->q_mount, 0); + xfs_log_force(mp, 0); xfs_buf_delwri_promote(bp); wake_up_process(bp->b_target->bt_task); } diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index a0f7da586d1..5da3a23b820 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -33,40 +33,23 @@ * The hash chain headers (hash buckets) */ typedef struct xfs_dqhash { - struct xfs_dquot *qh_next; + struct list_head qh_list; struct mutex qh_lock; uint qh_version; /* ever increasing version */ uint qh_nelems; /* number of dquots on the list */ } xfs_dqhash_t; -typedef struct xfs_dqlink { - struct xfs_dquot *ql_next; /* forward link */ - struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */ -} xfs_dqlink_t; - struct xfs_mount; struct xfs_trans; /* - * This is the marker which is designed to occupy the first few - * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers - * must come first. - * This serves as the marker ("sentinel") when we have to restart list - * iterations because of locking considerations. - */ -typedef struct xfs_dqmarker { - struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */ - struct xfs_dquot*dqm_flprev; - xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */ - xfs_dqlink_t dqm_hashlist; /* link to the hash chain */ - uint dqm_flags; /* various flags (XFS_DQ_*) */ -} xfs_dqmarker_t; - -/* * The incore dquot structure */ typedef struct xfs_dquot { - xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */ + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_freelist; /* global free list of dquots */ + struct list_head q_mplist; /* mount's list of dquots */ + struct list_head q_hashlist; /* gloabl hash list of dquots */ xfs_dqhash_t *q_hash; /* the hashchain header */ struct xfs_mount*q_mount; /* filesystem this relates to */ struct xfs_trans*q_transp; /* trans this belongs to currently */ @@ -87,13 +70,6 @@ typedef struct xfs_dquot { wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ } xfs_dquot_t; - -#define dq_flnext q_lists.dqm_flnext -#define dq_flprev q_lists.dqm_flprev -#define dq_mplist q_lists.dqm_mplist -#define dq_hashlist q_lists.dqm_hashlist -#define dq_flags q_lists.dqm_flags - /* * Lock hierarchy for q_qlock: * XFS_QLOCK_NORMAL is the implicit default, @@ -127,7 +103,6 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) } #define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) -#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 4e4ee9a5719..8d89a24ae32 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -107,8 +107,7 @@ xfs_qm_dquot_logitem_pin( /* ARGSUSED */ STATIC void xfs_qm_dquot_logitem_unpin( - xfs_dq_logitem_t *logitem, - int stale) + xfs_dq_logitem_t *logitem) { xfs_dquot_t *dqp = logitem->qli_dquot; @@ -123,7 +122,7 @@ xfs_qm_dquot_logitem_unpin_remove( xfs_dq_logitem_t *logitem, xfs_trans_t *tp) { - xfs_qm_dquot_logitem_unpin(logitem, 0); + xfs_qm_dquot_logitem_unpin(logitem); } /* @@ -228,7 +227,7 @@ xfs_qm_dquot_logitem_pushbuf( } mp = dqp->q_mount; bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, - XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK); + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); xfs_dqunlock(dqp); if (!bp) return; @@ -329,8 +328,7 @@ static struct xfs_item_ops xfs_dquot_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_dquot_logitem_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int)) - xfs_qm_dquot_logitem_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) xfs_qm_dquot_logitem_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*)) @@ -357,9 +355,8 @@ xfs_qm_dquot_logitem_init( xfs_dq_logitem_t *lp; lp = &dqp->q_logitem; - lp->qli_item.li_type = XFS_LI_DQUOT; - lp->qli_item.li_ops = &xfs_dquot_item_ops; - lp->qli_item.li_mountp = dqp->q_mount; + xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, + &xfs_dquot_item_ops); lp->qli_dquot = dqp; lp->qli_format.qlf_type = XFS_LI_DQUOT; lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); @@ -426,7 +423,7 @@ xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf) */ /*ARGSUSED*/ STATIC void -xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale) +xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf) { return; } @@ -537,8 +534,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t* ,int)) - xfs_qm_qoff_logitem_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) xfs_qm_qoff_logitem_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, @@ -559,8 +555,7 @@ static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int)) - xfs_qm_qoff_logitem_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) xfs_qm_qoff_logitem_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, @@ -586,11 +581,8 @@ xfs_qm_qoff_logitem_init( qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); - qf->qql_item.li_type = XFS_LI_QUOTAOFF; - if (start) - qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops; - else - qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops; + xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? + &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); qf->qql_item.li_mountp = mp; qf->qql_format.qf_type = XFS_LI_QUOTAOFF; qf->qql_format.qf_flags = flags; diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 417e61e3d9d..38e76414664 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -67,9 +67,6 @@ static cred_t xfs_zerocr; STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); -STATIC void xfs_qm_freelist_init(xfs_frlist_t *); -STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *); - STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); STATIC int xfs_qm_shake(int, gfp_t); @@ -84,21 +81,25 @@ extern struct mutex qcheck_lock; #endif #ifdef QUOTADEBUG -#define XQM_LIST_PRINT(l, NXT, title) \ -{ \ - xfs_dquot_t *dqp; int i = 0; \ - cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ - for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \ - cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \ - "bcnt = %d, icnt = %d, refs = %d", \ - ++i, (int) be32_to_cpu(dqp->q_core.d_id), \ - DQFLAGTO_TYPESTR(dqp), \ - (int) be64_to_cpu(dqp->q_core.d_bcount), \ - (int) be64_to_cpu(dqp->q_core.d_icount), \ - (int) dqp->q_nrefs); } \ +static void +xfs_qm_dquot_list_print( + struct xfs_mount *mp) +{ + xfs_dquot_t *dqp; + int i = 0; + + list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) { + cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " + "bcnt = %lld, icnt = %lld, refs = %d", + i++, be32_to_cpu(dqp->q_core.d_id), + DQFLAGTO_TYPESTR(dqp), + (long long)be64_to_cpu(dqp->q_core.d_bcount), + (long long)be64_to_cpu(dqp->q_core.d_icount), + dqp->q_nrefs); + } } #else -#define XQM_LIST_PRINT(l, NXT, title) do { } while (0) +static void xfs_qm_dquot_list_print(struct xfs_mount *mp) { } #endif /* @@ -144,7 +145,9 @@ xfs_Gqm_init(void) /* * Freelist of all dquots of all file systems */ - xfs_qm_freelist_init(&(xqm->qm_dqfreelist)); + INIT_LIST_HEAD(&xqm->qm_dqfrlist); + xqm->qm_dqfrlist_cnt = 0; + mutex_init(&xqm->qm_dqfrlist_lock); /* * dquot zone. we register our own low-memory callback. @@ -189,6 +192,7 @@ STATIC void xfs_qm_destroy( struct xfs_qm *xqm) { + struct xfs_dquot *dqp, *n; int hsize, i; ASSERT(xqm != NULL); @@ -204,7 +208,21 @@ xfs_qm_destroy( xqm->qm_usr_dqhtable = NULL; xqm->qm_grp_dqhtable = NULL; xqm->qm_dqhashmask = 0; - xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist)); + + /* frlist cleanup */ + mutex_lock(&xqm->qm_dqfrlist_lock); + list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { + xfs_dqlock(dqp); +#ifdef QUOTADEBUG + cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); +#endif + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + xfs_dqunlock(dqp); + xfs_qm_dqdestroy(dqp); + } + mutex_unlock(&xqm->qm_dqfrlist_lock); + mutex_destroy(&xqm->qm_dqfrlist_lock); #ifdef DEBUG mutex_destroy(&qcheck_lock); #endif @@ -256,7 +274,7 @@ STATIC void xfs_qm_rele_quotafs_ref( struct xfs_mount *mp) { - xfs_dquot_t *dqp, *nextdqp; + xfs_dquot_t *dqp, *n; ASSERT(xfs_Gqm); ASSERT(xfs_Gqm->qm_nrefs > 0); @@ -264,26 +282,24 @@ xfs_qm_rele_quotafs_ref( /* * Go thru the freelist and destroy all inactive dquots. */ - xfs_qm_freelist_lock(xfs_Gqm); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; - dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) { + list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) { xfs_dqlock(dqp); - nextdqp = dqp->dq_flnext; if (dqp->dq_flags & XFS_DQ_INACTIVE) { ASSERT(dqp->q_mount == NULL); ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); - XQM_FREELIST_REMOVE(dqp); + ASSERT(list_empty(&dqp->q_hashlist)); + ASSERT(list_empty(&dqp->q_mplist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; xfs_dqunlock(dqp); xfs_qm_dqdestroy(dqp); } else { xfs_dqunlock(dqp); } - dqp = nextdqp; } - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); /* * Destroy the entire XQM. If somebody mounts with quotaon, this'll @@ -305,7 +321,7 @@ xfs_qm_unmount( struct xfs_mount *mp) { if (mp->m_quotainfo) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); xfs_qm_destroy_quotainfo(mp); } } @@ -449,20 +465,21 @@ xfs_qm_unmount_quotas( */ STATIC int xfs_qm_dqflush_all( - xfs_mount_t *mp, - int sync_mode) + struct xfs_mount *mp, + int sync_mode) { - int recl; - xfs_dquot_t *dqp; - int niters; - int error; + struct xfs_quotainfo *q = mp->m_quotainfo; + int recl; + struct xfs_dquot *dqp; + int niters; + int error; - if (mp->m_quotainfo == NULL) + if (!q) return 0; niters = 0; again: - xfs_qm_mplist_lock(mp); - FOREACH_DQUOT_IN_MP(dqp, mp) { + mutex_lock(&q->qi_dqlist_lock); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); if (! XFS_DQ_IS_DIRTY(dqp)) { xfs_dqunlock(dqp); @@ -470,7 +487,7 @@ again: } /* XXX a sentinel would be better */ - recl = XFS_QI_MPLRECLAIMS(mp); + recl = q->qi_dqreclaims; if (!xfs_dqflock_nowait(dqp)) { /* * If we can't grab the flush lock then check @@ -485,21 +502,21 @@ again: * Let go of the mplist lock. We don't want to hold it * across a disk write. */ - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); error = xfs_qm_dqflush(dqp, sync_mode); xfs_dqunlock(dqp); if (error) return error; - xfs_qm_mplist_lock(mp); - if (recl != XFS_QI_MPLRECLAIMS(mp)) { - xfs_qm_mplist_unlock(mp); + mutex_lock(&q->qi_dqlist_lock); + if (recl != q->qi_dqreclaims) { + mutex_unlock(&q->qi_dqlist_lock); /* XXX restart limit */ goto again; } } - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); /* return ! busy */ return 0; } @@ -509,15 +526,15 @@ again: */ STATIC void xfs_qm_detach_gdquots( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_dquot_t *dqp, *gdqp; - int nrecl; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_dquot *dqp, *gdqp; + int nrecl; again: - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); - dqp = XFS_QI_MPLNEXT(mp); - while (dqp) { + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { xfs_dqlock(dqp); if ((gdqp = dqp->q_gdquot)) { xfs_dqlock(gdqp); @@ -530,15 +547,14 @@ xfs_qm_detach_gdquots( * Can't hold the mplist lock across a dqput. * XXXmust convert to marker based iterations here. */ - nrecl = XFS_QI_MPLRECLAIMS(mp); - xfs_qm_mplist_unlock(mp); + nrecl = q->qi_dqreclaims; + mutex_unlock(&q->qi_dqlist_lock); xfs_qm_dqput(gdqp); - xfs_qm_mplist_lock(mp); - if (nrecl != XFS_QI_MPLRECLAIMS(mp)) + mutex_lock(&q->qi_dqlist_lock); + if (nrecl != q->qi_dqreclaims) goto again; } - dqp = dqp->MPL_NEXT; } } @@ -550,23 +566,23 @@ xfs_qm_detach_gdquots( */ STATIC int xfs_qm_dqpurge_int( - xfs_mount_t *mp, - uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */ + struct xfs_mount *mp, + uint flags) { - xfs_dquot_t *dqp; - uint dqtype; - int nrecl; - xfs_dquot_t *nextdqp; - int nmisses; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_dquot *dqp, *n; + uint dqtype; + int nrecl; + int nmisses; - if (mp->m_quotainfo == NULL) + if (!q) return 0; dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; - xfs_qm_mplist_lock(mp); + mutex_lock(&q->qi_dqlist_lock); /* * In the first pass through all incore dquots of this filesystem, @@ -578,28 +594,25 @@ xfs_qm_dqpurge_int( again: nmisses = 0; - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); /* * Try to get rid of all of the unwanted dquots. The idea is to * get them off mplist and hashlist, but leave them on freelist. */ - dqp = XFS_QI_MPLNEXT(mp); - while (dqp) { + list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) { /* * It's OK to look at the type without taking dqlock here. * We're holding the mplist lock here, and that's needed for * a dqreclaim. */ - if ((dqp->dq_flags & dqtype) == 0) { - dqp = dqp->MPL_NEXT; + if ((dqp->dq_flags & dqtype) == 0) continue; - } if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - nrecl = XFS_QI_MPLRECLAIMS(mp); - xfs_qm_mplist_unlock(mp); + nrecl = q->qi_dqreclaims; + mutex_unlock(&q->qi_dqlist_lock); mutex_lock(&dqp->q_hash->qh_lock); - xfs_qm_mplist_lock(mp); + mutex_lock(&q->qi_dqlist_lock); /* * XXXTheoretically, we can get into a very long @@ -607,7 +620,7 @@ xfs_qm_dqpurge_int( * No one can be adding dquots to the mplist at * this point, but somebody might be taking things off. */ - if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { + if (nrecl != q->qi_dqreclaims) { mutex_unlock(&dqp->q_hash->qh_lock); goto again; } @@ -617,11 +630,9 @@ xfs_qm_dqpurge_int( * Take the dquot off the mplist and hashlist. It may remain on * freelist in INACTIVE state. */ - nextdqp = dqp->MPL_NEXT; nmisses += xfs_qm_dqpurge(dqp); - dqp = nextdqp; } - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); return nmisses; } @@ -921,12 +932,13 @@ xfs_qm_dqdetach( int xfs_qm_sync( - xfs_mount_t *mp, - int flags) + struct xfs_mount *mp, + int flags) { - int recl, restarts; - xfs_dquot_t *dqp; - int error; + struct xfs_quotainfo *q = mp->m_quotainfo; + int recl, restarts; + struct xfs_dquot *dqp; + int error; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; @@ -934,18 +946,19 @@ xfs_qm_sync( restarts = 0; again: - xfs_qm_mplist_lock(mp); + mutex_lock(&q->qi_dqlist_lock); /* * dqpurge_all() also takes the mplist lock and iterate thru all dquots * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared * when we have the mplist lock, we know that dquots will be consistent * as long as we have it locked. */ - if (! XFS_IS_QUOTA_ON(mp)) { - xfs_qm_mplist_unlock(mp); + if (!XFS_IS_QUOTA_ON(mp)) { + mutex_unlock(&q->qi_dqlist_lock); return 0; } - FOREACH_DQUOT_IN_MP(dqp, mp) { + ASSERT(mutex_is_locked(&q->qi_dqlist_lock)); + list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { /* * If this is vfs_sync calling, then skip the dquots that * don't 'seem' to be dirty. ie. don't acquire dqlock. @@ -969,7 +982,7 @@ xfs_qm_sync( } /* XXX a sentinel would be better */ - recl = XFS_QI_MPLRECLAIMS(mp); + recl = q->qi_dqreclaims; if (!xfs_dqflock_nowait(dqp)) { if (flags & SYNC_TRYLOCK) { xfs_dqunlock(dqp); @@ -989,7 +1002,7 @@ xfs_qm_sync( * Let go of the mplist lock. We don't want to hold it * across a disk write */ - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); error = xfs_qm_dqflush(dqp, flags); xfs_dqunlock(dqp); if (error && XFS_FORCED_SHUTDOWN(mp)) @@ -997,17 +1010,17 @@ xfs_qm_sync( else if (error) return error; - xfs_qm_mplist_lock(mp); - if (recl != XFS_QI_MPLRECLAIMS(mp)) { + mutex_lock(&q->qi_dqlist_lock); + if (recl != q->qi_dqreclaims) { if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) break; - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); goto again; } } - xfs_qm_mplist_unlock(mp); + mutex_unlock(&q->qi_dqlist_lock); return 0; } @@ -1052,8 +1065,9 @@ xfs_qm_init_quotainfo( return error; } - xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); - lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class); + INIT_LIST_HEAD(&qinf->qi_dqlist); + mutex_init(&qinf->qi_dqlist_lock); + lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class); qinf->qi_dqreclaims = 0; @@ -1150,7 +1164,8 @@ xfs_qm_destroy_quotainfo( */ xfs_qm_rele_quotafs_ref(mp); - xfs_qm_list_destroy(&qi->qi_dqlist); + ASSERT(list_empty(&qi->qi_dqlist)); + mutex_destroy(&qi->qi_dqlist_lock); if (qi->qi_uquotaip) { IRELE(qi->qi_uquotaip); @@ -1177,7 +1192,7 @@ xfs_qm_list_init( int n) { mutex_init(&list->qh_lock); - list->qh_next = NULL; + INIT_LIST_HEAD(&list->qh_list); list->qh_version = 0; list->qh_nelems = 0; } @@ -1316,9 +1331,6 @@ xfs_qm_qino_alloc( */ spin_lock(&mp->m_sb_lock); if (flags & XFS_QMOPT_SBVERSION) { -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - unsigned oldv = mp->m_sb.sb_versionnum; -#endif ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == @@ -1331,11 +1343,6 @@ xfs_qm_qino_alloc( /* qflags will get updated _after_ quotacheck */ mp->m_sb.sb_qflags = 0; -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - cmn_err(CE_NOTE, - "Old superblock version %x, converting to %x.", - oldv, mp->m_sb.sb_versionnum); -#endif } if (flags & XFS_QMOPT_UQUOTA) mp->m_sb.sb_uquotino = (*ip)->i_ino; @@ -1371,10 +1378,10 @@ xfs_qm_reset_dqcounts( #ifdef DEBUG j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); do_div(j, sizeof(xfs_dqblk_t)); - ASSERT(XFS_QM_DQPERBLK(mp) == j); + ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); - for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) { + for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to @@ -1429,7 +1436,7 @@ xfs_qm_dqiter_bufs( while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), - (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp); + mp->m_quotainfo->qi_dqchunklen, 0, &bp); if (error) break; @@ -1439,7 +1446,7 @@ xfs_qm_dqiter_bufs( * goto the next block. */ bno++; - firstid += XFS_QM_DQPERBLK(mp); + firstid += mp->m_quotainfo->qi_dqperchunk; } return error; } @@ -1505,7 +1512,7 @@ xfs_qm_dqiterate( continue; firstid = (xfs_dqid_t) map[i].br_startoff * - XFS_QM_DQPERBLK(mp); + mp->m_quotainfo->qi_dqperchunk; /* * Do a read-ahead on the next extent. */ @@ -1516,7 +1523,7 @@ xfs_qm_dqiterate( while (rablkcnt--) { xfs_baread(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), - (int)XFS_QI_DQCHUNKLEN(mp)); + mp->m_quotainfo->qi_dqchunklen); rablkno++; } } @@ -1576,8 +1583,10 @@ xfs_qm_quotacheck_dqadjust( /* * Set default limits, adjust timers (since we changed usages) + * + * There are no timers for the default values set in the root dquot. */ - if (! XFS_IS_SUSER_DQUOT(dqp)) { + if (dqp->q_core.d_id) { xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); } @@ -1747,14 +1756,14 @@ xfs_qm_quotacheck( lastino = 0; flags = 0; - ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp)); + ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); /* * There should be no cached dquots. The (simplistic) quotacheck * algorithm doesn't like that. */ - ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0); + ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist)); cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); @@ -1763,15 +1772,19 @@ xfs_qm_quotacheck( * their counters to zero. We need a clean slate. * We don't log our changes till later. */ - if ((uip = XFS_QI_UQIP(mp))) { - if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA))) + uip = mp->m_quotainfo->qi_uquotaip; + if (uip) { + error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA); + if (error) goto error_return; flags |= XFS_UQUOTA_CHKD; } - if ((gip = XFS_QI_GQIP(mp))) { - if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? - XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA))) + gip = mp->m_quotainfo->qi_gquotaip; + if (gip) { + error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? + XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); + if (error) goto error_return; flags |= XFS_OQUOTA_CHKD; } @@ -1804,7 +1817,7 @@ xfs_qm_quotacheck( * at this point (because we intentionally didn't in dqget_noattach). */ if (error) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); goto error_return; } @@ -1825,7 +1838,7 @@ xfs_qm_quotacheck( mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); mp->m_qflags |= flags; - XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++"); + xfs_qm_dquot_list_print(mp); error_return: if (error) { @@ -1920,59 +1933,53 @@ xfs_qm_init_quotainos( } } - XFS_QI_UQIP(mp) = uip; - XFS_QI_GQIP(mp) = gip; + mp->m_quotainfo->qi_uquotaip = uip; + mp->m_quotainfo->qi_gquotaip = gip; return 0; } + /* - * Traverse the freelist of dquots and attempt to reclaim a maximum of - * 'howmany' dquots. This operation races with dqlookup(), and attempts to - * favor the lookup function ... - * XXXsup merge this with qm_reclaim_one(). + * Just pop the least recently used dquot off the freelist and + * recycle it. The returned dquot is locked. */ -STATIC int -xfs_qm_shake_freelist( - int howmany) +STATIC xfs_dquot_t * +xfs_qm_dqreclaim_one(void) { - int nreclaimed; - xfs_dqhash_t *hash; - xfs_dquot_t *dqp, *nextdqp; + xfs_dquot_t *dqpout; + xfs_dquot_t *dqp; int restarts; - int nflushes; - - if (howmany <= 0) - return 0; - nreclaimed = 0; restarts = 0; - nflushes = 0; + dqpout = NULL; -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "Shake free 0x%x", howmany); -#endif - /* lock order is : hashchainlock, freelistlock, mplistlock */ - tryagain: - xfs_qm_freelist_lock(xfs_Gqm); + /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ +startagain: + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); - for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; - ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) && - nreclaimed < howmany); ) { + list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { + struct xfs_mount *mp = dqp->q_mount; xfs_dqlock(dqp); /* * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. + * want to reclaim a dquot that lookup wants. We release the + * freelist lock and start over, so that lookup will grab + * both the dquot and the freelistlock. */ if (dqp->dq_flags & XFS_DQ_WANT) { + ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); + + trace_xfs_dqreclaim_want(dqp); + xfs_dqunlock(dqp); - xfs_qm_freelist_unlock(xfs_Gqm); + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return nreclaimed; + return NULL; XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto tryagain; + goto startagain; } /* @@ -1981,23 +1988,27 @@ xfs_qm_shake_freelist( * life easier. */ if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); + ASSERT(mp == NULL); ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); + ASSERT(list_empty(&dqp->q_hashlist)); + ASSERT(list_empty(&dqp->q_mplist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + xfs_dqunlock(dqp); + dqpout = dqp; XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - nextdqp = dqp->dq_flnext; - goto off_freelist; + break; } - ASSERT(dqp->MPL_PREVP); + ASSERT(dqp->q_hash); + ASSERT(!list_empty(&dqp->q_mplist)); + /* * Try to grab the flush lock. If this dquot is in the process of * getting flushed to disk, we don't want to reclaim it. */ if (!xfs_dqflock_nowait(dqp)) { xfs_dqunlock(dqp); - dqp = dqp->dq_flnext; continue; } @@ -2010,21 +2021,21 @@ xfs_qm_shake_freelist( if (XFS_DQ_IS_DIRTY(dqp)) { int error; - trace_xfs_dqshake_dirty(dqp); + trace_xfs_dqreclaim_dirty(dqp); /* * We flush it delayed write, so don't bother - * releasing the mplock. + * releasing the freelist lock. */ error = xfs_qm_dqflush(dqp, 0); if (error) { - xfs_fs_cmn_err(CE_WARN, dqp->q_mount, - "xfs_qm_dqflush_all: dquot %p flush failed", dqp); + xfs_fs_cmn_err(CE_WARN, mp, + "xfs_qm_dqreclaim: dquot %p flush failed", dqp); } xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - dqp = dqp->dq_flnext; continue; } + /* * We're trying to get the hashlock out of order. This races * with dqlookup; so, we giveup and goto the next dquot if @@ -2033,56 +2044,74 @@ xfs_qm_shake_freelist( * waiting for the freelist lock. */ if (!mutex_trylock(&dqp->q_hash->qh_lock)) { - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - dqp = dqp->dq_flnext; - continue; + restarts++; + goto dqfunlock; } + /* * This races with dquot allocation code as well as dqflush_all * and reclaim code. So, if we failed to grab the mplist lock, * giveup everything and start over. */ - hash = dqp->q_hash; - ASSERT(hash); - if (! xfs_qm_mplist_nowait(dqp->q_mount)) { - /* XXX put a sentinel so that we can come back here */ + if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) { + restarts++; + mutex_unlock(&dqp->q_hash->qh_lock); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - mutex_unlock(&hash->qh_lock); - xfs_qm_freelist_unlock(xfs_Gqm); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return nreclaimed; - goto tryagain; + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS) + return NULL; + goto startagain; } - trace_xfs_dqshake_unlink(dqp); - -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n", - dqp, be32_to_cpu(dqp->q_core.d_id)); -#endif ASSERT(dqp->q_nrefs == 0); - nextdqp = dqp->dq_flnext; - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); - XQM_HASHLIST_REMOVE(hash, dqp); + list_del_init(&dqp->q_mplist); + mp->m_quotainfo->qi_dquots--; + mp->m_quotainfo->qi_dqreclaims++; + list_del_init(&dqp->q_hashlist); + dqp->q_hash->qh_version++; + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + dqpout = dqp; + mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); + mutex_unlock(&dqp->q_hash->qh_lock); +dqfunlock: xfs_dqfunlock(dqp); - xfs_qm_mplist_unlock(dqp->q_mount); - mutex_unlock(&hash->qh_lock); - - off_freelist: - XQM_FREELIST_REMOVE(dqp); xfs_dqunlock(dqp); - nreclaimed++; - XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims); + if (dqpout) + break; + if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) + return NULL; + } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); + return dqpout; +} + +/* + * Traverse the freelist of dquots and attempt to reclaim a maximum of + * 'howmany' dquots. This operation races with dqlookup(), and attempts to + * favor the lookup function ... + */ +STATIC int +xfs_qm_shake_freelist( + int howmany) +{ + int nreclaimed = 0; + xfs_dquot_t *dqp; + + if (howmany <= 0) + return 0; + + while (nreclaimed < howmany) { + dqp = xfs_qm_dqreclaim_one(); + if (!dqp) + return nreclaimed; xfs_qm_dqdestroy(dqp); - dqp = nextdqp; + nreclaimed++; } - xfs_qm_freelist_unlock(xfs_Gqm); return nreclaimed; } - /* * The kmem_shake interface is invoked when memory is running low. */ @@ -2097,7 +2126,7 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) if (!xfs_Gqm) return 0; - nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */ + nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */ /* incore dquots in all f/s's */ ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; @@ -2113,131 +2142,6 @@ xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) } -/* - * Just pop the least recently used dquot off the freelist and - * recycle it. The returned dquot is locked. - */ -STATIC xfs_dquot_t * -xfs_qm_dqreclaim_one(void) -{ - xfs_dquot_t *dqpout; - xfs_dquot_t *dqp; - int restarts; - int nflushes; - - restarts = 0; - dqpout = NULL; - nflushes = 0; - - /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ - startagain: - xfs_qm_freelist_lock(xfs_Gqm); - - FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) { - xfs_dqlock(dqp); - - /* - * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. We release the - * freelist lock and start over, so that lookup will grab - * both the dquot and the freelistlock. - */ - if (dqp->dq_flags & XFS_DQ_WANT) { - ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); - - trace_xfs_dqreclaim_want(dqp); - - xfs_dqunlock(dqp); - xfs_qm_freelist_unlock(xfs_Gqm); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; - XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto startagain; - } - - /* - * If the dquot is inactive, we are assured that it is - * not on the mplist or the hashlist, and that makes our - * life easier. - */ - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); - XQM_FREELIST_REMOVE(dqp); - xfs_dqunlock(dqp); - dqpout = dqp; - XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - break; - } - - ASSERT(dqp->q_hash); - ASSERT(dqp->MPL_PREVP); - - /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. - */ - if (!xfs_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - continue; - } - - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - int error; - - trace_xfs_dqreclaim_dirty(dqp); - - /* - * We flush it delayed write, so don't bother - * releasing the freelist lock. - */ - error = xfs_qm_dqflush(dqp, 0); - if (error) { - xfs_fs_cmn_err(CE_WARN, dqp->q_mount, - "xfs_qm_dqreclaim: dquot %p flush failed", dqp); - } - xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - continue; - } - - if (! xfs_qm_mplist_nowait(dqp->q_mount)) { - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - continue; - } - - if (!mutex_trylock(&dqp->q_hash->qh_lock)) - goto mplistunlock; - - trace_xfs_dqreclaim_unlink(dqp); - - ASSERT(dqp->q_nrefs == 0); - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); - XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); - XQM_FREELIST_REMOVE(dqp); - dqpout = dqp; - mutex_unlock(&dqp->q_hash->qh_lock); - mplistunlock: - xfs_qm_mplist_unlock(dqp->q_mount); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - if (dqpout) - break; - } - - xfs_qm_freelist_unlock(xfs_Gqm); - return dqpout; -} - - /*------------------------------------------------------------------*/ /* @@ -2662,66 +2566,3 @@ xfs_qm_vop_create_dqattach( } } -/* ------------- list stuff -----------------*/ -STATIC void -xfs_qm_freelist_init(xfs_frlist_t *ql) -{ - ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql; - mutex_init(&ql->qh_lock); - ql->qh_version = 0; - ql->qh_nelems = 0; -} - -STATIC void -xfs_qm_freelist_destroy(xfs_frlist_t *ql) -{ - xfs_dquot_t *dqp, *nextdqp; - - mutex_lock(&ql->qh_lock); - for (dqp = ql->qh_next; - dqp != (xfs_dquot_t *)ql; ) { - xfs_dqlock(dqp); - nextdqp = dqp->dq_flnext; -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); -#endif - XQM_FREELIST_REMOVE(dqp); - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - dqp = nextdqp; - } - mutex_unlock(&ql->qh_lock); - mutex_destroy(&ql->qh_lock); - - ASSERT(ql->qh_nelems == 0); -} - -STATIC void -xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq) -{ - dq->dq_flnext = ql->qh_next; - dq->dq_flprev = (xfs_dquot_t *)ql; - ql->qh_next = dq; - dq->dq_flnext->dq_flprev = dq; - xfs_Gqm->qm_dqfreelist.qh_nelems++; - xfs_Gqm->qm_dqfreelist.qh_version++; -} - -void -xfs_qm_freelist_unlink(xfs_dquot_t *dq) -{ - xfs_dquot_t *next = dq->dq_flnext; - xfs_dquot_t *prev = dq->dq_flprev; - - next->dq_flprev = prev; - prev->dq_flnext = next; - dq->dq_flnext = dq->dq_flprev = dq; - xfs_Gqm->qm_dqfreelist.qh_nelems--; - xfs_Gqm->qm_dqfreelist.qh_version++; -} - -void -xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq) -{ - xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq); -} diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index 495564b8af3..c9446f1c726 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -72,17 +72,6 @@ extern kmem_zone_t *qm_dqtrxzone; #define XFS_QM_MAX_DQCLUSTER_LOGSZ 3 typedef xfs_dqhash_t xfs_dqlist_t; -/* - * The freelist head. The first two fields match the first two in the - * xfs_dquot_t structure (in xfs_dqmarker_t) - */ -typedef struct xfs_frlist { - struct xfs_dquot *qh_next; - struct xfs_dquot *qh_prev; - struct mutex qh_lock; - uint qh_version; - uint qh_nelems; -} xfs_frlist_t; /* * Quota Manager (global) structure. Lives only in core. @@ -91,7 +80,9 @@ typedef struct xfs_qm { xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ - xfs_frlist_t qm_dqfreelist; /* freelist of dquots */ + struct list_head qm_dqfrlist; /* freelist of dquots */ + struct mutex qm_dqfrlist_lock; + int qm_dqfrlist_cnt; atomic_t qm_totaldquots; /* total incore dquots */ uint qm_nrefs; /* file systems with quota on */ int qm_dqfree_ratio;/* ratio of free to inuse dquots */ @@ -106,7 +97,9 @@ typedef struct xfs_qm { typedef struct xfs_quotainfo { xfs_inode_t *qi_uquotaip; /* user quota inode */ xfs_inode_t *qi_gquotaip; /* group quota inode */ - xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ + struct list_head qi_dqlist; /* all dquots in filesys */ + struct mutex qi_dqlist_lock; + int qi_dquots; int qi_dqreclaims; /* a change here indicates a removal in the dqlist */ time_t qi_btimelimit; /* limit for blks timer */ @@ -175,10 +168,6 @@ extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); -/* list stuff */ -extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); -extern void xfs_qm_freelist_unlink(xfs_dquot_t *); - #ifdef DEBUG extern int xfs_qm_internalqcheck(xfs_mount_t *); #else diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c index 83e7ea3e25f..3d1fc79532e 100644 --- a/fs/xfs/quota/xfs_qm_stats.c +++ b/fs/xfs/quota/xfs_qm_stats.c @@ -55,7 +55,7 @@ static int xqm_proc_show(struct seq_file *m, void *v) ndquot, xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, - xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); + xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0); return 0; } diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 50bee07d6b0..92b002f1805 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -79,6 +79,7 @@ xfs_qm_scall_quotaoff( xfs_mount_t *mp, uint flags) { + struct xfs_quotainfo *q = mp->m_quotainfo; uint dqtype; int error; uint inactivate_flags; @@ -102,11 +103,8 @@ xfs_qm_scall_quotaoff( * critical thing. * If quotaoff, then we must be dealing with the root filesystem. */ - ASSERT(mp->m_quotainfo); - if (mp->m_quotainfo) - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); - - ASSERT(mp->m_quotainfo); + ASSERT(q); + mutex_lock(&q->qi_quotaofflock); /* * If we're just turning off quota enforcement, change mp and go. @@ -117,7 +115,7 @@ xfs_qm_scall_quotaoff( spin_lock(&mp->m_sb_lock); mp->m_sb.sb_qflags = mp->m_qflags; spin_unlock(&mp->m_sb_lock); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + mutex_unlock(&q->qi_quotaofflock); /* XXX what to do if error ? Revert back to old vals incore ? */ error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); @@ -150,10 +148,8 @@ xfs_qm_scall_quotaoff( * Nothing to do? Don't complain. This happens when we're just * turning off quota enforcement. */ - if ((mp->m_qflags & flags) == 0) { - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - return (0); - } + if ((mp->m_qflags & flags) == 0) + goto out_unlock; /* * Write the LI_QUOTAOFF log record, and do SB changes atomically, @@ -162,7 +158,7 @@ xfs_qm_scall_quotaoff( */ error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); if (error) - goto out_error; + goto out_unlock; /* * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct @@ -204,7 +200,7 @@ xfs_qm_scall_quotaoff( * So, if we couldn't purge all the dquots from the filesystem, * we can't get rid of the incore data structures. */ - while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF))) + while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype))) delay(10 * nculprits); /* @@ -222,7 +218,7 @@ xfs_qm_scall_quotaoff( if (error) { /* We're screwed now. Shutdown is the only option. */ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - goto out_error; + goto out_unlock; } /* @@ -230,27 +226,26 @@ xfs_qm_scall_quotaoff( */ if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) { - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + mutex_unlock(&q->qi_quotaofflock); xfs_qm_destroy_quotainfo(mp); return (0); } /* - * Release our quotainode references, and vn_purge them, - * if we don't need them anymore. + * Release our quotainode references if we don't need them anymore. */ - if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) { - IRELE(XFS_QI_UQIP(mp)); - XFS_QI_UQIP(mp) = NULL; + if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) { + IRELE(q->qi_uquotaip); + q->qi_uquotaip = NULL; } - if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) { - IRELE(XFS_QI_GQIP(mp)); - XFS_QI_GQIP(mp) = NULL; + if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) { + IRELE(q->qi_gquotaip); + q->qi_gquotaip = NULL; } -out_error: - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - return (error); +out_unlock: + mutex_unlock(&q->qi_quotaofflock); + return error; } int @@ -379,9 +374,9 @@ xfs_qm_scall_quotaon( /* * Switch on quota enforcement in core. */ - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); return (0); } @@ -392,11 +387,12 @@ xfs_qm_scall_quotaon( */ int xfs_qm_scall_getqstat( - xfs_mount_t *mp, - fs_quota_stat_t *out) + struct xfs_mount *mp, + struct fs_quota_stat *out) { - xfs_inode_t *uip, *gip; - boolean_t tempuqip, tempgqip; + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_inode *uip, *gip; + boolean_t tempuqip, tempgqip; uip = gip = NULL; tempuqip = tempgqip = B_FALSE; @@ -415,9 +411,9 @@ xfs_qm_scall_getqstat( out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; - if (mp->m_quotainfo) { - uip = mp->m_quotainfo->qi_uquotaip; - gip = mp->m_quotainfo->qi_gquotaip; + if (q) { + uip = q->qi_uquotaip; + gip = q->qi_gquotaip; } if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, @@ -441,17 +437,20 @@ xfs_qm_scall_getqstat( if (tempgqip) IRELE(gip); } - if (mp->m_quotainfo) { - out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); - out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp); - out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp); - out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp); - out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp); - out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp); + if (q) { + out->qs_incoredqs = q->qi_dquots; + out->qs_btimelimit = q->qi_btimelimit; + out->qs_itimelimit = q->qi_itimelimit; + out->qs_rtbtimelimit = q->qi_rtbtimelimit; + out->qs_bwarnlimit = q->qi_bwarnlimit; + out->qs_iwarnlimit = q->qi_iwarnlimit; } - return (0); + return 0; } +#define XFS_DQ_MASK \ + (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK) + /* * Adjust quota limits, and start/stop timers accordingly. */ @@ -462,15 +461,17 @@ xfs_qm_scall_setqlim( uint type, fs_disk_quota_t *newlim) { + struct xfs_quotainfo *q = mp->m_quotainfo; xfs_disk_dquot_t *ddq; xfs_dquot_t *dqp; xfs_trans_t *tp; int error; xfs_qcnt_t hard, soft; - if ((newlim->d_fieldmask & - (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) - return (0); + if (newlim->d_fieldmask & ~XFS_DQ_MASK) + return EINVAL; + if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) + return 0; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, @@ -485,7 +486,7 @@ xfs_qm_scall_setqlim( * a quotaoff from happening). (XXXThis doesn't currently happen * because we take the vfslock before calling xfs_qm_sysent). */ - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); + mutex_lock(&q->qi_quotaofflock); /* * Get the dquot (locked), and join it to the transaction. @@ -493,9 +494,8 @@ xfs_qm_scall_setqlim( */ if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { xfs_trans_cancel(tp, XFS_TRANS_ABORT); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); ASSERT(error != ENOENT); - return (error); + goto out_unlock; } xfs_trans_dqjoin(tp, dqp); ddq = &dqp->q_core; @@ -513,8 +513,8 @@ xfs_qm_scall_setqlim( ddq->d_blk_hardlimit = cpu_to_be64(hard); ddq->d_blk_softlimit = cpu_to_be64(soft); if (id == 0) { - mp->m_quotainfo->qi_bhardlimit = hard; - mp->m_quotainfo->qi_bsoftlimit = soft; + q->qi_bhardlimit = hard; + q->qi_bsoftlimit = soft; } } else { qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); @@ -529,8 +529,8 @@ xfs_qm_scall_setqlim( ddq->d_rtb_hardlimit = cpu_to_be64(hard); ddq->d_rtb_softlimit = cpu_to_be64(soft); if (id == 0) { - mp->m_quotainfo->qi_rtbhardlimit = hard; - mp->m_quotainfo->qi_rtbsoftlimit = soft; + q->qi_rtbhardlimit = hard; + q->qi_rtbsoftlimit = soft; } } else { qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); @@ -546,8 +546,8 @@ xfs_qm_scall_setqlim( ddq->d_ino_hardlimit = cpu_to_be64(hard); ddq->d_ino_softlimit = cpu_to_be64(soft); if (id == 0) { - mp->m_quotainfo->qi_ihardlimit = hard; - mp->m_quotainfo->qi_isoftlimit = soft; + q->qi_ihardlimit = hard; + q->qi_isoftlimit = soft; } } else { qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); @@ -572,23 +572,23 @@ xfs_qm_scall_setqlim( * for warnings. */ if (newlim->d_fieldmask & FS_DQ_BTIMER) { - mp->m_quotainfo->qi_btimelimit = newlim->d_btimer; + q->qi_btimelimit = newlim->d_btimer; ddq->d_btimer = cpu_to_be32(newlim->d_btimer); } if (newlim->d_fieldmask & FS_DQ_ITIMER) { - mp->m_quotainfo->qi_itimelimit = newlim->d_itimer; + q->qi_itimelimit = newlim->d_itimer; ddq->d_itimer = cpu_to_be32(newlim->d_itimer); } if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { - mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer; + q->qi_rtbtimelimit = newlim->d_rtbtimer; ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); } if (newlim->d_fieldmask & FS_DQ_BWARNS) - mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns; + q->qi_bwarnlimit = newlim->d_bwarns; if (newlim->d_fieldmask & FS_DQ_IWARNS) - mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns; + q->qi_iwarnlimit = newlim->d_iwarns; if (newlim->d_fieldmask & FS_DQ_RTBWARNS) - mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns; + q->qi_rtbwarnlimit = newlim->d_rtbwarns; } else { /* * If the user is now over quota, start the timelimit. @@ -605,8 +605,9 @@ xfs_qm_scall_setqlim( error = xfs_trans_commit(tp, 0); xfs_qm_dqprint(dqp); xfs_qm_dqrele(dqp); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); + out_unlock: + mutex_unlock(&q->qi_quotaofflock); return error; } @@ -853,7 +854,8 @@ xfs_dqrele_inode( int error; /* skip quota inodes */ - if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) { + if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || + ip == ip->i_mount->m_quotainfo->qi_gquotaip) { ASSERT(ip->i_udquot == NULL); ASSERT(ip->i_gdquot == NULL); read_unlock(&pag->pag_ici_lock); @@ -931,7 +933,8 @@ struct mutex qcheck_lock; } typedef struct dqtest { - xfs_dqmarker_t q_lists; + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_hashlist; xfs_dqhash_t *q_hash; /* the hashchain header */ xfs_mount_t *q_mount; /* filesystem this relates to */ xfs_dqid_t d_id; /* user id or group id */ @@ -942,14 +945,9 @@ typedef struct dqtest { STATIC void xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp) { - xfs_dquot_t *d; - if (((d) = (h)->qh_next)) - (d)->HL_PREVP = &((dqp)->HL_NEXT); - (dqp)->HL_NEXT = d; - (dqp)->HL_PREVP = &((h)->qh_next); - (h)->qh_next = (xfs_dquot_t *)dqp; - (h)->qh_version++; - (h)->qh_nelems++; + list_add(&dqp->q_hashlist, &h->qh_list); + h->qh_version++; + h->qh_nelems++; } STATIC void xfs_qm_dqtest_print( @@ -1061,9 +1059,7 @@ xfs_qm_internalqcheck_dqget( xfs_dqhash_t *h; h = DQTEST_HASH(mp, id, type); - for (d = (xfs_dqtest_t *) h->qh_next; d != NULL; - d = (xfs_dqtest_t *) d->HL_NEXT) { - /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */ + list_for_each_entry(d, &h->qh_list, q_hashlist) { if (d->d_id == id && mp == d->q_mount) { *O_dq = d; return (0); @@ -1074,6 +1070,7 @@ xfs_qm_internalqcheck_dqget( d->d_id = id; d->q_mount = mp; d->q_hash = h; + INIT_LIST_HEAD(&d->q_hashlist); xfs_qm_hashinsert(h, d); *O_dq = d; return (0); @@ -1180,8 +1177,6 @@ xfs_qm_internalqcheck( xfs_ino_t lastino; int done, count; int i; - xfs_dqtest_t *d, *e; - xfs_dqhash_t *h1; int error; lastino = 0; @@ -1221,19 +1216,18 @@ xfs_qm_internalqcheck( } cmn_err(CE_DEBUG, "Checking results against system dquots"); for (i = 0; i < qmtest_hashmask; i++) { - h1 = &qmtest_udqtab[i]; - for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { + xfs_dqtest_t *d, *n; + xfs_dqhash_t *h; + + h = &qmtest_udqtab[i]; + list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) { xfs_dqtest_cmp(d); - e = (xfs_dqtest_t *) d->HL_NEXT; kmem_free(d); - d = e; } - h1 = &qmtest_gdqtab[i]; - for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { + h = &qmtest_gdqtab[i]; + list_for_each_entry_safe(d, n, &h->qh_list, q_hashlist) { xfs_dqtest_cmp(d); - e = (xfs_dqtest_t *) d->HL_NEXT; kmem_free(d); - d = e; } } diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h index 8286b2842b6..94a3d927d71 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -24,43 +24,6 @@ */ #define XFS_DQITER_MAP_SIZE 10 -/* Number of dquots that fit in to a dquot block */ -#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk) - -#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t)) - -#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims) -#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip) -#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip) -#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen) -#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit) -#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit) -#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit) -#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit) -#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit) -#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit) -#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock) - -#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist) -#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) -#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) - -#define xfs_qm_mplist_lock(mp) \ - mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock)) -#define xfs_qm_mplist_nowait(mp) \ - mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock)) -#define xfs_qm_mplist_unlock(mp) \ - mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock)) -#define XFS_QM_IS_MPLIST_LOCKED(mp) \ - mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock)) - -#define xfs_qm_freelist_lock(qm) \ - mutex_lock(&((qm)->qm_dqfreelist.qh_lock)) -#define xfs_qm_freelist_lock_nowait(qm) \ - mutex_trylock(&((qm)->qm_dqfreelist.qh_lock)) -#define xfs_qm_freelist_unlock(qm) \ - mutex_unlock(&((qm)->qm_dqfreelist.qh_lock)) - /* * Hash into a bucket in the dquot hash table, based on <mp, id>. */ @@ -72,9 +35,6 @@ XFS_DQ_HASHVAL(mp, id)) : \ (xfs_Gqm->qm_grp_dqhtable + \ XFS_DQ_HASHVAL(mp, id))) -#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \ - XFS_IS_UQUOTA_ON(mp) : \ - XFS_IS_OQUOTA_ON(mp)) #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ !dqp->q_core.d_blk_hardlimit && \ !dqp->q_core.d_blk_softlimit && \ @@ -86,68 +46,6 @@ !dqp->q_core.d_rtbcount && \ !dqp->q_core.d_icount) -#define HL_PREVP dq_hashlist.ql_prevp -#define HL_NEXT dq_hashlist.ql_next -#define MPL_PREVP dq_mplist.ql_prevp -#define MPL_NEXT dq_mplist.ql_next - - -#define _LIST_REMOVE(h, dqp, PVP, NXT) \ - { \ - xfs_dquot_t *d; \ - if (((d) = (dqp)->NXT)) \ - (d)->PVP = (dqp)->PVP; \ - *((dqp)->PVP) = d; \ - (dqp)->NXT = NULL; \ - (dqp)->PVP = NULL; \ - (h)->qh_version++; \ - (h)->qh_nelems--; \ - } - -#define _LIST_INSERT(h, dqp, PVP, NXT) \ - { \ - xfs_dquot_t *d; \ - if (((d) = (h)->qh_next)) \ - (d)->PVP = &((dqp)->NXT); \ - (dqp)->NXT = d; \ - (dqp)->PVP = &((h)->qh_next); \ - (h)->qh_next = dqp; \ - (h)->qh_version++; \ - (h)->qh_nelems++; \ - } - -#define FOREACH_DQUOT_IN_MP(dqp, mp) \ - for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT) - -#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \ -for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \ - (dqp) = (dqp)->dq_flnext) - -#define XQM_HASHLIST_INSERT(h, dqp) \ - _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT) - -#define XQM_FREELIST_INSERT(h, dqp) \ - xfs_qm_freelist_append(h, dqp) - -#define XQM_MPLIST_INSERT(h, dqp) \ - _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT) - -#define XQM_HASHLIST_REMOVE(h, dqp) \ - _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT) -#define XQM_FREELIST_REMOVE(dqp) \ - xfs_qm_freelist_unlink(dqp) -#define XQM_MPLIST_REMOVE(h, dqp) \ - { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \ - XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; } - -#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp)) - -#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \ - (tp)->t_dqinfo->dqa_usrdquots : \ - (tp)->t_dqinfo->dqa_grpdquots) -#define XFS_IS_SUSER_DQUOT(dqp) \ - (!((dqp)->q_core.d_id)) - #define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index c3ab75cb1d9..061d827da33 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -59,12 +59,11 @@ xfs_trans_dqjoin( xfs_trans_t *tp, xfs_dquot_t *dqp) { - xfs_dq_logitem_t *lp; + xfs_dq_logitem_t *lp = &dqp->q_logitem; - ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp != tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp)); - lp = &dqp->q_logitem; + ASSERT(lp->qli_dquot == dqp); /* * Get a log_item_desc to point at the new item. @@ -96,7 +95,7 @@ xfs_trans_log_dquot( { xfs_log_item_desc_t *lidp; - ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp == tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem)); @@ -198,16 +197,16 @@ xfs_trans_get_dqtrx( int i; xfs_dqtrx_t *qa; - for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { - qa = XFS_QM_DQP_TO_DQACCT(tp, dqp); + qa = XFS_QM_ISUDQ(dqp) ? + tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots; + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (qa[i].qt_dquot == NULL || - qa[i].qt_dquot == dqp) { - return (&qa[i]); - } + qa[i].qt_dquot == dqp) + return &qa[i]; } - return (NULL); + return NULL; } /* @@ -381,7 +380,7 @@ xfs_trans_apply_dquot_deltas( break; ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp == tp); /* * adjust the actual number of blocks used @@ -639,7 +638,7 @@ xfs_trans_dqresv( softlimit = q->qi_bsoftlimit; timer = be32_to_cpu(dqp->q_core.d_btimer); warns = be16_to_cpu(dqp->q_core.d_bwarns); - warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; resbcountp = &dqp->q_res_bcount; } else { ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); @@ -651,7 +650,7 @@ xfs_trans_dqresv( softlimit = q->qi_rtbsoftlimit; timer = be32_to_cpu(dqp->q_core.d_rtbtimer); warns = be16_to_cpu(dqp->q_core.d_rtbwarns); - warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; resbcountp = &dqp->q_res_rtbcount; } @@ -691,7 +690,7 @@ xfs_trans_dqresv( count = be64_to_cpu(dqp->q_core.d_icount); timer = be32_to_cpu(dqp->q_core.d_itimer); warns = be16_to_cpu(dqp->q_core.d_iwarns); - warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); if (!hardlimit) hardlimit = q->qi_ihardlimit; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index d13eeba2c8f..0135e2a669d 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode); extern int posix_acl_access_exists(struct inode *inode); extern int posix_acl_default_exists(struct inode *inode); -extern struct xattr_handler xfs_xattr_acl_access_handler; -extern struct xattr_handler xfs_xattr_acl_default_handler; +extern const struct xattr_handler xfs_xattr_acl_access_handler; +extern const struct xattr_handler xfs_xattr_acl_default_handler; #else # define xfs_check_acl NULL # define xfs_get_acl(inode, type) NULL diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index abb8222b88c..401f364ad36 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -175,14 +175,20 @@ typedef struct xfs_agfl { } xfs_agfl_t; /* - * Busy block/extent entry. Used in perag to mark blocks that have been freed - * but whose transactions aren't committed to disk yet. + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_alloc_busy_insert() for details. */ -typedef struct xfs_perag_busy { - xfs_agblock_t busy_start; - xfs_extlen_t busy_length; - struct xfs_trans *busy_tp; /* transaction that did the free */ -} xfs_perag_busy_t; +struct xfs_busy_extent { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + xlog_tid_t tid; /* transaction that created this */ +}; /* * Per-ag incore structure, copies of information in agf and agi, @@ -216,7 +222,8 @@ typedef struct xfs_perag { xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; #ifdef __KERNEL__ - spinlock_t pagb_lock; /* lock for pagb_list */ + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ atomic_t pagf_fstrms; /* # of filestreams active in this AG */ @@ -226,7 +233,6 @@ typedef struct xfs_perag { int pag_ici_reclaimable; /* reclaimable inodes */ #endif int pagb_count; /* pagb slots in use */ - xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */ } xfs_perag_t; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 94cddbfb256..a7fbe8a99b1 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -46,11 +46,9 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +static int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); /* * Prototypes for per-ag allocation routines @@ -540,9 +538,16 @@ xfs_alloc_ag_vextent( be32_to_cpu(agf->agf_length)); xfs_alloc_log_agf(args->tp, args->agbp, XFS_AGF_FREEBLKS); - /* search the busylist for these blocks */ - xfs_alloc_search_busy(args->tp, args->agno, - args->agbno, args->len); + /* + * Search the busylist for these blocks and mark the + * transaction as synchronous if blocks are found. This + * avoids the need to block due to a synchronous log + * force to ensure correct ordering as the synchronous + * transaction will guarantee that for us. + */ + if (xfs_alloc_busy_search(args->mp, args->agno, + args->agbno, args->len)) + xfs_trans_set_sync(args->tp); } if (!args->isfl) xfs_trans_mod_sb(args->tp, @@ -1693,7 +1698,7 @@ xfs_free_ag_extent( * when the iclog commits to disk. If a busy block is allocated, * the iclog is pushed up to the LSN that freed the block. */ - xfs_alloc_mark_busy(tp, agno, bno, len); + xfs_alloc_busy_insert(tp, agno, bno, len); return 0; error0: @@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist( *bnop = bno; /* - * As blocks are freed, they are added to the per-ag busy list - * and remain there until the freeing transaction is committed to - * disk. Now that we have allocated blocks, this list must be - * searched to see if a block is being reused. If one is, then - * the freeing transaction must be pushed to disk NOW by forcing - * to disk all iclogs up that transaction's LSN. + * As blocks are freed, they are added to the per-ag busy list and + * remain there until the freeing transaction is committed to disk. + * Now that we have allocated blocks, this list must be searched to see + * if a block is being reused. If one is, then the freeing transaction + * must be pushed to disk before this transaction. + * + * We do this by setting the current transaction to a sync transaction + * which guarantees that the freeing transaction is on disk before this + * transaction. This is done instead of a synchronous log force here so + * that we don't sit and wait with the AGF locked in the transaction + * during the log force. */ - xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); + if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1)) + xfs_trans_set_sync(tp); return 0; } @@ -2201,7 +2212,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; - memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); + pag->pagb_tree = RB_ROOT; pag->pagf_init = 1; } #ifdef DEBUG @@ -2479,127 +2490,263 @@ error0: * list is reused, the transaction that freed it must be forced to disk * before continuing to use the block. * - * xfs_alloc_mark_busy - add to the per-ag busy list - * xfs_alloc_clear_busy - remove an item from the per-ag busy list + * xfs_alloc_busy_insert - add to the per-ag busy list + * xfs_alloc_busy_clear - remove an item from the per-ag busy list + * xfs_alloc_busy_search - search for a busy extent + */ + +/* + * Insert a new extent into the busy tree. + * + * The busy extent tree is indexed by the start block of the busy extent. + * there can be multiple overlapping ranges in the busy extent tree but only + * ever one entry at a given start block. The reason for this is that + * multi-block extents can be freed, then smaller chunks of that extent + * allocated and freed again before the first transaction commit is on disk. + * If the exact same start block is freed a second time, we have to wait for + * that busy extent to pass out of the tree before the new extent is inserted. + * There are two main cases we have to handle here. + * + * The first case is a transaction that triggers a "free - allocate - free" + * cycle. This can occur during btree manipulations as a btree block is freed + * to the freelist, then allocated from the free list, then freed again. In + * this case, the second extxpnet free is what triggers the duplicate and as + * such the transaction IDs should match. Because the extent was allocated in + * this transaction, the transaction must be marked as synchronous. This is + * true for all cases where the free/alloc/free occurs in the one transaction, + * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case. + * This serves to catch violations of the second case quite effectively. + * + * The second case is where the free/alloc/free occur in different + * transactions. In this case, the thread freeing the extent the second time + * can't mark the extent busy immediately because it is already tracked in a + * transaction that may be committing. When the log commit for the existing + * busy extent completes, the busy extent will be removed from the tree. If we + * allow the second busy insert to continue using that busy extent structure, + * it can be freed before this transaction is safely in the log. Hence our + * only option in this case is to force the log to remove the existing busy + * extent from the list before we insert the new one with the current + * transaction ID. + * + * The problem we are trying to avoid in the free-alloc-free in separate + * transactions is most easily described with a timeline: + * + * Thread 1 Thread 2 Thread 3 xfslogd + * xact alloc + * free X + * mark busy + * commit xact + * free xact + * xact alloc + * alloc X + * busy search + * mark xact sync + * commit xact + * free xact + * force log + * checkpoint starts + * .... + * xact alloc + * free X + * mark busy + * finds match + * *** KABOOM! *** + * .... + * log IO completes + * unbusy X + * checkpoint completes + * + * By issuing a log force in thread 3 @ "KABOOM", the thread will block until + * the checkpoint completes, and the busy extent it matched will have been + * removed from the tree when it is woken. Hence it can then continue safely. + * + * However, to ensure this matching process is robust, we need to use the + * transaction ID for identifying transaction, as delayed logging results in + * the busy extent and transaction lifecycles being different. i.e. the busy + * extent is active for a lot longer than the transaction. Hence the + * transaction structure can be freed and reallocated, then mark the same + * extent busy again in the new transaction. In this case the new transaction + * will have a different tid but can have the same address, and hence we need + * to check against the tid. + * + * Future: for delayed logging, we could avoid the log force if the extent was + * first freed in the current checkpoint sequence. This, however, requires the + * ability to pin the current checkpoint in memory until this transaction + * commits to ensure that both the original free and the current one combine + * logically into the one checkpoint. If the checkpoint sequences are + * different, however, we still need to wait on a log force. */ void -xfs_alloc_mark_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +xfs_alloc_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { - xfs_perag_busy_t *bsy; + struct xfs_busy_extent *new; + struct xfs_busy_extent *busyp; struct xfs_perag *pag; - int n; + struct rb_node **rbp; + struct rb_node *parent; + int match; - pag = xfs_perag_get(tp->t_mountp, agno); - spin_lock(&pag->pagb_lock); - /* search pagb_list for an open slot */ - for (bsy = pag->pagb_list, n = 0; - n < XFS_PAGB_NUM_SLOTS; - bsy++, n++) { - if (bsy->busy_tp == NULL) { - break; - } + new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_alloc_busy(tp, agno, bno, len, 1); + xfs_trans_set_sync(tp); + return; } - trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); + new->agno = agno; + new->bno = bno; + new->length = len; + new->tid = xfs_log_get_trans_ident(tp); - if (n < XFS_PAGB_NUM_SLOTS) { - bsy = &pag->pagb_list[n]; - pag->pagb_count++; - bsy->busy_start = bno; - bsy->busy_length = len; - bsy->busy_tp = tp; - xfs_trans_add_busy(tp, agno, n); - } else { + INIT_LIST_HEAD(&new->list); + + /* trace before insert to be able to see failed inserts */ + trace_xfs_alloc_busy(tp, agno, bno, len, 0); + + pag = xfs_perag_get(tp->t_mountp, new->agno); +restart: + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + parent = NULL; + busyp = NULL; + match = 0; + while (*rbp && match >= 0) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); + + if (new->bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + rbp = &(*rbp)->rb_left; + if (new->bno + new->length > busyp->bno) + match = busyp->tid == new->tid ? 1 : -1; + } else if (new->bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + rbp = &(*rbp)->rb_right; + if (bno < busyp->bno + busyp->length) + match = busyp->tid == new->tid ? 1 : -1; + } else { + match = busyp->tid == new->tid ? 1 : -1; + break; + } + } + if (match < 0) { + /* overlap marked busy in different transaction */ + spin_unlock(&pag->pagb_lock); + xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); + goto restart; + } + if (match > 0) { /* - * The busy list is full! Since it is now not possible to - * track the free block, make this a synchronous transaction - * to insure that the block is not reused before this - * transaction commits. + * overlap marked busy in same transaction. Update if exact + * start block match, otherwise combine the busy extents into + * a single range. */ - xfs_trans_set_sync(tp); - } + if (busyp->bno == new->bno) { + busyp->length = max(busyp->length, new->length); + spin_unlock(&pag->pagb_lock); + ASSERT(tp->t_flags & XFS_TRANS_SYNC); + xfs_perag_put(pag); + kmem_free(new); + return; + } + rb_erase(&busyp->rb_node, &pag->pagb_tree); + new->length = max(busyp->bno + busyp->length, + new->bno + new->length) - + min(busyp->bno, new->bno); + new->bno = min(busyp->bno, new->bno); + } else + busyp = NULL; + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); + kmem_free(busyp); } -void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - int idx) +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +static int +xfs_alloc_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { struct xfs_perag *pag; - xfs_perag_busy_t *list; + struct rb_node *rbp; + struct xfs_busy_extent *busyp; + int match = 0; - ASSERT(idx < XFS_PAGB_NUM_SLOTS); - pag = xfs_perag_get(tp->t_mountp, agno); + pag = xfs_perag_get(mp, agno); spin_lock(&pag->pagb_lock); - list = pag->pagb_list; - trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); - - if (list[idx].busy_tp == tp) { - list[idx].busy_tp = NULL; - pag->pagb_count--; + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } } - spin_unlock(&pag->pagb_lock); + trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match); xfs_perag_put(pag); + return match; } - -/* - * If we find the extent in the busy list, force the log out to get the - * extent out of the busy list so the caller can use it straight away. - */ -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +void +xfs_alloc_busy_clear( + struct xfs_mount *mp, + struct xfs_busy_extent *busyp) { struct xfs_perag *pag; - xfs_perag_busy_t *bsy; - xfs_agblock_t uend, bend; - xfs_lsn_t lsn = 0; - int cnt; - pag = xfs_perag_get(tp->t_mountp, agno); - spin_lock(&pag->pagb_lock); - cnt = pag->pagb_count; + trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, + busyp->length); - /* - * search pagb_list for this slot, skipping open slots. We have to - * search the entire array as there may be multiple overlaps and - * we have to get the most recent LSN for the log force to push out - * all the transactions that span the range. - */ - uend = bno + len - 1; - for (cnt = 0; cnt < pag->pagb_count; cnt++) { - bsy = &pag->pagb_list[cnt]; - if (!bsy->busy_tp) - continue; + ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, + busyp->length) == 1); - bend = bsy->busy_start + bsy->busy_length - 1; - if (bno > bend || uend < bsy->busy_start) - continue; + list_del_init(&busyp->list); - /* (start1,length1) within (start2, length2) */ - if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) - lsn = bsy->busy_tp->t_commit_lsn; - } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + rb_erase(&busyp->rb_node, &pag->pagb_tree); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); - trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); - /* - * If a block was found, force the log through the LSN of the - * transaction that freed the block - */ - if (lsn) - xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC); + kmem_free(busyp); } diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 599bffa3978..6d05199b667 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -22,6 +22,7 @@ struct xfs_buf; struct xfs_mount; struct xfs_perag; struct xfs_trans; +struct xfs_busy_extent; /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. @@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, #ifdef __KERNEL__ void -xfs_alloc_mark_busy(xfs_trans_t *tp, +xfs_alloc_busy_insert(xfs_trans_t *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len); void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - int idx); +xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b726e10d2c1..83f49421875 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -134,7 +134,7 @@ xfs_allocbt_free_block( * disk. If a busy block is allocated, the iclog is pushed up to the * LSN that freed the block. */ - xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); + xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 5c11e4d1701..99587ded043 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3829,7 +3829,7 @@ xfs_bmap_add_attrfork( } if ((error = xfs_bmap_finish(&tp, &flist, &committed))) goto error2; - error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); ASSERT(ip->i_df.if_ext_max == XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); return error; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index f3c49e69eab..02a80984aa0 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -64,7 +64,7 @@ xfs_buf_item_log_debug( nbytes = last - first + 1; bfset(bip->bli_logged, first, nbytes); for (x = 0; x < nbytes; x++) { - chunk_num = byte >> XFS_BLI_SHIFT; + chunk_num = byte >> XFS_BLF_SHIFT; word_num = chunk_num >> BIT_TO_WORD_SHIFT; bit_num = chunk_num & (NBWORD - 1); wordp = &(bip->bli_format.blf_data_map[word_num]); @@ -166,7 +166,7 @@ xfs_buf_item_size( * cancel flag in it. */ trace_xfs_buf_item_size_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); return 1; } @@ -197,9 +197,9 @@ xfs_buf_item_size( } else if (next_bit != last_bit + 1) { last_bit = next_bit; nvecs++; - } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != - (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + - XFS_BLI_CHUNK)) { + } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != + (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + + XFS_BLF_CHUNK)) { last_bit = next_bit; nvecs++; } else { @@ -254,6 +254,20 @@ xfs_buf_item_format( vecp++; nvecs = 1; + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. We do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(&bip->bli_item))) + bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + if (bip->bli_flags & XFS_BLI_STALE) { /* * The buffer is stale, so all we need to log @@ -261,7 +275,7 @@ xfs_buf_item_format( * cancel flag in it. */ trace_xfs_buf_item_format_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); bip->bli_format.blf_size = nvecs; return; } @@ -294,28 +308,28 @@ xfs_buf_item_format( * keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; + vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; break; } else if (next_bit != last_bit + 1) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; + vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; nvecs++; vecp++; first_bit = next_bit; last_bit = next_bit; nbits = 1; - } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != - (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + - XFS_BLI_CHUNK)) { - buffer_offset = first_bit * XFS_BLI_CHUNK; + } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) != + (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) + + XFS_BLF_CHUNK)) { + buffer_offset = first_bit * XFS_BLF_CHUNK; vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; + vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; /* You would think we need to bump the nvecs here too, but we do not * this number is used by recovery, and it gets confused by the boundary @@ -341,10 +355,15 @@ xfs_buf_item_format( } /* - * This is called to pin the buffer associated with the buf log - * item in memory so it cannot be written out. Simply call bpin() - * on the buffer to do this. + * This is called to pin the buffer associated with the buf log item in memory + * so it cannot be written out. Simply call bpin() on the buffer to do this. + * + * We also always take a reference to the buffer log item here so that the bli + * is held while the item is pinned in memory. This means that we can + * unconditionally drop the reference count a transaction holds when the + * transaction is completed. */ + STATIC void xfs_buf_item_pin( xfs_buf_log_item_t *bip) @@ -356,6 +375,7 @@ xfs_buf_item_pin( ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); + atomic_inc(&bip->bli_refcount); trace_xfs_buf_item_pin(bip); xfs_bpin(bp); } @@ -372,12 +392,12 @@ xfs_buf_item_pin( */ STATIC void xfs_buf_item_unpin( - xfs_buf_log_item_t *bip, - int stale) + xfs_buf_log_item_t *bip) { struct xfs_ail *ailp; xfs_buf_t *bp; int freed; + int stale = bip->bli_flags & XFS_BLI_STALE; bp = bip->bli_buf; ASSERT(bp != NULL); @@ -393,7 +413,7 @@ xfs_buf_item_unpin( ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); trace_xfs_buf_item_unpin_stale(bip); /* @@ -428,40 +448,34 @@ xfs_buf_item_unpin_remove( xfs_buf_log_item_t *bip, xfs_trans_t *tp) { - xfs_buf_t *bp; - xfs_log_item_desc_t *lidp; - int stale = 0; - - bp = bip->bli_buf; - /* - * will xfs_buf_item_unpin() call xfs_buf_item_relse()? - */ + /* will xfs_buf_item_unpin() call xfs_buf_item_relse()? */ if ((atomic_read(&bip->bli_refcount) == 1) && (bip->bli_flags & XFS_BLI_STALE)) { + /* + * yes -- We can safely do some work here and then call + * buf_item_unpin to do the rest because we are + * are holding the buffer locked so no one else will be + * able to bump up the refcount. We have to remove the + * log item from the transaction as we are about to release + * our reference to the buffer. If we don't, the unlock that + * occurs later in the xfs_trans_uncommit() will try to + * reference the buffer which we no longer have a hold on. + */ + struct xfs_log_item_desc *lidp; + ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); trace_xfs_buf_item_unpin_stale(bip); - /* - * yes -- clear the xaction descriptor in-use flag - * and free the chunk if required. We can safely - * do some work here and then call buf_item_unpin - * to do the rest because if the if is true, then - * we are holding the buffer locked so no one else - * will be able to bump up the refcount. - */ - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip); - stale = lidp->lid_flags & XFS_LID_BUF_STALE; + lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)bip); xfs_trans_free_item(tp, lidp); + /* - * Since the transaction no longer refers to the buffer, - * the buffer should no longer refer to the transaction. + * Since the transaction no longer refers to the buffer, the + * buffer should no longer refer to the transaction. */ - XFS_BUF_SET_FSPRIVATE2(bp, NULL); + XFS_BUF_SET_FSPRIVATE2(bip->bli_buf, NULL); } - - xfs_buf_item_unpin(bip, stale); - - return; + xfs_buf_item_unpin(bip); } /* @@ -495,20 +509,23 @@ xfs_buf_item_trylock( } /* - * Release the buffer associated with the buf log item. - * If there is no dirty logged data associated with the - * buffer recorded in the buf log item, then free the - * buf log item and remove the reference to it in the - * buffer. + * Release the buffer associated with the buf log item. If there is no dirty + * logged data associated with the buffer recorded in the buf log item, then + * free the buf log item and remove the reference to it in the buffer. + * + * This call ignores the recursion count. It is only called when the buffer + * should REALLY be unlocked, regardless of the recursion count. * - * This call ignores the recursion count. It is only called - * when the buffer should REALLY be unlocked, regardless - * of the recursion count. + * We unconditionally drop the transaction's reference to the log item. If the + * item was logged, then another reference was taken when it was pinned, so we + * can safely drop the transaction reference now. This also allows us to avoid + * potential races with the unpin code freeing the bli by not referencing the + * bli after we've dropped the reference count. * - * If the XFS_BLI_HOLD flag is set in the buf log item, then - * free the log item if necessary but do not unlock the buffer. - * This is for support of xfs_trans_bhold(). Make sure the - * XFS_BLI_HOLD field is cleared if we don't free the item. + * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item + * if necessary but do not unlock the buffer. This is for support of + * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't + * free the item. */ STATIC void xfs_buf_item_unlock( @@ -520,73 +537,54 @@ xfs_buf_item_unlock( bp = bip->bli_buf; - /* - * Clear the buffer's association with this transaction. - */ + /* Clear the buffer's association with this transaction. */ XFS_BUF_SET_FSPRIVATE2(bp, NULL); /* - * If this is a transaction abort, don't return early. - * Instead, allow the brelse to happen. - * Normally it would be done for stale (cancelled) buffers - * at unpin time, but we'll never go through the pin/unpin - * cycle if we abort inside commit. + * If this is a transaction abort, don't return early. Instead, allow + * the brelse to happen. Normally it would be done for stale + * (cancelled) buffers at unpin time, but we'll never go through the + * pin/unpin cycle if we abort inside commit. */ aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; /* - * If the buf item is marked stale, then don't do anything. - * We'll unlock the buffer and free the buf item when the - * buffer is unpinned for the last time. + * Before possibly freeing the buf item, determine if we should + * release the buffer at the end of this routine. */ - if (bip->bli_flags & XFS_BLI_STALE) { - bip->bli_flags &= ~XFS_BLI_LOGGED; - trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); - if (!aborted) - return; - } + hold = bip->bli_flags & XFS_BLI_HOLD; + + /* Clear the per transaction state. */ + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); /* - * Drop the transaction's reference to the log item if - * it was not logged as part of the transaction. Otherwise - * we'll drop the reference in xfs_buf_item_unpin() when - * the transaction is really through with the buffer. + * If the buf item is marked stale, then don't do anything. We'll + * unlock the buffer and free the buf item when the buffer is unpinned + * for the last time. */ - if (!(bip->bli_flags & XFS_BLI_LOGGED)) { - atomic_dec(&bip->bli_refcount); - } else { - /* - * Clear the logged flag since this is per - * transaction state. - */ - bip->bli_flags &= ~XFS_BLI_LOGGED; + if (bip->bli_flags & XFS_BLI_STALE) { + trace_xfs_buf_item_unlock_stale(bip); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); + if (!aborted) { + atomic_dec(&bip->bli_refcount); + return; + } } - /* - * Before possibly freeing the buf item, determine if we should - * release the buffer at the end of this routine. - */ - hold = bip->bli_flags & XFS_BLI_HOLD; trace_xfs_buf_item_unlock(bip); /* - * If the buf item isn't tracking any data, free it. - * Otherwise, if XFS_BLI_HOLD is set clear it. + * If the buf item isn't tracking any data, free it, otherwise drop the + * reference we hold to it. */ if (xfs_bitmap_empty(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size)) { + bip->bli_format.blf_map_size)) xfs_buf_item_relse(bp); - } else if (hold) { - bip->bli_flags &= ~XFS_BLI_HOLD; - } + else + atomic_dec(&bip->bli_refcount); - /* - * Release the buffer if XFS_BLI_HOLD was not set. - */ - if (!hold) { + if (!hold) xfs_buf_relse(bp); - } } /* @@ -675,7 +673,7 @@ static struct xfs_item_ops xfs_buf_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_buf_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_buf_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) xfs_buf_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, @@ -723,20 +721,17 @@ xfs_buf_item_init( } /* - * chunks is the number of XFS_BLI_CHUNK size pieces + * chunks is the number of XFS_BLF_CHUNK size pieces * the buffer can be divided into. Make sure not to * truncate any pieces. map_size is the size of the * bitmap needed to describe the chunks of the buffer. */ - chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); + chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); - bip->bli_item.li_type = XFS_LI_BUF; - bip->bli_item.li_ops = &xfs_buf_item_ops; - bip->bli_item.li_mountp = mp; - bip->bli_item.li_ailp = mp->m_ail; + xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; @@ -799,8 +794,8 @@ xfs_buf_item_log( /* * Convert byte offsets to bit numbers. */ - first_bit = first >> XFS_BLI_SHIFT; - last_bit = last >> XFS_BLI_SHIFT; + first_bit = first >> XFS_BLF_SHIFT; + last_bit = last >> XFS_BLF_SHIFT; /* * Calculate the total number of bits to be set. diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 217f34af00c..f20bb472d58 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -26,7 +26,7 @@ extern kmem_zone_t *xfs_buf_item_zone; * have been logged. * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. */ -typedef struct xfs_buf_log_format_t { +typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ ushort blf_flags; /* misc state */ @@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format_t { * This flag indicates that the buffer contains on disk inodes * and requires special recovery handling. */ -#define XFS_BLI_INODE_BUF 0x1 +#define XFS_BLF_INODE_BUF 0x1 /* * This flag indicates that the buffer should not be replayed * during recovery because its blocks are being freed. */ -#define XFS_BLI_CANCEL 0x2 +#define XFS_BLF_CANCEL 0x2 /* * This flag indicates that the buffer contains on disk * user or group dquots and may require special recovery handling. */ -#define XFS_BLI_UDQUOT_BUF 0x4 -#define XFS_BLI_PDQUOT_BUF 0x8 -#define XFS_BLI_GDQUOT_BUF 0x10 +#define XFS_BLF_UDQUOT_BUF 0x4 +#define XFS_BLF_PDQUOT_BUF 0x8 +#define XFS_BLF_GDQUOT_BUF 0x10 -#define XFS_BLI_CHUNK 128 -#define XFS_BLI_SHIFT 7 +#define XFS_BLF_CHUNK 128 +#define XFS_BLF_SHIFT 7 #define BIT_TO_WORD_SHIFT 5 #define NBWORD (NBBY * sizeof(unsigned int)) @@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format_t { #define XFS_BLI_LOGGED 0x08 #define XFS_BLI_INODE_ALLOC_BUF 0x10 #define XFS_BLI_STALE_INODE 0x20 +#define XFS_BLI_INODE_BUF 0x40 #define XFS_BLI_FLAGS \ { XFS_BLI_HOLD, "HOLD" }, \ @@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format_t { { XFS_BLI_STALE, "STALE" }, \ { XFS_BLI_LOGGED, "LOGGED" }, \ { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ - { XFS_BLI_STALE_INODE, "STALE_INODE" } + { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ + { XFS_BLI_INODE_BUF, "INODE_BUF" } #ifdef __KERNEL__ diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 92d5cd5bf4f..047b8a8e5c2 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) va_list ap; #ifdef DEBUG - xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); #endif if (xfs_panic_mask && (xfs_panic_mask & panic_tag) @@ -186,18 +186,18 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) void xfs_error_report( - char *tag, - int level, - xfs_mount_t *mp, - char *fname, - int linenum, - inst_t *ra) + const char *tag, + int level, + struct xfs_mount *mp, + const char *filename, + int linenum, + inst_t *ra) { if (level <= xfs_error_level) { xfs_cmn_err(XFS_PTAG_ERROR_REPORT, CE_ALERT, mp, "XFS internal error %s at line %d of file %s. Caller 0x%p\n", - tag, linenum, fname, ra); + tag, linenum, filename, ra); xfs_stack_trace(); } @@ -205,15 +205,15 @@ xfs_error_report( void xfs_corruption_error( - char *tag, - int level, - xfs_mount_t *mp, - void *p, - char *fname, - int linenum, - inst_t *ra) + const char *tag, + int level, + struct xfs_mount *mp, + void *p, + const char *filename, + int linenum, + inst_t *ra) { if (level <= xfs_error_level) xfs_hex_dump(p, 16); - xfs_error_report(tag, level, mp, fname, linenum, ra); + xfs_error_report(tag, level, mp, filename, linenum, ra); } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 0c93051c465..c2c1a072bb8 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -29,10 +29,11 @@ extern int xfs_error_trap(int); struct xfs_mount; -extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp, - char *fname, int linenum, inst_t *ra); -extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp, - void *p, char *fname, int linenum, inst_t *ra); +extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, inst_t *ra); +extern void xfs_corruption_error(const char *tag, int level, + struct xfs_mount *mp, void *p, const char *filename, + int linenum, inst_t *ra); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6f35ed1b39b..409fe81585f 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -106,7 +106,7 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip) */ /*ARGSUSED*/ STATIC void -xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) +xfs_efi_item_unpin(xfs_efi_log_item_t *efip) { struct xfs_ail *ailp = efip->efi_item.li_ailp; @@ -224,7 +224,7 @@ static struct xfs_item_ops xfs_efi_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efi_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efi_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) xfs_efi_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, @@ -259,10 +259,7 @@ xfs_efi_init(xfs_mount_t *mp, KM_SLEEP); } - efip->efi_item.li_type = XFS_LI_EFI; - efip->efi_item.li_ops = &xfs_efi_item_ops; - efip->efi_item.li_mountp = mp; - efip->efi_item.li_ailp = mp->m_ail; + xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; @@ -428,7 +425,7 @@ xfs_efd_item_pin(xfs_efd_log_item_t *efdp) */ /*ARGSUSED*/ STATIC void -xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale) +xfs_efd_item_unpin(xfs_efd_log_item_t *efdp) { return; } @@ -518,7 +515,7 @@ static struct xfs_item_ops xfs_efd_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efd_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_efd_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) xfs_efd_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, @@ -554,10 +551,7 @@ xfs_efd_init(xfs_mount_t *mp, KM_SLEEP); } - efdp->efd_item.li_type = XFS_LI_EFD; - efdp->efd_item.li_ops = &xfs_efd_item_ops; - efdp->efd_item.li_mountp = mp; - efdp->efd_item.li_ailp = mp->m_ail; + xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops); efdp->efd_efip = efip; efdp->efd_format.efd_nextents = nextents; efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 0ffd5644704..8cd6e8d8fe9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2449,6 +2449,8 @@ xfs_iunpin_nowait( { ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + /* Give the log a push to start the unpinning I/O */ xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 7bfea854015..cf8249a6000 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -543,6 +543,7 @@ xfs_inode_item_pin( { ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); + trace_xfs_inode_pin(iip->ili_inode, _RET_IP_); atomic_inc(&iip->ili_inode->i_pincount); } @@ -556,11 +557,11 @@ xfs_inode_item_pin( /* ARGSUSED */ STATIC void xfs_inode_item_unpin( - xfs_inode_log_item_t *iip, - int stale) + xfs_inode_log_item_t *iip) { struct xfs_inode *ip = iip->ili_inode; + trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) wake_up(&ip->i_ipin_wait); @@ -572,7 +573,7 @@ xfs_inode_item_unpin_remove( xfs_inode_log_item_t *iip, xfs_trans_t *tp) { - xfs_inode_item_unpin(iip, 0); + xfs_inode_item_unpin(iip); } /* @@ -838,7 +839,7 @@ static struct xfs_item_ops xfs_inode_item_ops = { .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_inode_item_format, .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin, + .iop_unpin = (void(*)(xfs_log_item_t*))xfs_inode_item_unpin, .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) xfs_inode_item_unpin_remove, .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, @@ -865,17 +866,9 @@ xfs_inode_item_init( ASSERT(ip->i_itemp == NULL); iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); - iip->ili_item.li_type = XFS_LI_INODE; - iip->ili_item.li_ops = &xfs_inode_item_ops; - iip->ili_item.li_mountp = mp; - iip->ili_item.li_ailp = mp->m_ail; iip->ili_inode = ip; - - /* - We have zeroed memory. No need ... - iip->ili_extents_buf = NULL; - */ - + xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, + &xfs_inode_item_ops); iip->ili_format.ilf_type = XFS_LI_INODE; iip->ili_format.ilf_ino = ip->i_ino; iip->ili_format.ilf_blkno = ip->i_imap.im_blkno; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 0b65039951a..ef14943829d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -55,71 +55,33 @@ #define XFS_STRAT_WRITE_IMAPS 2 #define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP -STATIC int -xfs_imap_to_bmap( - xfs_inode_t *ip, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - xfs_iomap_t *iomapp, - int imaps, /* Number of imap entries */ - int iomaps, /* Number of iomap entries */ - int flags) -{ - xfs_mount_t *mp = ip->i_mount; - int pbm; - xfs_fsblock_t start_block; - - - for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) { - iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomapp->iomap_delta = offset - iomapp->iomap_offset; - iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomapp->iomap_flags = flags; - - if (XFS_IS_REALTIME_INODE(ip)) { - iomapp->iomap_flags |= IOMAP_REALTIME; - iomapp->iomap_target = mp->m_rtdev_targp; - } else { - iomapp->iomap_target = mp->m_ddev_targp; - } - start_block = imap->br_startblock; - if (start_block == HOLESTARTBLOCK) { - iomapp->iomap_bn = IOMAP_DADDR_NULL; - iomapp->iomap_flags |= IOMAP_HOLE; - } else if (start_block == DELAYSTARTBLOCK) { - iomapp->iomap_bn = IOMAP_DADDR_NULL; - iomapp->iomap_flags |= IOMAP_DELAY; - } else { - iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block); - if (ISUNWRITTEN(imap)) - iomapp->iomap_flags |= IOMAP_UNWRITTEN; - } - - offset += iomapp->iomap_bsize - iomapp->iomap_delta; - } - return pbm; /* Return the number filled */ -} +STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, + int, struct xfs_bmbt_irec *, int *); +STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int, + struct xfs_bmbt_irec *, int *); +STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *, int *); int xfs_iomap( - xfs_inode_t *ip, - xfs_off_t offset, - ssize_t count, - int flags, - xfs_iomap_t *iomapp, - int *niomaps) + struct xfs_inode *ip, + xfs_off_t offset, + ssize_t count, + int flags, + struct xfs_bmbt_irec *imap, + int *nimaps, + int *new) { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t offset_fsb, end_fsb; - int error = 0; - int lockmode = 0; - xfs_bmbt_irec_t imap; - int nimaps = 1; - int bmapi_flags = 0; - int iomap_flags = 0; + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int lockmode = 0; + int bmapi_flags = 0; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); + *new = 0; + if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); @@ -160,8 +122,8 @@ xfs_iomap( error = xfs_bmapi(NULL, ip, offset_fsb, (xfs_filblks_t)(end_fsb - offset_fsb), - bmapi_flags, NULL, 0, &imap, - &nimaps, NULL, NULL); + bmapi_flags, NULL, 0, imap, + nimaps, NULL, NULL); if (error) goto out; @@ -169,46 +131,41 @@ xfs_iomap( switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) { case BMAPI_WRITE: /* If we found an extent, return it */ - if (nimaps && - (imap.br_startblock != HOLESTARTBLOCK) && - (imap.br_startblock != DELAYSTARTBLOCK)) { - trace_xfs_iomap_found(ip, offset, count, flags, &imap); + if (*nimaps && + (imap->br_startblock != HOLESTARTBLOCK) && + (imap->br_startblock != DELAYSTARTBLOCK)) { + trace_xfs_iomap_found(ip, offset, count, flags, imap); break; } if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { error = xfs_iomap_write_direct(ip, offset, count, flags, - &imap, &nimaps, nimaps); + imap, nimaps); } else { error = xfs_iomap_write_delay(ip, offset, count, flags, - &imap, &nimaps); + imap, nimaps); } if (!error) { - trace_xfs_iomap_alloc(ip, offset, count, flags, &imap); + trace_xfs_iomap_alloc(ip, offset, count, flags, imap); } - iomap_flags = IOMAP_NEW; + *new = 1; break; case BMAPI_ALLOCATE: /* If we found an extent, return it */ xfs_iunlock(ip, lockmode); lockmode = 0; - if (nimaps && !isnullstartblock(imap.br_startblock)) { - trace_xfs_iomap_found(ip, offset, count, flags, &imap); + if (*nimaps && !isnullstartblock(imap->br_startblock)) { + trace_xfs_iomap_found(ip, offset, count, flags, imap); break; } error = xfs_iomap_write_allocate(ip, offset, count, - &imap, &nimaps); + imap, nimaps); break; } - if (nimaps) { - *niomaps = xfs_imap_to_bmap(ip, offset, &imap, - iomapp, nimaps, *niomaps, iomap_flags); - } else if (niomaps) { - *niomaps = 0; - } + ASSERT(*nimaps <= 1); out: if (lockmode) @@ -216,7 +173,6 @@ out: return XFS_ERROR(error); } - STATIC int xfs_iomap_eof_align_last_fsb( xfs_mount_t *mp, @@ -285,15 +241,14 @@ xfs_cmn_err_fsblock_zero( return EFSCORRUPTED; } -int +STATIC int xfs_iomap_write_direct( xfs_inode_t *ip, xfs_off_t offset, size_t count, int flags, xfs_bmbt_irec_t *ret_imap, - int *nmaps, - int found) + int *nmaps) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; @@ -330,7 +285,7 @@ xfs_iomap_write_direct( if (error) goto error_out; } else { - if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) + if (*nmaps && (ret_imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) ret_imap->br_blockcount + ret_imap->br_startoff); @@ -485,7 +440,7 @@ xfs_iomap_eof_want_preallocate( return 0; } -int +STATIC int xfs_iomap_write_delay( xfs_inode_t *ip, xfs_off_t offset, @@ -588,7 +543,7 @@ retry: * We no longer bother to look at the incoming map - all we have to * guarantee is that whatever we allocate fills the required range. */ -int +STATIC int xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 174f2999099..81ac4afd45b 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,19 +18,6 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ -#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL)) - - -typedef enum { /* iomap_flags values */ - IOMAP_READ = 0, /* mapping for a read */ - IOMAP_HOLE = 0x02, /* mapping covers a hole */ - IOMAP_DELAY = 0x04, /* mapping covers delalloc region */ - IOMAP_REALTIME = 0x10, /* mapping on the realtime device */ - IOMAP_UNWRITTEN = 0x20, /* mapping covers allocated */ - /* but uninitialized file data */ - IOMAP_NEW = 0x40 /* just allocate */ -} iomap_flags_t; - typedef enum { /* base extent manipulation calls */ BMAPI_READ = (1 << 0), /* read extents */ @@ -52,43 +39,11 @@ typedef enum { { BMAPI_MMAP, "MMAP" }, \ { BMAPI_TRYLOCK, "TRYLOCK" } -/* - * xfs_iomap_t: File system I/O map - * - * The iomap_bn field is expressed in 512-byte blocks, and is where the - * mapping starts on disk. - * - * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes. - * iomap_offset is the offset of the mapping in the file itself. - * iomap_bsize is the size of the mapping, iomap_delta is the - * desired data's offset into the mapping, given the offset supplied - * to the file I/O map routine. - * - * When a request is made to read beyond the logical end of the object, - * iomap_size may be set to 0, but iomap_offset and iomap_length should be set - * to the actual amount of underlying storage that has been allocated, if any. - */ - -typedef struct xfs_iomap { - xfs_daddr_t iomap_bn; /* first 512B blk of mapping */ - xfs_buftarg_t *iomap_target; - xfs_off_t iomap_offset; /* offset of mapping, bytes */ - xfs_off_t iomap_bsize; /* size of mapping, bytes */ - xfs_off_t iomap_delta; /* offset into mapping, bytes */ - iomap_flags_t iomap_flags; -} xfs_iomap_t; - struct xfs_inode; struct xfs_bmbt_irec; extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, - struct xfs_iomap *, int *); -extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, - int, struct xfs_bmbt_irec *, int *, int); -extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int, - struct xfs_bmbt_irec *, int *); -extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int *); + struct xfs_bmbt_irec *, int *, int *); extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2be01913628..5215abc8023 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -44,13 +44,8 @@ kmem_zone_t *xfs_log_ticket_zone; -#define xlog_write_adv_cnt(ptr, len, off, bytes) \ - { (ptr) += (bytes); \ - (len) -= (bytes); \ - (off) += (bytes);} - /* Local miscellaneous function prototypes */ -STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, +STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket, xlog_in_core_t **, xfs_lsn_t *); STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, xfs_buftarg_t *log_target, @@ -59,11 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); -STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], - int nentries, struct xlog_ticket *tic, - xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, - uint flags); /* local state machine functions */ STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); @@ -93,16 +83,8 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log, STATIC void xlog_ungrant_log_space(xlog_t *log, xlog_ticket_t *ticket); - -/* local ticket functions */ -STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log, - int unit_bytes, - int count, - char clientid, - uint flags); - #if defined(DEBUG) -STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); +STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); STATIC void xlog_verify_grant_head(xlog_t *log, int equals); STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, int count, boolean_t syncing); @@ -258,7 +240,7 @@ xfs_log_done( * If we get an error, just continue and give back the log ticket. */ (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(mp, ticket, iclog, &lsn)))) { + (xlog_commit_record(log, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { flags |= XFS_LOG_REL_PERM_RESERV; @@ -367,6 +349,15 @@ xfs_log_reserve( ASSERT(flags & XFS_LOG_PERM_RESERV); internal_ticket = *ticket; + /* + * this is a new transaction on the ticket, so we need to + * change the transaction ID so that the next transaction has a + * different TID in the log. Just add one to the existing tid + * so that we can see chains of rolling transactions in the log + * easily. + */ + internal_ticket->t_tid++; + trace_xfs_log_reserve(log, internal_ticket); xlog_grant_push_ail(mp, internal_ticket->t_unit_res); @@ -374,7 +365,8 @@ xfs_log_reserve( } else { /* may sleep if need to allocate more tickets */ internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, - client, flags); + client, flags, + KM_SLEEP|KM_MAYFAIL); if (!internal_ticket) return XFS_ERROR(ENOMEM); internal_ticket->t_trans_type = t_type; @@ -459,6 +451,13 @@ xfs_log_mount( /* Normal transactions can now occur */ mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + /* + * Now the log has been fully initialised and we know were our + * space grant counters are, we can initialise the permanent ticket + * needed for delayed logging to work. + */ + xlog_cil_init_post_recovery(mp->m_log); + return 0; out_destroy_ail: @@ -516,18 +515,10 @@ xfs_log_unmount_write(xfs_mount_t *mp) #ifdef DEBUG xlog_in_core_t *first_iclog; #endif - xfs_log_iovec_t reg[1]; xlog_ticket_t *tic = NULL; xfs_lsn_t lsn; int error; - /* the data section must be 32 bit size aligned */ - struct { - __uint16_t magic; - __uint16_t pad1; - __uint32_t pad2; /* may as well make it 64 bits */ - } magic = { XLOG_UNMOUNT_TYPE, 0, 0 }; - /* * Don't write out unmount record on read-only mounts. * Or, if we are doing a forced umount (typically because of IO errors). @@ -549,16 +540,30 @@ xfs_log_unmount_write(xfs_mount_t *mp) } while (iclog != first_iclog); #endif if (! (XLOG_FORCED_SHUTDOWN(log))) { - reg[0].i_addr = (void*)&magic; - reg[0].i_len = sizeof(magic); - reg[0].i_type = XLOG_REG_TYPE_UNMOUNT; - error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); if (!error) { + /* the data section must be 32 bit size aligned */ + struct { + __uint16_t magic; + __uint16_t pad1; + __uint32_t pad2; /* may as well make it 64 bits */ + } magic = { + .magic = XLOG_UNMOUNT_TYPE, + }; + struct xfs_log_iovec reg = { + .i_addr = (void *)&magic, + .i_len = sizeof(magic), + .i_type = XLOG_REG_TYPE_UNMOUNT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + /* remove inited flag */ - ((xlog_ticket_t *)tic)->t_flags = 0; - error = xlog_write(mp, reg, 1, tic, &lsn, + tic->t_flags = 0; + error = xlog_write(log, &vec, tic, &lsn, NULL, XLOG_UNMOUNT_TRANS); /* * At this point, we're umounting anyway, @@ -648,10 +653,30 @@ xfs_log_unmount(xfs_mount_t *mp) xlog_dealloc_log(mp->m_log); } +void +xfs_log_item_init( + struct xfs_mount *mp, + struct xfs_log_item *item, + int type, + struct xfs_item_ops *ops) +{ + item->li_mountp = mp; + item->li_ailp = mp->m_ail; + item->li_type = type; + item->li_ops = ops; + item->li_lv = NULL; + + INIT_LIST_HEAD(&item->li_ail); + INIT_LIST_HEAD(&item->li_cil); +} + /* * Write region vectors to log. The write happens using the space reservation * of the ticket (tic). It is not a requirement that all writes for a given - * transaction occur with one call to xfs_log_write(). + * transaction occur with one call to xfs_log_write(). However, it is important + * to note that the transaction reservation code makes an assumption about the + * number of log headers a transaction requires that may be violated if you + * don't pass all the transaction vectors in one call.... */ int xfs_log_write( @@ -663,11 +688,15 @@ xfs_log_write( { struct log *log = mp->m_log; int error; + struct xfs_log_vec vec = { + .lv_niovecs = nentries, + .lv_iovecp = reg, + }; if (XLOG_FORCED_SHUTDOWN(log)) return XFS_ERROR(EIO); - error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0); + error = xlog_write(log, &vec, tic, start_lsn, NULL, 0); if (error) xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); return error; @@ -1020,6 +1049,7 @@ xlog_alloc_log(xfs_mount_t *mp, int i; int iclogsize; int error = ENOMEM; + uint log2_size = 0; log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); if (!log) { @@ -1045,29 +1075,30 @@ xlog_alloc_log(xfs_mount_t *mp, error = EFSCORRUPTED; if (xfs_sb_version_hassector(&mp->m_sb)) { - log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; - if (log->l_sectbb_log < 0 || - log->l_sectbb_log > mp->m_sectbb_log) { - xlog_warn("XFS: Log sector size (0x%x) out of range.", - log->l_sectbb_log); + log2_size = mp->m_sb.sb_logsectlog; + if (log2_size < BBSHIFT) { + xlog_warn("XFS: Log sector size too small " + "(0x%x < 0x%x)", log2_size, BBSHIFT); goto out_free_log; } - /* for larger sector sizes, must have v2 or external log */ - if (log->l_sectbb_log != 0 && - (log->l_logBBstart != 0 && - !xfs_sb_version_haslogv2(&mp->m_sb))) { - xlog_warn("XFS: log sector size (0x%x) invalid " - "for configuration.", log->l_sectbb_log); + log2_size -= BBSHIFT; + if (log2_size > mp->m_sectbb_log) { + xlog_warn("XFS: Log sector size too large " + "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log); goto out_free_log; } - if (mp->m_sb.sb_logsectlog < BBSHIFT) { - xlog_warn("XFS: Log sector log (0x%x) too small.", - mp->m_sb.sb_logsectlog); + + /* for larger sector sizes, must have v2 or external log */ + if (log2_size && log->l_logBBstart > 0 && + !xfs_sb_version_haslogv2(&mp->m_sb)) { + + xlog_warn("XFS: log sector size (0x%x) invalid " + "for configuration.", log2_size); goto out_free_log; } } - log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; + log->l_sectBBsize = 1 << log2_size; xlog_get_iclog_buffer_size(mp, log); @@ -1147,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp, *iclogp = log->l_iclog; /* complete ring */ log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ + error = xlog_cil_init(log); + if (error) + goto out_free_iclog; return log; out_free_iclog: @@ -1174,26 +1208,31 @@ out: * ticket. Return the lsn of the commit record. */ STATIC int -xlog_commit_record(xfs_mount_t *mp, - xlog_ticket_t *ticket, - xlog_in_core_t **iclog, - xfs_lsn_t *commitlsnp) +xlog_commit_record( + struct log *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp) { - int error; - xfs_log_iovec_t reg[1]; - - reg[0].i_addr = NULL; - reg[0].i_len = 0; - reg[0].i_type = XLOG_REG_TYPE_COMMIT; + struct xfs_mount *mp = log->l_mp; + int error; + struct xfs_log_iovec reg = { + .i_addr = NULL, + .i_len = 0, + .i_type = XLOG_REG_TYPE_COMMIT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; ASSERT_ALWAYS(iclog); - if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, - iclog, XLOG_COMMIT_TRANS))) { + error = xlog_write(log, &vec, ticket, commitlsnp, iclog, + XLOG_COMMIT_TRANS); + if (error) xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - } return error; -} /* xlog_commit_record */ - +} /* * Push on the buffer cache code if we ever use more than 75% of the on-disk @@ -1468,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log) xlog_in_core_t *iclog, *next_iclog; int i; + xlog_cil_destroy(log); + iclog = log->l_iclog; for (i=0; i<log->l_iclog_bufs; i++) { sv_destroy(&iclog->ic_force_wait); @@ -1510,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log, * print out info relating to regions written which consume * the reservation */ -STATIC void -xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) +void +xlog_print_tic_res( + struct xfs_mount *mp, + struct xlog_ticket *ticket) { uint i; uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); @@ -1611,6 +1654,196 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) "bad-rtype" : res_type_str[r_type-1]), ticket->t_res_arr[i].r_len); } + + xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, + "xfs_log_write: reservation ran out. Need to up reservation"); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +} + +/* + * Calculate the potential space needed by the log vector. Each region gets + * its own xlog_op_header_t and may need to be double word aligned. + */ +static int +xlog_write_calc_vec_length( + struct xlog_ticket *ticket, + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + int headers = 0; + int len = 0; + int i; + + /* acct for start rec of xact */ + if (ticket->t_flags & XLOG_TIC_INITED) + headers++; + + for (lv = log_vector; lv; lv = lv->lv_next) { + headers += lv->lv_niovecs; + + for (i = 0; i < lv->lv_niovecs; i++) { + struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; + + len += vecp->i_len; + xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); + } + } + + ticket->t_res_num_ophdrs += headers; + len += headers * sizeof(struct xlog_op_header); + + return len; +} + +/* + * If first write for transaction, insert start record We can't be trying to + * commit if we are inited. We can't have any "partial_copy" if we are inited. + */ +static int +xlog_write_start_rec( + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket) +{ + if (!(ticket->t_flags & XLOG_TIC_INITED)) + return 0; + + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_len = 0; + ophdr->oh_flags = XLOG_START_TRANS; + ophdr->oh_res2 = 0; + + ticket->t_flags &= ~XLOG_TIC_INITED; + + return sizeof(struct xlog_op_header); +} + +static xlog_op_header_t * +xlog_write_setup_ophdr( + struct log *log, + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket, + uint flags) +{ + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_res2 = 0; + + /* are we copying a commit or unmount record? */ + ophdr->oh_flags = flags; + + /* + * We've seen logs corrupted with bad transaction client ids. This + * makes sure that XFS doesn't generate them on. Turn this into an EIO + * and shut down the filesystem. + */ + switch (ophdr->oh_clientid) { + case XFS_TRANSACTION: + case XFS_VOLUME: + case XFS_LOG: + break; + default: + xfs_fs_cmn_err(CE_WARN, log->l_mp, + "Bad XFS transaction clientid 0x%x in ticket 0x%p", + ophdr->oh_clientid, ticket); + return NULL; + } + + return ophdr; +} + +/* + * Set up the parameters of the region copy into the log. This has + * to handle region write split across multiple log buffers - this + * state is kept external to this function so that this code can + * can be written in an obvious, self documenting manner. + */ +static int +xlog_write_setup_copy( + struct xlog_ticket *ticket, + struct xlog_op_header *ophdr, + int space_available, + int space_required, + int *copy_off, + int *copy_len, + int *last_was_partial_copy, + int *bytes_consumed) +{ + int still_to_copy; + + still_to_copy = space_required - *bytes_consumed; + *copy_off = *bytes_consumed; + + if (still_to_copy <= space_available) { + /* write of region completes here */ + *copy_len = still_to_copy; + ophdr->oh_len = cpu_to_be32(*copy_len); + if (*last_was_partial_copy) + ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); + *last_was_partial_copy = 0; + *bytes_consumed = 0; + return 0; + } + + /* partial write of region, needs extra log op header reservation */ + *copy_len = space_available; + ophdr->oh_len = cpu_to_be32(*copy_len); + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; + if (*last_was_partial_copy) + ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; + *bytes_consumed += *copy_len; + (*last_was_partial_copy)++; + + /* account for new log op header */ + ticket->t_curr_res -= sizeof(struct xlog_op_header); + ticket->t_res_num_ophdrs++; + + return sizeof(struct xlog_op_header); +} + +static int +xlog_write_copy_finish( + struct log *log, + struct xlog_in_core *iclog, + uint flags, + int *record_cnt, + int *data_cnt, + int *partial_copy, + int *partial_copy_len, + int log_offset, + struct xlog_in_core **commit_iclog) +{ + if (*partial_copy) { + /* + * This iclog has already been marked WANT_SYNC by + * xlog_state_get_iclog_space. + */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + return xlog_state_release_iclog(log, iclog); + } + + *partial_copy = 0; + *partial_copy_len = 0; + + if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { + /* no more space in this iclog - push it. */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + + spin_lock(&log->l_icloglock); + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + } + + return 0; } /* @@ -1653,211 +1886,163 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) * we don't update ic_offset until the end when we know exactly how many * bytes have been written out. */ -STATIC int +int xlog_write( - struct xfs_mount *mp, - struct xfs_log_iovec reg[], - int nentries, + struct log *log, + struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, uint flags) { - xlog_t *log = mp->m_log; - xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ - xlog_op_header_t *logop_head; /* ptr to log operation header */ - __psint_t ptr; /* copy address into data region */ - int len; /* # xlog_write() bytes 2 still copy */ - int index; /* region index currently copying */ - int log_offset; /* offset (from 0) into data region */ - int start_rec_copy; /* # bytes to copy for start record */ - int partial_copy; /* did we split a region? */ - int partial_copy_len;/* # bytes copied if split region */ - int need_copy; /* # bytes need to memcpy this region */ - int copy_len; /* # bytes actually memcpy'ing */ - int copy_off; /* # bytes from entry start */ - int contwr; /* continued write of in-core log? */ - int error; - int record_cnt = 0, data_cnt = 0; - - partial_copy_len = partial_copy = 0; - - /* Calculate potential maximum space. Each region gets its own - * xlog_op_header_t and may need to be double word aligned. - */ - len = 0; - if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */ - len += sizeof(xlog_op_header_t); - ticket->t_res_num_ophdrs++; - } - - for (index = 0; index < nentries; index++) { - len += sizeof(xlog_op_header_t); /* each region gets >= 1 */ - ticket->t_res_num_ophdrs++; - len += reg[index].i_len; - xlog_tic_add_region(ticket, reg[index].i_len, reg[index].i_type); - } - contwr = *start_lsn = 0; + struct xlog_in_core *iclog = NULL; + struct xfs_log_iovec *vecp; + struct xfs_log_vec *lv; + int len; + int index; + int partial_copy = 0; + int partial_copy_len = 0; + int contwr = 0; + int record_cnt = 0; + int data_cnt = 0; + int error; - if (ticket->t_curr_res < len) { - xlog_print_tic_res(mp, ticket); -#ifdef DEBUG - xlog_panic( - "xfs_log_write: reservation ran out. Need to up reservation"); -#else - /* Customer configurable panic */ - xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, - "xfs_log_write: reservation ran out. Need to up reservation"); - /* If we did not panic, shutdown the filesystem */ - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -#endif - } else - ticket->t_curr_res -= len; + *start_lsn = 0; - for (index = 0; index < nentries; ) { - if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, - &contwr, &log_offset))) - return error; + len = xlog_write_calc_vec_length(ticket, log_vector); + if (log->l_cilp) { + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); - ASSERT(log_offset <= iclog->ic_size - 1); - ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + } else + ticket->t_curr_res -= len; + + if (ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, ticket); + + index = 0; + lv = log_vector; + vecp = lv->lv_iovecp; + while (lv && index < lv->lv_niovecs) { + void *ptr; + int log_offset; + + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &contwr, &log_offset); + if (error) + return error; - /* start_lsn is the first lsn written to. That's all we need. */ - if (! *start_lsn) - *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + ASSERT(log_offset <= iclog->ic_size - 1); + ptr = iclog->ic_datap + log_offset; - /* This loop writes out as many regions as can fit in the amount - * of space which was allocated by xlog_state_get_iclog_space(). - */ - while (index < nentries) { - ASSERT(reg[index].i_len % sizeof(__int32_t) == 0); - ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0); - start_rec_copy = 0; - - /* If first write for transaction, insert start record. - * We can't be trying to commit if we are inited. We can't - * have any "partial_copy" if we are inited. - */ - if (ticket->t_flags & XLOG_TIC_INITED) { - logop_head = (xlog_op_header_t *)ptr; - logop_head->oh_tid = cpu_to_be32(ticket->t_tid); - logop_head->oh_clientid = ticket->t_clientid; - logop_head->oh_len = 0; - logop_head->oh_flags = XLOG_START_TRANS; - logop_head->oh_res2 = 0; - ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */ - record_cnt++; - - start_rec_copy = sizeof(xlog_op_header_t); - xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy); - } + /* start_lsn is the first lsn written to. That's all we need. */ + if (!*start_lsn) + *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); - /* Copy log operation header directly into data section */ - logop_head = (xlog_op_header_t *)ptr; - logop_head->oh_tid = cpu_to_be32(ticket->t_tid); - logop_head->oh_clientid = ticket->t_clientid; - logop_head->oh_res2 = 0; + /* + * This loop writes out as many regions as can fit in the amount + * of space which was allocated by xlog_state_get_iclog_space(). + */ + while (lv && index < lv->lv_niovecs) { + struct xfs_log_iovec *reg = &vecp[index]; + struct xlog_op_header *ophdr; + int start_rec_copy; + int copy_len; + int copy_off; + + ASSERT(reg->i_len % sizeof(__int32_t) == 0); + ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); + + start_rec_copy = xlog_write_start_rec(ptr, ticket); + if (start_rec_copy) { + record_cnt++; + xlog_write_adv_cnt(&ptr, &len, &log_offset, + start_rec_copy); + } - /* header copied directly */ - xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t)); + ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); + if (!ophdr) + return XFS_ERROR(EIO); - /* are we copying a commit or unmount record? */ - logop_head->oh_flags = flags; + xlog_write_adv_cnt(&ptr, &len, &log_offset, + sizeof(struct xlog_op_header)); + + len += xlog_write_setup_copy(ticket, ophdr, + iclog->ic_size-log_offset, + reg->i_len, + ©_off, ©_len, + &partial_copy, + &partial_copy_len); + xlog_verify_dest_ptr(log, ptr); + + /* copy region */ + ASSERT(copy_len >= 0); + memcpy(ptr, reg->i_addr + copy_off, copy_len); + xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); + + copy_len += start_rec_copy + sizeof(xlog_op_header_t); + record_cnt++; + data_cnt += contwr ? copy_len : 0; + + error = xlog_write_copy_finish(log, iclog, flags, + &record_cnt, &data_cnt, + &partial_copy, + &partial_copy_len, + log_offset, + commit_iclog); + if (error) + return error; - /* - * We've seen logs corrupted with bad transaction client - * ids. This makes sure that XFS doesn't generate them on. - * Turn this into an EIO and shut down the filesystem. - */ - switch (logop_head->oh_clientid) { - case XFS_TRANSACTION: - case XFS_VOLUME: - case XFS_LOG: - break; - default: - xfs_fs_cmn_err(CE_WARN, mp, - "Bad XFS transaction clientid 0x%x in ticket 0x%p", - logop_head->oh_clientid, ticket); - return XFS_ERROR(EIO); - } + /* + * if we had a partial copy, we need to get more iclog + * space but we don't want to increment the region + * index because there is still more is this region to + * write. + * + * If we completed writing this region, and we flushed + * the iclog (indicated by resetting of the record + * count), then we also need to get more log space. If + * this was the last record, though, we are done and + * can just return. + */ + if (partial_copy) + break; - /* Partial write last time? => (partial_copy != 0) - * need_copy is the amount we'd like to copy if everything could - * fit in the current memcpy. - */ - need_copy = reg[index].i_len - partial_copy_len; - - copy_off = partial_copy_len; - if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ - copy_len = need_copy; - logop_head->oh_len = cpu_to_be32(copy_len); - if (partial_copy) - logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); - partial_copy_len = partial_copy = 0; - } else { /* partial write */ - copy_len = iclog->ic_size - log_offset; - logop_head->oh_len = cpu_to_be32(copy_len); - logop_head->oh_flags |= XLOG_CONTINUE_TRANS; - if (partial_copy) - logop_head->oh_flags |= XLOG_WAS_CONT_TRANS; - partial_copy_len += copy_len; - partial_copy++; - len += sizeof(xlog_op_header_t); /* from splitting of region */ - /* account for new log op header */ - ticket->t_curr_res -= sizeof(xlog_op_header_t); - ticket->t_res_num_ophdrs++; - } - xlog_verify_dest_ptr(log, ptr); - - /* copy region */ - ASSERT(copy_len >= 0); - memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len); - xlog_write_adv_cnt(ptr, len, log_offset, copy_len); - - /* make copy_len total bytes copied, including headers */ - copy_len += start_rec_copy + sizeof(xlog_op_header_t); - record_cnt++; - data_cnt += contwr ? copy_len : 0; - if (partial_copy) { /* copied partial region */ - /* already marked WANT_SYNC by xlog_state_get_iclog_space */ - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - record_cnt = data_cnt = 0; - if ((error = xlog_state_release_iclog(log, iclog))) - return error; - break; /* don't increment index */ - } else { /* copied entire region */ - index++; - partial_copy_len = partial_copy = 0; - - if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - record_cnt = data_cnt = 0; - spin_lock(&log->l_icloglock); - xlog_state_want_sync(log, iclog); - spin_unlock(&log->l_icloglock); - if (commit_iclog) { - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - } else if ((error = xlog_state_release_iclog(log, iclog))) - return error; - if (index == nentries) - return 0; /* we are done */ - else - break; + if (++index == lv->lv_niovecs) { + lv = lv->lv_next; + index = 0; + if (lv) + vecp = lv->lv_iovecp; + } + if (record_cnt == 0) { + if (!lv) + return 0; + break; + } } - } /* if (partial_copy) */ - } /* while (index < nentries) */ - } /* for (index = 0; index < nentries; ) */ - ASSERT(len == 0); + } + + ASSERT(len == 0); + + xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - if (commit_iclog) { ASSERT(flags & XLOG_COMMIT_TRANS); *commit_iclog = iclog; return 0; - } - return xlog_state_release_iclog(log, iclog); -} /* xlog_write */ +} /***************************************************************************** @@ -2840,6 +3025,8 @@ _xfs_log_force( XFS_STATS_INC(xs_log_force); + xlog_cil_push(log, 1); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; @@ -2989,6 +3176,12 @@ _xfs_log_force_lsn( XFS_STATS_INC(xs_log_force); + if (log->l_cilp) { + lsn = xlog_cil_push_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; + } + try_again: spin_lock(&log->l_icloglock); iclog = log->l_iclog; @@ -3153,20 +3346,30 @@ xfs_log_ticket_get( return ticket; } +xlog_tid_t +xfs_log_get_trans_ident( + struct xfs_trans *tp) +{ + return tp->t_ticket->t_tid; +} + /* * Allocate and initialise a new log ticket. */ -STATIC xlog_ticket_t * -xlog_ticket_alloc(xlog_t *log, - int unit_bytes, - int cnt, - char client, - uint xflags) +xlog_ticket_t * +xlog_ticket_alloc( + struct log *log, + int unit_bytes, + int cnt, + char client, + uint xflags, + int alloc_flags) { - xlog_ticket_t *tic; + struct xlog_ticket *tic; uint num_headers; + int iclog_space; - tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); if (!tic) return NULL; @@ -3208,16 +3411,40 @@ xlog_ticket_alloc(xlog_t *log, /* for start-rec */ unit_bytes += sizeof(xlog_op_header_t); - /* for LR headers */ - num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log); + /* + * for LR headers - the space for data in an iclog is the size minus + * the space used for the headers. If we use the iclog size, then we + * undercalculate the number of headers required. + * + * Furthermore - the addition of op headers for split-recs might + * increase the space required enough to require more log and op + * headers, so take that into account too. + * + * IMPORTANT: This reservation makes the assumption that if this + * transaction is the first in an iclog and hence has the LR headers + * accounted to it, then the remaining space in the iclog is + * exclusively for this transaction. i.e. if the transaction is larger + * than the iclog, it will be the only thing in that iclog. + * Fundamentally, this means we must pass the entire log vector to + * xlog_write to guarantee this. + */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + num_headers = howmany(unit_bytes, iclog_space); + + /* for split-recs - ophdrs added when data split over LRs */ + unit_bytes += sizeof(xlog_op_header_t) * num_headers; + + /* add extra header reservations if we overrun */ + while (!num_headers || + howmany(unit_bytes, iclog_space) > num_headers) { + unit_bytes += sizeof(xlog_op_header_t); + num_headers++; + } unit_bytes += log->l_iclog_hsize * num_headers; /* for commit-rec LR header - note: padding will subsume the ophdr */ unit_bytes += log->l_iclog_hsize; - /* for split-recs - ophdrs added when data split over LRs */ - unit_bytes += sizeof(xlog_op_header_t) * num_headers; - /* for roundoff padding for transaction data and one for commit record */ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && log->l_mp->m_sb.sb_logsunit > 1) { @@ -3233,13 +3460,13 @@ xlog_ticket_alloc(xlog_t *log, tic->t_curr_res = unit_bytes; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff); + tic->t_tid = random32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; if (xflags & XFS_LOG_PERM_RESERV) tic->t_flags |= XLOG_TIC_PERM_RESERV; - sv_init(&(tic->t_wait), SV_DEFAULT, "logtick"); + sv_init(&tic->t_wait, SV_DEFAULT, "logtick"); xlog_tic_reset_res(tic); @@ -3260,20 +3487,22 @@ xlog_ticket_alloc(xlog_t *log, * part of the log in case we trash the log structure. */ void -xlog_verify_dest_ptr(xlog_t *log, - __psint_t ptr) +xlog_verify_dest_ptr( + struct log *log, + char *ptr) { int i; int good_ptr = 0; - for (i=0; i < log->l_iclog_bufs; i++) { - if (ptr >= (__psint_t)log->l_iclog_bak[i] && - ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size) + for (i = 0; i < log->l_iclog_bufs; i++) { + if (ptr >= log->l_iclog_bak[i] && + ptr <= log->l_iclog_bak[i] + log->l_iclog_size) good_ptr++; } - if (! good_ptr) + + if (!good_ptr) xlog_panic("xlog_verify_dest_ptr: invalid ptr"); -} /* xlog_verify_dest_ptr */ +} STATIC void xlog_verify_grant_head(xlog_t *log, int equals) @@ -3459,6 +3688,11 @@ xlog_state_ioerror( * c. nothing new gets queued up after (a) and (b) are done. * d. if !logerror, flush the iclogs to disk, then seal them off * for business. + * + * Note: for delayed logging the !logerror case needs to flush the regions + * held in memory out to the iclogs before flushing them to disk. This needs + * to be done before the log is marked as shutdown, otherwise the flush to the + * iclogs will fail. */ int xfs_log_force_umount( @@ -3492,6 +3726,16 @@ xfs_log_force_umount( return 1; } retval = 0; + + /* + * Flush the in memory commit item list before marking the log as + * being shut down. We need to do it in this order to ensure all the + * completed transactions are flushed to disk with the xfs_log_force() + * call below. + */ + if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) + xlog_cil_push(log, 1); + /* * We must hold both the GRANT lock and the LOG lock, * before we mark the filesystem SHUTDOWN and wake diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 97a24c7795a..04c78e642cc 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -19,7 +19,6 @@ #define __XFS_LOG_H__ /* get lsn fields */ - #define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) #define BLOCK_LSN(lsn) ((uint)(lsn)) @@ -110,6 +109,15 @@ typedef struct xfs_log_iovec { uint i_type; /* type of region */ } xfs_log_iovec_t; +struct xfs_log_vec { + struct xfs_log_vec *lv_next; /* next lv in build list */ + int lv_niovecs; /* number of iovecs in lv */ + struct xfs_log_iovec *lv_iovecp; /* iovec array */ + struct xfs_log_item *lv_item; /* owner */ + char *lv_buf; /* formatted buffer */ + int lv_buf_len; /* size of formatted buffer */ +}; + /* * Structure used to pass callback function and the function's argument * to the log manager. @@ -126,6 +134,14 @@ typedef struct xfs_log_callback { struct xfs_mount; struct xlog_in_core; struct xlog_ticket; +struct xfs_log_item; +struct xfs_item_ops; +struct xfs_trans; + +void xfs_log_item_init(struct xfs_mount *mp, + struct xfs_log_item *item, + int type, + struct xfs_item_ops *ops); xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, @@ -174,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp); void xlog_iodone(struct xfs_buf *); -struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); +struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); +xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); + +int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, int flags); +bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); + #endif diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c new file mode 100644 index 00000000000..bb17cc044bf --- /dev/null +++ b/fs/xfs/xfs_log_cil.c @@ -0,0 +1,725 @@ +/* + * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_bit.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log_priv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir2.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_alloc.h" + +/* + * Perform initial CIL structure initialisation. If the CIL is not + * enabled in this filesystem, ensure the log->l_cilp is null so + * we can check this conditional to determine if we are doing delayed + * logging or not. + */ +int +xlog_cil_init( + struct log *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + log->l_cilp = NULL; + if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) + return 0; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return ENOMEM; + } + + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + init_rwsem(&cil->xc_ctx_lock); + sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct log *log) +{ + if (!log->l_cilp) + return; + + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + +/* + * Allocate a new ticket. Failing to get a new ticket makes it really hard to + * recover, so we don't allow failure here. Also, we allocate in a context that + * we don't want to be issuing transactions from, so we need to tell the + * allocation code this as well. + * + * We don't reserve any space for the ticket - we are going to steal whatever + * space we require from transactions as they commit. To ensure we reserve all + * the space required, we need to set the current reservation of the ticket to + * zero so that we know to steal the initial transaction overhead from the + * first transaction commit. + */ +static struct xlog_ticket * +xlog_cil_ticket_alloc( + struct log *log) +{ + struct xlog_ticket *tic; + + tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, + KM_SLEEP|KM_NOFS); + tic->t_trans_type = XFS_TRANS_CHECKPOINT; + + /* + * set the current reservation to zero so we know to steal the basic + * transaction overhead reservation from the first transaction commit. + */ + tic->t_curr_res = 0; + return tic; +} + +/* + * After the first stage of log recovery is done, we know where the head and + * tail of the log are. We need this log initialisation done before we can + * initialise the first CIL checkpoint context. + * + * Here we allocate a log ticket to track space usage during a CIL push. This + * ticket is passed to xlog_write() directly so that we don't slowly leak log + * space by failing to account for space used by log headers and additional + * region headers for split regions. + */ +void +xlog_cil_init_post_recovery( + struct log *log) +{ + if (!log->l_cilp) + return; + + log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); + log->l_cilp->xc_ctx->sequence = 1; + log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, + log->l_curr_block); +} + +/* + * Insert the log item into the CIL and calculate the difference in space + * consumed by the item. Add the space to the checkpoint ticket and calculate + * if the change requires additional log metadata. If it does, take that space + * as well. Remove the amount of space we addded to the checkpoint ticket from + * the current transaction ticket so that the accounting works out correctly. + * + * If this is the first time the item is being placed into the CIL in this + * context, pin it so it can't be written to disk until the CIL is flushed to + * the iclog and the iclog written to disk. + */ +static void +xlog_cil_insert( + struct log *log, + struct xlog_ticket *ticket, + struct xfs_log_item *item, + struct xfs_log_vec *lv) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *old = lv->lv_item->li_lv; + struct xfs_cil_ctx *ctx = cil->xc_ctx; + int len; + int diff_iovecs; + int iclog_space; + + if (old) { + /* existing lv on log item, space used is a delta */ + ASSERT(!list_empty(&item->li_cil)); + ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); + + len = lv->lv_buf_len - old->lv_buf_len; + diff_iovecs = lv->lv_niovecs - old->lv_niovecs; + kmem_free(old->lv_buf); + kmem_free(old); + } else { + /* new lv, must pin the log item */ + ASSERT(!lv->lv_item->li_lv); + ASSERT(list_empty(&item->li_cil)); + + len = lv->lv_buf_len; + diff_iovecs = lv->lv_niovecs; + IOP_PIN(lv->lv_item); + + } + len += diff_iovecs * sizeof(xlog_op_header_t); + + /* attach new log vector to log item */ + lv->lv_item->li_lv = lv; + + spin_lock(&cil->xc_cil_lock); + list_move_tail(&item->li_cil, &cil->xc_cil); + ctx->nvecs += diff_iovecs; + + /* + * If this is the first time the item is being committed to the CIL, + * store the sequence number on the log item so we can tell + * in future commits whether this is the first checkpoint the item is + * being committed into. + */ + if (!item->li_seq) + item->li_seq = ctx->sequence; + + /* + * Now transfer enough transaction reservation to the context ticket + * for the checkpoint. The context ticket is special - the unit + * reservation has to grow as well as the current reservation as we + * steal from tickets so we can correctly determine the space used + * during the transaction commit. + */ + if (ctx->ticket->t_curr_res == 0) { + /* first commit in checkpoint, steal the header reservation */ + ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); + ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; + ticket->t_curr_res -= ctx->ticket->t_unit_res; + } + + /* do we need space for more log record headers? */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + if (len > 0 && (ctx->space_used / iclog_space != + (ctx->space_used + len) / iclog_space)) { + int hdrs; + + hdrs = (len + iclog_space - 1) / iclog_space; + /* need to take into account split region headers, too */ + hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); + ctx->ticket->t_unit_res += hdrs; + ctx->ticket->t_curr_res += hdrs; + ticket->t_curr_res -= hdrs; + ASSERT(ticket->t_curr_res >= len); + } + ticket->t_curr_res -= len; + ctx->space_used += len; + + spin_unlock(&cil->xc_cil_lock); +} + +/* + * Format log item into a flat buffers + * + * For delayed logging, we need to hold a formatted buffer containing all the + * changes on the log item. This enables us to relog the item in memory and + * write it out asynchronously without needing to relock the object that was + * modified at the time it gets written into the iclog. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and formats the vector for the item into the buffer. + * The buffer is then attached to the log item are then inserted into the + * Committed Item List for tracking until the next checkpoint is written out. + * + * We don't set up region headers during this process; we simply copy the + * regions into the flat buffer. We can do this because we still have to do a + * formatting step to write the regions into the iclog buffer. Writing the + * ophdrs during the iclog write means that we can support splitting large + * regions across iclog boundares without needing a change in the format of the + * item/region encapsulation. + * + * Hence what we need to do now is change the rewrite the vector array to point + * to the copied region inside the buffer we just allocated. This allows us to + * format the regions into the iclog as though they are being formatted + * directly out of the objects themselves. + */ +static void +xlog_cil_format_items( + struct log *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn) +{ + struct xfs_log_vec *lv; + + if (start_lsn) + *start_lsn = log->l_cilp->xc_ctx->sequence; + + ASSERT(log_vector); + for (lv = log_vector; lv; lv = lv->lv_next) { + void *ptr; + int index; + int len = 0; + + /* build the vector array and calculate it's length */ + IOP_FORMAT(lv->lv_item, lv->lv_iovecp); + for (index = 0; index < lv->lv_niovecs; index++) + len += lv->lv_iovecp[index].i_len; + + lv->lv_buf_len = len; + lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); + ptr = lv->lv_buf; + + for (index = 0; index < lv->lv_niovecs; index++) { + struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; + + memcpy(ptr, vec->i_addr, vec->i_len); + vec->i_addr = ptr; + ptr += vec->i_len; + } + ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); + + xlog_cil_insert(log, ticket, lv->lv_item, lv); + } +} + +static void +xlog_cil_free_logvec( + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + + for (lv = log_vector; lv; ) { + struct xfs_log_vec *next = lv->lv_next; + kmem_free(lv->lv_buf); + kmem_free(lv); + lv = next; + } +} + +/* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * For more specific information about the order of operations in + * xfs_log_commit_cil() please refer to the comments in + * xfs_trans_commit_iclog(). + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +int +xfs_log_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct log *log = mp->m_log; + int log_flags = 0; + int push = 0; + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + if (XLOG_FORCED_SHUTDOWN(log)) { + xlog_cil_free_logvec(log_vector); + return XFS_ERROR(EIO); + } + + /* lock out background commit */ + down_read(&log->l_cilp->xc_ctx_lock); + xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); + + /* check we didn't blow the reservation */ + if (tp->t_ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, tp->t_ticket); + + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) { + spin_lock(&log->l_cilp->xc_cil_lock); + list_splice_init(&tp->t_busy, + &log->l_cilp->xc_ctx->busy_extents); + spin_unlock(&log->l_cilp->xc_cil_lock); + } + + tp->t_commit_lsn = *commit_lsn; + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_trans_unreserve_and_mod_sb(tp); + + /* check for background commit before unlock */ + if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) + push = 1; + up_read(&log->l_cilp->xc_ctx_lock); + + /* + * We need to push CIL every so often so we don't cache more than we + * can fit in the log. The limit really is that a checkpoint can't be + * more than half the log (the current checkpoint is not allowed to + * overwrite the previous checkpoint), but commit latency and memory + * usage limit this to a smaller size in most cases. + */ + if (push) + xlog_cil_push(log, 0); + return 0; +} + +/* + * Mark all items committed and clear busy extents. We free the log vector + * chains in a separate pass so that we unpin the log items as quickly as + * possible. + */ +static void +xlog_cil_committed( + void *args, + int abort) +{ + struct xfs_cil_ctx *ctx = args; + struct xfs_log_vec *lv; + int abortflag = abort ? XFS_LI_ABORTED : 0; + struct xfs_busy_extent *busyp, *n; + + /* unpin all the log items */ + for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { + xfs_trans_item_committed(lv->lv_item, ctx->start_lsn, + abortflag); + } + + list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) + xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); + + spin_lock(&ctx->cil->xc_cil_lock); + list_del(&ctx->committing); + spin_unlock(&ctx->cil->xc_cil_lock); + + xlog_cil_free_logvec(ctx->lv_chain); + kmem_free(ctx); +} + +/* + * Push the Committed Item List to the log. If the push_now flag is not set, + * then it is a background flush and so we can chose to ignore it. + */ +int +xlog_cil_push( + struct log *log, + int push_now) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *lv; + struct xfs_cil_ctx *ctx; + struct xfs_cil_ctx *new_ctx; + struct xlog_in_core *commit_iclog; + struct xlog_ticket *tic; + int num_lv; + int num_iovecs; + int len; + int error = 0; + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr; + struct xfs_log_vec lvhdr = { NULL }; + xfs_lsn_t commit_lsn; + + if (!cil) + return 0; + + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); + new_ctx->ticket = xlog_cil_ticket_alloc(log); + + /* lock out transaction commit, but don't block on background push */ + if (!down_write_trylock(&cil->xc_ctx_lock)) { + if (!push_now) + goto out_free_ticket; + down_write(&cil->xc_ctx_lock); + } + ctx = cil->xc_ctx; + + /* check if we've anything to push */ + if (list_empty(&cil->xc_cil)) + goto out_skip; + + /* check for spurious background flush */ + if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + goto out_skip; + + /* + * pull all the log vectors off the items in the CIL, and + * remove the items from the CIL. We don't need the CIL lock + * here because it's only needed on the transaction commit + * side which is currently locked out by the flush lock. + */ + lv = NULL; + num_lv = 0; + num_iovecs = 0; + len = 0; + while (!list_empty(&cil->xc_cil)) { + struct xfs_log_item *item; + int i; + + item = list_first_entry(&cil->xc_cil, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + if (!ctx->lv_chain) + ctx->lv_chain = item->li_lv; + else + lv->lv_next = item->li_lv; + lv = item->li_lv; + item->li_lv = NULL; + + num_lv++; + num_iovecs += lv->lv_niovecs; + for (i = 0; i < lv->lv_niovecs; i++) + len += lv->lv_iovecp[i].i_len; + } + + /* + * initialise the new context and attach it to the CIL. Then attach + * the current context to the CIL committing lsit so it can be found + * during log forces to extract the commit lsn of the sequence that + * needs to be forced. + */ + INIT_LIST_HEAD(&new_ctx->committing); + INIT_LIST_HEAD(&new_ctx->busy_extents); + new_ctx->sequence = ctx->sequence + 1; + new_ctx->cil = cil; + cil->xc_ctx = new_ctx; + + /* + * The switch is now done, so we can drop the context lock and move out + * of a shared context. We can't just go straight to the commit record, + * though - we need to synchronise with previous and future commits so + * that the commit records are correctly ordered in the log to ensure + * that we process items during log IO completion in the correct order. + * + * For example, if we get an EFI in one checkpoint and the EFD in the + * next (e.g. due to log forces), we do not want the checkpoint with + * the EFD to be committed before the checkpoint with the EFI. Hence + * we must strictly order the commit records of the checkpoints so + * that: a) the checkpoint callbacks are attached to the iclogs in the + * correct order; and b) the checkpoints are replayed in correct order + * in log recovery. + * + * Hence we need to add this context to the committing context list so + * that higher sequences will wait for us to write out a commit record + * before they do. + */ + spin_lock(&cil->xc_cil_lock); + list_add(&ctx->committing, &cil->xc_committing); + spin_unlock(&cil->xc_cil_lock); + up_write(&cil->xc_ctx_lock); + + /* + * Build a checkpoint transaction header and write it to the log to + * begin the transaction. We need to account for the space used by the + * transaction header here as it is not accounted for in xlog_write(). + * + * The LSN we need to pass to the log items on transaction commit is + * the LSN reported by the first log vector write. If we use the commit + * record lsn then we can move the tail beyond the grant write head. + */ + tic = ctx->ticket; + thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + thdr.th_type = XFS_TRANS_CHECKPOINT; + thdr.th_tid = tic->t_tid; + thdr.th_num_items = num_iovecs; + lhdr.i_addr = (xfs_caddr_t)&thdr; + lhdr.i_len = sizeof(xfs_trans_header_t); + lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; + tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); + + lvhdr.lv_niovecs = 1; + lvhdr.lv_iovecp = &lhdr; + lvhdr.lv_next = ctx->lv_chain; + + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + if (error) + goto out_abort; + + /* + * now that we've written the checkpoint into the log, strictly + * order the commit records so replay will get them in the right order. + */ +restart: + spin_lock(&cil->xc_cil_lock); + list_for_each_entry(new_ctx, &cil->xc_committing, committing) { + /* + * Higher sequences will wait for this one so skip them. + * Don't wait for own own sequence, either. + */ + if (new_ctx->sequence >= ctx->sequence) + continue; + if (!new_ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + goto restart; + } + } + spin_unlock(&cil->xc_cil_lock); + + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + if (error || commit_lsn == -1) + goto out_abort; + + /* attach all the transactions w/ busy extents to iclog */ + ctx->log_cb.cb_func = xlog_cil_committed; + ctx->log_cb.cb_arg = ctx; + error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); + if (error) + goto out_abort; + + /* + * now the checkpoint commit is complete and we've attached the + * callbacks to the iclog we can assign the commit LSN to the context + * and wake up anyone who is waiting for the commit to complete. + */ + spin_lock(&cil->xc_cil_lock); + ctx->commit_lsn = commit_lsn; + sv_broadcast(&cil->xc_commit_wait); + spin_unlock(&cil->xc_cil_lock); + + /* release the hounds! */ + return xfs_log_release_iclog(log->l_mp, commit_iclog); + +out_skip: + up_write(&cil->xc_ctx_lock); +out_free_ticket: + xfs_log_ticket_put(new_ctx->ticket); + kmem_free(new_ctx); + return 0; + +out_abort: + xlog_cil_committed(ctx, XFS_LI_ABORTED); + return XFS_ERROR(EIO); +} + +/* + * Conditionally push the CIL based on the sequence passed in. + * + * We only need to push if we haven't already pushed the sequence + * number given. Hence the only time we will trigger a push here is + * if the push sequence is the same as the current context. + * + * We return the current commit lsn to allow the callers to determine if a + * iclog flush is necessary following this call. + * + * XXX: Initially, just push the CIL unconditionally and return whatever + * commit lsn is there. It'll be empty, so this is broken for now. + */ +xfs_lsn_t +xlog_cil_push_lsn( + struct log *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx; + xfs_lsn_t commit_lsn = NULLCOMMITLSN; + +restart: + down_write(&cil->xc_ctx_lock); + ASSERT(push_seq <= cil->xc_ctx->sequence); + + /* check to see if we need to force out the current context */ + if (push_seq == cil->xc_ctx->sequence) { + up_write(&cil->xc_ctx_lock); + xlog_cil_push(log, 1); + goto restart; + } + + /* + * See if we can find a previous sequence still committing. + * We can drop the flush lock as soon as we have the cil lock + * because we are now only comparing contexts protected by + * the cil lock. + * + * We need to wait for all previous sequence commits to complete + * before allowing the force of push_seq to go ahead. Hence block + * on commits for those as well. + */ + spin_lock(&cil->xc_cil_lock); + up_write(&cil->xc_ctx_lock); + list_for_each_entry(ctx, &cil->xc_committing, committing) { + if (ctx->sequence > push_seq) + continue; + if (!ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); + goto restart; + } + if (ctx->sequence != push_seq) + continue; + /* found it! */ + commit_lsn = ctx->commit_lsn; + } + spin_unlock(&cil->xc_cil_lock); + return commit_lsn; +} + +/* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + struct xfs_cil_ctx *ctx; + + if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG)) + return false; + if (list_empty(&lip->li_cil)) + return false; + + ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) + return false; + return true; +} diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index fd02a18facd..8c072618965 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being shutdown */ -typedef __uint32_t xlog_tid_t; - #ifdef __KERNEL__ /* @@ -379,6 +377,99 @@ typedef struct xlog_in_core { } xlog_in_core_t; /* + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + xfs_log_callback_t log_cb; /* completion callback hook. */ + struct list_head committing; /* ctx committing list */ +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct log *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + struct xfs_cil_ctx *xc_ctx; + struct rw_semaphore xc_ctx_lock; + struct list_head xc_committing; + sv_t xc_commit_wait; +}; + +/* + * The amount of log space we should the CIL to aggregate is difficult to size. + * Whatever we chose we have to make we can get a reservation for the log space + * effectively, that it is large enough to capture sufficient relogging to + * reduce log buffer IO significantly, but it is not too large for the log or + * induces too much latency when writing out through the iclogs. We track both + * space consumed and the number of vectors in the checkpoint context, so we + * need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can basically make up arbitrary limits for the + * checkpoint size so long as they don't violate any other size rules. Hence + * the initial maximum size for the checkpoint transaction will be set to a + * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit + * right now based on the latency of writing out a large amount of data through + * the circular iclog buffers. + */ + +#define XLOG_CIL_SPACE_LIMIT(log) \ + (min((log->l_logsize >> 2), (8 * 1024 * 1024))) + +/* * The reservation head lsn is not made up of a cycle number and block number. * Instead, it uses a cycle number and byte number. Logs don't expect to * overflow 31 bits worth of byte offset, so using a byte number will mean @@ -388,6 +479,7 @@ typedef struct log { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ @@ -396,9 +488,7 @@ typedef struct log { struct xfs_buf_cancel **l_buf_cancel_table; int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ - uint l_sectbb_log; /* log2 of sector size in BBs */ - uint l_sectbb_mask; /* sector size (in BBs) - * alignment mask */ + uint l_sectBBsize; /* sector size in BBs (2^n) */ int l_iclog_size; /* size of log in bytes */ int l_iclog_size_log; /* log power size of log */ int l_iclog_bufs; /* number of iclog buffers */ @@ -440,14 +530,40 @@ typedef struct log { #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) - /* common routines */ extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); extern int xlog_recover(xlog_t *log); extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); -extern kmem_zone_t *xfs_log_ticket_zone; +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, + int count, char client, uint xflags, + int alloc_flags); + + +static inline void +xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) +{ + *ptr += bytes; + *len -= bytes; + *off += bytes; +} + +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +int xlog_write(struct log *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + xlog_in_core_t **commit_iclog, uint flags); + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct log *log); +void xlog_cil_init_post_recovery(struct log *log); +void xlog_cil_destroy(struct log *log); + +int xlog_cil_push(struct log *log, int push_now); +xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); /* * Unmount record type is used as a pseudo transaction type for the ticket. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 22e6efdc17e..14a69aec2c0 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -56,33 +56,61 @@ STATIC void xlog_recover_check_summary(xlog_t *); #define xlog_recover_check_summary(log) #endif - /* * Sector aligned buffer routines for buffer create/read/write/access */ -#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \ - ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \ - ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) -#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) +/* + * Verify the given count of basic blocks is valid number of blocks + * to specify for an operation involving the given XFS log buffer. + * Returns nonzero if the count is valid, 0 otherwise. + */ +static inline int +xlog_buf_bbcount_valid( + xlog_t *log, + int bbcount) +{ + return bbcount > 0 && bbcount <= log->l_logBBsize; +} + +/* + * Allocate a buffer to hold log data. The buffer needs to be able + * to map to a range of nbblks basic blocks at any valid (basic + * block) offset within the log. + */ STATIC xfs_buf_t * xlog_get_bp( xlog_t *log, int nbblks) { - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_get_bp(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return NULL; } - if (log->l_sectbb_log) { - if (nbblks > 1) - nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + /* + * We do log I/O in units of log sectors (a power-of-2 + * multiple of the basic block size), so we round up the + * requested size to acommodate the basic blocks required + * for complete log sectors. + * + * In addition, the buffer may be used for a non-sector- + * aligned block offset, in which case an I/O of the + * requested size could extend beyond the end of the + * buffer. If the requested size is only 1 basic block it + * will never straddle a sector boundary, so this won't be + * an issue. Nor will this be a problem if the log I/O is + * done in basic blocks (sector size 1). But otherwise we + * extend the buffer by one extra log sector to ensure + * there's space to accomodate this possiblility. + */ + if (nbblks > 1 && log->l_sectBBsize > 1) + nbblks += log->l_sectBBsize; + nbblks = round_up(nbblks, log->l_sectBBsize); + return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); } @@ -93,6 +121,10 @@ xlog_put_bp( xfs_buf_free(bp); } +/* + * Return the address of the start of the given block number's data + * in a log buffer. The buffer covers a log sector-aligned region. + */ STATIC xfs_caddr_t xlog_align( xlog_t *log, @@ -100,14 +132,14 @@ xlog_align( int nbblks, xfs_buf_t *bp) { + xfs_daddr_t offset; xfs_caddr_t ptr; - if (!log->l_sectbb_log) - return XFS_BUF_PTR(bp); + offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1); + ptr = XFS_BUF_PTR(bp) + BBTOB(offset); + + ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp)); - ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask); - ASSERT(XFS_BUF_SIZE(bp) >= - BBTOB(nbblks + (blk_no & log->l_sectbb_mask))); return ptr; } @@ -124,21 +156,18 @@ xlog_bread_noalign( { int error; - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_bread(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return EFSCORRUPTED; } - if (log->l_sectbb_log) { - blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); - ASSERT(bp); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); @@ -186,17 +215,15 @@ xlog_bwrite( { int error; - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_bwrite(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return EFSCORRUPTED; } - if (log->l_sectbb_log) { - blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); @@ -327,39 +354,38 @@ xlog_find_cycle_start( { xfs_caddr_t offset; xfs_daddr_t mid_blk; + xfs_daddr_t end_blk; uint mid_cycle; int error; - mid_blk = BLK_AVG(first_blk, *last_blk); - while (mid_blk != first_blk && mid_blk != *last_blk) { + end_blk = *last_blk; + mid_blk = BLK_AVG(first_blk, end_blk); + while (mid_blk != first_blk && mid_blk != end_blk) { error = xlog_bread(log, mid_blk, 1, bp, &offset); if (error) return error; mid_cycle = xlog_get_cycle(offset); - if (mid_cycle == cycle) { - *last_blk = mid_blk; - /* last_half_cycle == mid_cycle */ - } else { - first_blk = mid_blk; - /* first_half_cycle == mid_cycle */ - } - mid_blk = BLK_AVG(first_blk, *last_blk); + if (mid_cycle == cycle) + end_blk = mid_blk; /* last_half_cycle == mid_cycle */ + else + first_blk = mid_blk; /* first_half_cycle == mid_cycle */ + mid_blk = BLK_AVG(first_blk, end_blk); } - ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) || - (mid_blk == *last_blk && mid_blk-1 == first_blk)); + ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) || + (mid_blk == end_blk && mid_blk-1 == first_blk)); + + *last_blk = end_blk; return 0; } /* - * Check that the range of blocks does not contain the cycle number - * given. The scan needs to occur from front to back and the ptr into the - * region must be updated since a later routine will need to perform another - * test. If the region is completely good, we end up returning the same - * last block number. - * - * Set blkno to -1 if we encounter no errors. This is an invalid block number - * since we don't ever expect logs to get this large. + * Check that a range of blocks does not contain stop_on_cycle_no. + * Fill in *new_blk with the block offset where such a block is + * found, or with -1 (an invalid block number) if there is no such + * block in the range. The scan needs to occur from front to back + * and the pointer into the region must be updated since a later + * routine will need to perform another test. */ STATIC int xlog_find_verify_cycle( @@ -376,12 +402,16 @@ xlog_find_verify_cycle( xfs_caddr_t buf = NULL; int error = 0; + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks we'll be examining. If that fails, + * try a smaller size. We need to be able to read at least + * a log sector, or we're out of luck. + */ bufblks = 1 << ffs(nbblks); - while (!(bp = xlog_get_bp(log, bufblks))) { - /* can't get enough memory to do everything in one big buffer */ bufblks >>= 1; - if (bufblks <= log->l_sectbb_log) + if (bufblks < log->l_sectBBsize) return ENOMEM; } @@ -629,7 +659,7 @@ xlog_find_head( * In this case we want to find the first block with cycle * number matching last_half_cycle. We expect the log to be * some variation on - * x + 1 ... | x ... + * x + 1 ... | x ... | x * The first block with cycle number x (last_half_cycle) will * be where the new head belongs. First we do a binary search * for the first occurrence of last_half_cycle. The binary @@ -639,11 +669,13 @@ xlog_find_head( * the log, then we look for occurrences of last_half_cycle - 1 * at the end of the log. The cases we're looking for look * like - * x + 1 ... | x | x + 1 | x ... - * ^ binary search stopped here + * v binary search stopped here + * x + 1 ... | x | x + 1 | x ... | x + * ^ but we want to locate this spot * or - * x + 1 ... | x ... | x - 1 | x * <---------> less than scan distance + * x + 1 ... | x ... | x - 1 | x + * ^ we want to locate this spot */ stop_on_cycle = last_half_cycle; if ((error = xlog_find_cycle_start(log, bp, first_blk, @@ -699,16 +731,16 @@ xlog_find_head( * certainly not the head of the log. By searching for * last_half_cycle-1 we accomplish that. */ - start_blk = log_bbnum - num_scan_bblks + head_blk; ASSERT(head_blk <= INT_MAX && - (xfs_daddr_t) num_scan_bblks - head_blk >= 0); + (xfs_daddr_t) num_scan_bblks >= head_blk); + start_blk = log_bbnum - (num_scan_bblks - head_blk); if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks - (int)head_blk, (stop_on_cycle - 1), &new_blk))) goto bp_err; if (new_blk != -1) { head_blk = new_blk; - goto bad_blk; + goto validate_head; } /* @@ -726,7 +758,7 @@ xlog_find_head( head_blk = new_blk; } - bad_blk: +validate_head: /* * Now we need to make sure head_blk is not pointing to a block in * the middle of a log record. @@ -748,7 +780,7 @@ xlog_find_head( if ((error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0)) == -1) { /* We hit the beginning of the log during our search */ - start_blk = log_bbnum - num_scan_bblks + head_blk; + start_blk = log_bbnum - (num_scan_bblks - head_blk); new_blk = log_bbnum; ASSERT(start_blk <= INT_MAX && (xfs_daddr_t) log_bbnum-start_blk >= 0); @@ -833,12 +865,12 @@ xlog_find_tail( if (*head_blk == 0) { /* special case */ error = xlog_bread(log, 0, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (xlog_get_cycle(offset) == 0) { *tail_blk = 0; /* leave all other log inited values alone */ - goto exit; + goto done; } } @@ -849,7 +881,7 @@ xlog_find_tail( for (i = (int)(*head_blk) - 1; i >= 0; i--) { error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { found = 1; @@ -866,7 +898,7 @@ xlog_find_tail( for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { @@ -941,7 +973,7 @@ xlog_find_tail( umount_data_blk = (i + hblks) % log->l_logBBsize; error = xlog_bread(log, umount_data_blk, 1, bp, &offset); if (error) - goto bread_err; + goto done; op_head = (xlog_op_header_t *)offset; if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { @@ -987,12 +1019,10 @@ xlog_find_tail( * But... if the -device- itself is readonly, just skip this. * We can't recover this device anyway, so it won't matter. */ - if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) error = xlog_clear_stale_blocks(log, tail_lsn); - } -bread_err: -exit: +done: xlog_put_bp(bp); if (error) @@ -1152,16 +1182,22 @@ xlog_write_log_records( xfs_caddr_t offset; xfs_buf_t *bp; int balign, ealign; - int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); + int sectbb = log->l_sectBBsize; int end_block = start_block + blocks; int bufblks; int error = 0; int i, j = 0; + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks to be written. If that fails, try + * a smaller size. We need to be able to write at least a + * log sector, or we're out of luck. + */ bufblks = 1 << ffs(blocks); while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; - if (bufblks <= log->l_sectbb_log) + if (bufblks < sectbb) return ENOMEM; } @@ -1169,7 +1205,7 @@ xlog_write_log_records( * the buffer in the starting sector not covered by the first * write below. */ - balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); + balign = round_down(start_block, sectbb); if (balign != start_block) { error = xlog_bread_noalign(log, start_block, 1, bp); if (error) @@ -1188,7 +1224,7 @@ xlog_write_log_records( * the buffer in the final sector not covered by the write. * If this is the same sector as the above read, skip it. */ - ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block); + ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { offset = XFS_BUF_PTR(bp); balign = BBTOB(ealign - start_block); @@ -1408,6 +1444,7 @@ xlog_recover_add_item( STATIC int xlog_recover_add_to_cont_trans( + struct log *log, xlog_recover_t *trans, xfs_caddr_t dp, int len) @@ -1434,6 +1471,7 @@ xlog_recover_add_to_cont_trans( memcpy(&ptr[old_len], dp, len); /* d, s, l */ item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; + trace_xfs_log_recover_item_add_cont(log, trans, item, 0); return 0; } @@ -1452,6 +1490,7 @@ xlog_recover_add_to_cont_trans( */ STATIC int xlog_recover_add_to_trans( + struct log *log, xlog_recover_t *trans, xfs_caddr_t dp, int len) @@ -1510,6 +1549,7 @@ xlog_recover_add_to_trans( item->ri_buf[item->ri_cnt].i_addr = ptr; item->ri_buf[item->ri_cnt].i_len = len; item->ri_cnt++; + trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; } @@ -1521,7 +1561,9 @@ xlog_recover_add_to_trans( */ STATIC int xlog_recover_reorder_trans( - xlog_recover_t *trans) + struct log *log, + xlog_recover_t *trans, + int pass) { xlog_recover_item_t *item, *n; LIST_HEAD(sort_list); @@ -1534,7 +1576,9 @@ xlog_recover_reorder_trans( switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + trace_xfs_log_recover_item_reorder_head(log, + trans, item, pass); list_move(&item->ri_list, &trans->r_itemq); break; } @@ -1543,6 +1587,8 @@ xlog_recover_reorder_trans( case XFS_LI_QUOTAOFF: case XFS_LI_EFD: case XFS_LI_EFI: + trace_xfs_log_recover_item_reorder_tail(log, + trans, item, pass); list_move_tail(&item->ri_list, &trans->r_itemq); break; default: @@ -1592,8 +1638,10 @@ xlog_recover_do_buffer_pass1( /* * If this isn't a cancel buffer item, then just return. */ - if (!(flags & XFS_BLI_CANCEL)) + if (!(flags & XFS_BLF_CANCEL)) { + trace_xfs_log_recover_buf_not_cancel(log, buf_f); return; + } /* * Insert an xfs_buf_cancel record into the hash table of @@ -1627,6 +1675,7 @@ xlog_recover_do_buffer_pass1( while (nextp != NULL) { if (nextp->bc_blkno == blkno && nextp->bc_len == len) { nextp->bc_refcount++; + trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); return; } prevp = nextp; @@ -1640,13 +1689,14 @@ xlog_recover_do_buffer_pass1( bcp->bc_refcount = 1; bcp->bc_next = NULL; prevp->bc_next = bcp; + trace_xfs_log_recover_buf_cancel_add(log, buf_f); } /* * Check to see whether the buffer being recovered has a corresponding * entry in the buffer cancel record table. If it does then return 1 * so that it will be cancelled, otherwise return 0. If the buffer is - * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement + * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement * the refcount on the entry in the table and remove it from the table * if this is the last reference. * @@ -1671,7 +1721,7 @@ xlog_check_buffer_cancelled( * There is nothing in the table built in pass one, * so this buffer must not be cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1683,7 +1733,7 @@ xlog_check_buffer_cancelled( * There is no corresponding entry in the table built * in pass one, so this buffer has not been cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1702,7 +1752,7 @@ xlog_check_buffer_cancelled( * one in the table and remove it if this is the * last reference. */ - if (flags & XFS_BLI_CANCEL) { + if (flags & XFS_BLF_CANCEL) { bcp->bc_refcount--; if (bcp->bc_refcount == 0) { if (prevp == NULL) { @@ -1722,7 +1772,7 @@ xlog_check_buffer_cancelled( * We didn't find a corresponding entry in the table, so * return 0 so that the buffer is NOT cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); + ASSERT(!(flags & XFS_BLF_CANCEL)); return 0; } @@ -1779,6 +1829,8 @@ xlog_recover_do_inode_buffer( unsigned int *data_map = NULL; unsigned int map_size = 0; + trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + switch (buf_f->blf_type) { case XFS_LI_BUF: data_map = buf_f->blf_data_map; @@ -1822,8 +1874,8 @@ xlog_recover_do_inode_buffer( nbits = xfs_contig_bits(data_map, map_size, bit); ASSERT(nbits > 0); - reg_buf_offset = bit << XFS_BLI_SHIFT; - reg_buf_bytes = nbits << XFS_BLI_SHIFT; + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; item_index++; } @@ -1837,7 +1889,7 @@ xlog_recover_do_inode_buffer( } ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); /* @@ -1874,6 +1926,7 @@ xlog_recover_do_inode_buffer( /*ARGSUSED*/ STATIC void xlog_recover_do_reg_buffer( + struct xfs_mount *mp, xlog_recover_item_t *item, xfs_buf_t *bp, xfs_buf_log_format_t *buf_f) @@ -1885,6 +1938,8 @@ xlog_recover_do_reg_buffer( unsigned int map_size = 0; int error; + trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + switch (buf_f->blf_type) { case XFS_LI_BUF: data_map = buf_f->blf_data_map; @@ -1900,9 +1955,9 @@ xlog_recover_do_reg_buffer( nbits = xfs_contig_bits(data_map, map_size, bit); ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); - ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); + ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); /* * Do a sanity check if this is a dquot buffer. Just checking @@ -1911,7 +1966,7 @@ xlog_recover_do_reg_buffer( */ error = 0; if (buf_f->blf_flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { if (item->ri_buf[i].i_addr == NULL) { cmn_err(CE_ALERT, "XFS: NULL dquot in %s.", __func__); @@ -1932,9 +1987,9 @@ xlog_recover_do_reg_buffer( } memcpy(xfs_buf_offset(bp, - (uint)bit << XFS_BLI_SHIFT), /* dest */ + (uint)bit << XFS_BLF_SHIFT), /* dest */ item->ri_buf[i].i_addr, /* source */ - nbits<<XFS_BLI_SHIFT); /* length */ + nbits<<XFS_BLF_SHIFT); /* length */ next: i++; bit += nbits; @@ -2083,6 +2138,8 @@ xlog_recover_do_dquot_buffer( { uint type; + trace_xfs_log_recover_buf_dquot_buf(log, buf_f); + /* * Filesystems are required to send in quota flags at mount time. */ @@ -2091,11 +2148,11 @@ xlog_recover_do_dquot_buffer( } type = 0; - if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) type |= XFS_DQ_USER; - if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) type |= XFS_DQ_PROJ; - if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) type |= XFS_DQ_GROUP; /* * This type of quotas was turned off, so ignore this buffer @@ -2103,7 +2160,7 @@ xlog_recover_do_dquot_buffer( if (log->l_quotaoffs_flag & type) return; - xlog_recover_do_reg_buffer(item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } /* @@ -2116,7 +2173,7 @@ xlog_recover_do_dquot_buffer( * here which overlaps that may be stale. * * When meta-data buffers are freed at run time we log a buffer item - * with the XFS_BLI_CANCEL bit set to indicate that previous copies + * with the XFS_BLF_CANCEL bit set to indicate that previous copies * of the buffer in the log should not be replayed at recovery time. * This is so that if the blocks covered by the buffer are reused for * file data before we crash we don't end up replaying old, freed @@ -2150,7 +2207,7 @@ xlog_recover_do_buffer_trans( if (pass == XLOG_RECOVER_PASS1) { /* * In this pass we're only looking for buf items - * with the XFS_BLI_CANCEL bit set. + * with the XFS_BLF_CANCEL bit set. */ xlog_recover_do_buffer_pass1(log, buf_f); return 0; @@ -2164,9 +2221,11 @@ xlog_recover_do_buffer_trans( */ cancel = xlog_recover_do_buffer_pass2(log, buf_f); if (cancel) { + trace_xfs_log_recover_buf_cancel(log, buf_f); return 0; } } + trace_xfs_log_recover_buf_recover(log, buf_f); switch (buf_f->blf_type) { case XFS_LI_BUF: blkno = buf_f->blf_blkno; @@ -2185,7 +2244,7 @@ xlog_recover_do_buffer_trans( mp = log->l_mp; buf_flags = XBF_LOCK; - if (!(flags & XFS_BLI_INODE_BUF)) + if (!(flags & XFS_BLF_INODE_BUF)) buf_flags |= XBF_MAPPED; bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); @@ -2198,13 +2257,13 @@ xlog_recover_do_buffer_trans( } error = 0; - if (flags & XFS_BLI_INODE_BUF) { + if (flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); } else if (flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { - xlog_recover_do_reg_buffer(item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } if (error) return XFS_ERROR(error); @@ -2284,8 +2343,10 @@ xlog_recover_do_inode_trans( if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len, 0)) { error = 0; + trace_xfs_log_recover_inode_cancel(log, in_f); goto error; } + trace_xfs_log_recover_inode_recover(log, in_f); bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, XBF_LOCK); @@ -2337,6 +2398,7 @@ xlog_recover_do_inode_trans( /* do nothing */ } else { xfs_buf_relse(bp); + trace_xfs_log_recover_inode_skip(log, in_f); error = 0; goto error; } @@ -2758,11 +2820,12 @@ xlog_recover_do_trans( int error = 0; xlog_recover_item_t *item; - error = xlog_recover_reorder_trans(trans); + error = xlog_recover_reorder_trans(log, trans, pass); if (error) return error; list_for_each_entry(item, &trans->r_itemq, ri_list) { + trace_xfs_log_recover_item_recover(log, trans, item, pass); switch (ITEM_TYPE(item)) { case XFS_LI_BUF: error = xlog_recover_do_buffer_trans(log, item, pass); @@ -2919,8 +2982,9 @@ xlog_recover_process_data( error = xlog_recover_unmount_trans(trans); break; case XLOG_WAS_CONT_TRANS: - error = xlog_recover_add_to_cont_trans(trans, - dp, be32_to_cpu(ohead->oh_len)); + error = xlog_recover_add_to_cont_trans(log, + trans, dp, + be32_to_cpu(ohead->oh_len)); break; case XLOG_START_TRANS: xlog_warn( @@ -2930,7 +2994,7 @@ xlog_recover_process_data( break; case 0: case XLOG_CONTINUE_TRANS: - error = xlog_recover_add_to_trans(trans, + error = xlog_recover_add_to_trans(log, trans, dp, be32_to_cpu(ohead->oh_len)); break; default: @@ -3331,42 +3395,6 @@ xlog_pack_data( } } -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) -STATIC void -xlog_unpack_data_checksum( - xlog_rec_header_t *rhead, - xfs_caddr_t dp, - xlog_t *log) -{ - __be32 *up = (__be32 *)dp; - uint chksum = 0; - int i; - - /* divide length by 4 to get # words */ - for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - if (chksum != be32_to_cpu(rhead->h_chksum)) { - if (rhead->h_chksum || - ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) { - cmn_err(CE_DEBUG, - "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n", - be32_to_cpu(rhead->h_chksum), chksum); - cmn_err(CE_DEBUG, -"XFS: Disregard message if filesystem was created with non-DEBUG kernel"); - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - cmn_err(CE_DEBUG, - "XFS: LogR this is a LogV2 filesystem\n"); - } - log->l_flags |= XLOG_CHKSUM_MISMATCH; - } - } -} -#else -#define xlog_unpack_data_checksum(rhead, dp, log) -#endif - STATIC void xlog_unpack_data( xlog_rec_header_t *rhead, @@ -3390,8 +3418,6 @@ xlog_unpack_data( dp += BBSIZE; } } - - xlog_unpack_data_checksum(rhead, dp, log); } STATIC int @@ -3490,7 +3516,7 @@ xlog_do_recovery_pass( hblks = 1; } } else { - ASSERT(log->l_sectbb_log == 0); + ASSERT(log->l_sectBBsize == 1); hblks = 1; hbp = xlog_get_bp(log, 1); h_size = XLOG_BIG_RECORD_BSIZE; @@ -3946,10 +3972,6 @@ xlog_recover_check_summary( xfs_agf_t *agfp; xfs_buf_t *agfbp; xfs_buf_t *agibp; - xfs_buf_t *sbbp; -#ifdef XFS_LOUD_RECOVERY - xfs_sb_t *sbp; -#endif xfs_agnumber_t agno; __uint64_t freeblks; __uint64_t itotal; @@ -3984,30 +4006,5 @@ xlog_recover_check_summary( xfs_buf_relse(agibp); } } - - sbbp = xfs_getsb(mp, 0); -#ifdef XFS_LOUD_RECOVERY - sbp = &mp->m_sb; - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp)); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_icount %Lu itotal %Lu", - sbp->sb_icount, itotal); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu", - sbp->sb_ifree, ifree); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu", - sbp->sb_fdblocks, freeblks); -#if 0 - /* - * This is turned off until I account for the allocation - * btree blocks which live in free space. - */ - ASSERT(sbp->sb_icount == itotal); - ASSERT(sbp->sb_ifree == ifree); - ASSERT(sbp->sb_fdblocks == freeblks); -#endif -#endif - xfs_buf_relse(sbbp); } #endif /* DEBUG */ diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h index 75d74920725..1c55ccbb379 100644 --- a/fs/xfs/xfs_log_recover.h +++ b/fs/xfs/xfs_log_recover.h @@ -28,7 +28,7 @@ #define XLOG_RHASH(tid) \ ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) -#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) +#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1) /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index e79b56b4bca..d7bf38c8cd1 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1405,13 +1405,6 @@ xfs_mountfs( xfs_qm_mount_quotas(mp); } -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - if (XFS_IS_QUOTA_ON(mp)) - xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on"); - else - xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on"); -#endif - /* * Now we are mounted, reserve a small amount of unused space for * privileged transactions. This is needed so that transaction diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 9ff48a16a7e..1d2c7eed4ed 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -268,6 +268,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ +#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ #define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index fdcab3f81dd..e0e64b113bd 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -201,9 +201,6 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */ -#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */ -#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */ #define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index f73e358bae8..ce558efa2ea 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -44,24 +44,14 @@ #include "xfs_trans_priv.h" #include "xfs_trans_space.h" #include "xfs_inode_item.h" - - -STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *); -STATIC uint xfs_trans_count_vecs(xfs_trans_t *); -STATIC void xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *); -STATIC void xfs_trans_uncommit(xfs_trans_t *, uint); -STATIC void xfs_trans_committed(xfs_trans_t *, int); -STATIC void xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int); -STATIC void xfs_trans_free(xfs_trans_t *); +#include "xfs_trace.h" kmem_zone_t *xfs_trans_zone; - /* * Reservation functions here avoid a huge stack in xfs_trans_init * due to register overflow from temporaries in the calculations. */ - STATIC uint xfs_calc_write_reservation(xfs_mount_t *mp) { @@ -254,13 +244,30 @@ _xfs_trans_alloc( tp->t_type = type; tp->t_mountp = mp; tp->t_items_free = XFS_LIC_NUM_SLOTS; - tp->t_busy_free = XFS_LBC_NUM_SLOTS; xfs_lic_init(&(tp->t_items)); - XFS_LBC_INIT(&(tp->t_busy)); + INIT_LIST_HEAD(&tp->t_busy); return tp; } /* + * Free the transaction structure. If there is more clean up + * to do when the structure is freed, add it here. + */ +STATIC void +xfs_trans_free( + struct xfs_trans *tp) +{ + struct xfs_busy_extent *busyp, *n; + + list_for_each_entry_safe(busyp, n, &tp->t_busy, list) + xfs_alloc_busy_clear(tp->t_mountp, busyp); + + atomic_dec(&tp->t_mountp->m_active_trans); + xfs_trans_free_dqinfo(tp); + kmem_zone_free(xfs_trans_zone, tp); +} + +/* * This is called to create a new transaction which will share the * permanent log reservation of the given transaction. The remaining * unused block and rt extent reservations are also inherited. This @@ -283,9 +290,8 @@ xfs_trans_dup( ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; ntp->t_items_free = XFS_LIC_NUM_SLOTS; - ntp->t_busy_free = XFS_LBC_NUM_SLOTS; xfs_lic_init(&(ntp->t_items)); - XFS_LBC_INIT(&(ntp->t_busy)); + INIT_LIST_HEAD(&ntp->t_busy); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(tp->t_ticket != NULL); @@ -421,7 +427,6 @@ undo_blocks: return error; } - /* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. @@ -650,7 +655,7 @@ xfs_trans_apply_sb_deltas( * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we * still need to update the incore superblock with the changes. */ -STATIC void +void xfs_trans_unreserve_and_mod_sb( xfs_trans_t *tp) { @@ -764,94 +769,256 @@ xfs_trans_unreserve_and_mod_sb( } } +/* + * Total up the number of log iovecs needed to commit this + * transaction. The transaction itself needs one for the + * transaction header. Ask each dirty item in turn how many + * it needs to get the total. + */ +static uint +xfs_trans_count_vecs( + struct xfs_trans *tp) +{ + int nvecs; + xfs_log_item_desc_t *lidp; + + nvecs = 1; + lidp = xfs_trans_first_item(tp); + ASSERT(lidp != NULL); + + /* In the non-debug case we need to start bailing out if we + * didn't find a log_item here, return zero and let trans_commit + * deal with it. + */ + if (lidp == NULL) + return 0; + + while (lidp != NULL) { + /* + * Skip items which aren't dirty in this transaction. + */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + lidp->lid_size = IOP_SIZE(lidp->lid_item); + nvecs += lidp->lid_size; + lidp = xfs_trans_next_item(tp, lidp); + } + + return nvecs; +} /* - * xfs_trans_commit + * Fill in the vector with pointers to data to be logged + * by this transaction. The transaction header takes + * the first vector, and then each dirty item takes the + * number of vectors it indicated it needed in xfs_trans_count_vecs(). * - * Commit the given transaction to the log a/synchronously. + * As each item fills in the entries it needs, also pin the item + * so that it cannot be flushed out until the log write completes. + */ +static void +xfs_trans_fill_vecs( + struct xfs_trans *tp, + struct xfs_log_iovec *log_vector) +{ + xfs_log_item_desc_t *lidp; + struct xfs_log_iovec *vecp; + uint nitems; + + /* + * Skip over the entry for the transaction header, we'll + * fill that in at the end. + */ + vecp = log_vector + 1; + + nitems = 0; + lidp = xfs_trans_first_item(tp); + ASSERT(lidp); + while (lidp) { + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + + /* + * The item may be marked dirty but not log anything. This can + * be used to get called when a transaction is committed. + */ + if (lidp->lid_size) + nitems++; + IOP_FORMAT(lidp->lid_item, vecp); + vecp += lidp->lid_size; + IOP_PIN(lidp->lid_item); + lidp = xfs_trans_next_item(tp, lidp); + } + + /* + * Now that we've counted the number of items in this transaction, fill + * in the transaction header. Note that the transaction header does not + * have a log item. + */ + tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; + tp->t_header.th_type = tp->t_type; + tp->t_header.th_num_items = nitems; + log_vector->i_addr = (xfs_caddr_t)&tp->t_header; + log_vector->i_len = sizeof(xfs_trans_header_t); + log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; +} + +/* + * The committed item processing consists of calling the committed routine of + * each logged item, updating the item's position in the AIL if necessary, and + * unpinning each item. If the committed routine returns -1, then do nothing + * further with the item because it may have been freed. * - * XFS disk error handling mechanism is not based on a typical - * transaction abort mechanism. Logically after the filesystem - * gets marked 'SHUTDOWN', we can't let any new transactions - * be durable - ie. committed to disk - because some metadata might - * be inconsistent. In such cases, this returns an error, and the - * caller may assume that all locked objects joined to the transaction - * have already been unlocked as if the commit had succeeded. - * Do not reference the transaction structure after this call. + * Since items are unlocked when they are copied to the incore log, it is + * possible for two transactions to be completing and manipulating the same + * item simultaneously. The AIL lock will protect the lsn field of each item. + * The value of this field can never go backwards. + * + * We unpin the items after repositioning them in the AIL, because otherwise + * they could be immediately flushed and we'd have to race with the flusher + * trying to pull the item from the AIL as we add it. */ - /*ARGSUSED*/ -int -_xfs_trans_commit( - xfs_trans_t *tp, - uint flags, - int *log_flushed) +void +xfs_trans_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn, + int aborted) { - xfs_log_iovec_t *log_vector; - int nvec; - xfs_mount_t *mp; - xfs_lsn_t commit_lsn; - /* REFERENCED */ - int error; - int log_flags; - int sync; -#define XFS_TRANS_LOGVEC_COUNT 16 - xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; - struct xlog_in_core *commit_iclog; - int shutdown; + xfs_lsn_t item_lsn; + struct xfs_ail *ailp; - commit_lsn = -1; + if (aborted) + lip->li_flags |= XFS_LI_ABORTED; + item_lsn = IOP_COMMITTED(lip, commit_lsn); + + /* If the committed routine returns -1, item has been freed. */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) + return; /* - * Determine whether this commit is releasing a permanent - * log reservation or not. + * If the returned lsn is greater than what it contained before, update + * the location of the item in the AIL. If it is not, then do nothing. + * Items can never move backwards in the AIL. + * + * While the new lsn should usually be greater, it is possible that a + * later transaction completing simultaneously with an earlier one + * using the same item could complete first with a higher lsn. This + * would cause the earlier transaction to fail the test below. */ - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; + ailp = lip->li_ailp; + spin_lock(&ailp->xa_lock); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { + /* + * This will set the item's lsn to item_lsn and update the + * position of the item in the AIL. + * + * xfs_trans_ail_update() drops the AIL lock. + */ + xfs_trans_ail_update(ailp, lip, item_lsn); } else { - log_flags = 0; + spin_unlock(&ailp->xa_lock); } - mp = tp->t_mountp; /* - * If there is nothing to be logged by the transaction, - * then unlock all of the items associated with the - * transaction and free the transaction structure. - * Also make sure to return any reserved blocks to - * the free pool. + * Now that we've repositioned the item in the AIL, unpin it so it can + * be flushed. Pass information about buffer stale state down from the + * log item flags, if anyone else stales the buffer we do not want to + * pay any attention to it. */ -shut_us_down: - shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0; - if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) { - xfs_trans_unreserve_and_mod_sb(tp); + IOP_UNPIN(lip); +} + +/* + * This is typically called by the LM when a transaction has been fully + * committed to disk. It needs to unpin the items which have + * been logged by the transaction and update their positions + * in the AIL if necessary. + * + * This also gets called when the transactions didn't get written out + * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. + */ +STATIC void +xfs_trans_committed( + struct xfs_trans *tp, + int abortflag) +{ + xfs_log_item_desc_t *lidp; + xfs_log_item_chunk_t *licp; + xfs_log_item_chunk_t *next_licp; + + /* Call the transaction's completion callback if there is one. */ + if (tp->t_callback != NULL) + tp->t_callback(tp, tp->t_callarg); + + for (lidp = xfs_trans_first_item(tp); + lidp != NULL; + lidp = xfs_trans_next_item(tp, lidp)) { + xfs_trans_item_committed(lidp->lid_item, tp->t_lsn, abortflag); + } + + /* free the item chunks, ignoring the embedded chunk */ + for (licp = tp->t_items.lic_next; licp != NULL; licp = next_licp) { + next_licp = licp->lic_next; + kmem_free(licp); + } + + xfs_trans_free(tp); +} + +/* + * Called from the trans_commit code when we notice that + * the filesystem is in the middle of a forced shutdown. + */ +STATIC void +xfs_trans_uncommit( + struct xfs_trans *tp, + uint flags) +{ + xfs_log_item_desc_t *lidp; + + for (lidp = xfs_trans_first_item(tp); + lidp != NULL; + lidp = xfs_trans_next_item(tp, lidp)) { /* - * It is indeed possible for the transaction to be - * not dirty but the dqinfo portion to be. All that - * means is that we have some (non-persistent) quota - * reservations that need to be unreserved. + * Unpin all but those that aren't dirty. */ - xfs_trans_unreserve_and_mod_dquots(tp); - if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, - NULL, log_flags); - if (commit_lsn == -1 && !shutdown) - shutdown = XFS_ERROR(EIO); - } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0); - xfs_trans_free_busy(tp); - xfs_trans_free(tp); - XFS_STATS_INC(xs_trans_empty); - return (shutdown); + if (lidp->lid_flags & XFS_LID_DIRTY) + IOP_UNPIN_REMOVE(lidp->lid_item, tp); } - ASSERT(tp->t_ticket != NULL); - /* - * If we need to update the superblock, then do it now. - */ - if (tp->t_flags & XFS_TRANS_SB_DIRTY) - xfs_trans_apply_sb_deltas(tp); - xfs_trans_apply_dquot_deltas(tp); + xfs_trans_unreserve_and_mod_sb(tp); + xfs_trans_unreserve_and_mod_dquots(tp); + + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); + xfs_trans_free(tp); +} + +/* + * Format the transaction direct to the iclog. This isolates the physical + * transaction commit operation from the logical operation and hence allows + * other methods to be introduced without affecting the existing commit path. + */ +static int +xfs_trans_commit_iclog( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) +{ + int shutdown; + int error; + int log_flags = 0; + struct xlog_in_core *commit_iclog; +#define XFS_TRANS_LOGVEC_COUNT 16 + struct xfs_log_iovec log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; + struct xfs_log_iovec *log_vector; + uint nvec; + /* * Ask each log item how many log_vector entries it will @@ -861,8 +1028,7 @@ shut_us_down: */ nvec = xfs_trans_count_vecs(tp); if (nvec == 0) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - goto shut_us_down; + return ENOMEM; /* triggers a shutdown! */ } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { log_vector = log_vector_fast; } else { @@ -877,6 +1043,9 @@ shut_us_down: */ xfs_trans_fill_vecs(tp, log_vector); + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); /* @@ -884,18 +1053,19 @@ shut_us_down: * at any time after this call. However, all the items associated * with the transaction are still locked and pinned in memory. */ - commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); + *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); - tp->t_commit_lsn = commit_lsn; - if (nvec > XFS_TRANS_LOGVEC_COUNT) { + tp->t_commit_lsn = *commit_lsn; + trace_xfs_trans_commit_lsn(tp); + + if (nvec > XFS_TRANS_LOGVEC_COUNT) kmem_free(log_vector); - } /* * If we got a log write error. Unpin the logitems that we * had pinned, clean up, free trans structure, and return error. */ - if (error || commit_lsn == -1) { + if (error || *commit_lsn == -1) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); return XFS_ERROR(EIO); @@ -909,8 +1079,6 @@ shut_us_down: */ xfs_trans_unreserve_and_mod_sb(tp); - sync = tp->t_flags & XFS_TRANS_SYNC; - /* * Tell the LM to call the transaction completion routine * when the log write with LSN commit_lsn completes (e.g. @@ -953,7 +1121,7 @@ shut_us_down: * the commit lsn of this transaction for dependency tracking * purposes. */ - xfs_trans_unlock_items(tp, commit_lsn); + xfs_trans_unlock_items(tp, *commit_lsn); /* * If we detected a log error earlier, finish committing @@ -973,156 +1141,204 @@ shut_us_down: * and the items are released we can finally allow the iclog to * go to disk. */ - error = xfs_log_release_iclog(mp, commit_iclog); - - /* - * If the transaction needs to be synchronous, then force the - * log out now and wait for it. - */ - if (sync) { - if (!error) { - error = _xfs_log_force_lsn(mp, commit_lsn, - XFS_LOG_SYNC, log_flushed); - } - XFS_STATS_INC(xs_trans_sync); - } else { - XFS_STATS_INC(xs_trans_async); - } - - return (error); + return xfs_log_release_iclog(mp, commit_iclog); } - /* - * Total up the number of log iovecs needed to commit this - * transaction. The transaction itself needs one for the - * transaction header. Ask each dirty item in turn how many - * it needs to get the total. + * Walk the log items and allocate log vector structures for + * each item large enough to fit all the vectors they require. + * Note that this format differs from the old log vector format in + * that there is no transaction header in these log vectors. */ -STATIC uint -xfs_trans_count_vecs( +STATIC struct xfs_log_vec * +xfs_trans_alloc_log_vecs( xfs_trans_t *tp) { - int nvecs; xfs_log_item_desc_t *lidp; + struct xfs_log_vec *lv = NULL; + struct xfs_log_vec *ret_lv = NULL; - nvecs = 1; lidp = xfs_trans_first_item(tp); - ASSERT(lidp != NULL); - /* In the non-debug case we need to start bailing out if we - * didn't find a log_item here, return zero and let trans_commit - * deal with it. - */ - if (lidp == NULL) - return 0; + /* Bail out if we didn't find a log item. */ + if (!lidp) { + ASSERT(0); + return NULL; + } while (lidp != NULL) { - /* - * Skip items which aren't dirty in this transaction. - */ + struct xfs_log_vec *new_lv; + + /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) { lidp = xfs_trans_next_item(tp, lidp); continue; } + + /* Skip items that do not have any vectors for writing */ lidp->lid_size = IOP_SIZE(lidp->lid_item); - nvecs += lidp->lid_size; + if (!lidp->lid_size) { + lidp = xfs_trans_next_item(tp, lidp); + continue; + } + + new_lv = kmem_zalloc(sizeof(*new_lv) + + lidp->lid_size * sizeof(struct xfs_log_iovec), + KM_SLEEP); + + /* The allocated iovec region lies beyond the log vector. */ + new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; + new_lv->lv_niovecs = lidp->lid_size; + new_lv->lv_item = lidp->lid_item; + if (!ret_lv) + ret_lv = new_lv; + else + lv->lv_next = new_lv; + lv = new_lv; lidp = xfs_trans_next_item(tp, lidp); } - return nvecs; + return ret_lv; } -/* - * Called from the trans_commit code when we notice that - * the filesystem is in the middle of a forced shutdown. - */ -STATIC void -xfs_trans_uncommit( - xfs_trans_t *tp, - uint flags) +static int +xfs_trans_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) { - xfs_log_item_desc_t *lidp; + struct xfs_log_vec *log_vector; + int error; - for (lidp = xfs_trans_first_item(tp); - lidp != NULL; - lidp = xfs_trans_next_item(tp, lidp)) { - /* - * Unpin all but those that aren't dirty. - */ - if (lidp->lid_flags & XFS_LID_DIRTY) - IOP_UNPIN_REMOVE(lidp->lid_item, tp); - } + /* + * Get each log item to allocate a vector structure for + * the log item to to pass to the log write code. The + * CIL commit code will format the vector and save it away. + */ + log_vector = xfs_trans_alloc_log_vecs(tp); + if (!log_vector) + return ENOMEM; - xfs_trans_unreserve_and_mod_sb(tp); - xfs_trans_unreserve_and_mod_dquots(tp); + error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); + if (error) + return error; - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + + /* xfs_trans_free_items() unlocks them first */ + xfs_trans_free_items(tp, *commit_lsn, 0); xfs_trans_free(tp); + return 0; } /* - * Fill in the vector with pointers to data to be logged - * by this transaction. The transaction header takes - * the first vector, and then each dirty item takes the - * number of vectors it indicated it needed in xfs_trans_count_vecs(). + * xfs_trans_commit * - * As each item fills in the entries it needs, also pin the item - * so that it cannot be flushed out until the log write completes. + * Commit the given transaction to the log a/synchronously. + * + * XFS disk error handling mechanism is not based on a typical + * transaction abort mechanism. Logically after the filesystem + * gets marked 'SHUTDOWN', we can't let any new transactions + * be durable - ie. committed to disk - because some metadata might + * be inconsistent. In such cases, this returns an error, and the + * caller may assume that all locked objects joined to the transaction + * have already been unlocked as if the commit had succeeded. + * Do not reference the transaction structure after this call. */ -STATIC void -xfs_trans_fill_vecs( - xfs_trans_t *tp, - xfs_log_iovec_t *log_vector) +int +_xfs_trans_commit( + struct xfs_trans *tp, + uint flags, + int *log_flushed) { - xfs_log_item_desc_t *lidp; - xfs_log_iovec_t *vecp; - uint nitems; + struct xfs_mount *mp = tp->t_mountp; + xfs_lsn_t commit_lsn = -1; + int error = 0; + int log_flags = 0; + int sync = tp->t_flags & XFS_TRANS_SYNC; /* - * Skip over the entry for the transaction header, we'll - * fill that in at the end. + * Determine whether this commit is releasing a permanent + * log reservation or not. */ - vecp = log_vector + 1; /* pointer arithmetic */ + if (flags & XFS_TRANS_RELEASE_LOG_RES) { + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + log_flags = XFS_LOG_REL_PERM_RESERV; + } - nitems = 0; - lidp = xfs_trans_first_item(tp); - ASSERT(lidp != NULL); - while (lidp != NULL) { - /* - * Skip items which aren't dirty in this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); - continue; - } - /* - * The item may be marked dirty but not log anything. - * This can be used to get called when a transaction - * is committed. - */ - if (lidp->lid_size) { - nitems++; + /* + * If there is nothing to be logged by the transaction, + * then unlock all of the items associated with the + * transaction and free the transaction structure. + * Also make sure to return any reserved blocks to + * the free pool. + */ + if (!(tp->t_flags & XFS_TRANS_DIRTY)) + goto out_unreserve; + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + goto out_unreserve; + } + + ASSERT(tp->t_ticket != NULL); + + /* + * If we need to update the superblock, then do it now. + */ + if (tp->t_flags & XFS_TRANS_SB_DIRTY) + xfs_trans_apply_sb_deltas(tp); + xfs_trans_apply_dquot_deltas(tp); + + if (mp->m_flags & XFS_MOUNT_DELAYLOG) + error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags); + else + error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); + + if (error == ENOMEM) { + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + error = XFS_ERROR(EIO); + goto out_unreserve; + } + + /* + * If the transaction needs to be synchronous, then force the + * log out now and wait for it. + */ + if (sync) { + if (!error) { + error = _xfs_log_force_lsn(mp, commit_lsn, + XFS_LOG_SYNC, log_flushed); } - IOP_FORMAT(lidp->lid_item, vecp); - vecp += lidp->lid_size; /* pointer arithmetic */ - IOP_PIN(lidp->lid_item); - lidp = xfs_trans_next_item(tp, lidp); + XFS_STATS_INC(xs_trans_sync); + } else { + XFS_STATS_INC(xs_trans_async); } + return error; + +out_unreserve: + xfs_trans_unreserve_and_mod_sb(tp); + /* - * Now that we've counted the number of items in this - * transaction, fill in the transaction header. + * It is indeed possible for the transaction to be not dirty but + * the dqinfo portion to be. All that means is that we have some + * (non-persistent) quota reservations that need to be unreserved. */ - tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_header.th_type = tp->t_type; - tp->t_header.th_num_items = nitems; - log_vector->i_addr = (xfs_caddr_t)&tp->t_header; - log_vector->i_len = sizeof(xfs_trans_header_t); - log_vector->i_type = XLOG_REG_TYPE_TRANSHDR; -} + xfs_trans_unreserve_and_mod_dquots(tp); + if (tp->t_ticket) { + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + if (commit_lsn == -1 && !error) + error = XFS_ERROR(EIO); + } + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free(tp); + XFS_STATS_INC(xs_trans_empty); + return error; +} /* * Unlock all of the transaction's items and free the transaction. @@ -1195,25 +1411,10 @@ xfs_trans_cancel( /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); xfs_trans_free(tp); } - -/* - * Free the transaction structure. If there is more clean up - * to do when the structure is freed, add it here. - */ -STATIC void -xfs_trans_free( - xfs_trans_t *tp) -{ - atomic_dec(&tp->t_mountp->m_active_trans); - xfs_trans_free_dqinfo(tp); - kmem_zone_free(xfs_trans_zone, tp); -} - /* * Roll from one trans in the sequence of PERMANENT transactions to * the next: permanent transactions are only flushed out when @@ -1283,174 +1484,3 @@ xfs_trans_roll( xfs_trans_ihold(trans, dp); return 0; } - -/* - * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item(). - * - * This is typically called by the LM when a transaction has been fully - * committed to disk. It needs to unpin the items which have - * been logged by the transaction and update their positions - * in the AIL if necessary. - * This also gets called when the transactions didn't get written out - * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. - * - * Call xfs_trans_chunk_committed() to process the items in - * each chunk. - */ -STATIC void -xfs_trans_committed( - xfs_trans_t *tp, - int abortflag) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i; - - /* - * Call the transaction's completion callback if there - * is one. - */ - if (tp->t_callback != NULL) { - tp->t_callback(tp, tp->t_callarg); - } - - /* - * Special case the chunk embedded in the transaction. - */ - licp = &(tp->t_items); - if (!(xfs_lic_are_all_free(licp))) { - xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); - } - - /* - * Process the items in each chunk in turn. - */ - licp = licp->lic_next; - while (licp != NULL) { - ASSERT(!xfs_lic_are_all_free(licp)); - xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); - next_licp = licp->lic_next; - kmem_free(licp); - licp = next_licp; - } - - /* - * Clear all the per-AG busy list items listed in this transaction - */ - lbcp = &tp->t_busy; - while (lbcp != NULL) { - for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) { - if (!XFS_LBC_ISFREE(lbcp, i)) { - xfs_alloc_clear_busy(tp, lbsp->lbc_ag, - lbsp->lbc_idx); - } - } - lbcp = lbcp->lbc_next; - } - xfs_trans_free_busy(tp); - - /* - * That's it for the transaction structure. Free it. - */ - xfs_trans_free(tp); -} - -/* - * This is called to perform the commit processing for each - * item described by the given chunk. - * - * The commit processing consists of unlocking items which were - * held locked with the SYNC_UNLOCK attribute, calling the committed - * routine of each logged item, updating the item's position in the AIL - * if necessary, and unpinning each item. If the committed routine - * returns -1, then do nothing further with the item because it - * may have been freed. - * - * Since items are unlocked when they are copied to the incore - * log, it is possible for two transactions to be completing - * and manipulating the same item simultaneously. The AIL lock - * will protect the lsn field of each item. The value of this - * field can never go backwards. - * - * We unpin the items after repositioning them in the AIL, because - * otherwise they could be immediately flushed and we'd have to race - * with the flusher trying to pull the item from the AIL as we add it. - */ -STATIC void -xfs_trans_chunk_committed( - xfs_log_item_chunk_t *licp, - xfs_lsn_t lsn, - int aborted) -{ - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - xfs_lsn_t item_lsn; - int i; - - lidp = licp->lic_descs; - for (i = 0; i < licp->lic_unused; i++, lidp++) { - struct xfs_ail *ailp; - - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lip = lidp->lid_item; - if (aborted) - lip->li_flags |= XFS_LI_ABORTED; - - /* - * Send in the ABORTED flag to the COMMITTED routine - * so that it knows whether the transaction was aborted - * or not. - */ - item_lsn = IOP_COMMITTED(lip, lsn); - - /* - * If the committed routine returns -1, make - * no more references to the item. - */ - if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) { - continue; - } - - /* - * If the returned lsn is greater than what it - * contained before, update the location of the - * item in the AIL. If it is not, then do nothing. - * Items can never move backwards in the AIL. - * - * While the new lsn should usually be greater, it - * is possible that a later transaction completing - * simultaneously with an earlier one using the - * same item could complete first with a higher lsn. - * This would cause the earlier transaction to fail - * the test below. - */ - ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); - if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { - /* - * This will set the item's lsn to item_lsn - * and update the position of the item in - * the AIL. - * - * xfs_trans_ail_update() drops the AIL lock. - */ - xfs_trans_ail_update(ailp, lip, item_lsn); - } else { - spin_unlock(&ailp->xa_lock); - } - - /* - * Now that we've repositioned the item in the AIL, - * unpin it so it can be flushed. Pass information - * about buffer stale state down from the log item - * flags, if anyone else stales the buffer we do not - * want to pay any attention to it. - */ - IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE); - } -} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 79c8bab9dff..8c69e7824f6 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -49,6 +49,15 @@ typedef struct xfs_trans_header { #define XFS_LI_DQUOT 0x123d #define XFS_LI_QUOTAOFF 0x123e +#define XFS_LI_TYPE_DESC \ + { XFS_LI_EFI, "XFS_LI_EFI" }, \ + { XFS_LI_EFD, "XFS_LI_EFD" }, \ + { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ + { XFS_LI_INODE, "XFS_LI_INODE" }, \ + { XFS_LI_BUF, "XFS_LI_BUF" }, \ + { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ + { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" } + /* * Transaction types. Used to distinguish types of buffers. */ @@ -97,7 +106,8 @@ typedef struct xfs_trans_header { #define XFS_TRANS_GROWFSRT_FREE 39 #define XFS_TRANS_SWAPEXT 40 #define XFS_TRANS_SB_COUNT 41 -#define XFS_TRANS_TYPE_MAX 41 +#define XFS_TRANS_CHECKPOINT 42 +#define XFS_TRANS_TYPE_MAX 42 /* new transaction types need to be reflected in xfs_logprint(8) */ #define XFS_TRANS_TYPES \ @@ -139,6 +149,7 @@ typedef struct xfs_trans_header { { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ + { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ { XFS_TRANS_DUMMY1, "DUMMY1" }, \ { XFS_TRANS_DUMMY2, "DUMMY2" }, \ { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } @@ -159,7 +170,6 @@ typedef struct xfs_log_item_desc { #define XFS_LID_DIRTY 0x1 #define XFS_LID_PINNED 0x2 -#define XFS_LID_BUF_STALE 0x8 /* * This structure is used to maintain a chunk list of log_item_desc @@ -805,6 +815,7 @@ struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; struct xfs_dquot_acct; +struct xfs_busy_extent; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ @@ -820,6 +831,11 @@ typedef struct xfs_log_item { /* buffer item iodone */ /* callback func */ struct xfs_item_ops *li_ops; /* function list */ + + /* delayed logging */ + struct list_head li_cil; /* CIL pointers */ + struct xfs_log_vec *li_lv; /* active log vector */ + xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; #define XFS_LI_IN_AIL 0x1 @@ -833,7 +849,7 @@ typedef struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); - void (*iop_unpin)(xfs_log_item_t *, int); + void (*iop_unpin)(xfs_log_item_t *); void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); uint (*iop_trylock)(xfs_log_item_t *); void (*iop_unlock)(xfs_log_item_t *); @@ -846,7 +862,7 @@ typedef struct xfs_item_ops { #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) #define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) -#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags) +#define IOP_UNPIN(ip) (*(ip)->li_ops->iop_unpin)(ip) #define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) #define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) #define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) @@ -864,34 +880,6 @@ typedef struct xfs_item_ops { #define XFS_ITEM_PUSHBUF 3 /* - * This structure is used to maintain a list of block ranges that have been - * freed in the transaction. The ranges are listed in the perag[] busy list - * between when they're freed and the transaction is committed to disk. - */ - -typedef struct xfs_log_busy_slot { - xfs_agnumber_t lbc_ag; - ushort lbc_idx; /* index in perag.busy[] */ -} xfs_log_busy_slot_t; - -#define XFS_LBC_NUM_SLOTS 31 -typedef struct xfs_log_busy_chunk { - struct xfs_log_busy_chunk *lbc_next; - uint lbc_free; /* free slots bitmask */ - ushort lbc_unused; /* first unused */ - xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS]; -} xfs_log_busy_chunk_t; - -#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1) -#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1) - -#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK) -#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot))) -#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)])) -#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK) -#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot))) - -/* * This is the type of function which can be given to xfs_trans_callback() * to be called upon the transaction's commit to disk. */ @@ -942,8 +930,7 @@ typedef struct xfs_trans { unsigned int t_items_free; /* log item descs free */ xfs_log_item_chunk_t t_items; /* first log item desc chunk */ xfs_trans_header_t t_header; /* header for in-log trans */ - unsigned int t_busy_free; /* busy descs free */ - xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */ + struct list_head t_busy; /* list of busy extents */ unsigned long t_pflags; /* saved process flags state */ } xfs_trans_t; @@ -1017,9 +1004,6 @@ int _xfs_trans_commit(xfs_trans_t *, void xfs_trans_cancel(xfs_trans_t *, int); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); -xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - xfs_extlen_t idx); extern kmem_zone_t *xfs_trans_zone; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index fb586360d1c..63d81a22f4f 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -40,11 +40,51 @@ #include "xfs_rw.h" #include "xfs_trace.h" +/* + * Check to see if a buffer matching the given parameters is already + * a part of the given transaction. + */ +STATIC struct xfs_buf * +xfs_trans_buf_item_match( + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int len) +{ + xfs_log_item_chunk_t *licp; + xfs_log_item_desc_t *lidp; + xfs_buf_log_item_t *blip; + int i; + + len = BBTOB(len); + for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { + if (xfs_lic_are_all_free(licp)) { + ASSERT(licp == &tp->t_items); + ASSERT(licp->lic_next == NULL); + return NULL; + } + + for (i = 0; i < licp->lic_unused; i++) { + /* + * Skip unoccupied slots. + */ + if (xfs_lic_isfree(licp, i)) + continue; + + lidp = xfs_lic_slot(licp, i); + blip = (xfs_buf_log_item_t *)lidp->lid_item; + if (blip->bli_item.li_type != XFS_LI_BUF) + continue; + + if (XFS_BUF_TARGET(blip->bli_buf) == target && + XFS_BUF_ADDR(blip->bli_buf) == blkno && + XFS_BUF_COUNT(blip->bli_buf) == len) + return blip->bli_buf; + } + } -STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, - xfs_daddr_t, int); -STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, - xfs_daddr_t, int); + return NULL; +} /* * Add the locked buffer to the transaction. @@ -74,7 +114,7 @@ _xfs_trans_bjoin( xfs_buf_item_init(bp, tp->t_mountp); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); if (reset_recur) bip->bli_recur = 0; @@ -112,14 +152,6 @@ xfs_trans_bjoin( * within the transaction, just increment its lock recursion count * and return a pointer to it. * - * Use the fast path function xfs_trans_buf_item_match() or the buffer - * cache routine incore_match() to find the buffer - * if it is already owned by this transaction. - * - * If we don't already own the buffer, use get_buf() to get it. - * If it doesn't yet have an associated xfs_buf_log_item structure, - * then allocate one and add the item to this transaction. - * * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ @@ -149,11 +181,7 @@ xfs_trans_get_buf(xfs_trans_t *tp, * have it locked. In this case we just increment the lock * recursion count and return the buffer to the caller. */ - if (tp->t_items.lic_next == NULL) { - bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); - } else { - bp = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len); - } + bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); if (bp != NULL) { ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) @@ -259,14 +287,6 @@ int xfs_error_mod = 33; * within the transaction and already read in, just increment its * lock recursion count and return a pointer to it. * - * Use the fast path function xfs_trans_buf_item_match() or the buffer - * cache routine incore_match() to find the buffer - * if it is already owned by this transaction. - * - * If we don't already own the buffer, use read_buf() to get it. - * If it doesn't yet have an associated xfs_buf_log_item structure, - * then allocate one and add the item to this transaction. - * * If the transaction pointer is NULL, make this just a normal * read_buf() call. */ @@ -328,11 +348,7 @@ xfs_trans_read_buf( * If the buffer is not yet read in, then we read it in, increment * the lock recursion count, and return it to the caller. */ - if (tp->t_items.lic_next == NULL) { - bp = xfs_trans_buf_item_match(tp, target, blkno, len); - } else { - bp = xfs_trans_buf_item_match_all(tp, target, blkno, len); - } + bp = xfs_trans_buf_item_match(tp, target, blkno, len); if (bp != NULL) { ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); @@ -495,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); /* @@ -603,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_HOLD; trace_xfs_trans_bhold(bip); @@ -625,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp, bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); bip->bli_flags &= ~XFS_BLI_HOLD; @@ -688,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(XFS_BUF_ISSTALE(bp)); XFS_BUF_UNSTALE(bp); - bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; + bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; } lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); @@ -696,7 +712,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, tp->t_flags |= XFS_TRANS_DIRTY; lidp->lid_flags |= XFS_LID_DIRTY; - lidp->lid_flags &= ~XFS_LID_BUF_STALE; bip->bli_flags |= XFS_BLI_LOGGED; xfs_buf_item_log(bip, first, last); } @@ -747,8 +762,8 @@ xfs_trans_binval( ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); + ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(lidp->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; @@ -759,7 +774,7 @@ xfs_trans_binval( * in the buf log item. The STALE flag will be used in * xfs_buf_item_unpin() to determine if it should clean up * when the last reference to the buf item is given up. - * We set the XFS_BLI_CANCEL flag in the buf log format structure + * We set the XFS_BLF_CANCEL flag in the buf log format structure * and log the buf item. This will be used at recovery time * to determine that copies of the buffer in the log before * this should not be replayed. @@ -777,26 +792,26 @@ xfs_trans_binval( XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_STALE(bp); bip->bli_flags |= XFS_BLI_STALE; - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); - bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; - bip->bli_format.blf_flags |= XFS_BLI_CANCEL; + bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); + bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; + bip->bli_format.blf_flags |= XFS_BLF_CANCEL; memset((char *)(bip->bli_format.blf_data_map), 0, (bip->bli_format.blf_map_size * sizeof(uint))); - lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE; + lidp->lid_flags |= XFS_LID_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY; } /* - * This call is used to indicate that the buffer contains on-disk - * inodes which must be handled specially during recovery. They - * require special handling because only the di_next_unlinked from - * the inodes in the buffer should be recovered. The rest of the - * data in the buffer is logged via the inodes themselves. + * This call is used to indicate that the buffer contains on-disk inodes which + * must be handled specially during recovery. They require special handling + * because only the di_next_unlinked from the inodes in the buffer should be + * recovered. The rest of the data in the buffer is logged via the inodes + * themselves. * - * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log - * format structure so that we'll know what to do at recovery time. + * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be + * transferred to the buffer's log format structure so that we'll know what to + * do at recovery time. */ -/* ARGSUSED */ void xfs_trans_inode_buf( xfs_trans_t *tp, @@ -811,7 +826,7 @@ xfs_trans_inode_buf( bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); - bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; + bip->bli_flags |= XFS_BLI_INODE_BUF; } /* @@ -893,120 +908,12 @@ xfs_trans_dquot_buf( ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - ASSERT(type == XFS_BLI_UDQUOT_BUF || - type == XFS_BLI_PDQUOT_BUF || - type == XFS_BLI_GDQUOT_BUF); + ASSERT(type == XFS_BLF_UDQUOT_BUF || + type == XFS_BLF_PDQUOT_BUF || + type == XFS_BLF_GDQUOT_BUF); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_format.blf_flags |= type; } - -/* - * Check to see if a buffer matching the given parameters is already - * a part of the given transaction. Only check the first, embedded - * chunk, since we don't want to spend all day scanning large transactions. - */ -STATIC xfs_buf_t * -xfs_trans_buf_item_match( - xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *blip; - xfs_buf_t *bp; - int i; - - bp = NULL; - len = BBTOB(len); - licp = &tp->t_items; - if (!xfs_lic_are_all_free(licp)) { - for (i = 0; i < licp->lic_unused; i++) { - /* - * Skip unoccupied slots. - */ - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lidp = xfs_lic_slot(licp, i); - blip = (xfs_buf_log_item_t *)lidp->lid_item; - if (blip->bli_item.li_type != XFS_LI_BUF) { - continue; - } - - bp = blip->bli_buf; - if ((XFS_BUF_TARGET(bp) == target) && - (XFS_BUF_ADDR(bp) == blkno) && - (XFS_BUF_COUNT(bp) == len)) { - /* - * We found it. Break out and - * return the pointer to the buffer. - */ - break; - } else { - bp = NULL; - } - } - } - return bp; -} - -/* - * Check to see if a buffer matching the given parameters is already - * a part of the given transaction. Check all the chunks, we - * want to be thorough. - */ -STATIC xfs_buf_t * -xfs_trans_buf_item_match_all( - xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *blip; - xfs_buf_t *bp; - int i; - - bp = NULL; - len = BBTOB(len); - for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { - if (xfs_lic_are_all_free(licp)) { - ASSERT(licp == &tp->t_items); - ASSERT(licp->lic_next == NULL); - return NULL; - } - for (i = 0; i < licp->lic_unused; i++) { - /* - * Skip unoccupied slots. - */ - if (xfs_lic_isfree(licp, i)) { - continue; - } - - lidp = xfs_lic_slot(licp, i); - blip = (xfs_buf_log_item_t *)lidp->lid_item; - if (blip->bli_item.li_type != XFS_LI_BUF) { - continue; - } - - bp = blip->bli_buf; - if ((XFS_BUF_TARGET(bp) == target) && - (XFS_BUF_ADDR(bp) == blkno) && - (XFS_BUF_COUNT(bp) == len)) { - /* - * We found it. Break out and - * return the pointer to the buffer. - */ - return bp; - } - } - } - return NULL; -} diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c index eb3fc57f9ee..f11d37d06dc 100644 --- a/fs/xfs/xfs_trans_item.c +++ b/fs/xfs/xfs_trans_item.c @@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) void xfs_trans_free_items( xfs_trans_t *tp, + xfs_lsn_t commit_lsn, int flags) { xfs_log_item_chunk_t *licp; @@ -311,7 +312,7 @@ xfs_trans_free_items( * Special case the embedded chunk so we don't free it below. */ if (!xfs_lic_are_all_free(licp)) { - (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); + (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); xfs_lic_all_free(licp); licp->lic_unused = 0; } @@ -322,7 +323,7 @@ xfs_trans_free_items( */ while (licp != NULL) { ASSERT(!xfs_lic_are_all_free(licp)); - (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); + (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); next_licp = licp->lic_next; kmem_free(licp); licp = next_licp; @@ -438,112 +439,3 @@ xfs_trans_unlock_chunk( return freed; } - - -/* - * This is called to add the given busy item to the transaction's - * list of busy items. It must find a free busy item descriptor - * or allocate a new one and add the item to that descriptor. - * The function returns a pointer to busy descriptor used to point - * to the new busy entry. The log busy entry will now point to its new - * descriptor with its ???? field. - */ -xfs_log_busy_slot_t * -xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i=0; - - /* - * If there are no free descriptors, allocate a new chunk - * of them and put it at the front of the chunk list. - */ - if (tp->t_busy_free == 0) { - lbcp = (xfs_log_busy_chunk_t*) - kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP); - ASSERT(lbcp != NULL); - /* - * Initialize the chunk, and then - * claim the first slot in the newly allocated chunk. - */ - XFS_LBC_INIT(lbcp); - XFS_LBC_CLAIM(lbcp, 0); - lbcp->lbc_unused = 1; - lbsp = XFS_LBC_SLOT(lbcp, 0); - - /* - * Link in the new chunk and update the free count. - */ - lbcp->lbc_next = tp->t_busy.lbc_next; - tp->t_busy.lbc_next = lbcp; - tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1; - - /* - * Initialize the descriptor and the generic portion - * of the log item. - * - * Point the new slot at this item and return it. - * Also point the log item at its currently active - * descriptor and set the item's mount pointer. - */ - lbsp->lbc_ag = ag; - lbsp->lbc_idx = idx; - return lbsp; - } - - /* - * Find the free descriptor. It is somewhere in the chunklist - * of descriptors. - */ - lbcp = &tp->t_busy; - while (lbcp != NULL) { - if (XFS_LBC_VACANCY(lbcp)) { - if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) { - i = lbcp->lbc_unused; - break; - } else { - /* out-of-order vacancy */ - cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp); - ASSERT(0); - } - } - lbcp = lbcp->lbc_next; - } - ASSERT(lbcp != NULL); - /* - * If we find a free descriptor, claim it, - * initialize it, and return it. - */ - XFS_LBC_CLAIM(lbcp, i); - if (lbcp->lbc_unused <= i) { - lbcp->lbc_unused = i + 1; - } - lbsp = XFS_LBC_SLOT(lbcp, i); - tp->t_busy_free--; - lbsp->lbc_ag = ag; - lbsp->lbc_idx = idx; - return lbsp; -} - - -/* - * xfs_trans_free_busy - * Free all of the busy lists from a transaction - */ -void -xfs_trans_free_busy(xfs_trans_t *tp) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_chunk_t *lbcq; - - lbcp = tp->t_busy.lbc_next; - while (lbcp != NULL) { - lbcq = lbcp->lbc_next; - kmem_free(lbcp); - lbcp = lbcq; - } - - XFS_LBC_INIT(&tp->t_busy); - tp->t_busy.lbc_unused = 0; -} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 73e2ad39743..c6e4f2c8de6 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *, struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, struct xfs_log_item_desc *); -void xfs_trans_free_items(struct xfs_trans *, int); -void xfs_trans_unlock_items(struct xfs_trans *, - xfs_lsn_t); -void xfs_trans_free_busy(xfs_trans_t *tp); -xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - xfs_extlen_t idx); + +void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn); +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + int flags); + +void xfs_trans_item_committed(struct xfs_log_item *lip, + xfs_lsn_t commit_lsn, int aborted); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); /* * AIL traversal cursor. diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index b09904555d0..320775295e3 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ +typedef __uint32_t xlog_tid_t; /* transaction ID type */ + /* * These types are 64 bits on disk but are either 32 or 64 bits in memory. * Disk based types: |