aboutsummaryrefslogtreecommitdiff
path: root/fs/gfs2
diff options
context:
space:
mode:
authorBob Peterson <rpeterso@redhat.com>2012-07-19 08:12:40 -0400
committerSteven Whitehouse <swhiteho@redhat.com>2012-07-19 14:51:08 +0100
commit8e2e00473598dd5379d8408cb974dade000acafc (patch)
tree1f7bfdf0d07b6c0315bbd11ffee174742d66a459 /fs/gfs2
parent294f2ad5a545eb71d397623743ddd8201131bdad (diff)
GFS2: Reduce file fragmentation
This patch reduces GFS2 file fragmentation by pre-reserving blocks. The resulting improved on disk layout greatly speeds up operations in cases which would have resulted in interlaced allocation of blocks previously. A typical example of this is 10 parallel dd processes, each writing to a file in a common dirctory. The implementation uses an rbtree of reservations attached to each resource group (and each inode). Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Diffstat (limited to 'fs/gfs2')
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/file.c24
-rw-r--r--fs/gfs2/incore.h49
-rw-r--r--fs/gfs2/inode.c37
-rw-r--r--fs/gfs2/rgrp.c578
-rw-r--r--fs/gfs2/rgrp.h31
-rw-r--r--fs/gfs2/super.c7
-rw-r--r--fs/gfs2/trace_gfs2.h59
-rw-r--r--fs/gfs2/xattr.c12
9 files changed, 708 insertions, 92 deletions
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d957a86482..49cd7dd4a9f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
if (error)
goto out_rlist;
+ if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
+ gfs2_rs_deltree(ip->i_res);
+
error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
RES_INDIRECT + RES_STATFS + RES_QUOTA,
revokes);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6fbf3cbd974..9f94832cefe 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret)
return ret;
+ atomic_set(&ip->i_res->rs_sizehint,
+ PAGE_CACHE_SIZE / sdp->sd_sb.sb_bsize);
+
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
ret = gfs2_glock_nq(&gh);
if (ret)
@@ -571,22 +574,15 @@ fail:
static int gfs2_release(struct inode *inode, struct file *file)
{
- struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
- struct gfs2_file *fp;
struct gfs2_inode *ip = GFS2_I(inode);
- fp = file->private_data;
+ kfree(file->private_data);
file->private_data = NULL;
- if ((file->f_mode & FMODE_WRITE) && ip->i_res &&
+ if ((file->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1))
gfs2_rs_delete(ip);
- if (gfs2_assert_warn(sdp, fp))
- return -EIO;
-
- kfree(fp);
-
return 0;
}
@@ -662,14 +658,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
+ size_t writesize = iov_length(iov, nr_segs);
struct dentry *dentry = file->f_dentry;
struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ struct gfs2_sbd *sdp;
int ret;
+ sdp = GFS2_SB(file->f_mapping->host);
ret = gfs2_rs_alloc(ip);
if (ret)
return ret;
+ atomic_set(&ip->i_res->rs_sizehint, writesize / sdp->sd_sb.sb_bsize);
if (file->f_flags & O_APPEND) {
struct gfs2_holder gh;
@@ -795,6 +795,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
if (unlikely(error))
goto out_uninit;
+ atomic_set(&ip->i_res->rs_sizehint, len / sdp->sd_sb.sb_bsize);
+
while (len > 0) {
if (len < bytes)
bytes = len;
@@ -803,10 +805,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
offset += bytes;
continue;
}
- error = gfs2_rindex_update(sdp);
- if (error)
- goto out_unlock;
-
error = gfs2_quota_lock_check(ip);
if (error)
goto out_unlock;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index dc730700b3b..aaecc8085fc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -84,6 +84,7 @@ struct gfs2_rgrpd {
u32 rd_data; /* num of data blocks in rgrp */
u32 rd_bitbytes; /* number of bytes in data bitmaps */
u32 rd_free;
+ u32 rd_reserved; /* number of blocks reserved */
u32 rd_free_clone;
u32 rd_dinodes;
u64 rd_igeneration;
@@ -96,6 +97,9 @@ struct gfs2_rgrpd {
#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */
+ spinlock_t rd_rsspin; /* protects reservation related vars */
+ struct rb_root rd_rstree; /* multi-block reservation tree */
+ u32 rd_rs_cnt; /* count of current reservations */
};
enum gfs2_state_bits {
@@ -233,6 +237,38 @@ struct gfs2_holder {
unsigned long gh_ip;
};
+/* Resource group multi-block reservation, in order of appearance:
+
+ Step 1. Function prepares to write, allocates a mb, sets the size hint.
+ Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
+ Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
+ Step 4. Bits are assigned from the rgrp based on either the reservation
+ or wherever it can.
+*/
+
+struct gfs2_blkreserv {
+ /* components used during write (step 1): */
+ atomic_t rs_sizehint; /* hint of the write size */
+
+ /* components used during inplace_reserve (step 2): */
+ u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
+
+ /* components used during get_local_rgrp (step 3): */
+ struct gfs2_rgrpd *rs_rgd; /* pointer to the gfs2_rgrpd */
+ struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
+ struct rb_node rs_node; /* link to other block reservations */
+
+ /* components used during block searches and assignments (step 4): */
+ struct gfs2_bitmap *rs_bi; /* bitmap for the current allocation */
+ u32 rs_biblk; /* start block relative to the bi */
+ u32 rs_free; /* how many blocks are still free */
+
+ /* ancillary quota stuff */
+ struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
+ struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
+ unsigned int rs_qa_qd_num;
+};
+
enum {
GLF_LOCK = 1,
GLF_DEMOTE = 3,
@@ -290,16 +326,6 @@ struct gfs2_glock {
#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
-struct gfs2_blkreserv {
- u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
- struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
-
- /* ancillary quota stuff */
- struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
- struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
- unsigned int rs_qa_qd_num;
-};
-
enum {
GIF_INVALID = 0,
GIF_QD_LOCKED = 1,
@@ -307,7 +333,6 @@ enum {
GIF_SW_PAGED = 3,
};
-
struct gfs2_inode {
struct inode i_inode;
u64 i_no_addr;
@@ -318,7 +343,7 @@ struct gfs2_inode {
struct gfs2_glock *i_gl; /* Move into i_gh? */
struct gfs2_holder i_iopen_gh;
struct gfs2_holder i_gh; /* for prepare/commit_write only */
- struct gfs2_blkreserv *i_res; /* resource group block reservation */
+ struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
struct gfs2_rgrpd *i_rgd;
u64 i_goal; /* goal block for allocations */
struct rw_semaphore i_rw_mutex;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2b035e0959b..c53c67e30bd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
int error;
munge_mode_uid_gid(dip, &mode, &uid, &gid);
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
error = gfs2_quota_lock(dip, uid, gid);
if (error)
@@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
struct buffer_head *dibh;
int error;
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ return error;
+
error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
if (error)
goto fail;
@@ -596,7 +603,8 @@ fail_end_trans:
gfs2_trans_end(sdp);
fail_ipreserv:
- gfs2_inplace_release(dip);
+ if (alloc_required)
+ gfs2_inplace_release(dip);
fail_quota_locks:
gfs2_quota_unlock(dip);
@@ -647,7 +655,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
const struct qstr *name = &dentry->d_name;
struct gfs2_holder ghs[2];
struct inode *inode = NULL;
- struct gfs2_inode *dip = GFS2_I(dir);
+ struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
int error;
@@ -657,6 +665,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!name->len || name->len > GFS2_FNAMESIZE)
return -ENAMETOOLONG;
+ /* We need a reservation to allocate the new dinode block. The
+ directory ip temporarily points to the reservation, but this is
+ being done to get a set of contiguous blocks for the new dinode.
+ Since this is a create, we don't have a sizehint yet, so it will
+ have to use the minimum reservation size. */
error = gfs2_rs_alloc(dip);
if (error)
return error;
@@ -694,24 +707,29 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (IS_ERR(inode))
goto fail_gunlock2;
- error = gfs2_inode_refresh(GFS2_I(inode));
+ ip = GFS2_I(inode);
+ error = gfs2_inode_refresh(ip);
if (error)
goto fail_gunlock2;
- /* the new inode needs a reservation so it can allocate xattrs. */
- error = gfs2_rs_alloc(GFS2_I(inode));
- if (error)
- goto fail_gunlock2;
+ /* The newly created inode needs a reservation so it can allocate
+ xattrs. At the same time, we want new blocks allocated to the new
+ dinode to be as contiguous as possible. Since we allocated the
+ dinode block under the directory's reservation, we transfer
+ ownership of that reservation to the new inode. The directory
+ doesn't need a reservation unless it needs a new allocation. */
+ ip->i_res = dip->i_res;
+ dip->i_res = NULL;
error = gfs2_acl_create(dip, inode);
if (error)
goto fail_gunlock2;
- error = gfs2_security_init(dip, GFS2_I(inode), name);
+ error = gfs2_security_init(dip, ip, name);
if (error)
goto fail_gunlock2;
- error = link_dinode(dip, name, GFS2_I(inode));
+ error = link_dinode(dip, name, ip);
if (error)
goto fail_gunlock2;
@@ -738,6 +756,7 @@ fail_gunlock:
iput(inode);
}
fail:
+ gfs2_rs_delete(dip);
if (bh)
brelse(bh);
return error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fb7079263ea..4d34887a601 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -35,6 +35,9 @@
#define BFITNOENT ((u32)~0)
#define NO_BLOCK ((u64)~0)
+#define RSRV_CONTENTION_FACTOR 4
+#define RGRP_RSRV_MAX_CONTENDERS 2
+
#if BITS_PER_LONG == 32
#define LBITMASK (0x55555555UL)
#define LBITSKIP55 (0x55555555UL)
@@ -178,6 +181,57 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
}
/**
+ * rs_cmp - multi-block reservation range compare
+ * @blk: absolute file system block number of the new reservation
+ * @len: number of blocks in the new reservation
+ * @rs: existing reservation to compare against
+ *
+ * returns: 1 if the block range is beyond the reach of the reservation
+ * -1 if the block range is before the start of the reservation
+ * 0 if the block range overlaps with the reservation
+ */
+static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
+{
+ u64 startblk = gfs2_rs_startblk(rs);
+
+ if (blk >= startblk + rs->rs_free)
+ return 1;
+ if (blk + len - 1 < startblk)
+ return -1;
+ return 0;
+}
+
+/**
+ * rs_find - Find a rgrp multi-block reservation that contains a given block
+ * @rgd: The rgrp
+ * @rgblk: The block we're looking for, relative to the rgrp
+ */
+static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
+{
+ struct rb_node **newn;
+ int rc;
+ u64 fsblk = rgblk + rgd->rd_data0;
+
+ spin_lock(&rgd->rd_rsspin);
+ newn = &rgd->rd_rstree.rb_node;
+ while (*newn) {
+ struct gfs2_blkreserv *cur =
+ rb_entry(*newn, struct gfs2_blkreserv, rs_node);
+ rc = rs_cmp(fsblk, 1, cur);
+ if (rc < 0)
+ newn = &((*newn)->rb_left);
+ else if (rc > 0)
+ newn = &((*newn)->rb_right);
+ else {
+ spin_unlock(&rgd->rd_rsspin);
+ return cur;
+ }
+ }
+ spin_unlock(&rgd->rd_rsspin);
+ return NULL;
+}
+
+/**
* gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
* a block in a given allocation state.
* @buf: the buffer that holds the bitmaps
@@ -424,19 +478,93 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
int gfs2_rs_alloc(struct gfs2_inode *ip)
{
int error = 0;
+ struct gfs2_blkreserv *res;
+
+ if (ip->i_res)
+ return 0;
+
+ res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
+ if (!res)
+ error = -ENOMEM;
down_write(&ip->i_rw_mutex);
- if (!ip->i_res) {
- ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
- if (!ip->i_res)
- error = -ENOMEM;
- }
+ if (ip->i_res)
+ kmem_cache_free(gfs2_rsrv_cachep, res);
+ else
+ ip->i_res = res;
up_write(&ip->i_rw_mutex);
return error;
}
+static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
+{
+ gfs2_print_dbg(seq, " r: %llu s:%llu b:%u f:%u\n",
+ rs->rs_rgd->rd_addr, gfs2_rs_startblk(rs), rs->rs_biblk,
+ rs->rs_free);
+}
+
/**
- * gfs2_rs_delete - delete a reservation
+ * __rs_deltree - remove a multi-block reservation from the rgd tree
+ * @rs: The reservation to remove
+ *
+ */
+static void __rs_deltree(struct gfs2_blkreserv *rs)
+{
+ struct gfs2_rgrpd *rgd;
+
+ if (!gfs2_rs_active(rs))
+ return;
+
+ rgd = rs->rs_rgd;
+ /* We can't do this: The reason is that when the rgrp is invalidated,
+ it's in the "middle" of acquiring the glock, but the HOLDER bit
+ isn't set yet:
+ BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
+ trace_gfs2_rs(NULL, rs, TRACE_RS_TREEDEL);
+
+ if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
+ rb_erase(&rs->rs_node, &rgd->rd_rstree);
+ BUG_ON(!rgd->rd_rs_cnt);
+ rgd->rd_rs_cnt--;
+
+ if (rs->rs_free) {
+ /* return reserved blocks to the rgrp and the ip */
+ BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
+ rs->rs_rgd->rd_reserved -= rs->rs_free;
+ rs->rs_free = 0;
+ clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
+ smp_mb__after_clear_bit();
+ }
+ /* We can't change any of the step 1 or step 2 components of the rs.
+ E.g. We can't set rs_rgd to NULL because the rgd glock is held and
+ dequeued through this pointer.
+ Can't: atomic_set(&rs->rs_sizehint, 0);
+ Can't: rs->rs_requested = 0;
+ Can't: rs->rs_rgd = NULL;*/
+ rs->rs_bi = NULL;
+ rs->rs_biblk = 0;
+}
+
+/**
+ * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
+ * @rs: The reservation to remove
+ *
+ */
+void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
+{
+ struct gfs2_rgrpd *rgd;
+
+ if (!gfs2_rs_active(rs))
+ return;
+
+ rgd = rs->rs_rgd;
+ spin_lock(&rgd->rd_rsspin);
+ __rs_deltree(rs);
+ spin_unlock(&rgd->rd_rsspin);
+}
+
+/**
+ * gfs2_rs_delete - delete a multi-block reservation
* @ip: The inode for this reservation
*
*/
@@ -444,12 +572,36 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
{
down_write(&ip->i_rw_mutex);
if (ip->i_res) {
+ gfs2_rs_deltree(ip->i_res);
+ trace_gfs2_rs(ip, ip->i_res, TRACE_RS_DELETE);
+ BUG_ON(ip->i_res->rs_free);
kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
ip->i_res = NULL;
}
up_write(&ip->i_rw_mutex);
}
+/**
+ * return_all_reservations - return all reserved blocks back to the rgrp.
+ * @rgd: the rgrp that needs its space back
+ *
+ * We previously reserved a bunch of blocks for allocation. Now we need to
+ * give them back. This leave the reservation structures in tact, but removes
+ * all of their corresponding "no-fly zones".
+ */
+static void return_all_reservations(struct gfs2_rgrpd *rgd)
+{
+ struct rb_node *n;
+ struct gfs2_blkreserv *rs;
+
+ spin_lock(&rgd->rd_rsspin);
+ while ((n = rb_first(&rgd->rd_rstree))) {
+ rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+ __rs_deltree(rs);
+ }
+ spin_unlock(&rgd->rd_rsspin);
+}
+
void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
{
struct rb_node *n;
@@ -472,6 +624,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
gfs2_free_clones(rgd);
kfree(rgd->rd_bits);
+ return_all_reservations(rgd);
kmem_cache_free(gfs2_rgrpd_cachep, rgd);
}
}
@@ -649,6 +802,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
rgd->rd_data = be32_to_cpu(buf.ri_data);
rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
+ spin_lock_init(&rgd->rd_rsspin);
error = compute_bitstructs(rgd);
if (error)
@@ -1115,29 +1269,212 @@ out:
}
/**
+ * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
+ * @bi: the bitmap with the blocks
+ * @ip: the inode structure
+ * @biblk: the 32-bit block number relative to the start of the bitmap
+ * @amount: the number of blocks to reserve
+ *
+ * Returns: NULL - reservation was already taken, so not inserted
+ * pointer to the inserted reservation
+ */
+static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi,
+ struct gfs2_inode *ip, u32 biblk,
+ int amount)
+{
+ struct rb_node **newn, *parent = NULL;
+ int rc;
+ struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_rgrpd *rgd = rs->rs_rgd;
+ u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
+
+ spin_lock(&rgd->rd_rsspin);
+ newn = &rgd->rd_rstree.rb_node;
+ BUG_ON(!ip->i_res);
+ BUG_ON(gfs2_rs_active(rs));
+ /* Figure out where to put new node */
+ /*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
+ while (*newn) {
+ struct gfs2_blkreserv *cur =
+ rb_entry(*newn, struct gfs2_blkreserv, rs_node);
+
+ parent = *newn;
+ rc = rs_cmp(fsblock, amount, cur);
+ if (rc > 0)
+ newn = &((*newn)->rb_right);
+ else if (rc < 0)
+ newn = &((*newn)->rb_left);
+ else {
+ spin_unlock(&rgd->rd_rsspin);
+ return NULL; /* reservation already in use */
+ }
+ }
+
+ /* Do our reservation work */
+ rs = ip->i_res;
+ rs->rs_free = amount;
+ rs->rs_biblk = biblk;
+ rs->rs_bi = bi;
+ rb_link_node(&rs->rs_node, parent, newn);
+ rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
+
+ /* Do our inode accounting for the reservation */
+ /*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
+
+ /* Do our rgrp accounting for the reservation */
+ rgd->rd_reserved += amount; /* blocks reserved */
+ rgd->rd_rs_cnt++; /* number of in-tree reservations */
+ spin_unlock(&rgd->rd_rsspin);
+ trace_gfs2_rs(ip, rs, TRACE_RS_INSERT);
+ return rs;
+}
+
+/**
+ * unclaimed_blocks - return number of blocks that aren't spoken for
+ */
+static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
+{
+ return rgd->rd_free_clone - rgd->rd_reserved;
+}
+
+/**
+ * rg_mblk_search - find a group of multiple free blocks
+ * @rgd: the resource group descriptor
+ * @rs: the block reservation
+ * @ip: pointer to the inode for which we're reserving blocks
+ *
+ * This is very similar to rgblk_search, except we're looking for whole
+ * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
+ * on aligned dwords for speed's sake.
+ *
+ * Returns: 0 if successful or BFITNOENT if there isn't enough free space
+ */
+
+static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+{
+ struct gfs2_bitmap *bi = rgd->rd_bits;
+ const u32 length = rgd->rd_length;
+ u32 blk;
+ unsigned int buf, x, search_bytes;
+ u8 *buffer = NULL;
+ u8 *ptr, *end, *nonzero;
+ u32 goal, rsv_bytes;
+ struct gfs2_blkreserv *rs;
+ u32 best_rs_bytes, unclaimed;
+ int best_rs_blocks;
+
+ /* Find bitmap block that contains bits for goal block */
+ if (rgrp_contains_block(rgd, ip->i_goal))
+ goal = ip->i_goal - rgd->rd_data0;
+ else
+ goal = rgd->rd_last_alloc;
+ for (buf = 0; buf < length; buf++) {
+ bi = rgd->rd_bits + buf;
+ /* Convert scope of "goal" from rgrp-wide to within
+ found bit block */
+ if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
+ goal -= bi->bi_start * GFS2_NBBY;
+ goto do_search;
+ }
+ }
+ buf = 0;
+ goal = 0;
+
+do_search:
+ best_rs_blocks = max_t(int, atomic_read(&ip->i_res->rs_sizehint),
+ (RGRP_RSRV_MINBLKS * rgd->rd_length));
+ best_rs_bytes = (best_rs_blocks *
+ (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
+ GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
+ best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
+ unclaimed = unclaimed_blocks(rgd);
+ if (best_rs_bytes * GFS2_NBBY > unclaimed)
+ best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
+
+ for (x = 0; x <= length; x++) {
+ bi = rgd->rd_bits + buf;
+
+ if (test_bit(GBF_FULL, &bi->bi_flags))
+ goto skip;
+
+ WARN_ON(!buffer_uptodate(bi->bi_bh));
+ if (bi->bi_clone)
+ buffer = bi->bi_clone + bi->bi_offset;
+ else
+ buffer = bi->bi_bh->b_data + bi->bi_offset;
+
+ /* We have to keep the reservations aligned on u64 boundaries
+ otherwise we could get situations where a byte can't be
+ used because it's after a reservation, but a free bit still
+ is within the reservation's area. */
+ ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64));
+ end = (buffer + bi->bi_len);
+ while (ptr < end) {
+ rsv_bytes = 0;
+ if ((ptr + best_rs_bytes) <= end)
+ search_bytes = best_rs_bytes;
+ else
+ search_bytes = end - ptr;
+ BUG_ON(!search_bytes);
+ nonzero = memchr_inv(ptr, 0, search_bytes);
+ /* If the lot is all zeroes, reserve the whole size. If
+ there's enough zeroes to satisfy the request, use
+ what we can. If there's not enough, keep looking. */
+ if (nonzero == NULL)
+ rsv_bytes = search_bytes;
+ else if ((nonzero - ptr) * GFS2_NBBY >=
+ ip->i_res->rs_requested)
+ rsv_bytes = (nonzero - ptr);
+
+ if (rsv_bytes) {
+ blk = ((ptr - buffer) * GFS2_NBBY);
+ BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
+ rs = rs_insert(bi, ip, blk,
+ rsv_bytes * GFS2_NBBY);
+ if (IS_ERR(rs))
+ return PTR_ERR(rs);
+ if (rs)
+ return 0;
+ }
+ ptr += ALIGN(search_bytes, sizeof(u64));
+ }
+skip:
+ /* Try next bitmap block (wrap back to rgrp header
+ if at end) */
+ buf++;
+ buf %= length;
+ goal = 0;
+ }
+
+ return BFITNOENT;
+}
+
+/**
* try_rgrp_fit - See if a given reservation will fit in a given RG
* @rgd: the RG data
* @ip: the inode
*
* If there's room for the requested blocks to be allocated from the RG:
+ * This will try to get a multi-block reservation first, and if that doesn't
+ * fit, it will take what it can.
*
* Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
*/
-static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
+static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
{
- const struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_blkreserv *rs = ip->i_res;
if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
return 0;
- if (rgd->rd_free_clone >= rs->rs_requested)
+ /* Look for a multi-block reservation. */
+ if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS &&
+ rg_mblk_search(rgd, ip) != BFITNOENT)
+ return 1;
+ if (unclaimed_blocks(rgd) >= rs->rs_requested)
return 1;
- return 0;
-}
-static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
-{
- return (bi->bi_start * GFS2_NBBY) + blk;
+ return 0;
}
/**
@@ -1217,7 +1554,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_rgrpd *rgd, *begin = NULL;
+ struct gfs2_rgrpd *begin = NULL;
struct gfs2_blkreserv *rs = ip->i_res;
int error = 0, rg_locked, flags = LM_FLAG_TRY;
u64 last_unlinked = NO_BLOCK;
@@ -1225,32 +1562,40 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
if (sdp->sd_args.ar_rgrplvb)
flags |= GL_SKIP;
- rs = ip->i_res;
rs->rs_requested = requested;
if (gfs2_assert_warn(sdp, requested)) {
error = -EINVAL;
goto out;
}
-
- if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
- rgd = begin = ip->i_rgd;
- else
- rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
-
- if (rgd == NULL)
+ if (gfs2_rs_active(rs)) {
+ begin = rs->rs_rgd;
+ flags = 0; /* Yoda: Do or do not. There is no try */
+ } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
+ rs->rs_rgd = begin = ip->i_rgd;
+ } else {
+ rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
+ }
+ if (rs->rs_rgd == NULL)
return -EBADSLT;
while (loops < 3) {
rg_locked = 0;
- if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
+ if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
rg_locked = 1;
error = 0;
+ } else if (!loops && !gfs2_rs_active(rs) &&
+ rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
+ /* If the rgrp already is maxed out for contenders,
+ we can eliminate it as a "first pass" without even
+ requesting the rgrp glock. */
+ error = GLR_TRYFAILED;
} else {
- error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
- flags, &rs->rs_rgd_gh);
+ error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
+ LM_ST_EXCLUSIVE, flags,
+ &rs->rs_rgd_gh);
if (!error && sdp->sd_args.ar_rgrplvb) {
- error = update_rgrp_lvb(rgd);
+ error = update_rgrp_lvb(rs->rs_rgd);
if (error) {
gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
return error;
@@ -1259,25 +1604,37 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
}
switch (error) {
case 0:
- if (try_rgrp_fit(rgd, ip)) {
+ if (gfs2_rs_active(rs)) {
+ if (unclaimed_blocks(rs->rs_rgd) +
+ rs->rs_free >= rs->rs_requested) {
+ ip->i_rgd = rs->rs_rgd;
+ return 0;
+ }
+ /* We have a multi-block reservation, but the
+ rgrp doesn't have enough free blocks to
+ satisfy the request. Free the reservation
+ and look for a suitable rgrp. */
+ gfs2_rs_deltree(rs);
+ }
+ if (try_rgrp_fit(rs->rs_rgd, ip)) {
if (sdp->sd_args.ar_rgrplvb)
- gfs2_rgrp_bh_get(rgd);
- ip->i_rgd = rgd;
+ gfs2_rgrp_bh_get(rs->rs_rgd);
+ ip->i_rgd = rs->rs_rgd;
return 0;
}
- if (rgd->rd_flags & GFS2_RDF_CHECK) {
+ if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
if (sdp->sd_args.ar_rgrplvb)
- gfs2_rgrp_bh_get(rgd);
- try_rgrp_unlink(rgd, &last_unlinked,
+ gfs2_rgrp_bh_get(rs->rs_rgd);
+ try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
ip->i_no_addr);
}
if (!rg_locked)
gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
/* fall through */
case GLR_TRYFAILED:
- rgd = gfs2_rgrpd_get_next(rgd);
- rgd = rgd ? : begin; /* if NULL, wrap */
- if (rgd != begin) /* If we didn't wrap */
+ rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
+ rs->rs_rgd = rs->rs_rgd ? : begin; /* if NULL, wrap */
+ if (rs->rs_rgd != begin) /* If we didn't wrap */
break;
flags &= ~LM_FLAG_TRY;
@@ -1315,6 +1672,12 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
{
struct gfs2_blkreserv *rs = ip->i_res;
+ if (!rs)
+ return;
+
+ if (!rs->rs_free)
+ gfs2_rs_deltree(rs);
+
if (rs->rs_rgd_gh.gh_gl)
gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
rs->rs_requested = 0;
@@ -1413,7 +1776,27 @@ do_search:
if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
buffer = bi->bi_clone + bi->bi_offset;
- biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
+ while (1) {
+ struct gfs2_blkreserv *rs;
+ u32 rgblk;
+
+ biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
+ if (biblk == BFITNOENT)
+ break;
+ /* Check if this block is reserved() */
+ rgblk = gfs2_bi2rgd_blk(bi, biblk);
+ rs = rs_find(rgd, rgblk);
+ if (rs == NULL)
+ break;
+
+ BUG_ON(rs->rs_bi != bi);
+ biblk = BFITNOENT;
+ /* This should jump to the first block after the
+ reservation. */
+ goal = rs->rs_biblk + rs->rs_free;
+ if (goal >= bi->bi_len * GFS2_NBBY)
+ break;
+ }
if (biblk != BFITNOENT)
break;
@@ -1449,8 +1832,9 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
u32 blk, bool dinode, unsigned int *n)
{
const unsigned int elen = *n;
- u32 goal;
+ u32 goal, rgblk;
const u8 *buffer = NULL;
+ struct gfs2_blkreserv *rs;
*n = 0;
buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1463,6 +1847,10 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
goal++;
if (goal >= (bi->bi_len * GFS2_NBBY))
break;
+ rgblk = gfs2_bi2rgd_blk(bi, goal);
+ rs = rs_find(rgd, rgblk);
+ if (rs) /* Oops, we bumped into someone's reservation */
+ break;
if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
GFS2_BLKST_FREE)
break;
@@ -1538,12 +1926,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
{
- const struct gfs2_rgrpd *rgd = gl->gl_object;
+ struct gfs2_rgrpd *rgd = gl->gl_object;
+ struct gfs2_blkreserv *trs;
+ const struct rb_node *n;
+
if (rgd == NULL)
return 0;
- gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+ gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
(unsigned long long)rgd->rd_addr, rgd->rd_flags,
- rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
+ rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
+ rgd->rd_reserved);
+ spin_lock(&rgd->rd_rsspin);
+ for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
+ trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
+ dump_rs(seq, trs);
+ }
+ spin_unlock(&rgd->rd_rsspin);
return 0;
}
@@ -1558,10 +1956,63 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
}
/**
+ * claim_reserved_blks - Claim previously reserved blocks
+ * @ip: the inode that's claiming the reservation
+ * @dinode: 1 if this block is a dinode block, otherwise data block
+ * @nblocks: desired extent length
+ *
+ * Lay claim to previously allocated block reservation blocks.
+ * Returns: Starting block number of the blocks claimed.
+ * Sets *nblocks to the actual extent length allocated.
+ */
+static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode,
+ unsigned int *nblocks)
+{
+ struct gfs2_blkreserv *rs = ip->i_res;
+ struct gfs2_rgrpd *rgd = rs->rs_rgd;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_bitmap *bi;
+ u64 start_block = gfs2_rs_startblk(rs);
+ const unsigned int elen = *nblocks;
+
+ /*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
+ gfs2_assert_withdraw(sdp, rgd);
+ /*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
+ bi = rs->rs_bi;
+ gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+
+ for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
+ /* Make sure the bitmap hasn't changed */
+ gfs2_setbit(rgd, bi->bi_clone, bi, rs->rs_biblk,
+ dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
+ rs->rs_biblk++;
+ rs->rs_free--;
+
+ BUG_ON(!rgd->rd_reserved);
+ rgd->rd_reserved--;
+ dinode = false;
+ trace_gfs2_rs(ip, rs, TRACE_RS_CLAIM);
+ }
+
+ if (!rs->rs_free) {
+ struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
+
+ gfs2_rs_deltree(rs);
+ /* -nblocks because we haven't returned to do the math yet.
+ I'm doing the math backwards to prevent negative numbers,
+ but think of it as:
+ if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */
+ if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks)
+ rg_mblk_search(rgd, ip);
+ }
+ return start_block;
+}
+
+/**
* gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
* @ip: the inode to allocate the block for
* @bn: Used to return the starting block number
- * @ndata: requested number of blocks/extent length (value/result)
+ * @nblocks: requested number of blocks/extent length (value/result)
* @dinode: 1 if we're allocating a dinode block, else 0
* @generation: the generation number of the inode
*
@@ -1586,20 +2037,34 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
if (ip->i_res->rs_requested == 0)
return -ECANCELED;
- rgd = ip->i_rgd;
-
- if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
- goal = ip->i_goal - rgd->rd_data0;
- else
- goal = rgd->rd_last_alloc;
-
- blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
+ /* Check if we have a multi-block reservation, and if so, claim the
+ next free block from it. */
+ if (gfs2_rs_active(ip->i_res)) {
+ BUG_ON(!ip->i_res->rs_free);
+ rgd = ip->i_res->rs_rgd;
+ block = claim_reserved_blks(ip, dinode