aboutsummaryrefslogtreecommitdiff
path: root/fs/jfs/jfs_dmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/jfs/jfs_dmap.c')
-rw-r--r--fs/jfs/jfs_dmap.c4272
1 files changed, 4272 insertions, 0 deletions
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
new file mode 100644
index 00000000000..d86e467c6e4
--- /dev/null
+++ b/fs/jfs/jfs_dmap.c
@@ -0,0 +1,4272 @@
+/*
+ * Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/fs.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_lock.h"
+#include "jfs_metapage.h"
+#include "jfs_debug.h"
+
+/*
+ * Debug code for double-checking block map
+ */
+/* #define _JFS_DEBUG_DMAP 1 */
+
+#ifdef _JFS_DEBUG_DMAP
+#define DBINITMAP(size,ipbmap,results) \
+ DBinitmap(size,ipbmap,results)
+#define DBALLOC(dbmap,mapsize,blkno,nblocks) \
+ DBAlloc(dbmap,mapsize,blkno,nblocks)
+#define DBFREE(dbmap,mapsize,blkno,nblocks) \
+ DBFree(dbmap,mapsize,blkno,nblocks)
+#define DBALLOCCK(dbmap,mapsize,blkno,nblocks) \
+ DBAllocCK(dbmap,mapsize,blkno,nblocks)
+#define DBFREECK(dbmap,mapsize,blkno,nblocks) \
+ DBFreeCK(dbmap,mapsize,blkno,nblocks)
+
+static void DBinitmap(s64, struct inode *, u32 **);
+static void DBAlloc(uint *, s64, s64, s64);
+static void DBFree(uint *, s64, s64, s64);
+static void DBAllocCK(uint *, s64, s64, s64);
+static void DBFreeCK(uint *, s64, s64, s64);
+#else
+#define DBINITMAP(size,ipbmap,results)
+#define DBALLOC(dbmap, mapsize, blkno, nblocks)
+#define DBFREE(dbmap, mapsize, blkno, nblocks)
+#define DBALLOCCK(dbmap, mapsize, blkno, nblocks)
+#define DBFREECK(dbmap, mapsize, blkno, nblocks)
+#endif /* _JFS_DEBUG_DMAP */
+
+/*
+ * SERIALIZATION of the Block Allocation Map.
+ *
+ * the working state of the block allocation map is accessed in
+ * two directions:
+ *
+ * 1) allocation and free requests that start at the dmap
+ * level and move up through the dmap control pages (i.e.
+ * the vast majority of requests).
+ *
+ * 2) allocation requests that start at dmap control page
+ * level and work down towards the dmaps.
+ *
+ * the serialization scheme used here is as follows.
+ *
+ * requests which start at the bottom are serialized against each
+ * other through buffers and each requests holds onto its buffers
+ * as it works it way up from a single dmap to the required level
+ * of dmap control page.
+ * requests that start at the top are serialized against each other
+ * and request that start from the bottom by the multiple read/single
+ * write inode lock of the bmap inode. requests starting at the top
+ * take this lock in write mode while request starting at the bottom
+ * take the lock in read mode. a single top-down request may proceed
+ * exclusively while multiple bottoms-up requests may proceed
+ * simultaneously (under the protection of busy buffers).
+ *
+ * in addition to information found in dmaps and dmap control pages,
+ * the working state of the block allocation map also includes read/
+ * write information maintained in the bmap descriptor (i.e. total
+ * free block count, allocation group level free block counts).
+ * a single exclusive lock (BMAP_LOCK) is used to guard this information
+ * in the face of multiple-bottoms up requests.
+ * (lock ordering: IREAD_LOCK, BMAP_LOCK);
+ *
+ * accesses to the persistent state of the block allocation map (limited
+ * to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
+ */
+
+#define BMAP_LOCK_INIT(bmp) init_MUTEX(&bmp->db_bmaplock)
+#define BMAP_LOCK(bmp) down(&bmp->db_bmaplock)
+#define BMAP_UNLOCK(bmp) up(&bmp->db_bmaplock)
+
+/*
+ * forward references
+ */
+static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+ int nblocks);
+static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
+static void dbBackSplit(dmtree_t * tp, int leafno);
+static void dbJoin(dmtree_t * tp, int leafno, int newval);
+static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
+static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
+ int level);
+static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
+static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
+ int nblocks);
+static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno,
+ int nblocks,
+ int l2nb, s64 * results);
+static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+ int nblocks);
+static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks,
+ int l2nb,
+ s64 * results);
+static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb,
+ s64 * results);
+static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
+ s64 * results);
+static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
+static int dbFindBits(u32 word, int l2nb);
+static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
+static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
+static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
+ int nblocks);
+static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
+ int nblocks);
+static int dbMaxBud(u8 * cp);
+s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
+static int blkstol2(s64 nb);
+
+static int cntlz(u32 value);
+static int cnttz(u32 word);
+
+static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
+ int nblocks);
+static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks);
+static int dbInitDmapTree(struct dmap * dp);
+static int dbInitTree(struct dmaptree * dtp);
+static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i);
+static int dbGetL2AGSize(s64 nblocks);
+
+/*
+ * buddy table
+ *
+ * table used for determining buddy sizes within characters of
+ * dmap bitmap words. the characters themselves serve as indexes
+ * into the table, with the table elements yielding the maximum
+ * binary buddy of free bits within the character.
+ */
+static s8 budtab[256] = {
+ 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
+ 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
+};
+
+
+/*
+ * NAME: dbMount()
+ *
+ * FUNCTION: initializate the block allocation map.
+ *
+ * memory is allocated for the in-core bmap descriptor and
+ * the in-core descriptor is initialized from disk.
+ *
+ * PARAMETERS:
+ * ipbmap - pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -ENOMEM - insufficient memory
+ * -EIO - i/o error
+ */
+int dbMount(struct inode *ipbmap)
+{
+ struct bmap *bmp;
+ struct dbmap_disk *dbmp_le;
+ struct metapage *mp;
+ int i;
+
+ /*
+ * allocate/initialize the in-memory bmap descriptor
+ */
+ /* allocate memory for the in-memory bmap descriptor */
+ bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL);
+ if (bmp == NULL)
+ return -ENOMEM;
+
+ /* read the on-disk bmap descriptor. */
+ mp = read_metapage(ipbmap,
+ BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
+ PSIZE, 0);
+ if (mp == NULL) {
+ kfree(bmp);
+ return -EIO;
+ }
+
+ /* copy the on-disk bmap descriptor to its in-memory version. */
+ dbmp_le = (struct dbmap_disk *) mp->data;
+ bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
+ bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
+ bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
+ bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
+ bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
+ bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
+ bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
+ bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
+ bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
+ bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
+ bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
+ bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
+ for (i = 0; i < MAXAG; i++)
+ bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
+ bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
+ bmp->db_maxfreebud = dbmp_le->dn_maxfreebud;
+
+ /* release the buffer. */
+ release_metapage(mp);
+
+ /* bind the bmap inode and the bmap descriptor to each other. */
+ bmp->db_ipbmap = ipbmap;
+ JFS_SBI(ipbmap->i_sb)->bmap = bmp;
+
+ memset(bmp->db_active, 0, sizeof(bmp->db_active));
+ DBINITMAP(bmp->db_mapsize, ipbmap, &bmp->db_DBmap);
+
+ /*
+ * allocate/initialize the bmap lock
+ */
+ BMAP_LOCK_INIT(bmp);
+
+ return (0);
+}
+
+
+/*
+ * NAME: dbUnmount()
+ *
+ * FUNCTION: terminate the block allocation map in preparation for
+ * file system unmount.
+ *
+ * the in-core bmap descriptor is written to disk and
+ * the memory for this descriptor is freed.
+ *
+ * PARAMETERS:
+ * ipbmap - pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -EIO - i/o error
+ */
+int dbUnmount(struct inode *ipbmap, int mounterror)
+{
+ struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+ int i;
+
+ if (!(mounterror || isReadOnly(ipbmap)))
+ dbSync(ipbmap);
+
+ /*
+ * Invalidate the page cache buffers
+ */
+ truncate_inode_pages(ipbmap->i_mapping, 0);
+
+ /*
+ * Sanity Check
+ */
+ for (i = 0; i < bmp->db_numag; i++)
+ if (atomic_read(&bmp->db_active[i]))
+ printk(KERN_ERR "dbUnmount: db_active[%d] = %d\n",
+ i, atomic_read(&bmp->db_active[i]));
+
+ /* free the memory for the in-memory bmap. */
+ kfree(bmp);
+
+ return (0);
+}
+
+/*
+ * dbSync()
+ */
+int dbSync(struct inode *ipbmap)
+{
+ struct dbmap_disk *dbmp_le;
+ struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+ struct metapage *mp;
+ int i;
+
+ /*
+ * write bmap global control page
+ */
+ /* get the buffer for the on-disk bmap descriptor. */
+ mp = read_metapage(ipbmap,
+ BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
+ PSIZE, 0);
+ if (mp == NULL) {
+ jfs_err("dbSync: read_metapage failed!");
+ return -EIO;
+ }
+ /* copy the in-memory version of the bmap to the on-disk version */
+ dbmp_le = (struct dbmap_disk *) mp->data;
+ dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize);
+ dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree);
+ dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage);
+ dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag);
+ dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel);
+ dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
+ dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
+ dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
+ dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
+ dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
+ dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
+ dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
+ for (i = 0; i < MAXAG; i++)
+ dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]);
+ dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize);
+ dbmp_le->dn_maxfreebud = bmp->db_maxfreebud;
+
+ /* write the buffer */
+ write_metapage(mp);
+
+ /*
+ * write out dirty pages of bmap
+ */
+ filemap_fdatawrite(ipbmap->i_mapping);
+ filemap_fdatawait(ipbmap->i_mapping);
+
+ ipbmap->i_state |= I_DIRTY;
+ diWriteSpecial(ipbmap, 0);
+
+ return (0);
+}
+
+
+/*
+ * NAME: dbFree()
+ *
+ * FUNCTION: free the specified block range from the working block
+ * allocation map.
+ *
+ * the blocks will be free from the working map one dmap
+ * at a time.
+ *
+ * PARAMETERS:
+ * ip - pointer to in-core inode;
+ * blkno - starting block number to be freed.
+ * nblocks - number of blocks to be freed.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -EIO - i/o error
+ */
+int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
+{
+ struct metapage *mp;
+ struct dmap *dp;
+ int nb, rc;
+ s64 lblkno, rem;
+ struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+ struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+
+ IREAD_LOCK(ipbmap);
+
+ /* block to be freed better be within the mapsize. */
+ if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
+ IREAD_UNLOCK(ipbmap);
+ printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
+ (unsigned long long) blkno,
+ (unsigned long long) nblocks);
+ jfs_error(ip->i_sb,
+ "dbFree: block to be freed is outside the map");
+ return -EIO;
+ }
+
+ /*
+ * free the blocks a dmap at a time.
+ */
+ mp = NULL;
+ for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
+ /* release previous dmap if any */
+ if (mp) {
+ write_metapage(mp);
+ }
+
+ /* get the buffer for the current dmap. */
+ lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+ mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL) {
+ IREAD_UNLOCK(ipbmap);
+ return -EIO;
+ }
+ dp = (struct dmap *) mp->data;
+
+ /* determine the number of blocks to be freed from
+ * this dmap.
+ */
+ nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
+
+ DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
+
+ /* free the blocks. */
+ if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
+ release_metapage(mp);
+ IREAD_UNLOCK(ipbmap);
+ return (rc);
+ }
+
+ DBFREE(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
+ }
+
+ /* write the last buffer. */
+ write_metapage(mp);
+
+ IREAD_UNLOCK(ipbmap);
+
+ return (0);
+}
+
+
+/*
+ * NAME: dbUpdatePMap()
+ *
+ * FUNCTION: update the allocation state (free or allocate) of the
+ * specified block range in the persistent block allocation map.
+ *
+ * the blocks will be updated in the persistent map one
+ * dmap at a time.
+ *
+ * PARAMETERS:
+ * ipbmap - pointer to in-core inode for the block map.
+ * free - TRUE if block range is to be freed from the persistent
+ * map; FALSE if it is to be allocated.
+ * blkno - starting block number of the range.
+ * nblocks - number of contiguous blocks in the range.
+ * tblk - transaction block;
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -EIO - i/o error
+ */
+int
+dbUpdatePMap(struct inode *ipbmap,
+ int free, s64 blkno, s64 nblocks, struct tblock * tblk)
+{
+ int nblks, dbitno, wbitno, rbits;
+ int word, nbits, nwords;
+ struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+ s64 lblkno, rem, lastlblkno;
+ u32 mask;
+ struct dmap *dp;
+ struct metapage *mp;
+ struct jfs_log *log;
+ int lsn, difft, diffp;
+
+ /* the blocks better be within the mapsize. */
+ if (blkno + nblocks > bmp->db_mapsize) {
+ printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
+ (unsigned long long) blkno,
+ (unsigned long long) nblocks);
+ jfs_error(ipbmap->i_sb,
+ "dbUpdatePMap: blocks are outside the map");
+ return -EIO;
+ }
+
+ /* compute delta of transaction lsn from log syncpt */
+ lsn = tblk->lsn;
+ log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
+ logdiff(difft, lsn, log);
+
+ /*
+ * update the block state a dmap at a time.
+ */
+ mp = NULL;
+ lastlblkno = 0;
+ for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) {
+ /* get the buffer for the current dmap. */
+ lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+ if (lblkno != lastlblkno) {
+ if (mp) {
+ write_metapage(mp);
+ }
+
+ mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE,
+ 0);
+ if (mp == NULL)
+ return -EIO;
+ }
+ dp = (struct dmap *) mp->data;
+
+ /* determine the bit number and word within the dmap of
+ * the starting block. also determine how many blocks
+ * are to be updated within this dmap.
+ */
+ dbitno = blkno & (BPERDMAP - 1);
+ word = dbitno >> L2DBWORD;
+ nblks = min(rem, (s64)BPERDMAP - dbitno);
+
+ /* update the bits of the dmap words. the first and last
+ * words may only have a subset of their bits updated. if
+ * this is the case, we'll work against that word (i.e.
+ * partial first and/or last) only in a single pass. a
+ * single pass will also be used to update all words that
+ * are to have all their bits updated.
+ */
+ for (rbits = nblks; rbits > 0;
+ rbits -= nbits, dbitno += nbits) {
+ /* determine the bit number within the word and
+ * the number of bits within the word.
+ */
+ wbitno = dbitno & (DBWORD - 1);
+ nbits = min(rbits, DBWORD - wbitno);
+
+ /* check if only part of the word is to be updated. */
+ if (nbits < DBWORD) {
+ /* update (free or allocate) the bits
+ * in this word.
+ */
+ mask =
+ (ONES << (DBWORD - nbits) >> wbitno);
+ if (free)
+ dp->pmap[word] &=
+ cpu_to_le32(~mask);
+ else
+ dp->pmap[word] |=
+ cpu_to_le32(mask);
+
+ word += 1;
+ } else {
+ /* one or more words are to have all
+ * their bits updated. determine how
+ * many words and how many bits.
+ */
+ nwords = rbits >> L2DBWORD;
+ nbits = nwords << L2DBWORD;
+
+ /* update (free or allocate) the bits
+ * in these words.
+ */
+ if (free)
+ memset(&dp->pmap[word], 0,
+ nwords * 4);
+ else
+ memset(&dp->pmap[word], (int) ONES,
+ nwords * 4);
+
+ word += nwords;
+ }
+ }
+
+ /*
+ * update dmap lsn
+ */
+ if (lblkno == lastlblkno)
+ continue;
+
+ lastlblkno = lblkno;
+
+ if (mp->lsn != 0) {
+ /* inherit older/smaller lsn */
+ logdiff(diffp, mp->lsn, log);
+ if (difft < diffp) {
+ mp->lsn = lsn;
+
+ /* move bp after tblock in logsync list */
+ LOGSYNC_LOCK(log);
+ list_move(&mp->synclist, &tblk->synclist);
+ LOGSYNC_UNLOCK(log);
+ }
+
+ /* inherit younger/larger clsn */
+ LOGSYNC_LOCK(log);
+ logdiff(difft, tblk->clsn, log);
+ logdiff(diffp, mp->clsn, log);
+ if (difft > diffp)
+ mp->clsn = tblk->clsn;
+ LOGSYNC_UNLOCK(log);
+ } else {
+ mp->log = log;
+ mp->lsn = lsn;
+
+ /* insert bp after tblock in logsync list */
+ LOGSYNC_LOCK(log);
+
+ log->count++;
+ list_add(&mp->synclist, &tblk->synclist);
+
+ mp->clsn = tblk->clsn;
+ LOGSYNC_UNLOCK(log);
+ }
+ }
+
+ /* write the last buffer. */
+ if (mp) {
+ write_metapage(mp);
+ }
+
+ return (0);
+}
+
+
+/*
+ * NAME: dbNextAG()
+ *
+ * FUNCTION: find the preferred allocation group for new allocations.
+ *
+ * Within the allocation groups, we maintain a preferred
+ * allocation group which consists of a group with at least
+ * average free space. It is the preferred group that we target
+ * new inode allocation towards. The tie-in between inode
+ * allocation and block allocation occurs as we allocate the
+ * first (data) block of an inode and specify the inode (block)
+ * as the allocation hint for this block.
+ *
+ * We try to avoid having more than one open file growing in
+ * an allocation group, as this will lead to fragmentation.
+ * This differs from the old OS/2 method of trying to keep
+ * empty ags around for large allocations.
+ *
+ * PARAMETERS:
+ * ipbmap - pointer to in-core inode for the block map.
+ *
+ * RETURN VALUES:
+ * the preferred allocation group number.
+ */
+int dbNextAG(struct inode *ipbmap)
+{
+ s64 avgfree;
+ int agpref;
+ s64 hwm = 0;
+ int i;
+ int next_best = -1;
+ struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
+
+ BMAP_LOCK(bmp);
+
+ /* determine the average number of free blocks within the ags. */
+ avgfree = (u32)bmp->db_nfree / bmp->db_numag;
+
+ /*
+ * if the current preferred ag does not have an active allocator
+ * and has at least average freespace, return it
+ */
+ agpref = bmp->db_agpref;
+ if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
+ (bmp->db_agfree[agpref] >= avgfree))
+ goto unlock;
+
+ /* From the last preferred ag, find the next one with at least
+ * average free space.
+ */
+ for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
+ if (agpref == bmp->db_numag)
+ agpref = 0;
+
+ if (atomic_read(&bmp->db_active[agpref]))
+ /* open file is currently growing in this ag */
+ continue;
+ if (bmp->db_agfree[agpref] >= avgfree) {
+ /* Return this one */
+ bmp->db_agpref = agpref;
+ goto unlock;
+ } else if (bmp->db_agfree[agpref] > hwm) {
+ /* Less than avg. freespace, but best so far */
+ hwm = bmp->db_agfree[agpref];
+ next_best = agpref;
+ }
+ }
+
+ /*
+ * If no inactive ag was found with average freespace, use the
+ * next best
+ */
+ if (next_best != -1)
+ bmp->db_agpref = next_best;
+ /* else leave db_agpref unchanged */
+unlock:
+ BMAP_UNLOCK(bmp);
+
+ /* return the preferred group.
+ */
+ return (bmp->db_agpref);
+}
+
+/*
+ * NAME: dbAlloc()
+ *
+ * FUNCTION: attempt to allocate a specified number of contiguous free
+ * blocks from the working allocation block map.
+ *
+ * the block allocation policy uses hints and a multi-step
+ * approach.
+ *
+ * for allocation requests smaller than the number of blocks
+ * per dmap, we first try to allocate the new blocks
+ * immediately following the hint. if these blocks are not
+ * available, we try to allocate blocks near the hint. if
+ * no blocks near the hint are available, we next try to
+ * allocate within the same dmap as contains the hint.
+ *
+ * if no blocks are available in the dmap or the allocation
+ * request is larger than the dmap size, we try to allocate
+ * within the same allocation group as contains the hint. if
+ * this does not succeed, we finally try to allocate anywhere
+ * within the aggregate.
+ *
+ * we also try to allocate anywhere within the aggregate for
+ * for allocation requests larger than the allocation group
+ * size or requests that specify no hint value.
+ *
+ * PARAMETERS:
+ * ip - pointer to in-core inode;
+ * hint - allocation hint.
+ * nblocks - number of contiguous blocks in the range.
+ * results - on successful return, set to the starting block number
+ * of the newly allocated contiguous range.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -ENOSPC - insufficient disk resources
+ * -EIO - i/o error
+ */
+int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
+{
+ int rc, agno;
+ struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+ struct bmap *bmp;
+ struct metapage *mp;
+ s64 lblkno, blkno;
+ struct dmap *dp;
+ int l2nb;
+ s64 mapSize;
+ int writers;
+
+ /* assert that nblocks is valid */
+ assert(nblocks > 0);
+
+#ifdef _STILL_TO_PORT
+ /* DASD limit check F226941 */
+ if (OVER_LIMIT(ip, nblocks))
+ return -ENOSPC;
+#endif /* _STILL_TO_PORT */
+
+ /* get the log2 number of blocks to be allocated.
+ * if the number of blocks is not a log2 multiple,
+ * it will be rounded up to the next log2 multiple.
+ */
+ l2nb = BLKSTOL2(nblocks);
+
+ bmp = JFS_SBI(ip->i_sb)->bmap;
+
+//retry: /* serialize w.r.t.extendfs() */
+ mapSize = bmp->db_mapsize;
+
+ /* the hint should be within the map */
+ if (hint >= mapSize) {
+ jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map");
+ return -EIO;
+ }
+
+ /* if the number of blocks to be allocated is greater than the
+ * allocation group size, try to allocate anywhere.
+ */
+ if (l2nb > bmp->db_agl2size) {
+ IWRITE_LOCK(ipbmap);
+
+ rc = dbAllocAny(bmp, nblocks, l2nb, results);
+ if (rc == 0) {
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results,
+ nblocks);
+ }
+
+ goto write_unlock;
+ }
+
+ /*
+ * If no hint, let dbNextAG recommend an allocation group
+ */
+ if (hint == 0)
+ goto pref_ag;
+
+ /* we would like to allocate close to the hint. adjust the
+ * hint to the block following the hint since the allocators
+ * will start looking for free space starting at this point.
+ */
+ blkno = hint + 1;
+
+ if (blkno >= bmp->db_mapsize)
+ goto pref_ag;
+
+ agno = blkno >> bmp->db_agl2size;
+
+ /* check if blkno crosses over into a new allocation group.
+ * if so, check if we should allow allocations within this
+ * allocation group.
+ */
+ if ((blkno & (bmp->db_agsize - 1)) == 0)
+ /* check if the AG is currenly being written to.
+ * if so, call dbNextAG() to find a non-busy
+ * AG with sufficient free space.
+ */
+ if (atomic_read(&bmp->db_active[agno]))
+ goto pref_ag;
+
+ /* check if the allocation request size can be satisfied from a
+ * single dmap. if so, try to allocate from the dmap containing
+ * the hint using a tiered strategy.
+ */
+ if (nblocks <= BPERDMAP) {
+ IREAD_LOCK(ipbmap);
+
+ /* get the buffer for the dmap containing the hint.
+ */
+ rc = -EIO;
+ lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+ mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL)
+ goto read_unlock;
+
+ dp = (struct dmap *) mp->data;
+
+ /* first, try to satisfy the allocation request with the
+ * blocks beginning at the hint.
+ */
+ if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
+ != -ENOSPC) {
+ if (rc == 0) {
+ *results = blkno;
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+ *results, nblocks);
+ mark_metapage_dirty(mp);
+ }
+
+ release_metapage(mp);
+ goto read_unlock;
+ }
+
+ writers = atomic_read(&bmp->db_active[agno]);
+ if ((writers > 1) ||
+ ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
+ /*
+ * Someone else is writing in this allocation
+ * group. To avoid fragmenting, try another ag
+ */
+ release_metapage(mp);
+ IREAD_UNLOCK(ipbmap);
+ goto pref_ag;
+ }
+
+ /* next, try to satisfy the allocation request with blocks
+ * near the hint.
+ */
+ if ((rc =
+ dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
+ != -ENOSPC) {
+ if (rc == 0) {
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+ *results, nblocks);
+ mark_metapage_dirty(mp);
+ }
+
+ release_metapage(mp);
+ goto read_unlock;
+ }
+
+ /* try to satisfy the allocation request with blocks within
+ * the same dmap as the hint.
+ */
+ if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
+ != -ENOSPC) {
+ if (rc == 0) {
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+ *results, nblocks);
+ mark_metapage_dirty(mp);
+ }
+
+ release_metapage(mp);
+ goto read_unlock;
+ }
+
+ release_metapage(mp);
+ IREAD_UNLOCK(ipbmap);
+ }
+
+ /* try to satisfy the allocation request with blocks within
+ * the same allocation group as the hint.
+ */
+ IWRITE_LOCK(ipbmap);
+ if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results))
+ != -ENOSPC) {
+ if (rc == 0)
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
+ *results, nblocks);
+ goto write_unlock;
+ }
+ IWRITE_UNLOCK(ipbmap);
+
+
+ pref_ag:
+ /*
+ * Let dbNextAG recommend a preferred allocation group
+ */
+ agno = dbNextAG(ipbmap);
+ IWRITE_LOCK(ipbmap);
+
+ /* Try to allocate within this allocation group. if that fails, try to
+ * allocate anywhere in the map.
+ */
+ if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC)
+ rc = dbAllocAny(bmp, nblocks, l2nb, results);
+ if (rc == 0) {
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, nblocks);
+ }
+
+ write_unlock:
+ IWRITE_UNLOCK(ipbmap);
+
+ return (rc);
+
+ read_unlock:
+ IREAD_UNLOCK(ipbmap);
+
+ return (rc);
+}
+
+#ifdef _NOTYET
+/*
+ * NAME: dbAllocExact()
+ *
+ * FUNCTION: try to allocate the requested extent;
+ *
+ * PARAMETERS:
+ * ip - pointer to in-core inode;
+ * blkno - extent address;
+ * nblocks - extent length;
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -ENOSPC - insufficient disk resources
+ * -EIO - i/o error
+ */
+int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
+{
+ int rc;
+ struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
+ struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
+ struct dmap *dp;
+ s64 lblkno;
+ struct metapage *mp;
+
+ IREAD_LOCK(ipbmap);
+
+ /*
+ * validate extent request:
+ *
+ * note: defragfs policy:
+ * max 64 blocks will be moved.
+ * allocation request size must be satisfied from a single dmap.
+ */
+ if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
+ IREAD_UNLOCK(ipbmap);
+ return -EINVAL;
+ }
+
+ if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) {
+ /* the free space is no longer available */
+ IREAD_UNLOCK(ipbmap);
+ return -ENOSPC;
+ }
+
+ /* read in the dmap covering the extent */
+ lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
+ mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
+ if (mp == NULL) {
+ IREAD_UNLOCK(ipbmap);
+ return -EIO;
+ }
+ dp = (struct dmap *) mp->data;
+
+ /* try to allocate the requested extent */
+ rc = dbAllocNext(bmp, dp, blkno, nblocks);
+
+ IREAD_UNLOCK(ipbmap);
+
+ if (rc == 0) {
+ DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks);
+ mark_metapage_dirty(mp);
+ }
+ release_metapage(mp);
+
+ return (rc);
+}
+#endif /* _NOTYET */
+
+/*
+ * NAME: dbReAlloc()
+ *
+ * FUNCTION: attempt to extend a current allocation by a specified
+ * number of blocks.
+ *
+ * this routine attempts to satisfy the allocation request
+ * by first trying to extend the existing allocation in
+ * place by allocating the additional blocks as the blocks
+ * immediately following the current allocation. if these
+ * blocks are not available, this routine will attempt to
+ * allocate a new set of contiguous blocks large enough
+ * to cover the existing allocation plus the additional
+ * number of blocks required.
+ *
+ * PARAMETERS:
+ * ip - pointer to in-core inode requiring allocation.
+ * blkno - starting block of the current allocation.
+ * nblocks - number of contiguous blocks within the current
+ * allocation.
+ * addnblocks - number of blocks to add to the allocation.
+ * results - on successful return, set to the starting block number
+ * of the existing allocation if the existing allocation
+ * was extended in place or to a newly allocated contiguous
+ * range if the existing allocation could not be extended
+ * in place.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -ENOSPC - insufficient disk resources
+ * -EIO - i/o error
+ */
+int
+dbReAlloc(struct inode *ip,
+ s64 blkno, s64 nblocks, s64 addnblocks, s64 * results)
+{
+ int rc;
+
+ /* try to extend the allocation in place.
+ */
+ if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) {
+ *results = blkno;
+ return (0);
+ } else {
+ if (rc != -ENOSPC)
+ return (rc);
+ }
+
+ /* could not extend the allocation in place, so allocate a
+ * new set of blocks for the entire request (i.e. try to get
+ * a range of contiguous blocks large enough to cover the
+ * existing allocation plus the additional blocks.)
+ */
+ return (dbAlloc
+ (ip, blkno + nblocks - 1, addnblocks + nblocks, results));
+}
+
+
+/*
+ * NAME: dbExtend()
+ *
+ * FUNCTION: attempt to extend a current allocation by a specified
+ * number of blocks.
+ *
+ * this routine attempts to satisfy the allocation request
+ * by first trying to extend the existing allocation in
+ * place by allocating the additional blocks as the blocks
+ * immediately following the current allocation.
+ *
+ * PARAMETERS:
+ * ip - pointer to in-core inode requiring allocation.
+ * blkno - starting block of the current allocation.
+ * nblocks - number of contiguous blocks within the current
+ * allocation.
+ * addnblocks - number of blocks to add to the allocation.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -ENOSPC - insufficient disk resources
+ * -EIO - i/o error
+ */
+static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ s64 lblkno, lastblkno, extblkno;
+ uint rel_block;
+ struct metapage *mp;
+ struct dmap *dp;
+ int rc;
+ struct inode *ipbmap = sbi->ipbmap;
+ struct bmap *bmp;
+
+ /*
+ * We don't want a non-aligned extent to cross a page boundary
+ */
+ if (((rel_block = blkno & (sbi->nbperpage - 1))) &&
+ (rel_block + nblocks + addnblocks > sbi->nbperpage))
+ return -ENOSPC;
+
+ /* get the last block of the current allocation */
+ lastblkno = blkno + nblocks - 1;
+
+ /* determine the block number of the block following
+ * the existing allocation.
+ */
+ extblkno = lastblkno + 1;
+
+ IREAD_LOCK(ipbmap);
+
+ /* better be within the file system */
+ bmp = sbi->bmap;
+ if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
+ IREAD_UNLOCK(ipbmap);
+ jfs_error(ip->i_sb,
+ "dbExtend: the block is outside the filesystem");
+ return -EIO;
+ }
+
+ /* we'll attempt to extend the current allocation in place by
+ * allocating the additional blocks as the blocks immediately
+ * following the current allocation. we only try to extend the
+ * current allocation in place if the number of additional blocks
+ * can fit into a dmap, the last block of the current allocation
+ * is not the last block of the file system, and the start of the
+ * inplace extension is not on an allocation group boundary.
+ */
+ if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize ||
+ (extblkno & (bmp->db_agsize - 1)) == 0) {
+ IREAD_UNLOCK(ipbmap);
+ return -ENOSPC;
+ }
+
+ /* get the buffer for the dmap containing the first block
+ * of the extension.
+ */
+ lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage);
+ mp = r