diff options
Diffstat (limited to 'fs/jfs/jfs_dmap.c')
-rw-r--r-- | fs/jfs/jfs_dmap.c | 4272 |
1 files changed, 4272 insertions, 0 deletions
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c new file mode 100644 index 00000000000..d86e467c6e4 --- /dev/null +++ b/fs/jfs/jfs_dmap.c @@ -0,0 +1,4272 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_lock.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" + +/* + * Debug code for double-checking block map + */ +/* #define _JFS_DEBUG_DMAP 1 */ + +#ifdef _JFS_DEBUG_DMAP +#define DBINITMAP(size,ipbmap,results) \ + DBinitmap(size,ipbmap,results) +#define DBALLOC(dbmap,mapsize,blkno,nblocks) \ + DBAlloc(dbmap,mapsize,blkno,nblocks) +#define DBFREE(dbmap,mapsize,blkno,nblocks) \ + DBFree(dbmap,mapsize,blkno,nblocks) +#define DBALLOCCK(dbmap,mapsize,blkno,nblocks) \ + DBAllocCK(dbmap,mapsize,blkno,nblocks) +#define DBFREECK(dbmap,mapsize,blkno,nblocks) \ + DBFreeCK(dbmap,mapsize,blkno,nblocks) + +static void DBinitmap(s64, struct inode *, u32 **); +static void DBAlloc(uint *, s64, s64, s64); +static void DBFree(uint *, s64, s64, s64); +static void DBAllocCK(uint *, s64, s64, s64); +static void DBFreeCK(uint *, s64, s64, s64); +#else +#define DBINITMAP(size,ipbmap,results) +#define DBALLOC(dbmap, mapsize, blkno, nblocks) +#define DBFREE(dbmap, mapsize, blkno, nblocks) +#define DBALLOCCK(dbmap, mapsize, blkno, nblocks) +#define DBFREECK(dbmap, mapsize, blkno, nblocks) +#endif /* _JFS_DEBUG_DMAP */ + +/* + * SERIALIZATION of the Block Allocation Map. + * + * the working state of the block allocation map is accessed in + * two directions: + * + * 1) allocation and free requests that start at the dmap + * level and move up through the dmap control pages (i.e. + * the vast majority of requests). + * + * 2) allocation requests that start at dmap control page + * level and work down towards the dmaps. + * + * the serialization scheme used here is as follows. + * + * requests which start at the bottom are serialized against each + * other through buffers and each requests holds onto its buffers + * as it works it way up from a single dmap to the required level + * of dmap control page. + * requests that start at the top are serialized against each other + * and request that start from the bottom by the multiple read/single + * write inode lock of the bmap inode. requests starting at the top + * take this lock in write mode while request starting at the bottom + * take the lock in read mode. a single top-down request may proceed + * exclusively while multiple bottoms-up requests may proceed + * simultaneously (under the protection of busy buffers). + * + * in addition to information found in dmaps and dmap control pages, + * the working state of the block allocation map also includes read/ + * write information maintained in the bmap descriptor (i.e. total + * free block count, allocation group level free block counts). + * a single exclusive lock (BMAP_LOCK) is used to guard this information + * in the face of multiple-bottoms up requests. + * (lock ordering: IREAD_LOCK, BMAP_LOCK); + * + * accesses to the persistent state of the block allocation map (limited + * to the persistent bitmaps in dmaps) is guarded by (busy) buffers. + */ + +#define BMAP_LOCK_INIT(bmp) init_MUTEX(&bmp->db_bmaplock) +#define BMAP_LOCK(bmp) down(&bmp->db_bmaplock) +#define BMAP_UNLOCK(bmp) up(&bmp->db_bmaplock) + +/* + * forward references + */ +static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval); +static void dbBackSplit(dmtree_t * tp, int leafno); +static void dbJoin(dmtree_t * tp, int leafno, int newval); +static void dbAdjTree(dmtree_t * tp, int leafno, int newval); +static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, + int level); +static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results); +static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks, + int l2nb, s64 * results); +static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks, + int l2nb, + s64 * results); +static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, + s64 * results); +static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, + s64 * results); +static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks); +static int dbFindBits(u32 word, int l2nb); +static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno); +static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx); +static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbMaxBud(u8 * cp); +s64 dbMapFileSizeToMapSize(struct inode *ipbmap); +static int blkstol2(s64 nb); + +static int cntlz(u32 value); +static int cnttz(u32 word); + +static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks); +static int dbInitDmapTree(struct dmap * dp); +static int dbInitTree(struct dmaptree * dtp); +static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i); +static int dbGetL2AGSize(s64 nblocks); + +/* + * buddy table + * + * table used for determining buddy sizes within characters of + * dmap bitmap words. the characters themselves serve as indexes + * into the table, with the table elements yielding the maximum + * binary buddy of free bits within the character. + */ +static s8 budtab[256] = { + 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1 +}; + + +/* + * NAME: dbMount() + * + * FUNCTION: initializate the block allocation map. + * + * memory is allocated for the in-core bmap descriptor and + * the in-core descriptor is initialized from disk. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient memory + * -EIO - i/o error + */ +int dbMount(struct inode *ipbmap) +{ + struct bmap *bmp; + struct dbmap_disk *dbmp_le; + struct metapage *mp; + int i; + + /* + * allocate/initialize the in-memory bmap descriptor + */ + /* allocate memory for the in-memory bmap descriptor */ + bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL); + if (bmp == NULL) + return -ENOMEM; + + /* read the on-disk bmap descriptor. */ + mp = read_metapage(ipbmap, + BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + kfree(bmp); + return -EIO; + } + + /* copy the on-disk bmap descriptor to its in-memory version. */ + dbmp_le = (struct dbmap_disk *) mp->data; + bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize); + bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); + bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); + bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); + bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); + bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); + bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); + bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); + bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth); + bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); + bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); + bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); + for (i = 0; i < MAXAG; i++) + bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]); + bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize); + bmp->db_maxfreebud = dbmp_le->dn_maxfreebud; + + /* release the buffer. */ + release_metapage(mp); + + /* bind the bmap inode and the bmap descriptor to each other. */ + bmp->db_ipbmap = ipbmap; + JFS_SBI(ipbmap->i_sb)->bmap = bmp; + + memset(bmp->db_active, 0, sizeof(bmp->db_active)); + DBINITMAP(bmp->db_mapsize, ipbmap, &bmp->db_DBmap); + + /* + * allocate/initialize the bmap lock + */ + BMAP_LOCK_INIT(bmp); + + return (0); +} + + +/* + * NAME: dbUnmount() + * + * FUNCTION: terminate the block allocation map in preparation for + * file system unmount. + * + * the in-core bmap descriptor is written to disk and + * the memory for this descriptor is freed. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int dbUnmount(struct inode *ipbmap, int mounterror) +{ + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + int i; + + if (!(mounterror || isReadOnly(ipbmap))) + dbSync(ipbmap); + + /* + * Invalidate the page cache buffers + */ + truncate_inode_pages(ipbmap->i_mapping, 0); + + /* + * Sanity Check + */ + for (i = 0; i < bmp->db_numag; i++) + if (atomic_read(&bmp->db_active[i])) + printk(KERN_ERR "dbUnmount: db_active[%d] = %d\n", + i, atomic_read(&bmp->db_active[i])); + + /* free the memory for the in-memory bmap. */ + kfree(bmp); + + return (0); +} + +/* + * dbSync() + */ +int dbSync(struct inode *ipbmap) +{ + struct dbmap_disk *dbmp_le; + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + struct metapage *mp; + int i; + + /* + * write bmap global control page + */ + /* get the buffer for the on-disk bmap descriptor. */ + mp = read_metapage(ipbmap, + BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + jfs_err("dbSync: read_metapage failed!"); + return -EIO; + } + /* copy the in-memory version of the bmap to the on-disk version */ + dbmp_le = (struct dbmap_disk *) mp->data; + dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize); + dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree); + dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage); + dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag); + dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel); + dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); + dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); + dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); + dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth); + dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); + dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); + dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); + for (i = 0; i < MAXAG; i++) + dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]); + dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize); + dbmp_le->dn_maxfreebud = bmp->db_maxfreebud; + + /* write the buffer */ + write_metapage(mp); + + /* + * write out dirty pages of bmap + */ + filemap_fdatawrite(ipbmap->i_mapping); + filemap_fdatawait(ipbmap->i_mapping); + + ipbmap->i_state |= I_DIRTY; + diWriteSpecial(ipbmap, 0); + + return (0); +} + + +/* + * NAME: dbFree() + * + * FUNCTION: free the specified block range from the working block + * allocation map. + * + * the blocks will be free from the working map one dmap + * at a time. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int dbFree(struct inode *ip, s64 blkno, s64 nblocks) +{ + struct metapage *mp; + struct dmap *dp; + int nb, rc; + s64 lblkno, rem; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + + IREAD_LOCK(ipbmap); + + /* block to be freed better be within the mapsize. */ + if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) { + IREAD_UNLOCK(ipbmap); + printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(ip->i_sb, + "dbFree: block to be freed is outside the map"); + return -EIO; + } + + /* + * free the blocks a dmap at a time. + */ + mp = NULL; + for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { + /* release previous dmap if any */ + if (mp) { + write_metapage(mp); + } + + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + dp = (struct dmap *) mp->data; + + /* determine the number of blocks to be freed from + * this dmap. + */ + nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); + + DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); + + /* free the blocks. */ + if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) { + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + return (rc); + } + + DBFREE(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); + } + + /* write the last buffer. */ + write_metapage(mp); + + IREAD_UNLOCK(ipbmap); + + return (0); +} + + +/* + * NAME: dbUpdatePMap() + * + * FUNCTION: update the allocation state (free or allocate) of the + * specified block range in the persistent block allocation map. + * + * the blocks will be updated in the persistent map one + * dmap at a time. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * free - TRUE if block range is to be freed from the persistent + * map; FALSE if it is to be allocated. + * blkno - starting block number of the range. + * nblocks - number of contiguous blocks in the range. + * tblk - transaction block; + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int +dbUpdatePMap(struct inode *ipbmap, + int free, s64 blkno, s64 nblocks, struct tblock * tblk) +{ + int nblks, dbitno, wbitno, rbits; + int word, nbits, nwords; + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + s64 lblkno, rem, lastlblkno; + u32 mask; + struct dmap *dp; + struct metapage *mp; + struct jfs_log *log; + int lsn, difft, diffp; + + /* the blocks better be within the mapsize. */ + if (blkno + nblocks > bmp->db_mapsize) { + printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(ipbmap->i_sb, + "dbUpdatePMap: blocks are outside the map"); + return -EIO; + } + + /* compute delta of transaction lsn from log syncpt */ + lsn = tblk->lsn; + log = (struct jfs_log *) JFS_SBI(tblk->sb)->log; + logdiff(difft, lsn, log); + + /* + * update the block state a dmap at a time. + */ + mp = NULL; + lastlblkno = 0; + for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) { + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + if (lblkno != lastlblkno) { + if (mp) { + write_metapage(mp); + } + + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, + 0); + if (mp == NULL) + return -EIO; + } + dp = (struct dmap *) mp->data; + + /* determine the bit number and word within the dmap of + * the starting block. also determine how many blocks + * are to be updated within this dmap. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + nblks = min(rem, (s64)BPERDMAP - dbitno); + + /* update the bits of the dmap words. the first and last + * words may only have a subset of their bits updated. if + * this is the case, we'll work against that word (i.e. + * partial first and/or last) only in a single pass. a + * single pass will also be used to update all words that + * are to have all their bits updated. + */ + for (rbits = nblks; rbits > 0; + rbits -= nbits, dbitno += nbits) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nbits = min(rbits, DBWORD - wbitno); + + /* check if only part of the word is to be updated. */ + if (nbits < DBWORD) { + /* update (free or allocate) the bits + * in this word. + */ + mask = + (ONES << (DBWORD - nbits) >> wbitno); + if (free) + dp->pmap[word] &= + cpu_to_le32(~mask); + else + dp->pmap[word] |= + cpu_to_le32(mask); + + word += 1; + } else { + /* one or more words are to have all + * their bits updated. determine how + * many words and how many bits. + */ + nwords = rbits >> L2DBWORD; + nbits = nwords << L2DBWORD; + + /* update (free or allocate) the bits + * in these words. + */ + if (free) + memset(&dp->pmap[word], 0, + nwords * 4); + else + memset(&dp->pmap[word], (int) ONES, + nwords * 4); + + word += nwords; + } + } + + /* + * update dmap lsn + */ + if (lblkno == lastlblkno) + continue; + + lastlblkno = lblkno; + + if (mp->lsn != 0) { + /* inherit older/smaller lsn */ + logdiff(diffp, mp->lsn, log); + if (difft < diffp) { + mp->lsn = lsn; + + /* move bp after tblock in logsync list */ + LOGSYNC_LOCK(log); + list_move(&mp->synclist, &tblk->synclist); + LOGSYNC_UNLOCK(log); + } + + /* inherit younger/larger clsn */ + LOGSYNC_LOCK(log); + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + LOGSYNC_UNLOCK(log); + } else { + mp->log = log; + mp->lsn = lsn; + + /* insert bp after tblock in logsync list */ + LOGSYNC_LOCK(log); + + log->count++; + list_add(&mp->synclist, &tblk->synclist); + + mp->clsn = tblk->clsn; + LOGSYNC_UNLOCK(log); + } + } + + /* write the last buffer. */ + if (mp) { + write_metapage(mp); + } + + return (0); +} + + +/* + * NAME: dbNextAG() + * + * FUNCTION: find the preferred allocation group for new allocations. + * + * Within the allocation groups, we maintain a preferred + * allocation group which consists of a group with at least + * average free space. It is the preferred group that we target + * new inode allocation towards. The tie-in between inode + * allocation and block allocation occurs as we allocate the + * first (data) block of an inode and specify the inode (block) + * as the allocation hint for this block. + * + * We try to avoid having more than one open file growing in + * an allocation group, as this will lead to fragmentation. + * This differs from the old OS/2 method of trying to keep + * empty ags around for large allocations. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * the preferred allocation group number. + */ +int dbNextAG(struct inode *ipbmap) +{ + s64 avgfree; + int agpref; + s64 hwm = 0; + int i; + int next_best = -1; + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + + BMAP_LOCK(bmp); + + /* determine the average number of free blocks within the ags. */ + avgfree = (u32)bmp->db_nfree / bmp->db_numag; + + /* + * if the current preferred ag does not have an active allocator + * and has at least average freespace, return it + */ + agpref = bmp->db_agpref; + if ((atomic_read(&bmp->db_active[agpref]) == 0) && + (bmp->db_agfree[agpref] >= avgfree)) + goto unlock; + + /* From the last preferred ag, find the next one with at least + * average free space. + */ + for (i = 0 ; i < bmp->db_numag; i++, agpref++) { + if (agpref == bmp->db_numag) + agpref = 0; + + if (atomic_read(&bmp->db_active[agpref])) + /* open file is currently growing in this ag */ + continue; + if (bmp->db_agfree[agpref] >= avgfree) { + /* Return this one */ + bmp->db_agpref = agpref; + goto unlock; + } else if (bmp->db_agfree[agpref] > hwm) { + /* Less than avg. freespace, but best so far */ + hwm = bmp->db_agfree[agpref]; + next_best = agpref; + } + } + + /* + * If no inactive ag was found with average freespace, use the + * next best + */ + if (next_best != -1) + bmp->db_agpref = next_best; + /* else leave db_agpref unchanged */ +unlock: + BMAP_UNLOCK(bmp); + + /* return the preferred group. + */ + return (bmp->db_agpref); +} + +/* + * NAME: dbAlloc() + * + * FUNCTION: attempt to allocate a specified number of contiguous free + * blocks from the working allocation block map. + * + * the block allocation policy uses hints and a multi-step + * approach. + * + * for allocation requests smaller than the number of blocks + * per dmap, we first try to allocate the new blocks + * immediately following the hint. if these blocks are not + * available, we try to allocate blocks near the hint. if + * no blocks near the hint are available, we next try to + * allocate within the same dmap as contains the hint. + * + * if no blocks are available in the dmap or the allocation + * request is larger than the dmap size, we try to allocate + * within the same allocation group as contains the hint. if + * this does not succeed, we finally try to allocate anywhere + * within the aggregate. + * + * we also try to allocate anywhere within the aggregate for + * for allocation requests larger than the allocation group + * size or requests that specify no hint value. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * hint - allocation hint. + * nblocks - number of contiguous blocks in the range. + * results - on successful return, set to the starting block number + * of the newly allocated contiguous range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) +{ + int rc, agno; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp; + struct metapage *mp; + s64 lblkno, blkno; + struct dmap *dp; + int l2nb; + s64 mapSize; + int writers; + + /* assert that nblocks is valid */ + assert(nblocks > 0); + +#ifdef _STILL_TO_PORT + /* DASD limit check F226941 */ + if (OVER_LIMIT(ip, nblocks)) + return -ENOSPC; +#endif /* _STILL_TO_PORT */ + + /* get the log2 number of blocks to be allocated. + * if the number of blocks is not a log2 multiple, + * it will be rounded up to the next log2 multiple. + */ + l2nb = BLKSTOL2(nblocks); + + bmp = JFS_SBI(ip->i_sb)->bmap; + +//retry: /* serialize w.r.t.extendfs() */ + mapSize = bmp->db_mapsize; + + /* the hint should be within the map */ + if (hint >= mapSize) { + jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map"); + return -EIO; + } + + /* if the number of blocks to be allocated is greater than the + * allocation group size, try to allocate anywhere. + */ + if (l2nb > bmp->db_agl2size) { + IWRITE_LOCK(ipbmap); + + rc = dbAllocAny(bmp, nblocks, l2nb, results); + if (rc == 0) { + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, + nblocks); + } + + goto write_unlock; + } + + /* + * If no hint, let dbNextAG recommend an allocation group + */ + if (hint == 0) + goto pref_ag; + + /* we would like to allocate close to the hint. adjust the + * hint to the block following the hint since the allocators + * will start looking for free space starting at this point. + */ + blkno = hint + 1; + + if (blkno >= bmp->db_mapsize) + goto pref_ag; + + agno = blkno >> bmp->db_agl2size; + + /* check if blkno crosses over into a new allocation group. + * if so, check if we should allow allocations within this + * allocation group. + */ + if ((blkno & (bmp->db_agsize - 1)) == 0) + /* check if the AG is currenly being written to. + * if so, call dbNextAG() to find a non-busy + * AG with sufficient free space. + */ + if (atomic_read(&bmp->db_active[agno])) + goto pref_ag; + + /* check if the allocation request size can be satisfied from a + * single dmap. if so, try to allocate from the dmap containing + * the hint using a tiered strategy. + */ + if (nblocks <= BPERDMAP) { + IREAD_LOCK(ipbmap); + + /* get the buffer for the dmap containing the hint. + */ + rc = -EIO; + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + goto read_unlock; + + dp = (struct dmap *) mp->data; + + /* first, try to satisfy the allocation request with the + * blocks beginning at the hint. + */ + if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks)) + != -ENOSPC) { + if (rc == 0) { + *results = blkno; + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, + *results, nblocks); + mark_metapage_dirty(mp); + } + + release_metapage(mp); + goto read_unlock; + } + + writers = atomic_read(&bmp->db_active[agno]); + if ((writers > 1) || + ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) { + /* + * Someone else is writing in this allocation + * group. To avoid fragmenting, try another ag + */ + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + goto pref_ag; + } + + /* next, try to satisfy the allocation request with blocks + * near the hint. + */ + if ((rc = + dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results)) + != -ENOSPC) { + if (rc == 0) { + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, + *results, nblocks); + mark_metapage_dirty(mp); + } + + release_metapage(mp); + goto read_unlock; + } + + /* try to satisfy the allocation request with blocks within + * the same dmap as the hint. + */ + if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results)) + != -ENOSPC) { + if (rc == 0) { + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, + *results, nblocks); + mark_metapage_dirty(mp); + } + + release_metapage(mp); + goto read_unlock; + } + + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + } + + /* try to satisfy the allocation request with blocks within + * the same allocation group as the hint. + */ + IWRITE_LOCK(ipbmap); + if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) + != -ENOSPC) { + if (rc == 0) + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, + *results, nblocks); + goto write_unlock; + } + IWRITE_UNLOCK(ipbmap); + + + pref_ag: + /* + * Let dbNextAG recommend a preferred allocation group + */ + agno = dbNextAG(ipbmap); + IWRITE_LOCK(ipbmap); + + /* Try to allocate within this allocation group. if that fails, try to + * allocate anywhere in the map. + */ + if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC) + rc = dbAllocAny(bmp, nblocks, l2nb, results); + if (rc == 0) { + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, nblocks); + } + + write_unlock: + IWRITE_UNLOCK(ipbmap); + + return (rc); + + read_unlock: + IREAD_UNLOCK(ipbmap); + + return (rc); +} + +#ifdef _NOTYET +/* + * NAME: dbAllocExact() + * + * FUNCTION: try to allocate the requested extent; + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - extent address; + * nblocks - extent length; + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) +{ + int rc; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + struct dmap *dp; + s64 lblkno; + struct metapage *mp; + + IREAD_LOCK(ipbmap); + + /* + * validate extent request: + * + * note: defragfs policy: + * max 64 blocks will be moved. + * allocation request size must be satisfied from a single dmap. + */ + if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) { + IREAD_UNLOCK(ipbmap); + return -EINVAL; + } + + if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) { + /* the free space is no longer available */ + IREAD_UNLOCK(ipbmap); + return -ENOSPC; + } + + /* read in the dmap covering the extent */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + dp = (struct dmap *) mp->data; + + /* try to allocate the requested extent */ + rc = dbAllocNext(bmp, dp, blkno, nblocks); + + IREAD_UNLOCK(ipbmap); + + if (rc == 0) { + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks); + mark_metapage_dirty(mp); + } + release_metapage(mp); + + return (rc); +} +#endif /* _NOTYET */ + +/* + * NAME: dbReAlloc() + * + * FUNCTION: attempt to extend a current allocation by a specified + * number of blocks. + * + * this routine attempts to satisfy the allocation request + * by first trying to extend the existing allocation in + * place by allocating the additional blocks as the blocks + * immediately following the current allocation. if these + * blocks are not available, this routine will attempt to + * allocate a new set of contiguous blocks large enough + * to cover the existing allocation plus the additional + * number of blocks required. + * + * PARAMETERS: + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current + * allocation. + * addnblocks - number of blocks to add to the allocation. + * results - on successful return, set to the starting block number + * of the existing allocation if the existing allocation + * was extended in place or to a newly allocated contiguous + * range if the existing allocation could not be extended + * in place. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +int +dbReAlloc(struct inode *ip, + s64 blkno, s64 nblocks, s64 addnblocks, s64 * results) +{ + int rc; + + /* try to extend the allocation in place. + */ + if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) { + *results = blkno; + return (0); + } else { + if (rc != -ENOSPC) + return (rc); + } + + /* could not extend the allocation in place, so allocate a + * new set of blocks for the entire request (i.e. try to get + * a range of contiguous blocks large enough to cover the + * existing allocation plus the additional blocks.) + */ + return (dbAlloc + (ip, blkno + nblocks - 1, addnblocks + nblocks, results)); +} + + +/* + * NAME: dbExtend() + * + * FUNCTION: attempt to extend a current allocation by a specified + * number of blocks. + * + * this routine attempts to satisfy the allocation request + * by first trying to extend the existing allocation in + * place by allocating the additional blocks as the blocks + * immediately following the current allocation. + * + * PARAMETERS: + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current + * allocation. + * addnblocks - number of blocks to add to the allocation. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + s64 lblkno, lastblkno, extblkno; + uint rel_block; + struct metapage *mp; + struct dmap *dp; + int rc; + struct inode *ipbmap = sbi->ipbmap; + struct bmap *bmp; + + /* + * We don't want a non-aligned extent to cross a page boundary + */ + if (((rel_block = blkno & (sbi->nbperpage - 1))) && + (rel_block + nblocks + addnblocks > sbi->nbperpage)) + return -ENOSPC; + + /* get the last block of the current allocation */ + lastblkno = blkno + nblocks - 1; + + /* determine the block number of the block following + * the existing allocation. + */ + extblkno = lastblkno + 1; + + IREAD_LOCK(ipbmap); + + /* better be within the file system */ + bmp = sbi->bmap; + if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) { + IREAD_UNLOCK(ipbmap); + jfs_error(ip->i_sb, + "dbExtend: the block is outside the filesystem"); + return -EIO; + } + + /* we'll attempt to extend the current allocation in place by + * allocating the additional blocks as the blocks immediately + * following the current allocation. we only try to extend the + * current allocation in place if the number of additional blocks + * can fit into a dmap, the last block of the current allocation + * is not the last block of the file system, and the start of the + * inplace extension is not on an allocation group boundary. + */ + if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize || + (extblkno & (bmp->db_agsize - 1)) == 0) { + IREAD_UNLOCK(ipbmap); + return -ENOSPC; + } + + /* get the buffer for the dmap containing the first block + * of the extension. |