aboutsummaryrefslogtreecommitdiff
path: root/fs/jfs/jfs_imap.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/jfs/jfs_imap.c
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'fs/jfs/jfs_imap.c')
-rw-r--r--fs/jfs/jfs_imap.c3270
1 files changed, 3270 insertions, 0 deletions
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
new file mode 100644
index 00000000000..78383130162
--- /dev/null
+++ b/fs/jfs/jfs_imap.c
@@ -0,0 +1,3270 @@
+/*
+ * Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ * jfs_imap.c: inode allocation map manager
+ *
+ * Serialization:
+ * Each AG has a simple lock which is used to control the serialization of
+ * the AG level lists. This lock should be taken first whenever an AG
+ * level list will be modified or accessed.
+ *
+ * Each IAG is locked by obtaining the buffer for the IAG page.
+ *
+ * There is also a inode lock for the inode map inode. A read lock needs to
+ * be taken whenever an IAG is read from the map or the global level
+ * information is read. A write lock needs to be taken whenever the global
+ * level information is modified or an atomic operation needs to be used.
+ *
+ * If more than one IAG is read at one time, the read lock may not
+ * be given up until all of the IAG's are read. Otherwise, a deadlock
+ * may occur when trying to obtain the read lock while another thread
+ * holding the read lock is waiting on the IAG already being held.
+ *
+ * The control page of the inode map is read into memory by diMount().
+ * Thereafter it should only be modified in memory and then it will be
+ * written out when the filesystem is unmounted by diUnmount().
+ */
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+
+#include "jfs_incore.h"
+#include "jfs_filsys.h"
+#include "jfs_dinode.h"
+#include "jfs_dmap.h"
+#include "jfs_imap.h"
+#include "jfs_metapage.h"
+#include "jfs_superblock.h"
+#include "jfs_debug.h"
+
+/*
+ * imap locks
+ */
+/* iag free list lock */
+#define IAGFREE_LOCK_INIT(imap) init_MUTEX(&imap->im_freelock)
+#define IAGFREE_LOCK(imap) down(&imap->im_freelock)
+#define IAGFREE_UNLOCK(imap) up(&imap->im_freelock)
+
+/* per ag iag list locks */
+#define AG_LOCK_INIT(imap,index) init_MUTEX(&(imap->im_aglock[index]))
+#define AG_LOCK(imap,agno) down(&imap->im_aglock[agno])
+#define AG_UNLOCK(imap,agno) up(&imap->im_aglock[agno])
+
+/*
+ * external references
+ */
+extern struct address_space_operations jfs_aops;
+
+/*
+ * forward references
+ */
+static int diAllocAG(struct inomap *, int, boolean_t, struct inode *);
+static int diAllocAny(struct inomap *, int, boolean_t, struct inode *);
+static int diAllocBit(struct inomap *, struct iag *, int);
+static int diAllocExt(struct inomap *, int, struct inode *);
+static int diAllocIno(struct inomap *, int, struct inode *);
+static int diFindFree(u32, int);
+static int diNewExt(struct inomap *, struct iag *, int);
+static int diNewIAG(struct inomap *, int *, int, struct metapage **);
+static void duplicateIXtree(struct super_block *, s64, int, s64 *);
+
+static int diIAGRead(struct inomap * imap, int, struct metapage **);
+static int copy_from_dinode(struct dinode *, struct inode *);
+static void copy_to_dinode(struct dinode *, struct inode *);
+
+/*
+ * debug code for double-checking inode map
+ */
+/* #define _JFS_DEBUG_IMAP 1 */
+
+#ifdef _JFS_DEBUG_IMAP
+#define DBG_DIINIT(imap) DBGdiInit(imap)
+#define DBG_DIALLOC(imap, ino) DBGdiAlloc(imap, ino)
+#define DBG_DIFREE(imap, ino) DBGdiFree(imap, ino)
+
+static void *DBGdiInit(struct inomap * imap);
+static void DBGdiAlloc(struct inomap * imap, ino_t ino);
+static void DBGdiFree(struct inomap * imap, ino_t ino);
+#else
+#define DBG_DIINIT(imap)
+#define DBG_DIALLOC(imap, ino)
+#define DBG_DIFREE(imap, ino)
+#endif /* _JFS_DEBUG_IMAP */
+
+/*
+ * NAME: diMount()
+ *
+ * FUNCTION: initialize the incore inode map control structures for
+ * a fileset or aggregate init time.
+ *
+ * the inode map's control structure (dinomap) is
+ * brought in from disk and placed in virtual memory.
+ *
+ * PARAMETERS:
+ * ipimap - pointer to inode map inode for the aggregate or fileset.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -ENOMEM - insufficient free virtual memory.
+ * -EIO - i/o error.
+ */
+int diMount(struct inode *ipimap)
+{
+ struct inomap *imap;
+ struct metapage *mp;
+ int index;
+ struct dinomap_disk *dinom_le;
+
+ /*
+ * allocate/initialize the in-memory inode map control structure
+ */
+ /* allocate the in-memory inode map control structure. */
+ imap = (struct inomap *) kmalloc(sizeof(struct inomap), GFP_KERNEL);
+ if (imap == NULL) {
+ jfs_err("diMount: kmalloc returned NULL!");
+ return -ENOMEM;
+ }
+
+ /* read the on-disk inode map control structure. */
+
+ mp = read_metapage(ipimap,
+ IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
+ PSIZE, 0);
+ if (mp == NULL) {
+ kfree(imap);
+ return -EIO;
+ }
+
+ /* copy the on-disk version to the in-memory version. */
+ dinom_le = (struct dinomap_disk *) mp->data;
+ imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag);
+ imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag);
+ atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos));
+ atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree));
+ imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext);
+ imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext);
+ for (index = 0; index < MAXAG; index++) {
+ imap->im_agctl[index].inofree =
+ le32_to_cpu(dinom_le->in_agctl[index].inofree);
+ imap->im_agctl[index].extfree =
+ le32_to_cpu(dinom_le->in_agctl[index].extfree);
+ imap->im_agctl[index].numinos =
+ le32_to_cpu(dinom_le->in_agctl[index].numinos);
+ imap->im_agctl[index].numfree =
+ le32_to_cpu(dinom_le->in_agctl[index].numfree);
+ }
+
+ /* release the buffer. */
+ release_metapage(mp);
+
+ /*
+ * allocate/initialize inode allocation map locks
+ */
+ /* allocate and init iag free list lock */
+ IAGFREE_LOCK_INIT(imap);
+
+ /* allocate and init ag list locks */
+ for (index = 0; index < MAXAG; index++) {
+ AG_LOCK_INIT(imap, index);
+ }
+
+ /* bind the inode map inode and inode map control structure
+ * to each other.
+ */
+ imap->im_ipimap = ipimap;
+ JFS_IP(ipimap)->i_imap = imap;
+
+// DBG_DIINIT(imap);
+
+ return (0);
+}
+
+
+/*
+ * NAME: diUnmount()
+ *
+ * FUNCTION: write to disk the incore inode map control structures for
+ * a fileset or aggregate at unmount time.
+ *
+ * PARAMETERS:
+ * ipimap - pointer to inode map inode for the aggregate or fileset.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -ENOMEM - insufficient free virtual memory.
+ * -EIO - i/o error.
+ */
+int diUnmount(struct inode *ipimap, int mounterror)
+{
+ struct inomap *imap = JFS_IP(ipimap)->i_imap;
+
+ /*
+ * update the on-disk inode map control structure
+ */
+
+ if (!(mounterror || isReadOnly(ipimap)))
+ diSync(ipimap);
+
+ /*
+ * Invalidate the page cache buffers
+ */
+ truncate_inode_pages(ipimap->i_mapping, 0);
+
+ /*
+ * free in-memory control structure
+ */
+ kfree(imap);
+
+ return (0);
+}
+
+
+/*
+ * diSync()
+ */
+int diSync(struct inode *ipimap)
+{
+ struct dinomap_disk *dinom_le;
+ struct inomap *imp = JFS_IP(ipimap)->i_imap;
+ struct metapage *mp;
+ int index;
+
+ /*
+ * write imap global conrol page
+ */
+ /* read the on-disk inode map control structure */
+ mp = get_metapage(ipimap,
+ IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage,
+ PSIZE, 0);
+ if (mp == NULL) {
+ jfs_err("diSync: get_metapage failed!");
+ return -EIO;
+ }
+
+ /* copy the in-memory version to the on-disk version */
+ dinom_le = (struct dinomap_disk *) mp->data;
+ dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag);
+ dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag);
+ dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos));
+ dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree));
+ dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext);
+ dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext);
+ for (index = 0; index < MAXAG; index++) {
+ dinom_le->in_agctl[index].inofree =
+ cpu_to_le32(imp->im_agctl[index].inofree);
+ dinom_le->in_agctl[index].extfree =
+ cpu_to_le32(imp->im_agctl[index].extfree);
+ dinom_le->in_agctl[index].numinos =
+ cpu_to_le32(imp->im_agctl[index].numinos);
+ dinom_le->in_agctl[index].numfree =
+ cpu_to_le32(imp->im_agctl[index].numfree);
+ }
+
+ /* write out the control structure */
+ write_metapage(mp);
+
+ /*
+ * write out dirty pages of imap
+ */
+ filemap_fdatawrite(ipimap->i_mapping);
+ filemap_fdatawait(ipimap->i_mapping);
+
+ diWriteSpecial(ipimap, 0);
+
+ return (0);
+}
+
+
+/*
+ * NAME: diRead()
+ *
+ * FUNCTION: initialize an incore inode from disk.
+ *
+ * on entry, the specifed incore inode should itself
+ * specify the disk inode number corresponding to the
+ * incore inode (i.e. i_number should be initialized).
+ *
+ * this routine handles incore inode initialization for
+ * both "special" and "regular" inodes. special inodes
+ * are those required early in the mount process and
+ * require special handling since much of the file system
+ * is not yet initialized. these "special" inodes are
+ * identified by a NULL inode map inode pointer and are
+ * actually initialized by a call to diReadSpecial().
+ *
+ * for regular inodes, the iag describing the disk inode
+ * is read from disk to determine the inode extent address
+ * for the disk inode. with the inode extent address in
+ * hand, the page of the extent that contains the disk
+ * inode is read and the disk inode is copied to the
+ * incore inode.
+ *
+ * PARAMETERS:
+ * ip - pointer to incore inode to be initialized from disk.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -EIO - i/o error.
+ * -ENOMEM - insufficient memory
+ *
+ */
+int diRead(struct inode *ip)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ int iagno, ino, extno, rc;
+ struct inode *ipimap;
+ struct dinode *dp;
+ struct iag *iagp;
+ struct metapage *mp;
+ s64 blkno, agstart;
+ struct inomap *imap;
+ int block_offset;
+ int inodes_left;
+ uint pageno;
+ int rel_inode;
+
+ jfs_info("diRead: ino = %ld", ip->i_ino);
+
+ ipimap = sbi->ipimap;
+ JFS_IP(ip)->ipimap = ipimap;
+
+ /* determine the iag number for this inode (number) */
+ iagno = INOTOIAG(ip->i_ino);
+
+ /* read the iag */
+ imap = JFS_IP(ipimap)->i_imap;
+ IREAD_LOCK(ipimap);
+ rc = diIAGRead(imap, iagno, &mp);
+ IREAD_UNLOCK(ipimap);
+ if (rc) {
+ jfs_err("diRead: diIAGRead returned %d", rc);
+ return (rc);
+ }
+
+ iagp = (struct iag *) mp->data;
+
+ /* determine inode extent that holds the disk inode */
+ ino = ip->i_ino & (INOSPERIAG - 1);
+ extno = ino >> L2INOSPEREXT;
+
+ if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) ||
+ (addressPXD(&iagp->inoext[extno]) == 0)) {
+ release_metapage(mp);
+ return -ESTALE;
+ }
+
+ /* get disk block number of the page within the inode extent
+ * that holds the disk inode.
+ */
+ blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage);
+
+ /* get the ag for the iag */
+ agstart = le64_to_cpu(iagp->agstart);
+
+ release_metapage(mp);
+
+ rel_inode = (ino & (INOSPERPAGE - 1));
+ pageno = blkno >> sbi->l2nbperpage;
+
+ if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
+ /*
+ * OS/2 didn't always align inode extents on page boundaries
+ */
+ inodes_left =
+ (sbi->nbperpage - block_offset) << sbi->l2niperblk;
+
+ if (rel_inode < inodes_left)
+ rel_inode += block_offset << sbi->l2niperblk;
+ else {
+ pageno += 1;
+ rel_inode -= inodes_left;
+ }
+ }
+
+ /* read the page of disk inode */
+ mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
+ if (mp == 0) {
+ jfs_err("diRead: read_metapage failed");
+ return -EIO;
+ }
+
+ /* locate the the disk inode requested */
+ dp = (struct dinode *) mp->data;
+ dp += rel_inode;
+
+ if (ip->i_ino != le32_to_cpu(dp->di_number)) {
+ jfs_error(ip->i_sb, "diRead: i_ino != di_number");
+ rc = -EIO;
+ } else if (le32_to_cpu(dp->di_nlink) == 0)
+ rc = -ESTALE;
+ else
+ /* copy the disk inode to the in-memory inode */
+ rc = copy_from_dinode(dp, ip);
+
+ release_metapage(mp);
+
+ /* set the ag for the inode */
+ JFS_IP(ip)->agno = BLKTOAG(agstart, sbi);
+ JFS_IP(ip)->active_ag = -1;
+
+ return (rc);
+}
+
+
+/*
+ * NAME: diReadSpecial()
+ *
+ * FUNCTION: initialize a 'special' inode from disk.
+ *
+ * this routines handles aggregate level inodes. The
+ * inode cache cannot differentiate between the
+ * aggregate inodes and the filesystem inodes, so we
+ * handle these here. We don't actually use the aggregate
+ * inode map, since these inodes are at a fixed location
+ * and in some cases the aggregate inode map isn't initialized
+ * yet.
+ *
+ * PARAMETERS:
+ * sb - filesystem superblock
+ * inum - aggregate inode number
+ * secondary - 1 if secondary aggregate inode table
+ *
+ * RETURN VALUES:
+ * new inode - success
+ * NULL - i/o error.
+ */
+struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(sb);
+ uint address;
+ struct dinode *dp;
+ struct inode *ip;
+ struct metapage *mp;
+
+ ip = new_inode(sb);
+ if (ip == NULL) {
+ jfs_err("diReadSpecial: new_inode returned NULL!");
+ return ip;
+ }
+
+ if (secondary) {
+ address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
+ JFS_IP(ip)->ipimap = sbi->ipaimap2;
+ } else {
+ address = AITBL_OFF >> L2PSIZE;
+ JFS_IP(ip)->ipimap = sbi->ipaimap;
+ }
+
+ ASSERT(inum < INOSPEREXT);
+
+ ip->i_ino = inum;
+
+ address += inum >> 3; /* 8 inodes per 4K page */
+
+ /* read the page of fixed disk inode (AIT) in raw mode */
+ mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
+ if (mp == NULL) {
+ ip->i_nlink = 1; /* Don't want iput() deleting it */
+ iput(ip);
+ return (NULL);
+ }
+
+ /* get the pointer to the disk inode of interest */
+ dp = (struct dinode *) (mp->data);
+ dp += inum % 8; /* 8 inodes per 4K page */
+
+ /* copy on-disk inode to in-memory inode */
+ if ((copy_from_dinode(dp, ip)) != 0) {
+ /* handle bad return by returning NULL for ip */
+ ip->i_nlink = 1; /* Don't want iput() deleting it */
+ iput(ip);
+ /* release the page */
+ release_metapage(mp);
+ return (NULL);
+
+ }
+
+ ip->i_mapping->a_ops = &jfs_aops;
+ mapping_set_gfp_mask(ip->i_mapping, GFP_NOFS);
+
+ /* Allocations to metadata inodes should not affect quotas */
+ ip->i_flags |= S_NOQUOTA;
+
+ if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) {
+ sbi->gengen = le32_to_cpu(dp->di_gengen);
+ sbi->inostamp = le32_to_cpu(dp->di_inostamp);
+ }
+
+ /* release the page */
+ release_metapage(mp);
+
+ return (ip);
+}
+
+/*
+ * NAME: diWriteSpecial()
+ *
+ * FUNCTION: Write the special inode to disk
+ *
+ * PARAMETERS:
+ * ip - special inode
+ * secondary - 1 if secondary aggregate inode table
+ *
+ * RETURN VALUES: none
+ */
+
+void diWriteSpecial(struct inode *ip, int secondary)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ uint address;
+ struct dinode *dp;
+ ino_t inum = ip->i_ino;
+ struct metapage *mp;
+
+ ip->i_state &= ~I_DIRTY;
+
+ if (secondary)
+ address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage;
+ else
+ address = AITBL_OFF >> L2PSIZE;
+
+ ASSERT(inum < INOSPEREXT);
+
+ address += inum >> 3; /* 8 inodes per 4K page */
+
+ /* read the page of fixed disk inode (AIT) in raw mode */
+ mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1);
+ if (mp == NULL) {
+ jfs_err("diWriteSpecial: failed to read aggregate inode "
+ "extent!");
+ return;
+ }
+
+ /* get the pointer to the disk inode of interest */
+ dp = (struct dinode *) (mp->data);
+ dp += inum % 8; /* 8 inodes per 4K page */
+
+ /* copy on-disk inode to in-memory inode */
+ copy_to_dinode(dp, ip);
+ memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288);
+
+ if (inum == FILESYSTEM_I)
+ dp->di_gengen = cpu_to_le32(sbi->gengen);
+
+ /* write the page */
+ write_metapage(mp);
+}
+
+/*
+ * NAME: diFreeSpecial()
+ *
+ * FUNCTION: Free allocated space for special inode
+ */
+void diFreeSpecial(struct inode *ip)
+{
+ if (ip == NULL) {
+ jfs_err("diFreeSpecial called with NULL ip!");
+ return;
+ }
+ filemap_fdatawrite(ip->i_mapping);
+ filemap_fdatawait(ip->i_mapping);
+ truncate_inode_pages(ip->i_mapping, 0);
+ iput(ip);
+}
+
+
+
+/*
+ * NAME: diWrite()
+ *
+ * FUNCTION: write the on-disk inode portion of the in-memory inode
+ * to its corresponding on-disk inode.
+ *
+ * on entry, the specifed incore inode should itself
+ * specify the disk inode number corresponding to the
+ * incore inode (i.e. i_number should be initialized).
+ *
+ * the inode contains the inode extent address for the disk
+ * inode. with the inode extent address in hand, the
+ * page of the extent that contains the disk inode is
+ * read and the disk inode portion of the incore inode
+ * is copied to the disk inode.
+ *
+ * PARAMETERS:
+ * tid - transacation id
+ * ip - pointer to incore inode to be written to the inode extent.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -EIO - i/o error.
+ */
+int diWrite(tid_t tid, struct inode *ip)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+ int rc = 0;
+ s32 ino;
+ struct dinode *dp;
+ s64 blkno;
+ int block_offset;
+ int inodes_left;
+ struct metapage *mp;
+ uint pageno;
+ int rel_inode;
+ int dioffset;
+ struct inode *ipimap;
+ uint type;
+ lid_t lid;
+ struct tlock *ditlck, *tlck;
+ struct linelock *dilinelock, *ilinelock;
+ struct lv *lv;
+ int n;
+
+ ipimap = jfs_ip->ipimap;
+
+ ino = ip->i_ino & (INOSPERIAG - 1);
+
+ if (!addressPXD(&(jfs_ip->ixpxd)) ||
+ (lengthPXD(&(jfs_ip->ixpxd)) !=
+ JFS_IP(ipimap)->i_imap->im_nbperiext)) {
+ jfs_error(ip->i_sb, "diWrite: ixpxd invalid");
+ return -EIO;
+ }
+
+ /*
+ * read the page of disk inode containing the specified inode:
+ */
+ /* compute the block address of the page */
+ blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage);
+
+ rel_inode = (ino & (INOSPERPAGE - 1));
+ pageno = blkno >> sbi->l2nbperpage;
+
+ if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) {
+ /*
+ * OS/2 didn't always align inode extents on page boundaries
+ */
+ inodes_left =
+ (sbi->nbperpage - block_offset) << sbi->l2niperblk;
+
+ if (rel_inode < inodes_left)
+ rel_inode += block_offset << sbi->l2niperblk;
+ else {
+ pageno += 1;
+ rel_inode -= inodes_left;
+ }
+ }
+ /* read the page of disk inode */
+ retry:
+ mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1);
+ if (mp == 0)
+ return -EIO;
+
+ /* get the pointer to the disk inode */
+ dp = (struct dinode *) mp->data;
+ dp += rel_inode;
+
+ dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE;
+
+ /*
+ * acquire transaction lock on the on-disk inode;
+ * N.B. tlock is acquired on ipimap not ip;
+ */
+ if ((ditlck =
+ txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL)
+ goto retry;
+ dilinelock = (struct linelock *) & ditlck->lock;
+
+ /*
+ * copy btree root from in-memory inode to on-disk inode
+ *
+ * (tlock is taken from inline B+-tree root in in-memory
+ * inode when the B+-tree root is updated, which is pointed
+ * by jfs_ip->blid as well as being on tx tlock list)
+ *
+ * further processing of btree root is based on the copy
+ * in in-memory inode, where txLog() will log from, and,
+ * for xtree root, txUpdateMap() will update map and reset
+ * XAD_NEW bit;
+ */
+
+ if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) {
+ /*
+ * This is the special xtree inside the directory for storing
+ * the directory table
+ */
+ xtpage_t *p, *xp;
+ xad_t *xad;
+
+ jfs_ip->xtlid = 0;
+ tlck = lid_to_tlock(lid);
+ assert(tlck->type & tlckXTREE);
+ tlck->type |= tlckBTROOT;
+ tlck->mp = mp;
+ ilinelock = (struct linelock *) & tlck->lock;
+
+ /*
+ * copy xtree root from inode to dinode:
+ */
+ p = &jfs_ip->i_xtroot;
+ xp = (xtpage_t *) &dp->di_dirtable;
+ lv = ilinelock->lv;
+ for (n = 0; n < ilinelock->index; n++, lv++) {
+ memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
+ lv->length << L2XTSLOTSIZE);
+ }
+
+ /* reset on-disk (metadata page) xtree XAD_NEW bit */
+ xad = &xp->xad[XTENTRYSTART];
+ for (n = XTENTRYSTART;
+ n < le16_to_cpu(xp->header.nextindex); n++, xad++)
+ if (xad->flag & (XAD_NEW | XAD_EXTENDED))
+ xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+ }
+
+ if ((lid = jfs_ip->blid) == 0)
+ goto inlineData;
+ jfs_ip->blid = 0;
+
+ tlck = lid_to_tlock(lid);
+ type = tlck->type;
+ tlck->type |= tlckBTROOT;
+ tlck->mp = mp;
+ ilinelock = (struct linelock *) & tlck->lock;
+
+ /*
+ * regular file: 16 byte (XAD slot) granularity
+ */
+ if (type & tlckXTREE) {
+ xtpage_t *p, *xp;
+ xad_t *xad;
+
+ /*
+ * copy xtree root from inode to dinode:
+ */
+ p = &jfs_ip->i_xtroot;
+ xp = &dp->di_xtroot;
+ lv = ilinelock->lv;
+ for (n = 0; n < ilinelock->index; n++, lv++) {
+ memcpy(&xp->xad[lv->offset], &p->xad[lv->offset],
+ lv->length << L2XTSLOTSIZE);
+ }
+
+ /* reset on-disk (metadata page) xtree XAD_NEW bit */
+ xad = &xp->xad[XTENTRYSTART];
+ for (n = XTENTRYSTART;
+ n < le16_to_cpu(xp->header.nextindex); n++, xad++)
+ if (xad->flag & (XAD_NEW | XAD_EXTENDED))
+ xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
+ }
+ /*
+ * directory: 32 byte (directory entry slot) granularity
+ */
+ else if (type & tlckDTREE) {
+ dtpage_t *p, *xp;
+
+ /*
+ * copy dtree root from inode to dinode:
+ */
+ p = (dtpage_t *) &jfs_ip->i_dtroot;
+ xp = (dtpage_t *) & dp->di_dtroot;
+ lv = ilinelock->lv;
+ for (n = 0; n < ilinelock->index; n++, lv++) {
+ memcpy(&xp->slot[lv->offset], &p->slot[lv->offset],
+ lv->length << L2DTSLOTSIZE);
+ }
+ } else {
+ jfs_err("diWrite: UFO tlock");
+ }
+
+ inlineData:
+ /*
+ * copy inline symlink from in-memory inode to on-disk inode
+ */
+ if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) {
+ lv = & dilinelock->lv[dilinelock->index];
+ lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE;
+ lv->length = 2;
+ memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE);
+ dilinelock->index++;
+ }
+ /*
+ * copy inline data from in-memory inode to on-disk inode:
+ * 128 byte slot granularity
+ */
+ if (test_cflag(COMMIT_Inlineea, ip)) {
+ lv = & dilinelock->lv[dilinelock->index];
+ lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE;
+ lv->length = 1;
+ memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE);
+ dilinelock->index++;
+
+ clear_cflag(COMMIT_Inlineea, ip);
+ }
+
+ /*
+ * lock/copy inode base: 128 byte slot granularity
+ */
+// baseDinode:
+ lv = & dilinelock->lv[dilinelock->index];
+ lv->offset = dioffset >> L2INODESLOTSIZE;
+ copy_to_dinode(dp, ip);
+ if (test_and_clear_cflag(COMMIT_Dirtable, ip)) {
+ lv->length = 2;
+ memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96);
+ } else
+ lv->length = 1;
+ dilinelock->index++;
+
+#ifdef _JFS_FASTDASD
+ /*
+ * We aren't logging changes to the DASD used in directory inodes,
+ * but we need to write them to disk. If we don't unmount cleanly,
+ * mount will recalculate the DASD used.
+ */
+ if (S_ISDIR(ip->i_mode)
+ && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED))
+ memcpy(&dp->di_DASD, &ip->i_DASD, sizeof(struct dasd));
+#endif /* _JFS_FASTDASD */
+
+ /* release the buffer holding the updated on-disk inode.
+ * the buffer will be later written by commit processing.
+ */
+ write_metapage(mp);
+
+ return (rc);
+}
+
+
+/*
+ * NAME: diFree(ip)
+ *
+ * FUNCTION: free a specified inode from the inode working map
+ * for a fileset or aggregate.
+ *
+ * if the inode to be freed represents the first (only)
+ * free inode within the iag, the iag will be placed on
+ * the ag free inode list.
+ *
+ * freeing the inode will cause the inode extent to be
+ * freed if the inode is the only allocated inode within
+ * the extent. in this case all the disk resource backing
+ * up the inode extent will be freed. in addition, the iag
+ * will be placed on the ag extent free list if the extent
+ * is the first free extent in the iag. if freeing the
+ * extent also means that no free inodes will exist for
+ * the iag, the iag will also be removed from the ag free
+ * inode list.
+ *
+ * the iag describing the inode will be freed if the extent
+ * is to be freed and it is the only backed extent within
+ * the iag. in this case, the iag will be removed from the
+ * ag free extent list and ag free inode list and placed on
+ * the inode map's free iag list.
+ *
+ * a careful update approach is used to provide consistency
+ * in the face of updates to multiple buffers. under this
+ * approach, all required buffers are obtained before making
+ * any updates and are held until all updates are complete.
+ *
+ * PARAMETERS:
+ * ip - inode to be freed.
+ *
+ * RETURN VALUES:
+ * 0 - success
+ * -EIO - i/o error.
+ */
+int diFree(struct inode *ip)
+{
+ int rc;
+ ino_t inum = ip->i_ino;
+ struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp;
+ struct metapage *mp, *amp, *bmp, *cmp, *dmp;
+ int iagno, ino, extno, bitno, sword, agno;
+ int back, fwd;
+ u32 bitmap, mask;
+ struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap;
+ struct inomap *imap = JFS_IP(ipimap)->i_imap;
+ pxd_t freepxd;
+ tid_t tid;
+ struct inode *iplist[3];
+ struct tlock *tlck;
+ struct pxd_lock *pxdlock;
+
+ /*
+ * This is just to suppress compiler warnings. The same logic that
+ * references these variables is used to initialize them.
+ */
+ aiagp = biagp = ciagp = diagp = NULL;
+
+ /* get the iag number containing the inode.
+ */
+ iagno = INOTOIAG(inum);
+
+ /* make sure that the iag is contained within
+ * the map.
+ */
+ if (iagno >= imap->im_nextiag) {
+ dump_mem("imap", imap, 32);
+ jfs_error(ip->i_sb,
+ "diFree: inum = %d, iagno = %d, nextiag = %d",
+ (uint) inum, iagno, imap->im_nextiag);
+ return -EIO;
+ }
+
+ /* get the allocation group for this ino.
+ */
+ agno = JFS_IP(ip)->agno;
+
+ /* Lock the AG specific inode map information
+ */
+ AG_LOCK(imap, agno);
+
+ /* Obtain read lock in imap inode. Don't release it until we have
+ * read all of the IAG's that we are going to.
+ */
+ IREAD_LOCK(ipimap);
+
+ /* read the iag.
+ */
+ if ((rc = diIAGRead(imap, iagno, &mp))) {
+ IREAD_UNLOCK(ipimap);
+ AG_UNLOCK(imap, agno);
+ return (rc);
+ }
+ iagp = (struct iag *) mp->data;
+
+ /* get the inode number and extent number of the inode within
+ * the iag and the inode number within the extent.
+ */
+ ino = inum & (INOSPERIAG - 1);
+ extno = ino >> L2INOSPEREXT;
+ bitno = ino & (INOSPEREXT - 1);
+ mask = HIGHORDER >> bitno;
+
+ if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
+ jfs_error(ip->i_sb,
+ "diFree: wmap shows inode already free");
+ }
+
+ if (!addressPXD(&iagp->inoext[extno])) {
+ release_metapage(mp);
+ IREAD_UNLOCK(ipimap);
+ AG_UNLOCK(imap, agno);
+ jfs_error(ip->i_sb, "diFree: invalid inoext");
+ return -EIO;
+ }
+
+ /* compute the bitmap for the extent reflecting the freed inode.
+ */
+ bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask;
+
+ if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) {
+ release_metapage(mp);
+ IREAD_UNLOCK(ipimap);
+ AG_UNLOCK(imap, agno);
+ jfs_error(ip->i_sb, "diFree: numfree > numinos");
+ return -EIO;
+ }
+ /*
+ * inode extent still has some inodes or below low water mark:
+ * keep the inode extent;
+ */
+ if (bitmap ||
+ imap->im_agctl[agno].numfree < 96 ||
+ (imap->im_agctl[agno].numfree < 288 &&
+ (((imap->im_agctl[agno].numfree * 100) /
+ imap->im_agctl[agno].numinos) <= 25))) {
+ /* if the iag currently has no free inodes (i.e.,
+ * the inode being freed is the first free inode of iag),
+ * insert the iag at head of the inode free list for the ag.
+ */
+ if (iagp->nfreeinos == 0) {
+ /* check if there are any iags on the ag inode
+ * free list. if so, read the first one so that
+ * we can link the current iag onto the list at
+ * the head.
+ */
+ if ((fwd = imap->im_agctl[agno].inofree) >= 0) {
+ /* read the iag that currently is the head
+ * of the list.
+ */
+ if ((rc = diIAGRead(imap, fwd, &amp))) {
+ IREAD_UNLOCK(ipimap);
+ AG_UNLOCK(imap, agno);
+ release_metapage(mp);
+ return (rc);
+ }
+ aiagp = (struct iag *) amp->data;
+
+ /* make current head point back to the iag.
+ */
+ aiagp->inofreeback = cpu_to_le32(iagno);
+
+ write_metapage(amp);
+ }
+
+ /* iag points forward to current head and iag
+ * becomes the new head of the list.
+ */
+ iagp->inofreefwd =
+ cpu_to_le32(imap->im_agctl[agno].inofree);
+ iagp->inofreeback = cpu_to_le32(-1);
+ imap->im_agctl[agno].inofree = iagno;
+ }
+ IREAD_UNLOCK(ipimap);
+
+ /* update the free inode summary map for the extent if
+ * freeing the inode means the extent will now have free
+ * inodes (i.e., the inode being freed is the first free
+ * inode of extent),
+ */
+ if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
+ sword = extno >> L2EXTSPERSUM;
+ bitno = extno & (EXTSPERSUM - 1);
+ iagp->inosmap[sword] &=
+ cpu_to_le32(~(HIGHORDER >> bitno));
+ }
+
+ /* update the bitmap.
+ */
+ iagp->wmap[extno] = cpu_to_le32(bitmap);
+ DBG_DIFREE(imap, inum);
+
+ /* update the free inode counts at the iag, ag and
+ * map level.
+ */
+ iagp->nfreeinos =
+ cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1);
+ imap->im_agctl[agno].numfree += 1;
+ atomic_inc(&imap->im_numfree);
+
+ /* release the AG inode map lock
+ */
+ AG_UNLOCK(imap, agno);
+
+ /* write the iag */
+ write_metapage(mp);
+
+ return (0);
+ }
+
+
+ /*
+ * inode extent has become free and above low water mark:
+ * free the inode extent;
+ */
+
+ /*
+ * prepare to update iag list(s) (careful update step 1)
+ */
+ amp = bmp = cmp = dmp = NULL;
+ fwd = back = -1;
+
+ /* check if the iag currently has no free extents. if so,
+ * it will be placed on the head of the ag extent free list.
+ */
+ if (iagp->nfreeexts == 0) {
+ /* check if the ag extent free list has any iags.
+ * if so, read the iag at the head of the list now.
+ * this (head) iag will be updated later to reflect
+ * the addition of the current iag at the head of
+ * the list.
+ */
+ if ((fwd = imap->im_agctl[agno].extfree) >= 0) {
+ if ((rc = diIAGRead(imap, fwd, &amp)))
+ goto error_out;
+ aiagp = (struct iag *) amp->data;
+ }
+ } else {
+ /* iag has free extents. check if the addition of a free
+ * extent will cause all extents to be free within this
+ * iag. if so, the iag will be removed from the ag extent
+ * free list and placed on the inode map's free iag list.
+ */
+ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
+ /* in preparation for removing the iag from the
+ * ag extent free list, read the iags preceeding
+ * and following the iag on the ag extent free
+ * list.
+ */
+ if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) {
+ if ((rc = diIAGRead(imap, fwd, &amp)))
+ goto error_out;
+ aiagp = (struct iag *) amp->data;
+ }
+
+ if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) {
+ if ((rc = diIAGRead(imap, back, &bmp)))
+ goto error_out;
+ biagp = (struct iag *) bmp->data;
+ }
+ }
+ }
+
+ /* remove the iag from the ag inode free list if freeing
+ * this extent cause the iag to have no free inodes.
+ */
+ if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) {
+ int inofreeback = le32_to_cpu(iagp->inofreeback);
+ int inofreefwd = le32_to_cpu(iagp->inofreefwd);
+
+ /* in preparation for removing the iag from the
+ * ag inode free list, read the iags preceeding
+