10 files changed, 92 insertions, 49 deletions
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index f5719117edf..e285022f006 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -182,9 +182,9 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
 	 * Take appropriate lock on inode
 	 */
 	if (create)
-		IWRITE_LOCK(ip);
+		IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
 	else
-		IREAD_LOCK(ip);
+		IREAD_LOCK(ip, RDWRLOCK_NORMAL);
 
 	if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) &&
 	    (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) &&
@@ -359,7 +359,7 @@ void jfs_truncate(struct inode *ip)
 
 	nobh_truncate_page(ip->i_mapping, ip->i_size);
 
-	IWRITE_LOCK(ip);
+	IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
 	jfs_truncate_nolock(ip, ip->i_size);
 	IWRITE_UNLOCK(ip);
 }
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index ddffbbd4d95..7378798f0b2 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -39,10 +39,6 @@ extern void jfs_proc_clean(void);
 /*
  *	assert with traditional printf/panic
  */
-#ifdef CONFIG_KERNEL_ASSERTS
-/* kgdb stuff */
-#define assert(p) KERNEL_ASSERT(#p, p)
-#else
 #define assert(p) do {	\
 	if (!(p)) {	\
 		printk(KERN_CRIT "BUG at %s:%d assert(%s)\n",	\
@@ -50,7 +46,6 @@ extern void jfs_proc_clean(void);
 		BUG();	\
 	}		\
 } while (0)
-#endif
 
 /*
  *	debug ON
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 23546c8fd48..82b0544bd76 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -337,7 +337,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
 	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
 
-	IREAD_LOCK(ipbmap);
+	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 
 	/* block to be freed better be within the mapsize. */
 	if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
@@ -733,7 +733,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 	 * allocation group size, try to allocate anywhere.
 	 */
 	if (l2nb > bmp->db_agl2size) {
-		IWRITE_LOCK(ipbmap);
+		IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
 
 		rc = dbAllocAny(bmp, nblocks, l2nb, results);
 
@@ -774,7 +774,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 	 * the hint using a tiered strategy.
 	 */
 	if (nblocks <= BPERDMAP) {
-		IREAD_LOCK(ipbmap);
+		IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 
 		/* get the buffer for the dmap containing the hint.
 		 */
@@ -844,7 +844,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 	/* try to satisfy the allocation request with blocks within
 	 * the same allocation group as the hint.
 	 */
-	IWRITE_LOCK(ipbmap);
+	IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
 	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC)
 		goto write_unlock;
 
@@ -856,7 +856,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 	 * Let dbNextAG recommend a preferred allocation group
 	 */
 	agno = dbNextAG(ipbmap);
-	IWRITE_LOCK(ipbmap);
+	IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
 
 	/* Try to allocate within this allocation group.  if that fails, try to
 	 * allocate anywhere in the map.
@@ -900,7 +900,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
 	s64 lblkno;
 	struct metapage *mp;
 
-	IREAD_LOCK(ipbmap);
+	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 
 	/*
 	 * validate extent request:
@@ -1050,7 +1050,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
 	 */
 	extblkno = lastblkno + 1;
 
-	IREAD_LOCK(ipbmap);
+	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 
 	/* better be within the file system */
 	bmp = sbi->bmap;
@@ -3116,7 +3116,7 @@ int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
 	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
 
-	IREAD_LOCK(ipbmap);
+	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
 
 	/* block to be allocated better be within the mapsize. */
 	ASSERT(nblocks <= bmp->db_mapsize - blkno);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 53f63b47a6d..aa5124b643b 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -331,7 +331,7 @@ int diRead(struct inode *ip)
 
 	/* read the iag */
 	imap = JFS_IP(ipimap)->i_imap;
-	IREAD_LOCK(ipimap);
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 	rc = diIAGRead(imap, iagno, &mp);
 	IREAD_UNLOCK(ipimap);
 	if (rc) {
@@ -920,7 +920,7 @@ int diFree(struct inode *ip)
 	/* Obtain read lock in imap inode.  Don't release it until we have
 	 * read all of the IAG's that we are going to.
 	 */
-	IREAD_LOCK(ipimap);
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 
 	/* read the iag.
 	 */
@@ -1415,7 +1415,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 	AG_LOCK(imap, agno);
 
 	/* Get read lock on imap inode */
-	IREAD_LOCK(ipimap);
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 
 	/* get the iag number and read the iag */
 	iagno = INOTOIAG(inum);
@@ -1808,7 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
 		return -ENOSPC;
 
 	/* obtain read lock on imap inode */
-	IREAD_LOCK(imap->im_ipimap);
+	IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
 
 	/* read the iag at the head of the list.
 	 */
@@ -1946,7 +1946,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
 	} else {
 		/* read the iag.
 		 */
-		IREAD_LOCK(imap->im_ipimap);
+		IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP);
 		if ((rc = diIAGRead(imap, iagno, &mp))) {
 			IREAD_UNLOCK(imap->im_ipimap);
 			jfs_error(ip->i_sb, "diAllocExt: error reading iag");
@@ -2509,7 +2509,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
 		 */
 
 		/* acquire inode map lock */
-		IWRITE_LOCK(ipimap);
+		IWRITE_LOCK(ipimap, RDWRLOCK_IMAP);
 
 		if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) {
 			IWRITE_UNLOCK(ipimap);
@@ -2648,7 +2648,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
 	}
 
 	/* obtain read lock on map */
-	IREAD_LOCK(ipimap);
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 
 	/* read the iag */
 	if ((rc = diIAGRead(imap, iagno, &mp))) {
@@ -2779,7 +2779,7 @@ diUpdatePMap(struct inode *ipimap,
 		return -EIO;
 	}
 	/* read the iag */
-	IREAD_LOCK(ipimap);
+	IREAD_LOCK(ipimap, RDWRLOCK_IMAP);
 	rc = diIAGRead(imap, iagno, &mp);
 	IREAD_UNLOCK(ipimap);
 	if (rc)
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 94005584445..8f453eff3c8 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -109,9 +109,11 @@ struct jfs_inode_info {
 
 #define JFS_ACL_NOT_CACHED ((void *)-1)
 
-#define IREAD_LOCK(ip)		down_read(&JFS_IP(ip)->rdwrlock)
+#define IREAD_LOCK(ip, subclass) \
+	down_read_nested(&JFS_IP(ip)->rdwrlock, subclass)
 #define IREAD_UNLOCK(ip)	up_read(&JFS_IP(ip)->rdwrlock)
-#define IWRITE_LOCK(ip)		down_write(&JFS_IP(ip)->rdwrlock)
+#define IWRITE_LOCK(ip, subclass) \
+	down_write_nested(&JFS_IP(ip)->rdwrlock, subclass)
 #define IWRITE_UNLOCK(ip)	up_write(&JFS_IP(ip)->rdwrlock)
 
 /*
@@ -127,6 +129,29 @@ enum cflags {
 	COMMIT_Synclist,	/* metadata pages on group commit synclist */
 };
 
+/*
+ * commit_mutex nesting subclasses:
+ */
+enum commit_mutex_class
+{
+	COMMIT_MUTEX_PARENT,
+	COMMIT_MUTEX_CHILD,
+	COMMIT_MUTEX_SECOND_PARENT,	/* Renaming */
+	COMMIT_MUTEX_VICTIM		/* Inode being unlinked due to rename */
+};
+
+/*
+ * rdwrlock subclasses:
+ * The dmap inode may be locked while a normal inode or the imap inode are
+ * locked.
+ */
+enum rdwrlock_class
+{
+	RDWRLOCK_NORMAL,
+	RDWRLOCK_IMAP,
+	RDWRLOCK_DMAP
+};
+
 #define set_cflag(flag, ip)	set_bit(flag, &(JFS_IP(ip)->cflag))
 #define clear_cflag(flag, ip)	clear_bit(flag, &(JFS_IP(ip)->cflag))
 #define test_cflag(flag, ip)	test_bit(flag, &(JFS_IP(ip)->cflag))
diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h
index 7d78e83d7c4..df48ece4b7a 100644
--- a/fs/jfs/jfs_lock.h
+++ b/fs/jfs/jfs_lock.h
@@ -42,7 +42,7 @@ do {							\
 		if (cond)				\
 			break;				\
 		unlock_cmd;				\
-		schedule();				\
+		io_schedule();				\
 		lock_cmd;				\
 	}						\
 	current->state = TASK_RUNNING;			\
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index ceaf03b9493..58deae00750 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -56,7 +56,7 @@ static inline void __lock_metapage(struct metapage *mp)
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (metapage_locked(mp)) {
 			unlock_page(mp->page);
-			schedule();
+			io_schedule();
 			lock_page(mp->page);
 		}
 	} while (trylock_metapage(mp));
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d558e51b0df..6988a1082f5 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -135,7 +135,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
 	add_wait_queue(event, &wait);
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	TXN_UNLOCK();
-	schedule();
+	io_schedule();
 	current->state = TASK_RUNNING;
 	remove_wait_queue(event, &wait);
 }
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index e98eb03e531..acc97c46d8a 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -757,6 +757,11 @@ static int xtSearch(struct inode *ip, s64 xoff,	s64 *nextp,
 			nsplit = 0;
 
 		/* push (bn, index) of the parent page/entry */
+		if (BT_STACK_FULL(btstack)) {
+			jfs_error(ip->i_sb, "stack overrun in xtSearch!");
+			XT_PUTPAGE(mp);
+			return -EIO;
+		}
 		BT_PUSH(btstack, bn, index);
 
 		/* get the child page block number */
@@ -3915,6 +3920,11 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
 	 */
       getChild:
 	/* save current parent entry for the child page */
+	if (BT_STACK_FULL(&btstack)) {
+		jfs_error(ip->i_sb, "stack overrun in xtTruncate!");
+		XT_PUTPAGE(mp);
+		return -EIO;
+	}
 	BT_PUSH(&btstack, bn, index);
 
 	/* get child page */
@@ -4112,6 +4122,11 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
 	 */
       getChild:
 	/* save current parent entry for the child page */
+	if (BT_STACK_FULL(&btstack)) {
+		jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!");
+		XT_PUTPAGE(mp);
+		return -EIO;
+	}
 	BT_PUSH(&btstack, bn, index);
 
 	/* get child page */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a6a8c16c872..7ab47561b68 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -104,8 +104,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 
 	tid = txBegin(dip->i_sb, 0);
 
-	mutex_lock(&JFS_IP(dip)->commit_mutex);
-	mutex_lock(&JFS_IP(ip)->commit_mutex);
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
 	rc = jfs_init_acl(tid, ip, dip);
 	if (rc)
@@ -238,8 +238,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 
 	tid = txBegin(dip->i_sb, 0);
 
-	mutex_lock(&JFS_IP(dip)->commit_mutex);
-	mutex_lock(&JFS_IP(ip)->commit_mutex);
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
 	rc = jfs_init_acl(tid, ip, dip);
 	if (rc)
@@ -365,8 +365,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 
 	tid = txBegin(dip->i_sb, 0);
 
-	mutex_lock(&JFS_IP(dip)->commit_mutex);
-	mutex_lock(&JFS_IP(ip)->commit_mutex);
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
 	iplist[0] = dip;
 	iplist[1] = ip;
@@ -483,12 +483,12 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
 	if ((rc = get_UCSname(&dname, dentry)))
 		goto out;
 
-	IWRITE_LOCK(ip);
+	IWRITE_LOCK(ip, RDWRLOCK_NORMAL);
 
 	tid = txBegin(dip->i_sb, 0);
 
-	mutex_lock(&JFS_IP(dip)->commit_mutex);
-	mutex_lock(&JFS_IP(ip)->commit_mutex);
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
 	iplist[0] = dip;
 	iplist[1] = ip;
@@ -802,8 +802,8 @@ static int jfs_link(struct dentry *old_dentry,
 
 	tid = txBegin(ip->i_sb, 0);
 
-	mutex_lock(&JFS_IP(dir)->commit_mutex);
-	mutex_lock(&JFS_IP(ip)->commit_mutex);
+	mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
 	/*
 	 * scan parent directory for entry/freespace
@@ -913,8 +913,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 
 	tid = txBegin(dip->i_sb, 0);
 
-	mutex_lock(&JFS_IP(dip)->commit_mutex);
-	mutex_lock(&JFS_IP(ip)->commit_mutex);
+	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
 	rc = jfs_init_security(tid, ip, dip);
 	if (rc)
@@ -1127,7 +1127,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out3;
 		}
 	} else if (new_ip) {
-		IWRITE_LOCK(new_ip);
+		IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
 		/* Init inode for quota operations. */
 		DQUOT_INIT(new_ip);
 	}
@@ -1137,13 +1137,21 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 */
 	tid = txBegin(new_dir->i_sb, 0);
 
-	mutex_lock(&JFS_IP(new_dir)->commit_mutex);
-	mutex_lock(&JFS_IP(old_ip)->commit_mutex);
+	/*
+	 * How do we know the locking is safe from deadlocks?
+	 * The vfs does the hard part for us.  Any time we are taking nested
+	 * commit_mutexes, the vfs already has i_mutex held on the parent.
+	 * Here, the vfs has already taken i_mutex on both old_dir and new_dir.
+	 */
+	mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 	if (old_dir != new_dir)
-		mutex_lock(&JFS_IP(old_dir)->commit_mutex);
+		mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex,
+				  COMMIT_MUTEX_SECOND_PARENT);
 
 	if (new_ip) {
-		mutex_lock(&JFS_IP(new_ip)->commit_mutex);
+		mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex,
+				  COMMIT_MUTEX_VICTIM);
 		/*
 		 * Change existing directory entry to new inode number
 		 */
@@ -1357,8 +1365,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 
 	tid = txBegin(dir->i_sb, 0);
 
-	mutex_lock(&JFS_IP(dir)->commit_mutex);
-	mutex_lock(&JFS_IP(ip)->commit_mutex);
+	mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
+	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
 	rc = jfs_init_acl(tid, ip, dir);
 	if (rc)