aboutsummaryrefslogtreecommitdiff
path: root/fs/jfs/jfs_dtree.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 15:20:36 -0700
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/jfs/jfs_dtree.c
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'fs/jfs/jfs_dtree.c')
-rw-r--r--fs/jfs/jfs_dtree.c4752
1 files changed, 4752 insertions, 0 deletions
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
new file mode 100644
index 00000000000..e357890adfb
--- /dev/null
+++ b/fs/jfs/jfs_dtree.c
@@ -0,0 +1,4752 @@
+/*
+ * Copyright (C) International Business Machines Corp., 2000-2004
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ * jfs_dtree.c: directory B+-tree manager
+ *
+ * B+-tree with variable length key directory:
+ *
+ * each directory page is structured as an array of 32-byte
+ * directory entry slots initialized as a freelist
+ * to avoid search/compaction of free space at insertion.
+ * when an entry is inserted, a number of slots are allocated
+ * from the freelist as required to store variable length data
+ * of the entry; when the entry is deleted, slots of the entry
+ * are returned to freelist.
+ *
+ * leaf entry stores full name as key and file serial number
+ * (aka inode number) as data.
+ * internal/router entry stores sufffix compressed name
+ * as key and simple extent descriptor as data.
+ *
+ * each directory page maintains a sorted entry index table
+ * which stores the start slot index of sorted entries
+ * to allow binary search on the table.
+ *
+ * directory starts as a root/leaf page in on-disk inode
+ * inline data area.
+ * when it becomes full, it starts a leaf of a external extent
+ * of length of 1 block. each time the first leaf becomes full,
+ * it is extended rather than split (its size is doubled),
+ * until its length becoms 4 KBytes, from then the extent is split
+ * with new 4 Kbyte extent when it becomes full
+ * to reduce external fragmentation of small directories.
+ *
+ * blah, blah, blah, for linear scan of directory in pieces by
+ * readdir().
+ *
+ *
+ * case-insensitive directory file system
+ *
+ * names are stored in case-sensitive way in leaf entry.
+ * but stored, searched and compared in case-insensitive (uppercase) order
+ * (i.e., both search key and entry key are folded for search/compare):
+ * (note that case-sensitive order is BROKEN in storage, e.g.,
+ * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad
+ *
+ * entries which folds to the same key makes up a equivalent class
+ * whose members are stored as contiguous cluster (may cross page boundary)
+ * but whose order is arbitrary and acts as duplicate, e.g.,
+ * abc, Abc, aBc, abC)
+ *
+ * once match is found at leaf, requires scan forward/backward
+ * either for, in case-insensitive search, duplicate
+ * or for, in case-sensitive search, for exact match
+ *
+ * router entry must be created/stored in case-insensitive way
+ * in internal entry:
+ * (right most key of left page and left most key of right page
+ * are folded, and its suffix compression is propagated as router
+ * key in parent)
+ * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB>
+ * should be made the router key for the split)
+ *
+ * case-insensitive search:
+ *
+ * fold search key;
+ *
+ * case-insensitive search of B-tree:
+ * for internal entry, router key is already folded;
+ * for leaf entry, fold the entry key before comparison.
+ *
+ * if (leaf entry case-insensitive match found)
+ * if (next entry satisfies case-insensitive match)
+ * return EDUPLICATE;
+ * if (prev entry satisfies case-insensitive match)
+ * return EDUPLICATE;
+ * return match;
+ * else
+ * return no match;
+ *
+ * serialization:
+ * target directory inode lock is being held on entry/exit
+ * of all main directory service routines.
+ *
+ * log based recovery:
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "jfs_incore.h"
+#include "jfs_superblock.h"
+#include "jfs_filsys.h"
+#include "jfs_metapage.h"
+#include "jfs_dmap.h"
+#include "jfs_unicode.h"
+#include "jfs_debug.h"
+
+/* dtree split parameter */
+struct dtsplit {
+ struct metapage *mp;
+ s16 index;
+ s16 nslot;
+ struct component_name *key;
+ ddata_t *data;
+ struct pxdlist *pxdlist;
+};
+
+#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot)
+
+/* get page buffer for specified block address */
+#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\
+{\
+ BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\
+ if (!(RC))\
+ {\
+ if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\
+ ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\
+ {\
+ BT_PUTPAGE(MP);\
+ jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\
+ MP = NULL;\
+ RC = -EIO;\
+ }\
+ }\
+}
+
+/* for consistency */
+#define DT_PUTPAGE(MP) BT_PUTPAGE(MP)
+
+#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \
+ BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot)
+
+/*
+ * forward references
+ */
+static int dtSplitUp(tid_t tid, struct inode *ip,
+ struct dtsplit * split, struct btstack * btstack);
+
+static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
+ struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp);
+
+static int dtExtendPage(tid_t tid, struct inode *ip,
+ struct dtsplit * split, struct btstack * btstack);
+
+static int dtSplitRoot(tid_t tid, struct inode *ip,
+ struct dtsplit * split, struct metapage ** rmpp);
+
+static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp,
+ dtpage_t * fp, struct btstack * btstack);
+
+static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p);
+
+static int dtReadFirst(struct inode *ip, struct btstack * btstack);
+
+static int dtReadNext(struct inode *ip,
+ loff_t * offset, struct btstack * btstack);
+
+static int dtCompare(struct component_name * key, dtpage_t * p, int si);
+
+static int ciCompare(struct component_name * key, dtpage_t * p, int si,
+ int flag);
+
+static void dtGetKey(dtpage_t * p, int i, struct component_name * key,
+ int flag);
+
+static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
+ int ri, struct component_name * key, int flag);
+
+static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key,
+ ddata_t * data, struct dt_lock **);
+
+static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp,
+ struct dt_lock ** sdtlock, struct dt_lock ** ddtlock,
+ int do_index);
+
+static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock);
+
+static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock);
+
+static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock);
+
+#define ciToUpper(c) UniStrupr((c)->name)
+
+/*
+ * read_index_page()
+ *
+ * Reads a page of a directory's index table.
+ * Having metadata mapped into the directory inode's address space
+ * presents a multitude of problems. We avoid this by mapping to
+ * the absolute address space outside of the *_metapage routines
+ */
+static struct metapage *read_index_page(struct inode *inode, s64 blkno)
+{
+ int rc;
+ s64 xaddr;
+ int xflag;
+ s32 xlen;
+
+ rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
+ if (rc || (xlen == 0))
+ return NULL;
+
+ return read_metapage(inode, xaddr, PSIZE, 1);
+}
+
+/*
+ * get_index_page()
+ *
+ * Same as get_index_page(), but get's a new page without reading
+ */
+static struct metapage *get_index_page(struct inode *inode, s64 blkno)
+{
+ int rc;
+ s64 xaddr;
+ int xflag;
+ s32 xlen;
+
+ rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1);
+ if (rc || (xlen == 0))
+ return NULL;
+
+ return get_metapage(inode, xaddr, PSIZE, 1);
+}
+
+/*
+ * find_index()
+ *
+ * Returns dtree page containing directory table entry for specified
+ * index and pointer to its entry.
+ *
+ * mp must be released by caller.
+ */
+static struct dir_table_slot *find_index(struct inode *ip, u32 index,
+ struct metapage ** mp, s64 *lblock)
+{
+ struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+ s64 blkno;
+ s64 offset;
+ int page_offset;
+ struct dir_table_slot *slot;
+ static int maxWarnings = 10;
+
+ if (index < 2) {
+ if (maxWarnings) {
+ jfs_warn("find_entry called with index = %d", index);
+ maxWarnings--;
+ }
+ return NULL;
+ }
+
+ if (index >= jfs_ip->next_index) {
+ jfs_warn("find_entry called with index >= next_index");
+ return NULL;
+ }
+
+ if (jfs_dirtable_inline(ip)) {
+ /*
+ * Inline directory table
+ */
+ *mp = NULL;
+ slot = &jfs_ip->i_dirtable[index - 2];
+ } else {
+ offset = (index - 2) * sizeof(struct dir_table_slot);
+ page_offset = offset & (PSIZE - 1);
+ blkno = ((offset + 1) >> L2PSIZE) <<
+ JFS_SBI(ip->i_sb)->l2nbperpage;
+
+ if (*mp && (*lblock != blkno)) {
+ release_metapage(*mp);
+ *mp = NULL;
+ }
+ if (*mp == 0) {
+ *lblock = blkno;
+ *mp = read_index_page(ip, blkno);
+ }
+ if (*mp == 0) {
+ jfs_err("free_index: error reading directory table");
+ return NULL;
+ }
+
+ slot =
+ (struct dir_table_slot *) ((char *) (*mp)->data +
+ page_offset);
+ }
+ return slot;
+}
+
+static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp,
+ u32 index)
+{
+ struct tlock *tlck;
+ struct linelock *llck;
+ struct lv *lv;
+
+ tlck = txLock(tid, ip, mp, tlckDATA);
+ llck = (struct linelock *) tlck->lock;
+
+ if (llck->index >= llck->maxcnt)
+ llck = txLinelock(llck);
+ lv = &llck->lv[llck->index];
+
+ /*
+ * Linelock slot size is twice the size of directory table
+ * slot size. 512 entries per page.
+ */
+ lv->offset = ((index - 2) & 511) >> 1;
+ lv->length = 1;
+ llck->index++;
+}
+
+/*
+ * add_index()
+ *
+ * Adds an entry to the directory index table. This is used to provide
+ * each directory entry with a persistent index in which to resume
+ * directory traversals
+ */
+static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
+{
+ struct super_block *sb = ip->i_sb;
+ struct jfs_sb_info *sbi = JFS_SBI(sb);
+ struct jfs_inode_info *jfs_ip = JFS_IP(ip);
+ u64 blkno;
+ struct dir_table_slot *dirtab_slot;
+ u32 index;
+ struct linelock *llck;
+ struct lv *lv;
+ struct metapage *mp;
+ s64 offset;
+ uint page_offset;
+ struct tlock *tlck;
+ s64 xaddr;
+
+ ASSERT(DO_INDEX(ip));
+
+ if (jfs_ip->next_index < 2) {
+ jfs_warn("add_index: next_index = %d. Resetting!",
+ jfs_ip->next_index);
+ jfs_ip->next_index = 2;
+ }
+
+ index = jfs_ip->next_index++;
+
+ if (index <= MAX_INLINE_DIRTABLE_ENTRY) {
+ /*
+ * i_size reflects size of index table, or 8 bytes per entry.
+ */
+ ip->i_size = (loff_t) (index - 1) << 3;
+
+ /*
+ * dir table fits inline within inode
+ */
+ dirtab_slot = &jfs_ip->i_dirtable[index-2];
+ dirtab_slot->flag = DIR_INDEX_VALID;
+ dirtab_slot->slot = slot;
+ DTSaddress(dirtab_slot, bn);
+
+ set_cflag(COMMIT_Dirtable, ip);
+
+ return index;
+ }
+ if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) {
+ struct dir_table_slot temp_table[12];
+
+ /*
+ * It's time to move the inline table to an external
+ * page and begin to build the xtree
+ */
+ if (DQUOT_ALLOC_BLOCK(ip, sbi->nbperpage) ||
+ dbAlloc(ip, 0, sbi->nbperpage, &xaddr))
+ goto clean_up; /* No space */
+
+ /*
+ * Save the table, we're going to overwrite it with the
+ * xtree root
+ */
+ memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table));
+
+ /*
+ * Initialize empty x-tree
+ */
+ xtInitRoot(tid, ip);
+
+ /*
+ * Allocate the first block & add it to the xtree
+ */
+ if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) {
+ /* This really shouldn't fail */
+ jfs_warn("add_index: xtInsert failed!");
+ memcpy(&jfs_ip->i_dirtable, temp_table,
+ sizeof (temp_table));
+ goto clean_up;
+ }
+ ip->i_size = PSIZE;
+
+ if ((mp = get_index_page(ip, 0)) == 0) {
+ jfs_err("add_index: get_metapage failed!");
+ xtTruncate(tid, ip, 0, COMMIT_PWMAP);
+ memcpy(&jfs_ip->i_dirtable, temp_table,
+ sizeof (temp_table));
+ goto clean_up;
+ }
+ tlck = txLock(tid, ip, mp, tlckDATA);
+ llck = (struct linelock *) & tlck->lock;
+ ASSERT(llck->index == 0);
+ lv = &llck->lv[0];
+
+ lv->offset = 0;
+ lv->length = 6; /* tlckDATA slot size is 16 bytes */
+ llck->index++;
+
+ memcpy(mp->data, temp_table, sizeof(temp_table));
+
+ mark_metapage_dirty(mp);
+ release_metapage(mp);
+
+ /*
+ * Logging is now directed by xtree tlocks
+ */
+ clear_cflag(COMMIT_Dirtable, ip);
+ }
+
+ offset = (index - 2) * sizeof(struct dir_table_slot);
+ page_offset = offset & (PSIZE - 1);
+ blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage;
+ if (page_offset == 0) {
+ /*
+ * This will be the beginning of a new page
+ */
+ xaddr = 0;
+ if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) {
+ jfs_warn("add_index: xtInsert failed!");
+ goto clean_up;
+ }
+ ip->i_size += PSIZE;
+
+ if ((mp = get_index_page(ip, blkno)))
+ memset(mp->data, 0, PSIZE); /* Just looks better */
+ else
+ xtTruncate(tid, ip, offset, COMMIT_PWMAP);
+ } else
+ mp = read_index_page(ip, blkno);
+
+ if (mp == 0) {
+ jfs_err("add_index: get/read_metapage failed!");
+ goto clean_up;
+ }
+
+ lock_index(tid, ip, mp, index);
+
+ dirtab_slot =
+ (struct dir_table_slot *) ((char *) mp->data + page_offset);
+ dirtab_slot->flag = DIR_INDEX_VALID;
+ dirtab_slot->slot = slot;
+ DTSaddress(dirtab_slot, bn);
+
+ mark_metapage_dirty(mp);
+ release_metapage(mp);
+
+ return index;
+
+ clean_up:
+
+ jfs_ip->next_index--;
+
+ return 0;
+}
+
+/*
+ * free_index()
+ *
+ * Marks an entry to the directory index table as free.
+ */
+static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next)
+{
+ struct dir_table_slot *dirtab_slot;
+ s64 lblock;
+ struct metapage *mp = NULL;
+
+ dirtab_slot = find_index(ip, index, &mp, &lblock);
+
+ if (dirtab_slot == 0)
+ return;
+
+ dirtab_slot->flag = DIR_INDEX_FREE;
+ dirtab_slot->slot = dirtab_slot->addr1 = 0;
+ dirtab_slot->addr2 = cpu_to_le32(next);
+
+ if (mp) {
+ lock_index(tid, ip, mp, index);
+ mark_metapage_dirty(mp);
+ release_metapage(mp);
+ } else
+ set_cflag(COMMIT_Dirtable, ip);
+}
+
+/*
+ * modify_index()
+ *
+ * Changes an entry in the directory index table
+ */
+static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn,
+ int slot, struct metapage ** mp, u64 *lblock)
+{
+ struct dir_table_slot *dirtab_slot;
+
+ dirtab_slot = find_index(ip, index, mp, lblock);
+
+ if (dirtab_slot == 0)
+ return;
+
+ DTSaddress(dirtab_slot, bn);
+ dirtab_slot->slot = slot;
+
+ if (*mp) {
+ lock_index(tid, ip, *mp, index);
+ mark_metapage_dirty(*mp);
+ } else
+ set_cflag(COMMIT_Dirtable, ip);
+}
+
+/*
+ * read_index()
+ *
+ * reads a directory table slot
+ */
+static int read_index(struct inode *ip, u32 index,
+ struct dir_table_slot * dirtab_slot)
+{
+ s64 lblock;
+ struct metapage *mp = NULL;
+ struct dir_table_slot *slot;
+
+ slot = find_index(ip, index, &mp, &lblock);
+ if (slot == 0) {
+ return -EIO;
+ }
+
+ memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot));
+
+ if (mp)
+ release_metapage(mp);
+
+ return 0;
+}
+
+/*
+ * dtSearch()
+ *
+ * function:
+ * Search for the entry with specified key
+ *
+ * parameter:
+ *
+ * return: 0 - search result on stack, leaf page pinned;
+ * errno - I/O error
+ */
+int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
+ struct btstack * btstack, int flag)
+{
+ int rc = 0;
+ int cmp = 1; /* init for empty page */
+ s64 bn;
+ struct metapage *mp;
+ dtpage_t *p;
+ s8 *stbl;
+ int base, index, lim;
+ struct btframe *btsp;
+ pxd_t *pxd;
+ int psize = 288; /* initial in-line directory */
+ ino_t inumber;
+ struct component_name ciKey;
+ struct super_block *sb = ip->i_sb;
+
+ ciKey.name =
+ (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
+ GFP_NOFS);
+ if (ciKey.name == 0) {
+ rc = -ENOMEM;
+ goto dtSearch_Exit2;
+ }
+
+
+ /* uppercase search key for c-i directory */
+ UniStrcpy(ciKey.name, key->name);
+ ciKey.namlen = key->namlen;
+
+ /* only uppercase if case-insensitive support is on */
+ if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) {
+ ciToUpper(&ciKey);
+ }
+ BT_CLR(btstack); /* reset stack */
+
+ /* init level count for max pages to split */
+ btstack->nsplit = 1;
+
+ /*
+ * search down tree from root:
+ *
+ * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of
+ * internal page, child page Pi contains entry with k, Ki <= K < Kj.
+ *
+ * if entry with search key K is not found
+ * internal page search find the entry with largest key Ki
+ * less than K which point to the child page to search;
+ * leaf page search find the entry with smallest key Kj
+ * greater than K so that the returned index is the position of
+ * the entry to be shifted right for insertion of new entry.
+ * for empty tree, search key is greater than any key of the tree.
+ *
+ * by convention, root bn = 0.
+ */
+ for (bn = 0;;) {
+ /* get/pin the page to search */
+ DT_GETPAGE(ip, bn, mp, psize, p, rc);
+ if (rc)
+ goto dtSearch_Exit1;
+
+ /* get sorted entry table of the page */
+ stbl = DT_GETSTBL(p);
+
+ /*
+ * binary search with search key K on the current page.
+ */
+ for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
+ index = base + (lim >> 1);
+
+ if (p->header.flag & BT_LEAF) {
+ /* uppercase leaf name to compare */
+ cmp =
+ ciCompare(&ciKey, p, stbl[index],
+ JFS_SBI(sb)->mntflag);
+ } else {
+ /* router key is in uppercase */
+
+ cmp = dtCompare(&ciKey, p, stbl[index]);
+
+
+ }
+ if (cmp == 0) {
+ /*
+ * search hit
+ */
+ /* search hit - leaf page:
+ * return the entry found
+ */
+ if (p->header.flag & BT_LEAF) {
+ inumber = le32_to_cpu(
+ ((struct ldtentry *) & p->slot[stbl[index]])->inumber);
+
+ /*
+ * search for JFS_LOOKUP
+ */
+ if (flag == JFS_LOOKUP) {
+ *data = inumber;
+ rc = 0;
+ goto out;
+ }
+
+ /*
+ * search for JFS_CREATE
+ */
+ if (flag == JFS_CREATE) {
+ *data = inumber;
+ rc = -EEXIST;
+ goto out;
+ }
+
+ /*
+ * search for JFS_REMOVE or JFS_RENAME
+ */
+ if ((flag == JFS_REMOVE ||
+ flag == JFS_RENAME) &&
+ *data != inumber) {
+ rc = -ESTALE;
+ goto out;
+ }
+
+ /*
+ * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME
+ */
+ /* save search result */
+ *data = inumber;
+ btsp = btstack->top;
+ btsp->bn = bn;
+ btsp->index = index;
+ btsp->mp = mp;
+
+ rc = 0;
+ goto dtSearch_Exit1;
+ }
+
+ /* search hit - internal page:
+ * descend/search its child page
+ */
+ goto getChild;
+ }
+
+ if (cmp > 0) {
+ base = index + 1;
+ --lim;
+ }
+ }
+
+ /*
+ * search miss
+ *
+ * base is the smallest index with key (Kj) greater than
+ * search key (K) and may be zero or (maxindex + 1) index.
+ */
+ /*
+ * search miss - leaf page
+ *
+ * return location of entry (base) where new entry with
+ * search key K is to be inserted.
+ */
+ if (p->header.flag & BT_LEAF) {
+ /*
+ * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME
+ */
+ if (flag == JFS_LOOKUP || flag == JFS_REMOVE ||
+ flag == JFS_RENAME) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ /*
+ * search for JFS_CREATE|JFS_FINDDIR:
+ *
+ * save search result
+ */
+ *data = 0;
+ btsp = btstack->top;
+ btsp->bn = bn;
+ btsp->index = base;
+ btsp->mp = mp;
+
+ rc = 0;
+ goto dtSearch_Exit1;
+ }
+
+ /*
+ * search miss - internal page
+ *
+ * if base is non-zero, decrement base by one to get the parent
+ * entry of the child page to search.
+ */
+ index = base ? base - 1 : base;
+
+ /*
+ * go down to child page
+ */
+ getChild:
+ /* update max. number of pages to split */
+ if (BT_STACK_FULL(btstack)) {
+ /* Something's corrupted, mark filesytem dirty so
+ * chkdsk will fix it.
+ */
+ jfs_error(sb, "stack overrun in dtSearch!");
+ BT_STACK_DUMP(btstack);
+ rc = -EIO;
+ goto out;
+ }
+ btstack->nsplit++;
+
+ /* push (bn, index) of the parent page/entry */
+ BT_PUSH(btstack, bn, index);
+
+ /* get the child page block number */
+ pxd = (pxd_t *) & p->slot[stbl[index]];
+ bn = addressPXD(pxd);
+ psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize;
+
+ /* unpin the parent page */
+ DT_PUTPAGE(mp);
+ }
+
+ out:
+ DT_PUTPAGE(mp);
+
+ dtSearch_Exit1:
+
+ kfree(ciKey.name);
+
+ dtSearch_Exit2:
+
+ return rc;
+}
+
+
+/*
+ * dtInsert()
+ *
+ * function: insert an entry to directory tree
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ * errno - failure;
+ */
+int dtInsert(tid_t tid, struct inode *ip,
+ struct component_name * name, ino_t * fsn, struct btstack * btstack)
+{
+ int rc = 0;
+ struct metapage *mp; /* meta-page buffer */
+ dtpage_t *p; /* base B+-tree index page */
+ s64 bn;
+ int index;
+ struct dtsplit split; /* split information */
+ ddata_t data;
+ struct dt_lock *dtlck;
+ int n;
+ struct tlock *tlck;
+ struct lv *lv;
+
+ /*
+ * retrieve search result
+ *
+ * dtSearch() returns (leaf page pinned, index at which to insert).
+ * n.b. dtSearch() may return index of (maxindex + 1) of
+ * the full page.
+ */
+ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index);
+
+ /*
+ * insert entry for new key
+ */
+ if (DO_INDEX(ip)) {
+ if (JFS_IP(ip)->next_index == DIREND) {
+ DT_PUTPAGE(mp);
+ return -EMLINK;
+ }
+ n = NDTLEAF(name->namlen);
+ data.leaf.tid = tid;
+ data.leaf.ip = ip;
+ } else {
+ n = NDTLEAF_LEGACY(name->namlen);
+ data.leaf.ip = NULL; /* signifies legacy directory format */
+ }
+ data.leaf.ino = *fsn;
+
+ /*
+ * leaf page does not have enough room for new entry:
+ *
+ * extend/split the leaf page;
+ *
+ * dtSplitUp() will insert the entry and unpin the leaf page.
+ */
+ if (n > p->header.freecnt) {
+ split.mp = mp;
+ split.index = index;
+ split.nslot = n;
+ split.key = name;
+ split.data = &data;
+ rc = dtSplitUp(tid, ip, &split, btstack);
+ return rc;
+ }
+
+ /*
+ * leaf page does have enough room for new entry:
+ *
+ * insert the new data entry into the leaf page;
+ */
+ BT_MARK_DIRTY(mp, ip);
+ /*
+ * acquire a transaction lock on the leaf page
+ */
+ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY);
+ dtlck = (struct dt_lock *) & tlck->lock;
+ ASSERT(dtlck->index == 0);
+ lv = & dtlck->lv[0];
+
+ /* linelock header */
+ lv->offset = 0;
+ lv->length = 1;
+ dtlck->index++;
+
+ dtInsertEntry(p, index, name, &data, &dtlck);
+
+ /* linelock stbl of non-root leaf page */
+ if (!(p->header.flag & BT_ROOT)) {
+ if (dtlck->index >= dtlck->maxcnt)
+ dtlck = (struct dt_lock *) txLinelock(dtlck);
+ lv = & dtlck->lv[dtlck->index];
+ n = index >> L2DTSLOTSIZE;
+ lv->offset = p->header.stblindex + n;
+ lv->length =
+ ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1;
+ dtlck->index++;
+ }
+
+ /* unpin the leaf page */
+ DT_PUTPAGE(mp);
+
+ return 0;
+}
+
+
+/*
+ * dtSplitUp()
+ *
+ * function: propagate insertion bottom up;
+ *
+ * parameter:
+ *
+ * return: 0 - success;
+ * errno - failure;
+ * leaf page unpinned;
+ */
+static int dtSplitUp(tid_t tid,
+ struct inode *ip, struct dtsplit * split, struct btstack * btstack)
+{
+ struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ int rc = 0;
+ struct metapage *smp;
+ dtpage_t *sp; /* split page */
+ struct metapage *rmp;
+ dtpage_t *rp; /* new right page split from sp */
+ pxd_t rpxd; /* new right page extent descriptor */
+ struct metapage *lmp;
+ dtpage_t *lp; /* left child page */
+ int skip; /* index of entry of insertion */
+ struct btframe *parent; /* parent page entry on traverse stack */
+ s64 xaddr, nxaddr;
+ int xlen, xsize;
+ struct pxdlist pxdlist;
+ pxd_t *pxd;
+ struct component_name key = { 0, NULL };
+ ddata_t *data = split->data;
+ int n;
+ struct dt_lock *dtlck;
+ struct tlock *tlck;
+ struct lv *lv;
+ int quota_allocation = 0;
+
+ /* get split page */
+ smp = split->mp;
+ sp = DT_PAGE(ip, smp);
+
+ key.name =
+ (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t),
+ GFP_NOFS);
+ if (key.name == 0) {
+ DT_PUTPAGE(smp);
+ rc = -ENOMEM;
+ goto dtSplitUp_Exit;
+ }
+
+ /*
+ * split leaf page
+ *
+ * The split routines insert the new entry, and
+ * acquire txLock as appropriate.
+ */
+ /*
+ * split root leaf page:
+ */
+ if (sp->header.flag & BT_ROOT) {
+ /*
+ * allocate a single extent child page
+ */
+ xlen = 1;
+ n = sbi->bsize >> L2DTSLOTSIZE;
+ n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */
+ n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */
+ if (n <= split->nslot)
+ xlen++;
+ if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) {
+ DT_PUTPAGE(smp);
+ goto freeKeyName;
+ }
+
+ pxdlist.maxnpxd = 1;
+ pxdlist.npxd = 0;
+ pxd = &pxdlist.pxd[0];
+ PXDaddress(pxd, xaddr);
+ PXDlength(pxd, xlen);
+ split->pxdlist = &pxdlist;
+ rc = dtSplitRoot(tid, ip, split, &rmp);
+
+ if (rc)
+ dbFree(ip, xaddr, xlen);
+ else
+ DT_PUTPAGE(rmp);
+
+ DT_PUTPAGE(smp);
+
+ goto freeKeyName;
+ }
+
+ /*
+ * extend first leaf page
+ *
+ * extend the 1st extent if less than buffer page size
+ * (dtExtendPage() reurns leaf page unpinned)
+ */
+ pxd = &sp->header.self;
+ xlen = lengthPXD(pxd);
+ xsize = xlen << sbi->l2bsize;
+ if (xsize < PSIZE) {
+ xaddr = addressPXD(pxd);
+ n = xsize >> L2DTSLOTSIZE;
+ n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */
+ if ((n + sp->header.freecnt) <= split->nslot)
+ n = xlen + (xlen << 1);
+ else
+ n = xlen;
+
+ /* Allocate blocks to quota. */
+ if (DQUOT_ALLOC_BLOCK(ip, n)) {
+ rc = -EDQUOT;
+ goto extendOut;
+ }
+ quota_allocation += n;
+
+ if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen,
+ (s64) n, &nxaddr)))
+ goto extendOut;
+
+ pxdlist.maxnpxd = 1;
+ pxdlist.npxd = 0;
+ pxd = &pxdlist.pxd[0];
+ PXDaddress(pxd, nxaddr)
+ PXDlength(pxd, xlen + n);
+ split->pxdlist = &pxdlist;
+ if ((rc = dtExtendPage(tid, ip, split, btstack))) {
+ nxaddr = addressPXD(pxd);
+ if (xaddr != nxaddr) {
+ /* free relocated extent */
+ xlen = lengthPXD(pxd);
+ dbFree(ip, nxaddr, (s64) xlen);
+ } else {
+ /* free extended delta */
+ xlen = lengthPXD(pxd) - n;
+ xaddr = addressPXD(pxd) + xlen;
+ dbFree(ip, xaddr, (s64) n);
+ }
+ }
+
+ extendOut:
+ DT_PUTPAGE(smp);
+ goto freeKeyName;
+ }
+
+ /*
+ * split leaf page <sp> into <sp> and a new right page <rp>.
+ *
+ * return <rp> pinned and its extent descriptor <rpxd>
+ */
+ /*
+ * allocate new directory page extent and
+ * new index page(s) to cover page split(s)
+ *
+ * allocation hint: ?
+ */
+ n = btstack->nsplit;
+ pxdlist.maxnpxd = pxdlist.npxd = 0;
+ xlen = sbi->nbperpage;
+ for (pxd = pxdlist.pxd; n > 0; n--, pxd++) {
+ if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) {
+ PXDaddress(pxd, xaddr);
+ PXDlength(pxd, xlen);
+ pxdlist.maxnpxd++;
+ continue;
+ }
+
+ DT_PUTPAGE(smp);
+
+ /* undo allocation */
+ goto splitOut;
+ }
+
+ split->pxdlist = &pxdlist;
+ if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) {
+ DT_PUTPAGE(smp);
+
+ /* undo allocation */
+ goto splitOut;
+ }
+
+ /*
+ * propagate up the router entry for the leaf page just split
+ *
+ * insert a router entry for the new page into the parent page,
+ * propagate the insert/split up the tree by walking back the stack
+ * of (bn of parent page, index of child page entry in parent page)
+ * that were traversed during the search for the page that split.
+ *
+ * the propagation of insert/split up the tree stops if the root
+ * splits or the page inserted into doesn't have to split to hold
+ * the new entry.
+ *
+ * the parent entry for the split page remains the same, and
+ * a new entry is inserted at its right with the first key and
+ * block number of the new right page.
+ *
+ * There are a maximum of 4 pages pinned at any time:
+ * two children, left parent and right parent (when the parent splits).
+ * keep the child pages pinned while working on the parent.
+ * make sure that all pins are released at exit.
+ */
+ while ((parent = BT_POP(btstack)) != NULL) {
+ /* parent page specified by stack frame <parent> */
+
+ /* keep current child pages (<lp>, <rp>) pinned */
+ lmp = smp;
+ lp = sp;
+
+ /*
+ * insert router entry in parent for new right child page <rp>
+ */
+ /* get the parent page <sp> */
+ DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
+ if (rc) {
+ DT_PUTPAGE(lmp);
+ DT_PUTPAGE(rmp);
+ goto splitOut;
+ }
+
+ /*
+ * The new key entry goes ONE AFTER the index of parent entry,
+ * because the split was to the right.
+ */
+ skip = parent->index + 1;
+
+ /*
+ * compute the key for the router entry
+ *
+ * key suffix compression:
+ * for internal pages that have leaf pages as children,
+ * retain only what's needed to distinguish between
+ * the new entry and the entry on the page to its left.
+ * If the keys compare equal, retain the entire key.
+ *
+ * note that compression is performed only at computing
+ * router key at the lowest internal level.
+ * further compression of the key between pairs of higher
+ * level internal pages loses too much information and
+ * the search may fail.
+ * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,}
+ * results in two adjacent parent entries (a)(xx).
+ * if split occurs between these two entries, and
+ * if compression is applied, the router key of parent entry
+ * of right page (x) will divert search for x into right
+ * subtree and miss x in the left subtree.)
+ *
+ * the entire key must be retained for the next-to-leftmost
+ * internal key at any level of the tree, or search may fail
+ * (e.g., ?)
+ */
+ switch (rp->header.flag & BT_TYPE) {
+ case BT_LEAF:
+ /*
+ * compute the length of prefix for suffix compression
+ * between last entry of left page and first entry
+ * of right page
+ */
+ if ((sp->header.flag & BT_ROOT && skip > 1) ||
+ sp->header.prev != 0 || skip > 1) {
+ /* compute uppercase router prefix key */
+ rc = ciGetLeafPrefixKey(lp,
+ lp->header.nextindex-1,
+ rp, 0, &key,
+ sbi->mntflag);
+ if (rc) {
+ DT_PUTPAGE(lmp);
+ DT_PUTPAGE(rmp);
+ DT_PUTPAGE(smp);
+ goto splitOut;
+ }
+ } else {
+ /* next to leftmost entry of
+ lowest internal level */
+
+ /* compute uppercase router key */
+ dtGetKey(rp, 0, &key, sbi->mntflag);
+ key.name[key.namlen] = 0;
+
+ if ((sbi->mntflag & JFS_OS2) == JFS_OS2)
+ ciToUpper(&key);
+ }
+
+ n = NDTINTERNAL(key.namlen);
+ break;
+
+ case BT_INTERNAL:
+ dtGetKey(rp, 0, &key, sbi->mntflag);
+ n = NDTINTERNAL(key.namlen);
+ break;
+
+ default:
+ jfs_err("dtSplitUp(): UFO!");
+