diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/jfs/jfs_dtree.c |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'fs/jfs/jfs_dtree.c')
-rw-r--r-- | fs/jfs/jfs_dtree.c | 4752 |
1 files changed, 4752 insertions, 0 deletions
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c new file mode 100644 index 00000000000..e357890adfb --- /dev/null +++ b/fs/jfs/jfs_dtree.c @@ -0,0 +1,4752 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_dtree.c: directory B+-tree manager + * + * B+-tree with variable length key directory: + * + * each directory page is structured as an array of 32-byte + * directory entry slots initialized as a freelist + * to avoid search/compaction of free space at insertion. + * when an entry is inserted, a number of slots are allocated + * from the freelist as required to store variable length data + * of the entry; when the entry is deleted, slots of the entry + * are returned to freelist. + * + * leaf entry stores full name as key and file serial number + * (aka inode number) as data. + * internal/router entry stores sufffix compressed name + * as key and simple extent descriptor as data. + * + * each directory page maintains a sorted entry index table + * which stores the start slot index of sorted entries + * to allow binary search on the table. + * + * directory starts as a root/leaf page in on-disk inode + * inline data area. + * when it becomes full, it starts a leaf of a external extent + * of length of 1 block. each time the first leaf becomes full, + * it is extended rather than split (its size is doubled), + * until its length becoms 4 KBytes, from then the extent is split + * with new 4 Kbyte extent when it becomes full + * to reduce external fragmentation of small directories. + * + * blah, blah, blah, for linear scan of directory in pieces by + * readdir(). + * + * + * case-insensitive directory file system + * + * names are stored in case-sensitive way in leaf entry. + * but stored, searched and compared in case-insensitive (uppercase) order + * (i.e., both search key and entry key are folded for search/compare): + * (note that case-sensitive order is BROKEN in storage, e.g., + * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad + * + * entries which folds to the same key makes up a equivalent class + * whose members are stored as contiguous cluster (may cross page boundary) + * but whose order is arbitrary and acts as duplicate, e.g., + * abc, Abc, aBc, abC) + * + * once match is found at leaf, requires scan forward/backward + * either for, in case-insensitive search, duplicate + * or for, in case-sensitive search, for exact match + * + * router entry must be created/stored in case-insensitive way + * in internal entry: + * (right most key of left page and left most key of right page + * are folded, and its suffix compression is propagated as router + * key in parent) + * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB> + * should be made the router key for the split) + * + * case-insensitive search: + * + * fold search key; + * + * case-insensitive search of B-tree: + * for internal entry, router key is already folded; + * for leaf entry, fold the entry key before comparison. + * + * if (leaf entry case-insensitive match found) + * if (next entry satisfies case-insensitive match) + * return EDUPLICATE; + * if (prev entry satisfies case-insensitive match) + * return EDUPLICATE; + * return match; + * else + * return no match; + * + * serialization: + * target directory inode lock is being held on entry/exit + * of all main directory service routines. + * + * log based recovery: + */ + +#include <linux/fs.h> +#include <linux/quotaops.h> +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dmap.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + +/* dtree split parameter */ +struct dtsplit { + struct metapage *mp; + s16 index; + s16 nslot; + struct component_name *key; + ddata_t *data; + struct pxdlist *pxdlist; +}; + +#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) + +/* get page buffer for specified block address */ +#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ +{\ + BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\ + if (!(RC))\ + {\ + if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\ + ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\ + {\ + BT_PUTPAGE(MP);\ + jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\ + MP = NULL;\ + RC = -EIO;\ + }\ + }\ +} + +/* for consistency */ +#define DT_PUTPAGE(MP) BT_PUTPAGE(MP) + +#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ + BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot) + +/* + * forward references + */ +static int dtSplitUp(tid_t tid, struct inode *ip, + struct dtsplit * split, struct btstack * btstack); + +static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, + struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp); + +static int dtExtendPage(tid_t tid, struct inode *ip, + struct dtsplit * split, struct btstack * btstack); + +static int dtSplitRoot(tid_t tid, struct inode *ip, + struct dtsplit * split, struct metapage ** rmpp); + +static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, + dtpage_t * fp, struct btstack * btstack); + +static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p); + +static int dtReadFirst(struct inode *ip, struct btstack * btstack); + +static int dtReadNext(struct inode *ip, + loff_t * offset, struct btstack * btstack); + +static int dtCompare(struct component_name * key, dtpage_t * p, int si); + +static int ciCompare(struct component_name * key, dtpage_t * p, int si, + int flag); + +static void dtGetKey(dtpage_t * p, int i, struct component_name * key, + int flag); + +static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, + int ri, struct component_name * key, int flag); + +static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, + ddata_t * data, struct dt_lock **); + +static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, + struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, + int do_index); + +static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock); + +static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock); + +static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock); + +#define ciToUpper(c) UniStrupr((c)->name) + +/* + * read_index_page() + * + * Reads a page of a directory's index table. + * Having metadata mapped into the directory inode's address space + * presents a multitude of problems. We avoid this by mapping to + * the absolute address space outside of the *_metapage routines + */ +static struct metapage *read_index_page(struct inode *inode, s64 blkno) +{ + int rc; + s64 xaddr; + int xflag; + s32 xlen; + + rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); + if (rc || (xlen == 0)) + return NULL; + + return read_metapage(inode, xaddr, PSIZE, 1); +} + +/* + * get_index_page() + * + * Same as get_index_page(), but get's a new page without reading + */ +static struct metapage *get_index_page(struct inode *inode, s64 blkno) +{ + int rc; + s64 xaddr; + int xflag; + s32 xlen; + + rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); + if (rc || (xlen == 0)) + return NULL; + + return get_metapage(inode, xaddr, PSIZE, 1); +} + +/* + * find_index() + * + * Returns dtree page containing directory table entry for specified + * index and pointer to its entry. + * + * mp must be released by caller. + */ +static struct dir_table_slot *find_index(struct inode *ip, u32 index, + struct metapage ** mp, s64 *lblock) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + s64 blkno; + s64 offset; + int page_offset; + struct dir_table_slot *slot; + static int maxWarnings = 10; + + if (index < 2) { + if (maxWarnings) { + jfs_warn("find_entry called with index = %d", index); + maxWarnings--; + } + return NULL; + } + + if (index >= jfs_ip->next_index) { + jfs_warn("find_entry called with index >= next_index"); + return NULL; + } + + if (jfs_dirtable_inline(ip)) { + /* + * Inline directory table + */ + *mp = NULL; + slot = &jfs_ip->i_dirtable[index - 2]; + } else { + offset = (index - 2) * sizeof(struct dir_table_slot); + page_offset = offset & (PSIZE - 1); + blkno = ((offset + 1) >> L2PSIZE) << + JFS_SBI(ip->i_sb)->l2nbperpage; + + if (*mp && (*lblock != blkno)) { + release_metapage(*mp); + *mp = NULL; + } + if (*mp == 0) { + *lblock = blkno; + *mp = read_index_page(ip, blkno); + } + if (*mp == 0) { + jfs_err("free_index: error reading directory table"); + return NULL; + } + + slot = + (struct dir_table_slot *) ((char *) (*mp)->data + + page_offset); + } + return slot; +} + +static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp, + u32 index) +{ + struct tlock *tlck; + struct linelock *llck; + struct lv *lv; + + tlck = txLock(tid, ip, mp, tlckDATA); + llck = (struct linelock *) tlck->lock; + + if (llck->index >= llck->maxcnt) + llck = txLinelock(llck); + lv = &llck->lv[llck->index]; + + /* + * Linelock slot size is twice the size of directory table + * slot size. 512 entries per page. + */ + lv->offset = ((index - 2) & 511) >> 1; + lv->length = 1; + llck->index++; +} + +/* + * add_index() + * + * Adds an entry to the directory index table. This is used to provide + * each directory entry with a persistent index in which to resume + * directory traversals + */ +static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) +{ + struct super_block *sb = ip->i_sb; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + u64 blkno; + struct dir_table_slot *dirtab_slot; + u32 index; + struct linelock *llck; + struct lv *lv; + struct metapage *mp; + s64 offset; + uint page_offset; + struct tlock *tlck; + s64 xaddr; + + ASSERT(DO_INDEX(ip)); + + if (jfs_ip->next_index < 2) { + jfs_warn("add_index: next_index = %d. Resetting!", + jfs_ip->next_index); + jfs_ip->next_index = 2; + } + + index = jfs_ip->next_index++; + + if (index <= MAX_INLINE_DIRTABLE_ENTRY) { + /* + * i_size reflects size of index table, or 8 bytes per entry. + */ + ip->i_size = (loff_t) (index - 1) << 3; + + /* + * dir table fits inline within inode + */ + dirtab_slot = &jfs_ip->i_dirtable[index-2]; + dirtab_slot->flag = DIR_INDEX_VALID; + dirtab_slot->slot = slot; + DTSaddress(dirtab_slot, bn); + + set_cflag(COMMIT_Dirtable, ip); + + return index; + } + if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) { + struct dir_table_slot temp_table[12]; + + /* + * It's time to move the inline table to an external + * page and begin to build the xtree + */ + if (DQUOT_ALLOC_BLOCK(ip, sbi->nbperpage) || + dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) + goto clean_up; /* No space */ + + /* + * Save the table, we're going to overwrite it with the + * xtree root + */ + memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table)); + + /* + * Initialize empty x-tree + */ + xtInitRoot(tid, ip); + + /* + * Allocate the first block & add it to the xtree + */ + if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) { + /* This really shouldn't fail */ + jfs_warn("add_index: xtInsert failed!"); + memcpy(&jfs_ip->i_dirtable, temp_table, + sizeof (temp_table)); + goto clean_up; + } + ip->i_size = PSIZE; + + if ((mp = get_index_page(ip, 0)) == 0) { + jfs_err("add_index: get_metapage failed!"); + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + memcpy(&jfs_ip->i_dirtable, temp_table, + sizeof (temp_table)); + goto clean_up; + } + tlck = txLock(tid, ip, mp, tlckDATA); + llck = (struct linelock *) & tlck->lock; + ASSERT(llck->index == 0); + lv = &llck->lv[0]; + + lv->offset = 0; + lv->length = 6; /* tlckDATA slot size is 16 bytes */ + llck->index++; + + memcpy(mp->data, temp_table, sizeof(temp_table)); + + mark_metapage_dirty(mp); + release_metapage(mp); + + /* + * Logging is now directed by xtree tlocks + */ + clear_cflag(COMMIT_Dirtable, ip); + } + + offset = (index - 2) * sizeof(struct dir_table_slot); + page_offset = offset & (PSIZE - 1); + blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage; + if (page_offset == 0) { + /* + * This will be the beginning of a new page + */ + xaddr = 0; + if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) { + jfs_warn("add_index: xtInsert failed!"); + goto clean_up; + } + ip->i_size += PSIZE; + + if ((mp = get_index_page(ip, blkno))) + memset(mp->data, 0, PSIZE); /* Just looks better */ + else + xtTruncate(tid, ip, offset, COMMIT_PWMAP); + } else + mp = read_index_page(ip, blkno); + + if (mp == 0) { + jfs_err("add_index: get/read_metapage failed!"); + goto clean_up; + } + + lock_index(tid, ip, mp, index); + + dirtab_slot = + (struct dir_table_slot *) ((char *) mp->data + page_offset); + dirtab_slot->flag = DIR_INDEX_VALID; + dirtab_slot->slot = slot; + DTSaddress(dirtab_slot, bn); + + mark_metapage_dirty(mp); + release_metapage(mp); + + return index; + + clean_up: + + jfs_ip->next_index--; + + return 0; +} + +/* + * free_index() + * + * Marks an entry to the directory index table as free. + */ +static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next) +{ + struct dir_table_slot *dirtab_slot; + s64 lblock; + struct metapage *mp = NULL; + + dirtab_slot = find_index(ip, index, &mp, &lblock); + + if (dirtab_slot == 0) + return; + + dirtab_slot->flag = DIR_INDEX_FREE; + dirtab_slot->slot = dirtab_slot->addr1 = 0; + dirtab_slot->addr2 = cpu_to_le32(next); + + if (mp) { + lock_index(tid, ip, mp, index); + mark_metapage_dirty(mp); + release_metapage(mp); + } else + set_cflag(COMMIT_Dirtable, ip); +} + +/* + * modify_index() + * + * Changes an entry in the directory index table + */ +static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn, + int slot, struct metapage ** mp, u64 *lblock) +{ + struct dir_table_slot *dirtab_slot; + + dirtab_slot = find_index(ip, index, mp, lblock); + + if (dirtab_slot == 0) + return; + + DTSaddress(dirtab_slot, bn); + dirtab_slot->slot = slot; + + if (*mp) { + lock_index(tid, ip, *mp, index); + mark_metapage_dirty(*mp); + } else + set_cflag(COMMIT_Dirtable, ip); +} + +/* + * read_index() + * + * reads a directory table slot + */ +static int read_index(struct inode *ip, u32 index, + struct dir_table_slot * dirtab_slot) +{ + s64 lblock; + struct metapage *mp = NULL; + struct dir_table_slot *slot; + + slot = find_index(ip, index, &mp, &lblock); + if (slot == 0) { + return -EIO; + } + + memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot)); + + if (mp) + release_metapage(mp); + + return 0; +} + +/* + * dtSearch() + * + * function: + * Search for the entry with specified key + * + * parameter: + * + * return: 0 - search result on stack, leaf page pinned; + * errno - I/O error + */ +int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, + struct btstack * btstack, int flag) +{ + int rc = 0; + int cmp = 1; /* init for empty page */ + s64 bn; + struct metapage *mp; + dtpage_t *p; + s8 *stbl; + int base, index, lim; + struct btframe *btsp; + pxd_t *pxd; + int psize = 288; /* initial in-line directory */ + ino_t inumber; + struct component_name ciKey; + struct super_block *sb = ip->i_sb; + + ciKey.name = + (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), + GFP_NOFS); + if (ciKey.name == 0) { + rc = -ENOMEM; + goto dtSearch_Exit2; + } + + + /* uppercase search key for c-i directory */ + UniStrcpy(ciKey.name, key->name); + ciKey.namlen = key->namlen; + + /* only uppercase if case-insensitive support is on */ + if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) { + ciToUpper(&ciKey); + } + BT_CLR(btstack); /* reset stack */ + + /* init level count for max pages to split */ + btstack->nsplit = 1; + + /* + * search down tree from root: + * + * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + goto dtSearch_Exit1; + + /* get sorted entry table of the page */ + stbl = DT_GETSTBL(p); + + /* + * binary search with search key K on the current page. + */ + for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { + index = base + (lim >> 1); + + if (p->header.flag & BT_LEAF) { + /* uppercase leaf name to compare */ + cmp = + ciCompare(&ciKey, p, stbl[index], + JFS_SBI(sb)->mntflag); + } else { + /* router key is in uppercase */ + + cmp = dtCompare(&ciKey, p, stbl[index]); + + + } + if (cmp == 0) { + /* + * search hit + */ + /* search hit - leaf page: + * return the entry found + */ + if (p->header.flag & BT_LEAF) { + inumber = le32_to_cpu( + ((struct ldtentry *) & p->slot[stbl[index]])->inumber); + + /* + * search for JFS_LOOKUP + */ + if (flag == JFS_LOOKUP) { + *data = inumber; + rc = 0; + goto out; + } + + /* + * search for JFS_CREATE + */ + if (flag == JFS_CREATE) { + *data = inumber; + rc = -EEXIST; + goto out; + } + + /* + * search for JFS_REMOVE or JFS_RENAME + */ + if ((flag == JFS_REMOVE || + flag == JFS_RENAME) && + *data != inumber) { + rc = -ESTALE; + goto out; + } + + /* + * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME + */ + /* save search result */ + *data = inumber; + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + rc = 0; + goto dtSearch_Exit1; + } + + /* search hit - internal page: + * descend/search its child page + */ + goto getChild; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or (maxindex + 1) index. + */ + /* + * search miss - leaf page + * + * return location of entry (base) where new entry with + * search key K is to be inserted. + */ + if (p->header.flag & BT_LEAF) { + /* + * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME + */ + if (flag == JFS_LOOKUP || flag == JFS_REMOVE || + flag == JFS_RENAME) { + rc = -ENOENT; + goto out; + } + + /* + * search for JFS_CREATE|JFS_FINDDIR: + * + * save search result + */ + *data = 0; + btsp = btstack->top; + btsp->bn = bn; + btsp->index = base; + btsp->mp = mp; + + rc = 0; + goto dtSearch_Exit1; + } + + /* + * search miss - internal page + * + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + getChild: + /* update max. number of pages to split */ + if (BT_STACK_FULL(btstack)) { + /* Something's corrupted, mark filesytem dirty so + * chkdsk will fix it. + */ + jfs_error(sb, "stack overrun in dtSearch!"); + BT_STACK_DUMP(btstack); + rc = -EIO; + goto out; + } + btstack->nsplit++; + + /* push (bn, index) of the parent page/entry */ + BT_PUSH(btstack, bn, index); + + /* get the child page block number */ + pxd = (pxd_t *) & p->slot[stbl[index]]; + bn = addressPXD(pxd); + psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; + + /* unpin the parent page */ + DT_PUTPAGE(mp); + } + + out: + DT_PUTPAGE(mp); + + dtSearch_Exit1: + + kfree(ciKey.name); + + dtSearch_Exit2: + + return rc; +} + + +/* + * dtInsert() + * + * function: insert an entry to directory tree + * + * parameter: + * + * return: 0 - success; + * errno - failure; + */ +int dtInsert(tid_t tid, struct inode *ip, + struct component_name * name, ino_t * fsn, struct btstack * btstack) +{ + int rc = 0; + struct metapage *mp; /* meta-page buffer */ + dtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index; + struct dtsplit split; /* split information */ + ddata_t data; + struct dt_lock *dtlck; + int n; + struct tlock *tlck; + struct lv *lv; + + /* + * retrieve search result + * + * dtSearch() returns (leaf page pinned, index at which to insert). + * n.b. dtSearch() may return index of (maxindex + 1) of + * the full page. + */ + DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); + + /* + * insert entry for new key + */ + if (DO_INDEX(ip)) { + if (JFS_IP(ip)->next_index == DIREND) { + DT_PUTPAGE(mp); + return -EMLINK; + } + n = NDTLEAF(name->namlen); + data.leaf.tid = tid; + data.leaf.ip = ip; + } else { + n = NDTLEAF_LEGACY(name->namlen); + data.leaf.ip = NULL; /* signifies legacy directory format */ + } + data.leaf.ino = *fsn; + + /* + * leaf page does not have enough room for new entry: + * + * extend/split the leaf page; + * + * dtSplitUp() will insert the entry and unpin the leaf page. + */ + if (n > p->header.freecnt) { + split.mp = mp; + split.index = index; + split.nslot = n; + split.key = name; + split.data = &data; + rc = dtSplitUp(tid, ip, &split, btstack); + return rc; + } + + /* + * leaf page does have enough room for new entry: + * + * insert the new data entry into the leaf page; + */ + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + dtInsertEntry(p, index, name, &data, &dtlck); + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + n = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + n; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; + dtlck->index++; + } + + /* unpin the leaf page */ + DT_PUTPAGE(mp); + + return 0; +} + + +/* + * dtSplitUp() + * + * function: propagate insertion bottom up; + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * leaf page unpinned; + */ +static int dtSplitUp(tid_t tid, + struct inode *ip, struct dtsplit * split, struct btstack * btstack) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + int rc = 0; + struct metapage *smp; + dtpage_t *sp; /* split page */ + struct metapage *rmp; + dtpage_t *rp; /* new right page split from sp */ + pxd_t rpxd; /* new right page extent descriptor */ + struct metapage *lmp; + dtpage_t *lp; /* left child page */ + int skip; /* index of entry of insertion */ + struct btframe *parent; /* parent page entry on traverse stack */ + s64 xaddr, nxaddr; + int xlen, xsize; + struct pxdlist pxdlist; + pxd_t *pxd; + struct component_name key = { 0, NULL }; + ddata_t *data = split->data; + int n; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + int quota_allocation = 0; + + /* get split page */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + key.name = + (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), + GFP_NOFS); + if (key.name == 0) { + DT_PUTPAGE(smp); + rc = -ENOMEM; + goto dtSplitUp_Exit; + } + + /* + * split leaf page + * + * The split routines insert the new entry, and + * acquire txLock as appropriate. + */ + /* + * split root leaf page: + */ + if (sp->header.flag & BT_ROOT) { + /* + * allocate a single extent child page + */ + xlen = 1; + n = sbi->bsize >> L2DTSLOTSIZE; + n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ + n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */ + if (n <= split->nslot) + xlen++; + if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) { + DT_PUTPAGE(smp); + goto freeKeyName; + } + + pxdlist.maxnpxd = 1; + pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + split->pxdlist = &pxdlist; + rc = dtSplitRoot(tid, ip, split, &rmp); + + if (rc) + dbFree(ip, xaddr, xlen); + else + DT_PUTPAGE(rmp); + + DT_PUTPAGE(smp); + + goto freeKeyName; + } + + /* + * extend first leaf page + * + * extend the 1st extent if less than buffer page size + * (dtExtendPage() reurns leaf page unpinned) + */ + pxd = &sp->header.self; + xlen = lengthPXD(pxd); + xsize = xlen << sbi->l2bsize; + if (xsize < PSIZE) { + xaddr = addressPXD(pxd); + n = xsize >> L2DTSLOTSIZE; + n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ + if ((n + sp->header.freecnt) <= split->nslot) + n = xlen + (xlen << 1); + else + n = xlen; + + /* Allocate blocks to quota. */ + if (DQUOT_ALLOC_BLOCK(ip, n)) { + rc = -EDQUOT; + goto extendOut; + } + quota_allocation += n; + + if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, + (s64) n, &nxaddr))) + goto extendOut; + + pxdlist.maxnpxd = 1; + pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + PXDaddress(pxd, nxaddr) + PXDlength(pxd, xlen + n); + split->pxdlist = &pxdlist; + if ((rc = dtExtendPage(tid, ip, split, btstack))) { + nxaddr = addressPXD(pxd); + if (xaddr != nxaddr) { + /* free relocated extent */ + xlen = lengthPXD(pxd); + dbFree(ip, nxaddr, (s64) xlen); + } else { + /* free extended delta */ + xlen = lengthPXD(pxd) - n; + xaddr = addressPXD(pxd) + xlen; + dbFree(ip, xaddr, (s64) n); + } + } + + extendOut: + DT_PUTPAGE(smp); + goto freeKeyName; + } + + /* + * split leaf page <sp> into <sp> and a new right page <rp>. + * + * return <rp> pinned and its extent descriptor <rpxd> + */ + /* + * allocate new directory page extent and + * new index page(s) to cover page split(s) + * + * allocation hint: ? + */ + n = btstack->nsplit; + pxdlist.maxnpxd = pxdlist.npxd = 0; + xlen = sbi->nbperpage; + for (pxd = pxdlist.pxd; n > 0; n--, pxd++) { + if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + pxdlist.maxnpxd++; + continue; + } + + DT_PUTPAGE(smp); + + /* undo allocation */ + goto splitOut; + } + + split->pxdlist = &pxdlist; + if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) { + DT_PUTPAGE(smp); + + /* undo allocation */ + goto splitOut; + } + + /* + * propagate up the router entry for the leaf page just split + * + * insert a router entry for the new page into the parent page, + * propagate the insert/split up the tree by walking back the stack + * of (bn of parent page, index of child page entry in parent page) + * that were traversed during the search for the page that split. + * + * the propagation of insert/split up the tree stops if the root + * splits or the page inserted into doesn't have to split to hold + * the new entry. + * + * the parent entry for the split page remains the same, and + * a new entry is inserted at its right with the first key and + * block number of the new right page. + * + * There are a maximum of 4 pages pinned at any time: + * two children, left parent and right parent (when the parent splits). + * keep the child pages pinned while working on the parent. + * make sure that all pins are released at exit. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* parent page specified by stack frame <parent> */ + + /* keep current child pages (<lp>, <rp>) pinned */ + lmp = smp; + lp = sp; + + /* + * insert router entry in parent for new right child page <rp> + */ + /* get the parent page <sp> */ + DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); + if (rc) { + DT_PUTPAGE(lmp); + DT_PUTPAGE(rmp); + goto splitOut; + } + + /* + * The new key entry goes ONE AFTER the index of parent entry, + * because the split was to the right. + */ + skip = parent->index + 1; + + /* + * compute the key for the router entry + * + * key suffix compression: + * for internal pages that have leaf pages as children, + * retain only what's needed to distinguish between + * the new entry and the entry on the page to its left. + * If the keys compare equal, retain the entire key. + * + * note that compression is performed only at computing + * router key at the lowest internal level. + * further compression of the key between pairs of higher + * level internal pages loses too much information and + * the search may fail. + * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,} + * results in two adjacent parent entries (a)(xx). + * if split occurs between these two entries, and + * if compression is applied, the router key of parent entry + * of right page (x) will divert search for x into right + * subtree and miss x in the left subtree.) + * + * the entire key must be retained for the next-to-leftmost + * internal key at any level of the tree, or search may fail + * (e.g., ?) + */ + switch (rp->header.flag & BT_TYPE) { + case BT_LEAF: + /* + * compute the length of prefix for suffix compression + * between last entry of left page and first entry + * of right page + */ + if ((sp->header.flag & BT_ROOT && skip > 1) || + sp->header.prev != 0 || skip > 1) { + /* compute uppercase router prefix key */ + rc = ciGetLeafPrefixKey(lp, + lp->header.nextindex-1, + rp, 0, &key, + sbi->mntflag); + if (rc) { + DT_PUTPAGE(lmp); + DT_PUTPAGE(rmp); + DT_PUTPAGE(smp); + goto splitOut; + } + } else { + /* next to leftmost entry of + lowest internal level */ + + /* compute uppercase router key */ + dtGetKey(rp, 0, &key, sbi->mntflag); + key.name[key.namlen] = 0; + + if ((sbi->mntflag & JFS_OS2) == JFS_OS2) + ciToUpper(&key); + } + + n = NDTINTERNAL(key.namlen); + break; + + case BT_INTERNAL: + dtGetKey(rp, 0, &key, sbi->mntflag); + n = NDTINTERNAL(key.namlen); + break; + + default: + jfs_err("dtSplitUp(): UFO!"); + |