diff options
author | Alex Tomas <alex@clusterfs.com> | 2008-01-29 00:19:52 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-01-29 00:19:52 -0500 |
commit | c9de560ded61faa5b754137b7753da252391c55a (patch) | |
tree | 2c4311377c4aa72450e27f531e198fe3e1c67db0 | |
parent | 1988b51e476bd097d910c9245b53f2e38aedaf0d (diff) |
ext4: Add multi block allocator for ext4
Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r-- | Documentation/filesystems/ext4.txt | 10 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 39 | ||||
-rw-r--r-- | fs/ext4/Makefile | 2 | ||||
-rw-r--r-- | fs/ext4/balloc.c | 67 | ||||
-rw-r--r-- | fs/ext4/extents.c | 45 | ||||
-rw-r--r-- | fs/ext4/inode.c | 15 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 4552 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 10 | ||||
-rw-r--r-- | fs/ext4/super.c | 62 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 4 | ||||
-rw-r--r-- | include/linux/ext4_fs.h | 76 | ||||
-rw-r--r-- | include/linux/ext4_fs_i.h | 4 | ||||
-rw-r--r-- | include/linux/ext4_fs_sb.h | 52 |
13 files changed, 4900 insertions, 38 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 4f329afe20e..560f88dc709 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -86,9 +86,11 @@ Alex is working on a new set of patches right now. When mounting an ext4 filesystem, the following option are accepted: (*) == default -extents ext4 will use extents to address file data. The +extents (*) ext4 will use extents to address file data. The file system will no longer be mountable by ext3. +noextents ext4 will not use extents for newly created files + journal_checksum Enable checksumming of the journal transactions. This will allow the recovery code in e2fsck and the kernel to detect corruption in the kernel. It is a @@ -206,6 +208,12 @@ nobh (a) cache disk block mapping information "nobh" option tries to avoid associating buffer heads (supported only for "writeback" mode). +mballoc (*) Use the multiple block allocator for block allocation +nomballoc disabled multiple block allocator for block allocation. +stripe=n Number of filesystem blocks that mballoc will try + to use for allocation size and alignment. For RAID5/6 + systems this should be the number of data + disks * RAID chunk size in file system blocks. Data Mode --------- diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index dec99455321..4413a2d4646 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -857,6 +857,45 @@ CPUs. The "procs_blocked" line gives the number of processes currently blocked, waiting for I/O to complete. +1.9 Ext4 file system parameters +------------------------------ +Ext4 file system have one directory per partition under /proc/fs/ext4/ +# ls /proc/fs/ext4/hdc/ +group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req +stats stream_req + +mb_groups: +This file gives the details of mutiblock allocator buddy cache of free blocks + +mb_history: +Multiblock allocation history. + +stats: +This file indicate whether the multiblock allocator should start collecting +statistics. The statistics are shown during unmount + +group_prealloc: +The multiblock allocator normalize the block allocation request to +group_prealloc filesystem blocks if we don't have strip value set. +The stripe value can be specified at mount time or during mke2fs. + +max_to_scan: +How long multiblock allocator can look for a best extent (in found extents) + +min_to_scan: +How long multiblock allocator must look for a best extent + +order2_req: +Multiblock allocator use 2^N search using buddies only for requests greater +than or equal to order2_req. The request size is specfied in file system +blocks. A value of 2 indicate only if the requests are greater than or equal +to 4 blocks. + +stream_req: +Files smaller than stream_req are served by the stream allocator, whose +purpose is to pack requests as close each to other as possible to +produce smooth I/O traffic. Avalue of 16 indicate that file smaller than 16 +filesystem block size will use group based preallocation. ------------------------------------------------------------------------------ Summary diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index d5fd80bc0d0..ac6fa8ca0a2 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o + ext4_jbd2.o migrate.o mballoc.o ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 80a4616c824..ac75ea953d8 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -577,6 +577,8 @@ void ext4_discard_reservation(struct inode *inode) struct ext4_reserve_window_node *rsv; spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock; + ext4_mb_discard_inode_preallocations(inode); + if (!block_i) return; @@ -785,19 +787,29 @@ error_return: * @inode: inode * @block: start physical block to free * @count: number of blocks to count + * @metadata: Are these metadata blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t block, unsigned long count) + ext4_fsblk_t block, unsigned long count, + int metadata) { struct super_block * sb; unsigned long dquot_freed_blocks; + /* this isn't the right place to decide whether block is metadata + * inode.c/extents.c knows better, but for safety ... */ + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || + ext4_should_journal_data(inode)) + metadata = 1; + sb = inode->i_sb; - if (!sb) { - printk ("ext4_free_blocks: nonexistent device"); - return; - } - ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); + + if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info) + ext4_free_blocks_sb(handle, sb, block, count, + &dquot_freed_blocks); + else + ext4_mb_free_blocks(handle, inode, block, count, + metadata, &dquot_freed_blocks); if (dquot_freed_blocks) DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); return; @@ -1576,7 +1588,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) } /** - * ext4_new_blocks() -- core block(s) allocation function + * ext4_new_blocks_old() -- core block(s) allocation function * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) @@ -1589,7 +1601,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * any specific goal block. * */ -ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, +ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp) { struct buffer_head *bitmap_bh = NULL; @@ -1849,13 +1861,46 @@ out: } ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int *errp) + ext4_fsblk_t goal, int *errp) +{ + struct ext4_allocation_request ar; + ext4_fsblk_t ret; + + if (!test_opt(inode->i_sb, MBALLOC)) { + unsigned long count = 1; + ret = ext4_new_blocks_old(handle, inode, goal, &count, errp); + return ret; + } + + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.goal = goal; + ar.len = 1; + ret = ext4_mb_new_blocks(handle, &ar, errp); + return ret; +} + +ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned long *count, int *errp) { - unsigned long count = 1; + struct ext4_allocation_request ar; + ext4_fsblk_t ret; - return ext4_new_blocks(handle, inode, goal, &count, errp); + if (!test_opt(inode->i_sb, MBALLOC)) { + ret = ext4_new_blocks_old(handle, inode, goal, count, errp); + return ret; + } + + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.goal = goal; + ar.len = *count; + ret = ext4_mb_new_blocks(handle, &ar, errp); + *count = ar.len; + return ret; } + /** * ext4_count_free_blocks() -- count filesystem free blocks * @sb: superblock diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f5cf2a94b6f..0cffb59fff4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -853,7 +853,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; - ext4_free_blocks(handle, inode, ablocks[i], 1); + ext4_free_blocks(handle, inode, ablocks[i], 1, 1); } } kfree(ablocks); @@ -1698,7 +1698,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, ext_debug("index is empty, remove it, free block %llu\n", leaf); bh = sb_find_get_block(inode->i_sb, leaf); ext4_forget(handle, 1, inode, bh, leaf); - ext4_free_blocks(handle, inode, leaf, 1); + ext4_free_blocks(handle, inode, leaf, 1, 1); return err; } @@ -1759,8 +1759,10 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, { struct buffer_head *bh; unsigned short ee_len = ext4_ext_get_actual_len(ex); - int i; + int i, metadata = 0; + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + metadata = 1; #ifdef EXTENTS_STATS { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -1789,7 +1791,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, bh = sb_find_get_block(inode->i_sb, start + i); ext4_forget(handle, 0, inode, bh, start + i); } - ext4_free_blocks(handle, inode, start, num); + ext4_free_blocks(handle, inode, start, num, metadata); } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", @@ -2287,6 +2289,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, newblock; int err = 0, depth, ret; unsigned long allocated = 0; + struct ext4_allocation_request ar; __clear_bit(BH_New, &bh_result->b_state); ext_debug("blocks %u/%lu requested for inode %u\n", @@ -2397,8 +2400,15 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info)) ext4_init_block_alloc_info(inode); - /* allocate new block */ - goal = ext4_ext_find_goal(inode, path, iblock); + /* find neighbour allocated blocks */ + ar.lleft = iblock; + err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); + if (err) + goto out2; + ar.lright = iblock; + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); + if (err) + goto out2; /* * See if request is beyond maximum number of blocks we can have in @@ -2421,7 +2431,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, allocated = le16_to_cpu(newex.ee_len); else allocated = max_blocks; - newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err); + + /* allocate new block */ + ar.inode = inode; + ar.goal = ext4_ext_find_goal(inode, path, iblock); + ar.logical = iblock; + ar.len = allocated; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + else + /* disable in-core preallocation for non-regular files */ + ar.flags = 0; + newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; ext_debug("allocate new block: goal %llu, found %llu/%lu\n", @@ -2429,14 +2450,17 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); - newex.ee_len = cpu_to_le16(allocated); + newex.ee_len = cpu_to_le16(ar.len); if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ ext4_ext_mark_uninitialized(&newex); err = ext4_ext_insert_extent(handle, inode, path, &newex); if (err) { /* free data blocks we just allocated */ + /* not a good idea to call discard here directly, + * but otherwise we'd need to call it every free() */ + ext4_mb_discard_inode_preallocations(inode); ext4_free_blocks(handle, inode, ext_pblock(&newex), - le16_to_cpu(newex.ee_len)); + le16_to_cpu(newex.ee_len), 0); goto out2; } @@ -2445,6 +2469,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); + allocated = le16_to_cpu(newex.ee_len); outnew: __set_bit(BH_New, &bh_result->b_state); @@ -2496,6 +2521,8 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); + ext4_mb_discard_inode_preallocations(inode); + /* * TODO: optimization is possible here. * Probably we need not scan at all, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a06a3b7cfc3..bb717cbb749 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -551,7 +551,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, return ret; failed_out: for (i = 0; i <index; i++) - ext4_free_blocks(handle, inode, new_blocks[i], 1); + ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); return ret; } @@ -650,9 +650,9 @@ failed: ext4_journal_forget(handle, branch[i].bh); } for (i = 0; i <indirect_blks; i++) - ext4_free_blocks(handle, inode, new_blocks[i], 1); + ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); - ext4_free_blocks(handle, inode, new_blocks[i], num); + ext4_free_blocks(handle, inode, new_blocks[i], num, 0); return err; } @@ -749,9 +749,10 @@ err_out: for (i = 1; i <= num; i++) { BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); ext4_journal_forget(handle, where[i].bh); - ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1); + ext4_free_blocks(handle, inode, + le32_to_cpu(where[i-1].key), 1, 0); } - ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks); + ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); return err; } @@ -2052,7 +2053,7 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, } } - ext4_free_blocks(handle, inode, block_to_free, count); + ext4_free_blocks(handle, inode, block_to_free, count, 0); } /** @@ -2225,7 +2226,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, ext4_journal_test_restart(handle, inode); } - ext4_free_blocks(handle, inode, nr, 1); + ext4_free_blocks(handle, inode, nr, 1, 1); if (parent_bh) { /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c new file mode 100644 index 00000000000..76e5fedc0a0 --- /dev/null +++ b/fs/ext4/mballoc.c @@ -0,0 +1,4552 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas <alex@clusterfs.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + + +/* + * mballoc.c contains the multiblocks allocation routines + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/ext4_jbd2.h> +#include <linux/ext4_fs.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/proc_fs.h> +#include <linux/pagemap.h> +#include <linux/seq_file.h> +#include <linux/version.h> +#include "group.h" + +/* + * MUSTDO: + * - test ext4_ext_search_left() and ext4_ext_search_right() + * - search for metadata in few groups + * + * TODO v4: + * - normalization should take into account whether file is still open + * - discard preallocations if no free space left (policy?) + * - don't normalize tails + * - quota + * - reservation for superuser + * + * TODO v3: + * - bitmap read-ahead (proposed by Oleg Drokin aka green) + * - track min/max extents in each group for better group selection + * - mb_mark_used() may allocate chunk right after splitting buddy + * - tree of groups sorted by number of free blocks + * - error handling + */ + +/* + * The allocation request involve request for multiple number of blocks + * near to the goal(block) value specified. + * + * During initialization phase of the allocator we decide to use the group + * preallocation or inode preallocation depending on the size file. The + * size of the file could be the resulting file size we would have after + * allocation or the current file size which ever is larger. If the size is + * less that sbi->s_mb_stream_request we select the group + * preallocation. The default value of s_mb_stream_request is 16 + * blocks. This can also be tuned via + * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms + * of number of blocks. + * + * The main motivation for having small file use group preallocation is to + * ensure that we have small file closer in the disk. + * + * First stage the allocator looks at the inode prealloc list + * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for + * this particular inode. The inode prealloc space is represented as: + * + * pa_lstart -> the logical start block for this prealloc space + * pa_pstart -> the physical start block for this prealloc space + * pa_len -> lenght for this prealloc space + * pa_free -> free space available in this prealloc space + * + * The inode preallocation space is used looking at the _logical_ start + * block. If only the logical file block falls within the range of prealloc + * space we will consume the particular prealloc space. This make sure that + * that the we have contiguous physical blocks representing the file blocks + * + * The important thing to be noted in case of inode prealloc space is that + * we don't modify the values associated to inode prealloc space except + * pa_free. + * + * If we are not able to find blocks in the inode prealloc space and if we + * have the group allocation flag set then we look at the locality group + * prealloc space. These are per CPU prealloc list repreasented as + * + * ext4_sb_info.s_locality_groups[smp_processor_id()] + * + * The reason for having a per cpu locality group is to reduce the contention + * between CPUs. It is possible to get scheduled at this point. + * + * The locality group prealloc space is used looking at whether we have + * enough free space (pa_free) withing the prealloc space. + * + * If we can't allocate blocks via inode prealloc or/and locality group + * prealloc then we look at the buddy cache. The buddy cache is represented + * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets + * mapped to the buddy and bitmap information regarding different + * groups. The buddy information is attached to buddy cache inode so that + * we can access them through the page cache. The information regarding + * each group is loaded via ext4_mb_load_buddy. The information involve + * block bitmap and buddy information. The information are stored in the + * inode as: + * + * { page } + * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... + * + * + * one block each for bitmap and buddy information. So for each group we + * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / + * blocksize) blocks. So it can have information regarding groups_per_page + * which is blocks_per_page/2 + * + * The buddy cache inode is not stored on disk. The inode is thrown + * away when the filesystem is unmounted. + * + * We look for count number of blocks in the buddy cache. If we were able + * to locate that many free blocks we return with additional information + * regarding rest of the contiguous physical block available + * + * Before allocating blocks via buddy cache we normalize the request + * blocks. This ensure we ask for more blocks that we needed. The extra + * blocks that we get after allocation is added to the respective prealloc + * list. In case of inode preallocation we follow a list of heuristics + * based on file size. This can be found in ext4_mb_normalize_request. If + * we are doing a group prealloc we try to normalize the request to + * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to + * 512 blocks. This can be tuned via + * /proc/fs/ext4/<partition/group_prealloc. The value is represented in + * terms of number of blocks. If we have mounted the file system with -O + * stripe=<value> option the group prealloc request is normalized to the + * stripe value (sbi->s_stripe) + * + * The regular allocator(using the buddy cache) support few tunables. + * + * /proc/fs/ext4/<partition>/min_to_scan + * /proc/fs/ext4/<partition>/max_to_scan + * /proc/fs/ext4/<partition>/order2_req + * + * The regular allocator use buddy scan only if the request len is power of + * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The + * value of s_mb_order2_reqs can be tuned via + * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to + * stripe size (sbi->s_stripe), we try to search for contigous block in + * stripe size. This should result in better allocation on RAID setup. If + * not we search in the specific group using bitmap for best extents. The + * tunable min_to_scan and max_to_scan controll the behaviour here. + * min_to_scan indicate how long the mballoc __must__ look for a best + * extent and max_to_scanindicate how long the mballoc __can__ look for a + * best extent in the found extents. Searching for the blocks starts with + * the group specified as the goal value in allocation context via + * ac_g_ex. Each group is first checked based on the criteria whether it + * can used for allocation. ext4_mb_good_group explains how the groups are + * checked. + * + * Both the prealloc space are getting populated as above. So for the first + * request we will hit the buddy cache which will result in this prealloc + * space getting filled. The prealloc space is then later used for the + * subsequent request. + */ + +/* + * mballoc operates on the following data: + * - on-disk bitmap + * - in-core buddy (actually includes buddy and bitmap) + * - preallocation descriptors (PAs) + * + * there are two types of preallocations: + * - inode + * assiged to specific inode and can be used for this inode only. + * it describes part of inode's space preallocated to specific + * physical blocks. any block from that preallocated can be used + * independent. the descriptor just tracks number of blocks left + * unused. so, before taking some block from descriptor, one must + * make sure corresponded logical block isn't allocated yet. this + * also means that freeing any block within descriptor's range + * must discard all preallocated blocks. + * - locality group + * assigned to specific locality group which does not translate to + * permanent set of inodes: inode can join and leave group. space + * from this type of preallocation can be used for any inode. thus + * it's consumed from the beginning to the end. + * + * relation between them can be expressed as: + * in-core buddy = on-disk bitmap + preallocation descriptors + * + * this mean blocks mballoc considers used are: + * - allocated blocks (persistent) + * - preallocated blocks (non-persistent) + * + * consistency in mballoc world means that at any time a block is either + * free or used in ALL structures. notice: "any time" should not be read + * literally -- time is discrete and delimited by locks. + * + * to keep it simple, we don't use block numbers, instead we count number of + * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. + * + * all operations can be expressed as: + * - init buddy: buddy = on-disk + PAs + * - new PA: buddy += N; PA = N + * - use inode PA: on-disk += N; PA -= N + * - discard inode PA buddy -= on-disk - PA; PA = 0 + * - use locality group PA on-disk += N; PA -= N + * - discard locality group PA buddy -= PA; PA = 0 + * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap + * is used in real operation because we can't know actual used + * bits from PA, only from on-disk bitmap + * + * if we follow this strict logic, then all operations above should be atomic. + * given some of them can block, we'd have to use something like semaphores + * killing performance on high-end SMP hardware. let's try to relax it using + * the following knowledge: + * 1) if buddy is referenced, it's already initialized + * 2) while block is used in buddy and the buddy is referenced, + * nobody can re-allocate that block + * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has + * bit set and PA claims same block, it's OK. IOW, one can set bit in + * on-disk bitmap if buddy has same bit set or/and PA covers corresponded + * block + * + * so, now we're building a concurrency table: + * - init buddy vs. + * - new PA + * blocks for PA are allocated in the buddy, buddy must be referenced + * until PA is linked to allocation group to avoid concurrent buddy init + * - use inode PA + * we need to make sure that either on-disk bitmap or PA has uptodate data + * given (3) we care that PA-=N operation doesn't interfere with init + * - discard inode PA + * the simplest way would be to have buddy initialized by the discard + * - use locality group PA + * again PA-=N must be serialized with init + * - discard locality group PA + * the simplest way would be to have buddy initialized by the discard + * - new PA vs. + * - use inode PA + * i_data_sem serializes them + * - discard inode PA + * discard process must wait until PA isn't used by another process + * - use locality group PA + * some mutex should serialize them + * - discard locality group PA + * discard process must wait until PA isn't used by another process + * - use inode PA + * - use inode PA + * i_data_sem or another mutex should serializes them + * - discard inode PA + * discard process must wait until PA isn't used by another process + * - use locality group PA + * nothing wrong here -- they're different PAs covering different blocks + * - discard locality group PA + * discard process must wait until PA isn't used by another process + * + * now we're ready to make few consequences: + * - PA is referenced and while it is no discard is possible + * - PA is referenced until block isn't marked in on-disk bitmap + * - PA changes only after on-disk bitmap + * - discard must not compete with init. either init is done before + * any discard or they're serialized somehow + * - buddy init as sum of on-disk bitmap and PAs is done atomically + * + * a special case when we've used PA to emptiness. no need to modify buddy + * in this case, but we should care about concurrent init + * + */ + + /* + * Logic in few words: + * + * - allocation: + * load group + * find blocks + * mark bits in on-disk bitmap + * release group + * + * - use preallocation: + * find proper PA (per-inode or group) + * load group + * mark bits in on-disk bitmap + * release group + * release PA + * + * - free: + * load group + * mark bits in on-disk bitmap + * release group + * + * - discard preallocations in group: + * mark PAs deleted + * move them onto local list + * load on-disk bitmap + * load group + * remove PA from object (inode or locality group) + * mark free blocks in-core + * + * - discard inode's preallocations: + */ + +/* + * Locking rules + * + * Locks: + * - bitlock on a group (group) + * - object (inode/locality) (object) + * - per-pa lock (pa) + * + * Paths: + * - new pa + * object + * group + * + * - find and use pa: + * pa + * + * - release consumed pa: + * pa + * group + * object + * + * - generate in-core bitmap: + * group + * pa + * + * - discard all for given object (inode, locality group): + * object + * pa + * group + * + * - discard all for given group: + * group + * pa + * group + * object + * + */ + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + */ +#define MB_DEBUG__ +#ifdef MB_DEBUG +#define mb_debug(fmt, a...) printk(fmt, ##a) +#else +#define mb_debug(fmt, a...) +#endif + +/* + * with EXT4_MB_HISTORY mballoc stores last N allocations in memory + * and you can monitor it in /proc/fs/ext4/<dev>/mb_history + */ +#define EXT4_MB_HISTORY +#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ +#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ +#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */ +#define EXT4_MB_HISTORY_FREE 8 /* free */ + +#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \ + EXT4_MB_HISTORY_PREALLOC) + +/* + * How long mballoc can look for a best extent (in found extents) + */ +#define MB_DEFAULT_MAX_TO_SCAN 200 + +/* + * How long mballoc must look for a best extent + */ +#define MB_DEFAULT_MIN_TO_SCAN 10 + +/* + * How many groups mballoc will scan looking for the best chunk + */ +#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 + +/* + * with 'ext4_mb_stats' allocator will collect stats that will be + * shown at umount. The collecting costs though! + */ +#define MB_DEFAULT_STATS 1 + +/* + * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served + * by the stream allocator, which purpose is to pack requests + * as close each to other as possible to produce smooth I/O traffic + * We use locality group prealloc space for stream request. + * We can tune the same via /proc/fs/ext4/<parition>/stream_req + */ +#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ + +/* + * for which requests use 2^N search using buddies + */ +#define MB_DEFAULT_ORDER2_REQS 2 + +/* + * default group prealloc size 512 blocks + */ +#define MB_DEFAULT_GROUP_PREALLOC 512 + +static struct kmem_cache *ext4_pspace_cachep; + +#ifdef EXT4_BB_MAX_BLOCKS +#undef EXT4_BB_MAX_BLOCKS +#endif +#define EXT4_BB_MAX_BLOCKS 30 + +struct ext4_free_metadata { + ext4_group_t group; + unsigned short num; + ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; + struct list_head list; +}; + +struct ext4_group_info { + unsigned long bb_state; + unsigned long bb_tid; + struct ext4_free_metadata *bb_md_cur; + unsigned short bb_first_free; + unsigned short bb_free; + unsigned short bb_fragments; + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + unsigned short bb_counters[]; +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_LOCKED_BIT 1 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) + + +struct ext4_prealloc_space { + struct list_head pa_inode_list; + struct list_head pa_group_list; + union { + struct list_head pa_tmp_list; + struct rcu_head pa_rcu; + } u; + spinlock_t pa_lock; + atomic_t pa_count; + unsigned pa_deleted; + ext4_fsblk_t pa_pstart; /* phys. block */ + ext4_lblk_t pa_lstart; /* log. block */ + unsigned short pa_len; /* len of preallocated chunk */ + unsigned short pa_free; /* how many blocks are free */ + unsigned short pa_linear; /* consumed in one direction + * strictly, for grp prealloc */ + spinlock_t *pa_obj_lock; + struct inode *pa_inode; /* hack, for history only */ +}; + + +struct ext4_free_extent { + ext4_lblk_t fe_logical; + ext4_grpblk_t fe_start; + ext4_group_t fe_group; + int fe_len; +}; + +/* + * Locality group: + * we try to group all related changes together + * so that writeback can flush/allocate them together as well + */ +struct ext4_locality_group { + /* for allocator */ + struct mutex lg_mutex; /* to serialize allocates */ + struct list_head lg_prealloc_list;/* list of preallocations */ + spinlock_t lg_prealloc_lock; +}; + +struct ext4_allocation_context { + struct inode *ac_inode; + struct super_block *ac_sb; + + /* original request */ + struct ext4_free_extent ac_o_ex; + + /* goal request (after normalization) */ + struct ext4_free_extent ac_g_ex; + + /* the best found extent */ + struct ext4_free_extent ac_b_ex; + + /* copy of the bext found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; + __u16 ac_found; + __u16 ac_tail; + __u16 ac_buddy; + __u16 ac_flags; /* allocation hints */ + __u8 ac_status; + __u8 ac_criteria; + __u8 ac_repeats; + __u8 ac_2order; /* if request is to allocate 2^N blocks and + * N > 0, the field stores N, otherwise 0 */ + __u8 ac_op; /* operation, for history only */ + struct page *ac_bitmap_page; + struct page *ac_buddy_page; + struct ext4_prealloc_space *ac_pa; + struct ext4_locality_group *ac_lg; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + +struct ext4_mb_history { + struct ext4 |