diff options
Diffstat (limited to 'fs/ext4/mballoc.c')
| -rw-r--r-- | fs/ext4/mballoc.c | 4116 |
1 files changed, 2362 insertions, 1754 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ef97f19c2f9..2dcb936be90 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -21,20 +21,19 @@ * mballoc.c contains the multiblocks allocation routines */ -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/namei.h> -#include <linux/ext4_jbd2.h> -#include <linux/ext4_fs.h> -#include <linux/quotaops.h> -#include <linux/buffer_head.h> +#include "ext4_jbd2.h" +#include "mballoc.h" +#include <linux/log2.h> #include <linux/module.h> -#include <linux/swap.h> -#include <linux/proc_fs.h> -#include <linux/pagemap.h> -#include <linux/seq_file.h> -#include <linux/version.h> -#include "group.h" +#include <linux/slab.h> +#include <trace/events/ext4.h> + +#ifdef CONFIG_EXT4_DEBUG +ushort ext4_mballoc_debug __read_mostly; + +module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); +MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); +#endif /* * MUSTDO: @@ -60,32 +59,33 @@ * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * - * During initialization phase of the allocator we decide to use the group - * preallocation or inode preallocation depending on the size file. The - * size of the file could be the resulting file size we would have after - * allocation or the current file size which ever is larger. If the size is - * less that sbi->s_mb_stream_request we select the group - * preallocation. The default value of s_mb_stream_request is 16 - * blocks. This can also be tuned via - * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms - * of number of blocks. + * During initialization phase of the allocator we decide to use the + * group preallocation or inode preallocation depending on the size of + * the file. The size of the file could be the resulting file size we + * would have after allocation, or the current file size, which ever + * is larger. If the size is less than sbi->s_mb_stream_request we + * select to use the group preallocation. The default value of + * s_mb_stream_request is 16 blocks. This can also be tuned via + * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in + * terms of number of blocks. * * The main motivation for having small file use group preallocation is to - * ensure that we have small file closer in the disk. + * ensure that we have small files closer together on the disk. * - * First stage the allocator looks at the inode prealloc list - * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for - * this particular inode. The inode prealloc space is represented as: + * First stage the allocator looks at the inode prealloc list, + * ext4_inode_info->i_prealloc_list, which contains list of prealloc + * spaces for this particular inode. The inode prealloc space is + * represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space - * pa_len -> lenght for this prealloc space - * pa_free -> free space available in this prealloc space + * pa_len -> length for this prealloc space (in clusters) + * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc - * space we will consume the particular prealloc space. This make sure that - * that the we have contiguous physical blocks representing the file blocks + * space we will consume the particular prealloc space. This makes sure that + * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except @@ -93,7 +93,7 @@ * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group - * prealloc space. These are per CPU prealloc list repreasented as + * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * @@ -101,7 +101,7 @@ * between CPUs. It is possible to get scheduled at this point. * * The locality group prealloc space is used looking at whether we have - * enough free space (pa_free) withing the prealloc space. + * enough free space (pa_free) within the prealloc space. * * If we can't allocate blocks via inode prealloc or/and locality group * prealloc then we look at the buddy cache. The buddy cache is represented @@ -114,7 +114,7 @@ * inode as: * * { page } - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we @@ -135,33 +135,35 @@ * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to - * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to + * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is + * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via - * /proc/fs/ext4/<partition/group_prealloc. The value is represented in + * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the - * stripe value (sbi->s_stripe) + * the smallest multiple of the stripe value (sbi->s_stripe) which is + * greater than the default mb_group_prealloc. * - * The regular allocator(using the buddy cache) support few tunables. + * The regular allocator (using the buddy cache) supports a few tunables. * - * /proc/fs/ext4/<partition>/min_to_scan - * /proc/fs/ext4/<partition>/max_to_scan - * /proc/fs/ext4/<partition>/order2_req + * /sys/fs/ext4/<partition>/mb_min_to_scan + * /sys/fs/ext4/<partition>/mb_max_to_scan + * /sys/fs/ext4/<partition>/mb_order2_req * - * The regular allocator use buddy scan only if the request len is power of + * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via - * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to - * stripe size (sbi->s_stripe), we try to search for contigous block in - * stripe size. This should result in better allocation on RAID setup. If - * not we search in the specific group using bitmap for best extents. The - * tunable min_to_scan and max_to_scan controll the behaviour here. + * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to + * stripe size (sbi->s_stripe), we try to search for contiguous block in + * stripe size. This should result in better allocation on RAID setups. If + * not, we search in the specific group using bitmap for best extents. The + * tunable min_to_scan and max_to_scan control the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best - * extent and max_to_scanindicate how long the mballoc __can__ look for a + * extent and max_to_scan indicates how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it - * can used for allocation. ext4_mb_good_group explains how the groups are + * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * Both the prealloc space are getting populated as above. So for the first @@ -344,288 +346,28 @@ * object * */ - -/* - * with AGGRESSIVE_CHECK allocator runs consistency checks over - * structures. these checks slow things down a lot - */ -#define AGGRESSIVE_CHECK__ - -/* - * with DOUBLE_CHECK defined mballoc creates persistent in-core - * bitmaps, maintains and uses them to check for double allocations - */ -#define DOUBLE_CHECK__ - -/* - */ -#define MB_DEBUG__ -#ifdef MB_DEBUG -#define mb_debug(fmt, a...) printk(fmt, ##a) -#else -#define mb_debug(fmt, a...) -#endif - -/* - * with EXT4_MB_HISTORY mballoc stores last N allocations in memory - * and you can monitor it in /proc/fs/ext4/<dev>/mb_history - */ -#define EXT4_MB_HISTORY -#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ -#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ -#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */ -#define EXT4_MB_HISTORY_FREE 8 /* free */ - -#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \ - EXT4_MB_HISTORY_PREALLOC) - -/* - * How long mballoc can look for a best extent (in found extents) - */ -#define MB_DEFAULT_MAX_TO_SCAN 200 - -/* - * How long mballoc must look for a best extent - */ -#define MB_DEFAULT_MIN_TO_SCAN 10 - -/* - * How many groups mballoc will scan looking for the best chunk - */ -#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 - -/* - * with 'ext4_mb_stats' allocator will collect stats that will be - * shown at umount. The collecting costs though! - */ -#define MB_DEFAULT_STATS 1 - -/* - * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served - * by the stream allocator, which purpose is to pack requests - * as close each to other as possible to produce smooth I/O traffic - * We use locality group prealloc space for stream request. - * We can tune the same via /proc/fs/ext4/<parition>/stream_req - */ -#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ - -/* - * for which requests use 2^N search using buddies - */ -#define MB_DEFAULT_ORDER2_REQS 2 - -/* - * default group prealloc size 512 blocks - */ -#define MB_DEFAULT_GROUP_PREALLOC 512 - static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; - -#ifdef EXT4_BB_MAX_BLOCKS -#undef EXT4_BB_MAX_BLOCKS -#endif -#define EXT4_BB_MAX_BLOCKS 30 - -struct ext4_free_metadata { - ext4_group_t group; - unsigned short num; - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; - struct list_head list; -}; - -struct ext4_group_info { - unsigned long bb_state; - unsigned long bb_tid; - struct ext4_free_metadata *bb_md_cur; - unsigned short bb_first_free; - unsigned short bb_free; - unsigned short bb_fragments; - struct list_head bb_prealloc_list; -#ifdef DOUBLE_CHECK - void *bb_bitmap; -#endif - unsigned short bb_counters[]; -}; - -#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 -#define EXT4_GROUP_INFO_LOCKED_BIT 1 - -#define EXT4_MB_GRP_NEED_INIT(grp) \ - (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) - - -struct ext4_prealloc_space { - struct list_head pa_inode_list; - struct list_head pa_group_list; - union { - struct list_head pa_tmp_list; - struct rcu_head pa_rcu; - } u; - spinlock_t pa_lock; - atomic_t pa_count; - unsigned pa_deleted; - ext4_fsblk_t pa_pstart; /* phys. block */ - ext4_lblk_t pa_lstart; /* log. block */ - unsigned short pa_len; /* len of preallocated chunk */ - unsigned short pa_free; /* how many blocks are free */ - unsigned short pa_linear; /* consumed in one direction - * strictly, for grp prealloc */ - spinlock_t *pa_obj_lock; - struct inode *pa_inode; /* hack, for history only */ -}; - - -struct ext4_free_extent { - ext4_lblk_t fe_logical; - ext4_grpblk_t fe_start; - ext4_group_t fe_group; - int fe_len; +static struct kmem_cache *ext4_free_data_cachep; + +/* We create slab caches for groupinfo data structures based on the + * superblock block size. There will be one per mounted filesystem for + * each unique s_blocksize_bits */ +#define NR_GRPINFO_CACHES 8 +static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; + +static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { + "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", + "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", + "ext4_groupinfo_64k", "ext4_groupinfo_128k" }; -/* - * Locality group: - * we try to group all related changes together - * so that writeback can flush/allocate them together as well - */ -struct ext4_locality_group { - /* for allocator */ - struct mutex lg_mutex; /* to serialize allocates */ - struct list_head lg_prealloc_list;/* list of preallocations */ - spinlock_t lg_prealloc_lock; -}; - -struct ext4_allocation_context { - struct inode *ac_inode; - struct super_block *ac_sb; - - /* original request */ - struct ext4_free_extent ac_o_ex; - - /* goal request (after normalization) */ - struct ext4_free_extent ac_g_ex; - - /* the best found extent */ - struct ext4_free_extent ac_b_ex; - - /* copy of the bext found extent taken before preallocation efforts */ - struct ext4_free_extent ac_f_ex; - - /* number of iterations done. we have to track to limit searching */ - unsigned long ac_ex_scanned; - __u16 ac_groups_scanned; - __u16 ac_found; - __u16 ac_tail; - __u16 ac_buddy; - __u16 ac_flags; /* allocation hints */ - __u8 ac_status; - __u8 ac_criteria; - __u8 ac_repeats; - __u8 ac_2order; /* if request is to allocate 2^N blocks and - * N > 0, the field stores N, otherwise 0 */ - __u8 ac_op; /* operation, for history only */ - struct page *ac_bitmap_page; - struct page *ac_buddy_page; - struct ext4_prealloc_space *ac_pa; - struct ext4_locality_group *ac_lg; -}; - -#define AC_STATUS_CONTINUE 1 -#define AC_STATUS_FOUND 2 -#define AC_STATUS_BREAK 3 - -struct ext4_mb_history { - struct ext4_free_extent orig; /* orig allocation */ - struct ext4_free_extent goal; /* goal allocation */ - struct ext4_free_extent result; /* result allocation */ - unsigned pid; - unsigned ino; - __u16 found; /* how many extents have been found */ - __u16 groups; /* how many groups have been scanned */ - __u16 tail; /* what tail broke some buddy */ - __u16 buddy; /* buddy the tail ^^^ broke */ - __u16 flags; - __u8 cr:3; /* which phase the result extent was found at */ - __u8 op:4; - __u8 merged:1; -}; - -struct ext4_buddy { - struct page *bd_buddy_page; - void *bd_buddy; - struct page *bd_bitmap_page; - void *bd_bitmap; - struct ext4_group_info *bd_info; - struct super_block *bd_sb; - __u16 bd_blkbits; - ext4_group_t bd_group; -}; -#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) -#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) - -#ifndef EXT4_MB_HISTORY -static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) -{ - return; -} -#else -static void ext4_mb_store_history(struct ext4_allocation_context *ac); -#endif - -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - -static struct proc_dir_entry *proc_root_ext4; -struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); -ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned long *count, int *errp); - static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); -static void ext4_mb_free_committed_blocks(struct super_block *); -static void ext4_mb_return_to_preallocation(struct inode *inode, - struct ext4_buddy *e4b, sector_t block, - int count); -static void ext4_mb_put_pa(struct ext4_allocation_context *, - struct super_block *, struct ext4_prealloc_space *pa); -static int ext4_mb_init_per_dev_proc(struct super_block *sb); -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); - - -static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); -} - -static inline void ext4_unlock_group(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); -} - -static inline int ext4_is_group_locked(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, - &(grinfo->bb_state)); -} - -static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, - struct ext4_free_extent *fex) -{ - ext4_fsblk_t block; - - block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb) - + fex->fe_start - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - return block; -} +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int rc); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -657,49 +399,49 @@ static inline void mb_set_bit(int bit, void *addr) ext4_set_bit(bit, addr); } -static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) -{ - addr = mb_correct_addr_and_bit(&bit, addr); - ext4_set_bit_atomic(lock, bit, addr); -} - static inline void mb_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } -static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) +static inline int mb_test_and_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); - ext4_clear_bit_atomic(lock, bit, addr); + return ext4_test_and_clear_bit(bit, addr); } static inline int mb_find_next_zero_bit(void *addr, int max, int start) { - int fix = 0; + int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); - max += fix; + tmpmax = max + fix; start += fix; - return ext4_find_next_zero_bit(addr, max, start) - fix; + ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; } static inline int mb_find_next_bit(void *addr, int max, int start) { - int fix = 0; + int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); - max += fix; + tmpmax = max + fix; start += fix; - return ext4_find_next_bit(addr, max, start) - fix; + ret = ext4_find_next_bit(addr, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; } static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; - BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); if (order > e4b->bd_blkbits + 1) { @@ -708,11 +450,12 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) } /* at order 0 we see each particular block */ - *max = 1 << (e4b->bd_blkbits + 3); - if (order == 0) - return EXT4_MB_BITMAP(e4b); + if (order == 0) { + *max = 1 << (e4b->bd_blkbits + 3); + return e4b->bd_bitmap; + } - bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; + bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; return bb; @@ -727,19 +470,19 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; - BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; - blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); - blocknr += first + i; - blocknr += - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - - ext4_error(sb, __FUNCTION__, "double-free of inode" - " %lu's block %llu(bit %u in group %lu)\n", - inode ? inode->i_ino : 0, blocknr, - first + i, e4b->bd_group); + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), first + i); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing block already freed " + "(bit %u)", + first + i); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -751,7 +494,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; - BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); for (i = 0; i < count; i++) { BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); mb_set_bit(first + i, e4b->bd_info->bb_bitmap); @@ -767,9 +510,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { - printk("corruption in group %lu at byte %u(%u):" - " %x in copy != %x on disk/prealloc\n", - e4b->bd_group, i, i * 8, b1[i], b2[i]); + ext4_msg(e4b->bd_sb, KERN_ERR, + "corruption in group %u " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc", + e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } @@ -823,9 +568,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, void *buddy; void *buddy2; - if (!test_opt(sb, MBALLOC)) - return 0; - { static int mb_check_counter; if (mb_check_counter++ % 100 != 0) @@ -855,14 +597,14 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, continue; } - /* both bits in buddy2 must be 0 */ + /* both bits in buddy2 must be 1 */ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); for (j = 0; j < (1 << order); j++) { k = (i * (1 << order)) + j; MB_CHECK_ASSERT( - !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); + !mb_test_bit(k, e4b->bd_bitmap)); } count++; } @@ -894,37 +636,41 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); - buddy = mb_find_buddy(e4b, 0, &max); list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; - pa = list_entry(cur, struct ext4_prealloc_space, group_list); - ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k); + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); MB_CHECK_ASSERT(groupnr == e4b->bd_group); - for (i = 0; i < pa->len; i++) + for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } return 0; } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ - __FILE__, __FUNCTION__, __LINE__) + __FILE__, __func__, __LINE__) #else #define mb_check_buddy(e4b) #endif -/* FIXME!! need more doc */ +/* + * Divide blocks started from @first with length @len into + * smaller chunks with power of 2 blocks. + * Clear the bits in bitmap which the blocks of the chunk(s) covered, + * then increase bb_counters[] for corresponded chunk size. + */ static void ext4_mb_mark_free_simple(struct super_block *sb, - void *buddy, unsigned first, int len, + void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned short min; - unsigned short max; - unsigned short chunk; + ext4_grpblk_t min; + ext4_grpblk_t max; + ext4_grpblk_t chunk; unsigned short border; - BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); + BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; @@ -950,14 +696,37 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } -static void ext4_mb_generate_buddy(struct super_block *sb, +/* + * Cache the order of the largest free extent we have available in this block + * group. + */ +static void +mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) +{ + int i; + int bits; + + grp->bb_largest_free_order = -1; /* uninit */ + + bits = sb->s_blocksize_bits + 1; + for (i = bits; i >= 0; i--) { + if (grp->bb_counters[i] > 0) { + grp->bb_largest_free_order = i; + break; + } + } +} + +static noinline_for_stack +void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); - unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); - unsigned short i = 0; - unsigned short first; - unsigned short len; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; + ext4_grpblk_t first; + ext4_grpblk_t len; unsigned free = 0; unsigned fragments = 0; unsigned long long period = get_cycles(); @@ -982,15 +751,21 @@ static void ext4_mb_generate_buddy(struct super_block *sb, grp->bb_fragments = fragments; if (free != grp->bb_free) { - ext4_error(sb, __FUNCTION__, - "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", - group, free, grp->bb_free); + ext4_grp_locked_error(sb, group, 0, 0, + "block bitmap and bg descriptor " + "inconsistent: %u vs %u free clusters", + free, grp->bb_free); /* - * If we intent to continue, we consider group descritor + * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } + mb_set_largest_free_order(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); @@ -1001,6 +776,24 @@ static void ext4_mb_generate_buddy(struct super_block *sb, spin_unlock(&EXT4_SB(sb)->s_bal_lock); } +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) { + ext4_set_bits(buddy, 0, count); + } + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group); +} + /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve @@ -1008,7 +801,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, * stored in the inode as * * { page } - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. @@ -1016,28 +809,34 @@ static void ext4_mb_generate_buddy(struct super_block *sb, * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. * So it can have information regarding groups_per_page which * is blocks_per_page/2 + * + * Locking note: This routine takes the block group lock of all groups + * for this page; do not hold this lock when calling this routine! */ static int ext4_mb_init_cache(struct page *page, char *incore) { + ext4_group_t ngroups; int blocksize; int blocks_per_page; int groups_per_page; int err = 0; int i; - ext4_group_t first_group; + ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; - struct buffer_head **bh; + struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; + struct ext4_group_info *grinfo; - mb_debug("init page %lu\n", page->index); + mb_debug(1, "init page %lu\n", page->index); inode = page->mapping->host; sb = inode->i_sb; + ngroups = ext4_get_groups_count(sb); blocksize = 1 << inode->i_blkbits; blocks_per_page = PAGE_CACHE_SIZE / blocksize; @@ -1047,67 +846,58 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { - err = -ENOMEM; i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, GFP_NOFS); - if (bh == NULL) + if (bh == NULL) { + err = -ENOMEM; goto out; + } } else bh = &bhs; first_group = page->index * blocks_per_page / 2; /* read all groups the page covers into the cache */ - for (i = 0; i < groups_per_page; i++) { - struct ext4_group_desc *desc; - - if (first_group + i >= EXT4_SB(sb)->s_groups_count) + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (group >= ngroups) break; - err = -EIO; - desc = ext4_get_group_desc(sb, first_group + i, NULL); - if (desc == NULL) - goto out; - - err = -ENOMEM; - bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); - if (bh[i] == NULL) - goto out; - - if (bh_uptodate_or_lock(bh[i])) - continue; - - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - ext4_init_block_bitmap(sb, bh[i], - first_group + i, desc); - set_buffer_uptodate(bh[i]); - unlock_buffer(bh[i]); + grinfo = ext4_get_group_info(sb, group); + /* + * If page is uptodate then we came here after online resize + * which added some new uninitialized group info structs, so + * we must skip all initialized uptodate buddies on the page, + * which may be currently in use by an allocating task. + */ + if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + bh[i] = NULL; continue; } - get_bh(bh[i]); - bh[i]->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh[i]); - mb_debug("read bitmap for group %lu\n", first_group + i); + if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { + err = -ENOMEM; + goto out; + } + mb_debug(1, "read bitmap for group %u\n", group); } /* wait for I/O completion */ - for (i = 0; i < groups_per_page && bh[i]; i++) - wait_on_buffer(bh[i]); - - err = -EIO; - for (i = 0; i < groups_per_page && bh[i]; i++) - if (!buffer_uptodate(bh[i])) + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { + err = -EIO; goto out; + } + } first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { - int group; - struct ext4_group_info *grinfo; - group = (first_block + i) >> 1; - if (group >= EXT4_SB(sb)->s_groups_count) + if (group >= ngroups) break; + if (!bh[group - first_group]) + /* skip initialized uptodate buddy */ + continue; + /* * data carry information regarding this * particular group in the format specified @@ -1124,23 +914,29 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); - mb_debug("put buddy for group %u in page %lu/%x\n", + mb_debug(1, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); - memset(data, 0xff, blocksize); + trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo = ext4_get_group_info(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, - sizeof(unsigned short)*(sb->s_blocksize_bits+2)); + sizeof(*grinfo->bb_counters) * + (sb->s_blocksize_bits+2)); /* * incore got set to the group block bitmap below */ + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); incore = NULL; } else { /* this is block of bitmap */ BUG_ON(incore != NULL); - mb_debug("put bitmap for group %u in page %lu/%x\n", + mb_debug(1, "put bitmap for group %u in page %lu/%x\n", group, page->index, i * blocksize); + trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ ext4_lock_group(sb, group); @@ -1148,6 +944,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be @@ -1160,7 +957,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) out: if (bh) { - for (i = 0; i < groups_per_page && bh[i]; i++) + for (i = 0; i < groups_per_page; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); @@ -1168,28 +965,175 @@ out: return err; } -static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, - struct ext4_buddy *e4b) +/* + * Lock the buddy and bitmap pages. This make sure other parallel init_group + * on the same buddy page doesn't happen whild holding the buddy page lock. + * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap + * are on the same page e4b->bd_buddy_page is NULL and return value is 0. + */ +static int ext4_mb_get_buddy_page_lock(struct super_block *sb, + ext4_group_t group, struct ext4_buddy *e4b) +{ + struct inode *inode = EXT4_SB(sb)->s_buddy_cache; + int block, pnum, poff; + int blocks_per_page; + struct page *page; + + e4b->bd_buddy_page = NULL; + e4b->bd_bitmap_page = NULL; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -ENOMEM; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_bitmap_page = page; + e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + + if (blocks_per_page >= 2) { + /* buddy and bitmap are on the same page */ + return 0; + } + + block++; + pnum = block / blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -ENOMEM; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_buddy_page = page; + return 0; +} + +static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) +{ + if (e4b->bd_bitmap_page) { + unlock_page(e4b->bd_bitmap_page); + page_cache_release(e4b->bd_bitmap_page); + } + if (e4b->bd_buddy_page) { + unlock_page(e4b->bd_buddy_page); + page_cache_release(e4b->bd_buddy_page); + } +} + +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ +static noinline_for_stack +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +{ + + struct ext4_group_info *this_grp; + struct ext4_buddy e4b; + struct page *page; + int ret = 0; + + might_sleep(); + mb_debug(1, "init group %u\n", group); + this_grp = ext4_get_group_info(sb, group); + /* + * This ensures that we don't reinit the buddy cache + * page which map to the group from which we are already + * allocating. If we are looking at the buddy cache we would + * have taken a reference using ext4_mb_load_buddy and that + * would have pinned buddy page to page cache. + * The call to ext4_mb_get_buddy_page_lock will mark the + * page accessed. + */ + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); + if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { + /* + * somebody initialized the group + * return without doing anything + */ + goto err; + } + + page = e4b.bd_bitmap_page; + ret = ext4_mb_init_cache(page, NULL); + if (ret) + goto err; + if (!PageUptodate(page)) { + ret = -EIO; + goto err; + } + + if (e4b.bd_buddy_page == NULL) { + /* + * If both the bitmap and buddy are in + * the same page we don't need to force + * init the buddy + */ + ret = 0; + goto err; + } + /* init buddy cache */ + page = e4b.bd_buddy_page; + ret = ext4_mb_init_cache(page, e4b.bd_bitmap); + if (ret) + goto err; + if (!PageUptodate(page)) { + ret = -EIO; + goto err; + } +err: + ext4_mb_put_buddy_page_lock(&e4b); + return ret; +} + +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ +static noinline_for_stack int +ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; int blocks_per_page; int block; int pnum; int poff; struct page *page; + int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; - mb_debug("load group %lu\n", group); + might_sleep(); + mb_debug(1, "load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; - e4b->bd_info = ext4_get_group_info(sb, group); + e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + /* + * we need full data about the group + * to make a good selection + */ + ret = ext4_mb_init_group(sb, group); + if (ret) + return ret; + } + /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. @@ -1201,49 +1145,79 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, /* we could use find_or_create_page(), but it locks page * what we'd like to avoid in fast path ... */ - page = find_get_page(inode->i_mapping, pnum); + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) + /* + * drop the page reference and try + * to get the page with lock. If we + * are not uptodate that implies + * somebody just created the page but + * is yet to initialize the same. So + * wait for it to initialize. + */ page_cache_release(page); page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { - ext4_mb_init_cache(page, NULL); + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } mb_cmp_bitmaps(e4b, page_address(page) + (poff * sb->s_blocksize)); } unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) + if (page == NULL) { + ret = -ENOMEM; goto err; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto err; + } + + /* Pages marked accessed already */ e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) page_cache_release(page); page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (page) { BUG_ON(page->mapping != inode->i_mapping); - if (!PageUptodate(page)) - ext4_mb_init_cache(page, e4b->bd_bitmap); - + if (!PageUptodate(page)) { + ret = ext4_mb_init_cache(page, e4b->bd_bitmap); + if (ret) { + unlock_page(page); + goto err; + } + } unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) + if (page == NULL) { + ret = -ENOMEM; goto err; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto err; + } + + /* Pages marked accessed already */ e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); @@ -1251,16 +1225,18 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, return 0; err: + if (page) + page_cache_release(page); if (e4b->bd_bitmap_page) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; - return -EIO; + return ret; } -static void ext4_mb_release_desc(struct ext4_buddy *e4b) +static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) page_cache_release(e4b->bd_bitmap_page); @@ -1274,10 +1250,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) int order = 1; void *bb; - BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); - bb = EXT4_MB_BUDDY(e4b); + bb = e4b->bd_buddy; while (order <= e4b->bd_blkbits + 1) { block = block >> 1; if (!mb_test_bit(block, bb)) { @@ -1290,7 +1266,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) return 0; } -static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) +static void mb_clear_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1303,12 +1279,39 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - mb_clear_bit_atomic(lock, cur, bm); + mb_clear_bit(cur, bm); cur++; } } -static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) +/* clear bits in given range + * will return first found zero bit if any, -1 otherwise + */ +static int mb_test_and_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + int zero_bit = -1; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + if (*addr != (__u32)(-1) && zero_bit == -1) + zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); + *addr = 0; + cur += 32; + continue; + } + if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) + zero_bit = cur; + cur++; + } + + return zero_bit; +} + +void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1321,23 +1324,100 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - mb_set_bit_atomic(lock, cur, bm); + mb_set_bit(cur, bm); cur++; } } -static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, - int first, int count) +/* + * _________________________________________________________________ */ + +static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) { - int block = 0; - int max = 0; - int order; - void *buddy; - void *buddy2; + if (mb_test_bit(*bit + side, bitmap)) { + mb_clear_bit(*bit, bitmap); + (*bit) -= side; + return 1; + } + else { + (*bit) += side; + mb_set_bit(*bit, bitmap); + return -1; + } +} + +static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) +{ + int max; + int order = 1; + void *buddy = mb_find_buddy(e4b, order, &max); + + while (buddy) { + void *buddy2; + + /* Bits in range [first; last] are known to be set since + * corresponding blocks were allocated. Bits in range + * (first; last) will stay set because they form buddies on + * upper layer. We just deal with borders if they don't + * align with upper layer and then go up. + * Releasing entire group is all about clearing + * single bit of highest order buddy. + */ + + /* Example: + * --------------------------------- + * | 1 | 1 | 1 | 1 | + * --------------------------------- + * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------- + * 0 1 2 3 4 5 6 7 + * \_____________________/ + * + * Neither [1] nor [6] is aligned to above layer. + * Left neighbour [0] is free, so mark it busy, + * decrease bb_counters and extend range to + * [0; 6] + * Right neighbour [7] is busy. It can't be coaleasced with [6], so + * mark [6] free, increase bb_counters and shrink range to + * [0; 5]. + * Then shift range to [0; 2], go up and do the same. + */ + + + if (first & 1) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); + if (!(last & 1)) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); + if (first > last) + break; + order++; + + if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { + mb_clear_bits(buddy, first, last - first + 1); + e4b->bd_info->bb_counters[order - 1] += last - first + 1; + break; + } + first >>= 1; + last >>= 1; + buddy = buddy2; + } +} + +static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, + int first, int count) +{ + int left_is_free = 0; + int right_is_free = 0; + int block; + int last = first + count - 1; struct super_block *sb = e4b->bd_sb; - BUG_ON(first + count > (sb->s_blocksize << 3)); - BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); + BUG_ON(last >= (sb->s_blocksize << 3)); + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + /* Don't bother if the block group is corrupt. */ + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return; + mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1345,85 +1425,77 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; - /* let's maintain fragments counter */ + /* access memory sequentially: check left neighbour, + * clear range and then check right neighbour + */ if (first != 0) - block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); - if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) - max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); - if (block && max) + left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); + block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); + if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) + right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); + + if (unlikely(block != -1)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t blocknr; + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), block); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u); block bitmap corrupt.", + block); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + e4b->bd_info->bb_free); + /* Mark the block group as corrupt. */ + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &e4b->bd_info->bb_state); + mb_regenerate_buddy(e4b); + goto done; + } + + /* let's maintain fragments counter */ + if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; - else if (!block && !max) + else if (!left_is_free && !right_is_free) e4b->bd_info->bb_fragments++; - /* let's maintain buddy itself */ - while (count-- > 0) { - block = first++; - order = 0; - - if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { - ext4_fsblk_t blocknr; - blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); - blocknr += block; - blocknr += - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - - ext4_error(sb, __FUNCTION__, "double-free of inode" - " %lu's block %llu(bit %u in group %lu)\n", - inode ? inode->i_ino : 0, blocknr, block, - e4b->bd_group); - } - mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); - e4b->bd_info->bb_counters[order]++; - - /* start of the buddy */ - buddy = mb_find_buddy(e4b, order, &max); - - do { - block &= ~1UL; - if (mb_test_bit(block, buddy) || - mb_test_bit(block + 1, buddy)) - break; - - /* both the buddies are free, try to coalesce them */ - buddy2 = mb_find_buddy(e4b, order + 1, &max); - - if (!buddy2) - break; - - if (order > 0) { - /* for special purposes, we don't set - * free bits in bitmap */ - mb_set_bit(block, buddy); - mb_set_bit(block + 1, buddy); - } - e4b->bd_info->bb_counters[order]--; - e4b->bd_info->bb_counters[order]--; + /* buddy[0] == bd_bitmap is a special case, so handle + * it right away and let mb_buddy_mark_free stay free of + * zero order checks. + * Check if neighbours are to be coaleasced, + * adjust bitmap bb_counters and borders appropriately. + */ + if (first & 1) { + first += !left_is_free; + e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; + } + if (!(last & 1)) { + last -= !right_is_free; + e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; + } - block = block >> 1; - order++; - e4b->bd_info->bb_counters[order]++; + if (first <= last) + mb_buddy_mark_free(e4b, first >> 1, last >> 1); - mb_clear_bit(block, buddy2); - buddy = buddy2; - } while (1); - } +done: + mb_set_largest_free_order(sb, e4b->bd_info); mb_check_buddy(e4b); - - return 0; } -static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, +static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { int next = block; - int max; - int ord; + int max, order; void *buddy; - BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); - buddy = mb_find_buddy(e4b, order, &max); + buddy = mb_find_buddy(e4b, 0, &max); BUG_ON(buddy == NULL); BUG_ON(block >= max); if (mb_test_bit(block, buddy)) { @@ -1433,12 +1505,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, return 0; } - /* FIXME dorp order completely ? */ - if (likely(order == 0)) { - /* find actual order */ - order = mb_find_order_for_block(e4b, block); - block = block >> order; - } + /* find actual order */ + order = mb_find_order_for_block(e4b, block); + block = block >> order; ex->fe_len = 1 << order; ex->fe_start = block << order; @@ -1450,18 +1519,17 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, ex->fe_start += next; while (needed > ex->fe_len && - (buddy = mb_find_buddy(e4b, order, &max))) { + mb_find_buddy(e4b, order, &max)) { if (block + 1 >= max) break; next = (block + 1) * (1 << order); - if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) + if (mb_test_bit(next, e4b->bd_bitmap)) break; - ord = mb_find_order_for_block(e4b, next); + order = mb_find_order_for_block(e4b, next); - order = ord; block = next >> order; ex->fe_len += 1 << order; } @@ -1484,7 +1552,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); - BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); @@ -1494,9 +1562,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain fragments counter */ if (start != 0) - mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); + mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) - max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); + max = !mb_test_bit(start + len, e4b->bd_bitmap); if (mlen && max) e4b->bd_info->bb_fragments++; else if (!mlen && !max) @@ -1537,9 +1605,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) e4b->bd_info->bb_counters[ord]++; e4b->bd_info->bb_counters[ord]++; } + mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); - mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group), - EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -1569,15 +1637,19 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ac->ac_tail = ret & 0xffff; ac->ac_buddy = ret >> 16; - /* XXXXXXX: SUCH A HORRIBLE **CK */ - /*FIXME!! Why ? */ + /* + * take the page reference. We want the page to be pinned + * so that we don't get a ext4_mb_init_cache_call for this + * group until we update the bitmap. That would mean we + * double allocate blocks. The reference is dropped + * in ext4_mb_release_context + */ ac->ac_bitmap_page = e4b->bd_bitmap_page; get_page(ac->ac_bitmap_page); ac->ac_buddy_page = e4b->bd_buddy_page; get_page(ac->ac_buddy_page); - /* store last allocated for subsequent stream allocation */ - if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); sbi->s_mb_last_group = ac->ac_f_ex.fe_group; sbi->s_mb_last_start = ac->ac_f_ex.fe_start; @@ -1599,6 +1671,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_free_extent ex; int max; + if (ac->ac_status == AC_STATUS_FOUND) + return; /* * We don't want to scan for a whole year */ @@ -1619,7 +1693,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, /* recheck chunk's availability - we don't know * when it was found (within this lock-unlock * period or not) */ - max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); + max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); if (max >= gex->fe_len) { ext4_mb_use_best_found(ac, e4b); return; @@ -1645,8 +1719,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); - BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); - BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; @@ -1696,7 +1770,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, ext4_mb_check_limits(ac, e4b, 0); } -static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, +static noinline_for_stack +int ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_free_extent ex = ac->ac_b_ex; @@ -1710,7 +1785,7 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, return err; ext4_lock_group(ac->ac_sb, group); - max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { ac->ac_b_ex = ex; @@ -1718,37 +1793,46 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, } ext4_unlock_group(ac->ac_sb, group); - ext4_mb_release_desc(e4b); + ext4_mb_unload_buddy(e4b); return 0; } -static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, +static noinline_for_stack +int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { ext4_group_t group = ac->ac_g_ex.fe_group; int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_super_block *es = sbi->s_es; + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) return 0; + if (grp->bb_free == 0) + return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return err; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { + ext4_mb_unload_buddy(e4b); + return 0; + } + ext4_lock_group(ac->ac_sb, group); - max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); + ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; - start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + - ex.fe_start + le32_to_cpu(es->s_first_data_block); + start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + + ex.fe_start; /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; @@ -1773,7 +1857,7 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ext4_mb_use_best_found(ac, e4b); } ext4_unlock_group(ac->ac_sb, group); - ext4_mb_release_desc(e4b); + ext4_mb_unload_buddy(e4b); return 0; } @@ -1782,7 +1866,8 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, * The routine scans buddy structures (not bitmap!) from given order * to max order and tries to find big enough chunk to satisfy the req */ -static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, +static noinline_for_stack +void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; @@ -1825,11 +1910,12 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, * In order to optimize scanning, caller must pass number of * free blocks in the group, so the routine can know upper limit. */ -static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, +static noinline_for_stack +void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; - void *bitmap = EXT4_MB_BITMAP(e4b); + void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i; int free; @@ -1841,24 +1927,26 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, while (free && ac->ac_status == AC_STATUS_CONTINUE) { i = mb_find_next_zero_bit(bitmap, - EXT4_BLOCKS_PER_GROUP(sb), i); - if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { + EXT4_CLUSTERS_PER_GROUP(sb), i); + if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * we have free blocks */ - ext4_error(sb, __FUNCTION__, "%d free blocks as per " - "group info. But bitmap says 0\n", + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free clusters as per " + "group info. But bitmap says 0", free); break; } - mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); + mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { - ext4_error(sb, __FUNCTION__, "%d free blocks as per " - "group info. But got %d blocks\n", + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free clusters as per " + "group info. But got %d blocks", free, ex.fe_len); /* * The number of free blocks differs. This mostly @@ -1867,7 +1955,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, */ break; } - + ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; @@ -1879,15 +1967,15 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, /* * This is a special case for storages like raid5 - * we try to find stripe-aligned chunks for stripe-size requests - * XXX should do so at least for multiples of stripe size as well + * we try to find stripe-aligned chunks for stripe-size-multiple requests */ -static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, +static noinline_for_stack +void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - void *bitmap = EXT4_MB_BITMAP(e4b); + void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; @@ -1897,17 +1985,18 @@ static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, BUG_ON(sbi->s_stripe == 0); /* find first stripe-aligned block in group */ - first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) - + le32_to_cpu(sbi->s_es->s_first_data_block); + first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); + a = first_group_block + sbi->s_stripe - 1; do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; - while (i < EXT4_BLOCKS_PER_GROUP(sb)) { + while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { - max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); + max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { ac->ac_found++; + ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; @@ -1917,37 +2006,54 @@ static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } } +/* This is now called BEFORE we load the buddy bitmap. */ static int ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { unsigned free, fragments; - unsigned i, bits; - struct ext4_group_desc *desc; + int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < 0 || cr >= 4); - BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); free = grp->bb_free; - fragments = grp->bb_fragments; if (free == 0) return 0; + if (cr <= 2 && free < ac->ac_g_ex.fe_len) + return 0; + + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return 0; + + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + int ret = ext4_mb_init_group(ac->ac_sb, group); + if (ret) + return 0; + } + + fragments = grp->bb_fragments; if (fragments == 0) return 0; switch (cr) { case 0: BUG_ON(ac->ac_2order == 0); - /* If this group is uninitialized, skip it initially */ - desc = ext4_get_group_desc(ac->ac_sb, group, NULL); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) + + /* Avoid using the first bg of a flexgroup for data files */ + if ((ac->ac_flags & EXT4_MB_HINT_DATA) && + (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && + ((group % flex_size) == 0)) return 0; - bits = ac->ac_sb->s_blocksize_bits + 1; - for (i = ac->ac_2order; i <= bits; i++) - if (grp->bb_counters[i] > 0) - return 1; - break; + if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || + (free / fragments) >= ac->ac_g_ex.fe_len) + return 1; + + if (grp->bb_largest_free_order < ac->ac_2order) + return 0; + + return 1; case 1: if ((free / fragments) >= ac->ac_g_ex.fe_len) return 1; @@ -1965,20 +2071,23 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, return 0; } -static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) +static noinline_for_stack int +ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { - ext4_group_t group; - ext4_group_t i; + ext4_group_t ngroups, group, i; int cr; int err = 0; - int bsbits; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; - loff_t size, isize; sb = ac->ac_sb; sbi = EXT4_SB(sb); + ngroups = ext4_get_groups_count(sb); + /* non-extent files are limited to low blocks/groups */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) + ngroups = sbi->s_blockfile_groups; + BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ @@ -1999,7 +2108,7 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs - * You can tune it via /proc/fs/ext4/<partition>/order2_req + * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req */ if (i >= sbi->s_mb_order2_reqs) { /* @@ -2009,15 +2118,8 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ac->ac_2order = i - 1; } - bsbits = ac->ac_sb->s_blocksize_bits; /* if stream allocation is enabled, use global goal */ - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; - isize = i_size_read(ac->ac_inode) >> bsbits; - if (size < isize) - size = isize; - - if (size < sbi->s_mb_stream_request && - (ac->ac_flags & EXT4_MB_HINT_DATA)) { + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { /* TBD: may be hot point */ spin_lock(&sbi->s_md_lock); ac->ac_g_ex.fe_group = sbi->s_mb_last_group; @@ -2025,9 +2127,6 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) spin_unlock(&sbi->s_md_lock); } - /* searching for the right group start from the goal value specified */ - group = ac->ac_g_ex.fe_group; - /* Let's just scan groups to find more-less suitable blocks */ cr = ac->ac_2order ? 0 : 1; /* @@ -2037,37 +2136,22 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) repeat: for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; - for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { - struct ext4_group_info *grp; - struct ext4_group_desc *desc; - - if (group == EXT4_SB(sb)->s_groups_count) - group = 0; - - /* quick check to skip empty groups */ - grp = ext4_get_group_info(ac->ac_sb, group); - if (grp->bb_free == 0) - continue; + /* + * searching for the right group start + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; + for (i = 0; i < ngroups; group++, i++) { + cond_resched(); /* - * if the group is already init we check whether it is - * a good group and if not we don't load the buddy + * Artificially restricted ngroups for non-extent + * files makes group > ngroups possible on first loop. */ - if (EXT4_MB_GRP_NEED_INIT(grp)) { - /* - * we need full data about the group - * to make a good selection - */ - err = ext4_mb_load_buddy(sb, group, &e4b); - if (err) - goto out; - ext4_mb_release_desc(&e4b); - } + if (group >= ngroups) + group = 0; - /* - * If the particular group doesn't satisfy our - * criteria we continue with the next group - */ + /* This now checks without needing the buddy page */ if (!ext4_mb_good_group(ac, group, cr)) continue; @@ -2076,27 +2160,28 @@ repeat: goto out; ext4_lock_group(sb, group); + + /* + * We need to check again after locking the + * block group + */ if (!ext4_mb_good_group(ac, group, cr)) { - /* someone did allocation from this group */ ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); continue; } ac->ac_groups_scanned++; - desc = ext4_get_group_desc(sb, group, NULL); - if (cr == 0 || (desc->bg_flags & - cpu_to_le16(EXT4_BG_BLOCK_UNINIT) && - ac->ac_2order != 0)) + if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2) ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == 1 && - ac->ac_g_ex.fe_len == sbi->s_stripe) + else if (cr == 1 && sbi->s_stripe && + !(ac->ac_g_ex.fe_len % sbi->s_stripe)) ext4_mb_scan_aligned(ac, &e4b); else ext4_mb_complex_scan_group(ac, &e4b); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); if (ac->ac_status != AC_STATUS_CONTINUE) break; @@ -2132,241 +2217,40 @@ out: return err; } -#ifdef EXT4_MB_HISTORY -struct ext4_mb_proc_session { - struct ext4_mb_history *history; - struct super_block *sb; - int start; - int max; -}; - -static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s, - struct ext4_mb_history *hs, - int first) -{ - if (hs == s->history + s->max) - hs = s->history; - if (!first && hs == s->history + s->start) - return NULL; - while (hs->orig.fe_len == 0) { - hs++; - if (hs == s->history + s->max) - hs = s->history; - if (hs == s->history + s->start) - return NULL; - } - return hs; -} - -static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos) -{ - struct ext4_mb_proc_session *s = seq->private; - struct ext4_mb_history *hs; - int l = *pos; - - if (l == 0) - return SEQ_START_TOKEN; - hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1); - if (!hs) - return NULL; - while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL); - return hs; -} - -static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v, - loff_t *pos) -{ - struct ext4_mb_proc_session *s = seq->private; - struct ext4_mb_history *hs = v; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ext4_mb_history_skip_empty(s, s->history + s->start, 1); - else - return ext4_mb_history_skip_empty(s, ++hs, 0); -} - -static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) -{ - char buf[25], buf2[25], buf3[25], *fmt; - struct ext4_mb_history *hs = v; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " - "%-5s %-2s %-5s %-5s %-5s %-6s\n", - "pid", "inode", "original", "goal", "result", "found", - "grps", "cr", "flags", "merge", "tail", "broken"); - return 0; - } - - if (hs->op == EXT4_MB_HISTORY_ALLOC) { - fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " - "%-5u %-5s %-5u %-6u\n"; - sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len, - hs->result.fe_logical); - sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, - hs->orig.fe_start, hs->orig.fe_len, - hs->orig.fe_logical); - sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, - hs->goal.fe_start, hs->goal.fe_len, - hs->goal.fe_logical); - seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, - hs->found, hs->groups, hs->cr, hs->flags, - hs->merged ? "M" : "", hs->tail, - hs->buddy ? 1 << hs->buddy : 0); - } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { - fmt = "%-5u %-8u %-23s %-23s %-23s\n"; - sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len, - hs->result.fe_logical); - sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, - hs->orig.fe_start, hs->orig.fe_len, - hs->orig.fe_logical); - seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); - } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { - sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len); - seq_printf(seq, "%-5u %-8u %-23s discard\n", - hs->pid, hs->ino, buf2); - } else if (hs->op == EXT4_MB_HISTORY_FREE) { - sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len); - seq_printf(seq, "%-5u %-8u %-23s free\n", - hs->pid, hs->ino, buf2); - } - return 0; -} - -static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v) -{ -} - -static struct seq_operations ext4_mb_seq_history_ops = { - .start = ext4_mb_seq_history_start, - .next = ext4_mb_seq_history_next, - .stop = ext4_mb_seq_history_stop, - .show = ext4_mb_seq_history_show, -}; - -static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) -{ - struct super_block *sb = PDE(inode)->data; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_mb_proc_session *s; - int rc; - int size; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s == NULL) - return -ENOMEM; - s->sb = sb; - size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max; - s->history = kmalloc(size, GFP_KERNEL); - if (s->history == NULL) { - kfree(s); - return -ENOMEM; - } - - spin_lock(&sbi->s_mb_history_lock); - memcpy(s->history, sbi->s_mb_history, size); - s->max = sbi->s_mb_history_max; - s->start = sbi->s_mb_history_cur % s->max; - spin_unlock(&sbi->s_mb_history_lock); - - rc = seq_open(file, &ext4_mb_seq_history_ops); - if (rc == 0) { - struct seq_file *m = (struct seq_file *)file->private_data; - m->private = s; - } else { - kfree(s->history); - kfree(s); - } - return rc; - -} - -static int ext4_mb_seq_history_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = (struct seq_file *)file->private_data; - struct ext4_mb_proc_session *s = seq->private; - kfree(s->history); - kfree(s); - return seq_release(inode, file); -} - -static ssize_t ext4_mb_seq_history_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct seq_file *seq = (struct seq_file *)file->private_data; - struct ext4_mb_proc_session *s = seq->private; - struct super_block *sb = s->sb; - char str[32]; - int value; - - if (count >= sizeof(str)) { - printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n", - "mb_history", (int)sizeof(str)); - return -EOVERFLOW; - } - - if (copy_from_user(str, buffer, count)) - return -EFAULT; - - value = simple_strtol(str, NULL, 0); - if (value < 0) - return -ERANGE; - EXT4_SB(sb)->s_mb_history_filter = value; - - return count; -} - -static struct file_operations ext4_mb_seq_history_fops = { - .owner = THIS_MODULE, - .open = ext4_mb_seq_history_open, - .read = seq_read, - .write = ext4_mb_seq_history_write, - .llseek = seq_lseek, - .release = ext4_mb_seq_history_release, -}; - static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = seq->private; - struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; - if (*pos < 0 || *pos >= sbi->s_groups_count) + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; - group = *pos + 1; - return (void *) group; + return (void *) ((unsigned long) group); } static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = seq->private; - struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ++*pos; - if (*pos < 0 || *pos >= sbi->s_groups_count) + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; - return (void *) group;; + return (void *) ((unsigned long) group); } static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = seq->private; - long group = (long) v; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; - int err; + int err, buddy_loaded = 0; struct ext4_buddy e4b; + struct ext4_group_info *grinfo; struct sg { struct ext4_group_info info; - unsigned short counters[16]; + ext4_grpblk_t counters[16]; } sg; group--; @@ -2380,17 +2264,23 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); - err = ext4_mb_load_buddy(sb, group, &e4b); - if (err) { - seq_printf(seq, "#%-5lu: I/O error\n", group); - return 0; + grinfo = ext4_get_group_info(sb, group); + /* Load the group info in memory only if not already loaded. */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + seq_printf(seq, "#%-5u: I/O error\n", group); + return 0; + } + buddy_loaded = 1; } - ext4_lock_group(sb, group); + memcpy(&sg, ext4_get_group_info(sb, group), i); - ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); - seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, + if (buddy_loaded) + ext4_mb_unload_buddy(&e4b); + + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? @@ -2404,7 +2294,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) { } -static struct seq_operations ext4_mb_seq_groups_ops = { +static const struct seq_operations ext4_mb_seq_groups_ops = { .start = ext4_mb_seq_groups_start, .next = ext4_mb_seq_groups_next, .stop = ext4_mb_seq_groups_stop, @@ -2413,19 +2303,19 @@ static struct seq_operations ext4_mb_seq_groups_ops = { static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) { - struct super_block *sb = PDE(inode)->data; + struct super_block *sb = PDE_DATA(inode); int rc; rc = seq_open(file, &ext4_mb_seq_groups_ops); if (rc == 0) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; m->private = sb; } return rc; } -static struct file_operations ext4_mb_seq_groups_fops = { +static const struct file_operations ext4_mb_seq_groups_fops = { .owner = THIS_MODULE, .open = ext4_mb_seq_groups_open, .read = seq_read, @@ -2433,229 +2323,257 @@ static struct file_operations ext4_mb_seq_groups_fops = { .release = seq_release, }; -static void ext4_mb_history_release(struct super_block *sb) +static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) +{ + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; + + BUG_ON(!cachep); + return cachep; +} + +/* + * Allocate the top-level s_group_info array for the specified number + * of groups + */ +int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned size; + struct ext4_group_info ***new_groupinfo; - remove_proc_entry("mb_groups", sbi->s_mb_proc); - remove_proc_entry("mb_history", sbi->s_mb_proc); + size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); + if (size <= sbi->s_group_info_size) + return 0; - kfree(sbi->s_mb_history); + size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); + new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); + if (!new_groupinfo) { + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); + return -ENOMEM; + } + if (sbi->s_group_info) { + memcpy(new_groupinfo, sbi->s_group_info, + sbi->s_group_info_size * sizeof(*sbi->s_group_info)); + ext4_kvfree(sbi->s_group_info); + } + sbi->s_group_info = new_groupinfo; + sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", + sbi->s_group_info_size); + return 0; } -static void ext4_mb_history_init(struct super_block *sb) +/* Create and initialize ext4_group_info data for the given group. */ +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, + struct ext4_group_desc *desc) { - struct ext4_sb_info *sbi = EXT4_SB(sb); int i; + int metalen = 0; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info **meta_group_info; + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); - if (sbi->s_mb_proc != NULL) { - struct proc_dir_entry *p; - p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); - if (p) { - p->proc_fops = &ext4_mb_seq_history_fops; - p->data = sb; - } - p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); - if (p) { - p->proc_fops = &ext4_mb_seq_groups_fops; - p->data = sb; + /* + * First check if this group is the first of a reserved block. + * If it's true, we have to allocate a new table of pointers + * to ext4_group_info structures + */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + metalen = sizeof(*meta_group_info) << + EXT4_DESC_PER_BLOCK_BITS(sb); + meta_group_info = kmalloc(metalen, GFP_KERNEL); + if (meta_group_info == NULL) { + ext4_msg(sb, KERN_ERR, "can't allocate mem " + "for a buddy group"); + goto exit_meta_group_info; } + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = + meta_group_info; } - sbi->s_mb_history_max = 1000; - sbi->s_mb_history_cur = 0; - spin_lock_init(&sbi->s_mb_history_lock); - i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); - sbi->s_mb_history = kmalloc(i, GFP_KERNEL); - if (likely(sbi->s_mb_history != NULL)) - memset(sbi->s_mb_history, 0, i); - /* if we can't allocate history, then we simple won't use it */ -} + meta_group_info = + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); -static void ext4_mb_store_history(struct ext4_allocation_context *ac) -{ - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_mb_history h; + meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL); + if (meta_group_info[i] == NULL) { + ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); + goto exit_group_info; + } + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, + &(meta_group_info[i]->bb_state)); - if (unlikely(sbi->s_mb_history == NULL)) - return; + /* + * initialize bb_free to be able to skip + * empty groups without initialization + */ + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { + meta_group_info[i]->bb_free = + ext4_free_clusters_after_init(sb, group, desc); + } else { + meta_group_info[i]->bb_free = + ext4_free_group_clusters(sb, desc); + } - if (!(ac->ac_op & sbi->s_mb_history_filter)) - return; + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + init_rwsem(&meta_group_info[i]->alloc_sem); + meta_group_info[i]->bb_free_root = RB_ROOT; + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ - h.op = ac->ac_op; - h.pid = current->pid; - h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0; - h.orig = ac->ac_o_ex; - h.result = ac->ac_b_ex; - h.flags = ac->ac_flags; - h.found = ac->ac_found; - h.groups = ac->ac_groups_scanned; - h.cr = ac->ac_criteria; - h.tail = ac->ac_tail; - h.buddy = ac->ac_buddy; - h.merged = 0; - if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { - if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && - ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) - h.merged = 1; - h.goal = ac->ac_g_ex; - h.result = ac->ac_f_ex; +#ifdef DOUBLE_CHECK + { + struct buffer_head *bh; + meta_group_info[i]->bb_bitmap = + kmalloc(sb->s_blocksize, GFP_KERNEL); + BUG_ON(meta_group_info[i]->bb_bitmap == NULL); + bh = ext4_read_block_bitmap(sb, group); + BUG_ON(bh == NULL); + memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, + sb->s_blocksize); + put_bh(bh); } +#endif - spin_lock(&sbi->s_mb_history_lock); - memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); - if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) - sbi->s_mb_history_cur = 0; - spin_unlock(&sbi->s_mb_history_lock); -} + return 0; -#else -#define ext4_mb_history_release(sb) -#define ext4_mb_history_init(sb) -#endif +exit_group_info: + /* If a meta_group_info table has been allocated, release it now */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { + kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; + } +exit_meta_group_info: + return -ENOMEM; +} /* ext4_mb_add_groupinfo */ static int ext4_mb_init_backend(struct super_block *sb) { + ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; - int j, len, metalen; struct ext4_sb_info *sbi = EXT4_SB(sb); - int num_meta_group_infos = - (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >> - EXT4_DESC_PER_BLOCK_BITS(sb); - struct ext4_group_info **meta_group_info; + int err; + struct ext4_group_desc *desc; + struct kmem_cache *cachep; + + err = ext4_mb_alloc_groupinfo(sb, ngroups); + if (err) + return err; - /* An 8TB filesystem with 64-bit pointers requires a 4096 byte - * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. - * So a two level scheme suffices for now. */ - sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * - num_meta_group_infos, GFP_KERNEL); - if (sbi->s_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); - return -ENOMEM; - } sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { - printk(KERN_ERR "EXT4-fs: can't get new inode\n"); + ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } + /* To avoid potentially colliding with an valid on-disk inode number, + * use EXT4_BAD_INO for the buddy cache inode number. This inode is + * not in the inode hash, so it should never be found by iget(), but + * this will avoid confusion if it ever shows up during debugging. */ + sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; - - metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); - for (i = 0; i < num_meta_group_infos; i++) { - if ((i + 1) == num_meta_group_infos) - metalen = sizeof(*meta_group_info) * - (sbi->s_groups_count - - (i << EXT4_DESC_PER_BLOCK_BITS(sb))); - meta_group_info = kmalloc(metalen, GFP_KERNEL); - if (meta_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate mem for a " - "buddy group\n"); - goto err_freemeta; - } - sbi->s_group_info[i] = meta_group_info; - } - - /* - * calculate needed size. if change bb_counters size, - * don't forget about ext4_mb_generate_buddy() - */ - len = sizeof(struct ext4_group_info); - len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); - for (i = 0; i < sbi->s_groups_count; i++) { - struct ext4_group_desc *desc; - - meta_group_info = - sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)]; - j = i & (EXT4_DESC_PER_BLOCK(sb) - 1); - - meta_group_info[j] = kzalloc(len, GFP_KERNEL); - if (meta_group_info[j] == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); - i--; - goto err_freebuddy; - } + for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { - printk(KERN_ERR - "EXT4-fs: can't read descriptor %lu\n", i); + ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); goto err_freebuddy; } - memset(meta_group_info[j], 0, len); - set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, - &(meta_group_info[j]->bb_state)); - - /* - * initialize bb_free to be able to skip - * empty groups without initialization - */ - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - meta_group_info[j]->bb_free = - ext4_free_blocks_after_init(sb, i, desc); - } else { - meta_group_info[j]->bb_free = - le16_to_cpu(desc->bg_free_blocks_count); - } - - INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list); - -#ifdef DOUBLE_CHECK - { - struct buffer_head *bh; - meta_group_info[j]->bb_bitmap = - kmalloc(sb->s_blocksize, GFP_KERNEL); - BUG_ON(meta_group_info[j]->bb_bitmap == NULL); - bh = read_block_bitmap(sb, i); - BUG_ON(bh == NULL); - memcpy(meta_group_info[j]->bb_bitmap, bh->b_data, - sb->s_blocksize); - put_bh(bh); - } -#endif - + if (ext4_mb_add_groupinfo(sb, i, desc) != 0) + goto err_freebuddy; } return 0; err_freebuddy: - while (i >= 0) { - kfree(ext4_get_group_info(sb, i)); - i--; - } - i = num_meta_group_infos; -err_freemeta: - while (--i >= 0) + cachep = get_groupinfo_cache(sb->s_blocksize_bits); + while (i-- > 0) + kmem_cache_free(cachep, ext4_get_group_info(sb, i)); + i = sbi->s_group_info_size; + while (i-- > 0) kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: - kfree(sbi->s_group_info); + ext4_kvfree(sbi->s_group_info); return -ENOMEM; } -int ext4_mb_init(struct super_block *sb, int needs_recovery) +static void ext4_groupinfo_destroy_slabs(void) +{ + int i; + + for (i = 0; i < NR_GRPINFO_CACHES; i++) { + if (ext4_groupinfo_caches[i]) + kmem_cache_destroy(ext4_groupinfo_caches[i]); + ext4_groupinfo_caches[i] = NULL; + } +} + +static int ext4_groupinfo_create_slab(size_t size) +{ + static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); + int slab_size; + int blocksize_bits = order_base_2(size); + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep; + + if (cache_index >= NR_GRPINFO_CACHES) + return -EINVAL; + + if (unlikely(cache_index < 0)) + cache_index = 0; + + mutex_lock(&ext4_grpinfo_slab_create_mutex); + if (ext4_groupinfo_caches[cache_index]) { + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + return 0; /* Already created */ + } + + slab_size = offsetof(struct ext4_group_info, + bb_counters[blocksize_bits + 2]); + + cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], + slab_size, 0, SLAB_RECLAIM_ACCOUNT, + NULL); + + ext4_groupinfo_caches[cache_index] = cachep; + + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + if (!cachep) { + printk(KERN_EMERG + "EXT4-fs: no memory for groupinfo slab cache\n"); + return -ENOMEM; + } + + return 0; +} + +int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned i; + unsigned i, j; unsigned offset; unsigned max; + int ret; - if (!test_opt(sb, MBALLOC)) - return 0; - - i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); + i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { - clear_opt(sbi->s_mount_opt, MBALLOC); - return -ENOMEM; + ret = -ENOMEM; + goto out; } + + i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { - clear_opt(sbi->s_mount_opt, MBALLOC); - kfree(sbi->s_mb_maxs); - return -ENOMEM; + ret = -ENOMEM; + goto out; } + ret = ext4_groupinfo_create_slab(sb->s_blocksize); + if (ret < 0) + goto out; + /* order 0 is regular bitmap */ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; sbi->s_mb_offsets[0] = 0; @@ -2671,19 +2589,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) i++; } while (i <= sb->s_blocksize_bits + 1); - /* init file for buddy data */ - i = ext4_mb_init_backend(sb); - if (i) { - clear_opt(sbi->s_mount_opt, MBALLOC); - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - return i; - } - spin_lock_init(&sbi->s_md_lock); - INIT_LIST_HEAD(&sbi->s_active_transaction); - INIT_LIST_HEAD(&sbi->s_closed_transaction); - INIT_LIST_HEAD(&sbi->s_committed_transaction); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; @@ -2691,33 +2597,70 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; - sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; + /* + * The default group preallocation is 512, which for 4k block + * sizes translates to 2 megabytes. However for bigalloc file + * systems, this is probably too big (i.e, if the cluster size + * is 1 megabyte, then group preallocation size becomes half a + * gigabyte!). As a default, we will keep a two megabyte + * group pralloc size for cluster sizes up to 64k, and after + * that, we will force a minimum group preallocation size of + * 32 clusters. This translates to 8 megs when the cluster + * size is 256k, and 32 megs when the cluster size is 1 meg, + * which seems reasonable as a default. + */ + sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> + sbi->s_cluster_bits, 32); + /* + * If there is a s_stripe > 1, then we set the s_mb_group_prealloc + * to the lowest multiple of s_stripe which is bigger than + * the s_mb_group_prealloc as determined above. We want + * the preallocation size to be an exact multiple of the + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ + if (sbi->s_stripe > 1) { + sbi->s_mb_group_prealloc = roundup( + sbi->s_mb_group_prealloc, sbi->s_stripe); + } - i = sizeof(struct ext4_locality_group) * NR_CPUS; - sbi->s_locality_groups = kmalloc(i, GFP_KERNEL); + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { - clear_opt(sbi->s_mount_opt, MBALLOC); - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - return -ENOMEM; + ret = -ENOMEM; + goto out; } - for (i = 0; i < NR_CPUS; i++) { + for_each_possible_cpu(i) { struct ext4_locality_group *lg; - lg = &sbi->s_locality_groups[i]; + lg = per_cpu_ptr(sbi->s_locality_groups, i); mutex_init(&lg->lg_mutex); - INIT_LIST_HEAD(&lg->lg_prealloc_list); + for (j = 0; j < PREALLOC_TB_SIZE; j++) + INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); spin_lock_init(&lg->lg_prealloc_lock); } - ext4_mb_init_per_dev_proc(sb); - ext4_mb_history_init(sb); + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) + goto out_free_locality_groups; + + if (sbi->s_proc) + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); - printk("EXT4-fs: mballoc enabled\n"); return 0; + +out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; +out: + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); + sbi->s_mb_maxs = NULL; + return ret; } -/* need to called with ext4 group lock (ext4_lock_group) */ +/* need to called with the ext4 group lock held */ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; @@ -2728,34 +2671,27 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; - kfree(pa); + kmem_cache_free(ext4_pspace_cachep, pa); } if (count) - mb_debug("mballoc: %u PAs left\n", count); + mb_debug(1, "mballoc: %u PAs left\n", count); } int ext4_mb_release(struct super_block *sb) { + ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int num_meta_group_infos; struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); - if (!test_opt(sb, MBALLOC)) - return 0; - - /* release freed, non-committed blocks */ - spin_lock(&sbi->s_md_lock); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_committed_transaction); - spin_unlock(&sbi->s_md_lock); - ext4_mb_free_committed_blocks(sb); + if (sbi->s_proc) + remove_proc_entry("mb_groups", sbi->s_proc); if (sbi->s_group_info) { - for (i = 0; i < sbi->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { grinfo = ext4_get_group_info(sb, i); #ifdef DOUBLE_CHECK kfree(grinfo->bb_bitmap); @@ -2763,299 +2699,192 @@ int ext4_mb_release(struct super_block *sb) ext4_lock_group(sb, i); ext4_mb_cleanup_pa(grinfo); ext4_unlock_group(sb, i); - kfree(grinfo); + kmem_cache_free(cachep, grinfo); } - num_meta_group_infos = (sbi->s_groups_count + + num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) kfree(sbi->s_group_info[i]); - kfree(sbi->s_group_info); + ext4_kvfree(sbi->s_group_info); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); if (sbi->s_buddy_cache) iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { - printk(KERN_INFO - "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u blocks %u reqs (%u success)", atomic_read(&sbi->s_bal_allocated), atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); - printk(KERN_INFO - "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " - "%u 2^N hits, %u breaks, %u lost\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); - printk(KERN_INFO - "EXT4-fs: mballoc: %lu generated and it took %Lu\n", - sbi->s_mb_buddies_generated++, + ext4_msg(sb, KERN_INFO, + "mballoc: %lu generated and it took %Lu", + sbi->s_mb_buddies_generated, sbi->s_mb_generation_time); - printk(KERN_INFO - "EXT4-fs: mballoc: %u preallocated, %u discarded\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), atomic_read(&sbi->s_mb_discarded)); } - kfree(sbi->s_locality_groups); - - ext4_mb_history_release(sb); - ext4_mb_destroy_per_dev_proc(sb); + free_percpu(sbi->s_locality_groups); return 0; } -static void ext4_mb_free_committed_blocks(struct super_block *sb) +static inline int ext4_issue_discard(struct super_block *sb, + ext4_group_t block_group, ext4_grpblk_t cluster, int count) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; - int i; - int count = 0; - int count2 = 0; - struct ext4_free_metadata *md; - struct ext4_buddy e4b; - - if (list_empty(&sbi->s_committed_transaction)) - return; - - /* there is committed blocks to be freed yet */ - do { - /* get next array of blocks */ - md = NULL; - spin_lock(&sbi->s_md_lock); - if (!list_empty(&sbi->s_committed_transaction)) { - md = list_entry(sbi->s_committed_transaction.next, - struct ext4_free_metadata, list); - list_del(&md->list); - } - spin_unlock(&sbi->s_md_lock); - - if (md == NULL) - break; - - mb_debug("gonna free %u blocks in group %lu (0x%p):", - md->num, md->group, md); - - err = ext4_mb_load_buddy(sb, md->group, &e4b); - /* we expect to find existing buddy because it's pinned */ - BUG_ON(err != 0); - - /* there are blocks to put in buddy to make them really free */ - count += md->num; - count2++; - ext4_lock_group(sb, md->group); - for (i = 0; i < md->num; i++) { - mb_debug(" %u", md->blocks[i]); - err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1); - BUG_ON(err != 0); - } - mb_debug("\n"); - ext4_unlock_group(sb, md->group); - - /* balance refcounts from ext4_mb_free_metadata() */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - - kfree(md); - ext4_mb_release_desc(&e4b); - - } while (md); - - mb_debug("freed %u blocks in %u structures\n", count, count2); -} - -#define EXT4_ROOT "ext4" -#define EXT4_MB_STATS_NAME "stats" -#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" -#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" -#define EXT4_MB_ORDER2_REQ "order2_req" -#define EXT4_MB_STREAM_REQ "stream_req" -#define EXT4_MB_GROUP_PREALLOC "group_prealloc" - - - -#define MB_PROC_VALUE_READ(name) \ -static int ext4_mb_read_##name(char *page, char **start, \ - off_t off, int count, int *eof, void *data) \ -{ \ - struct ext4_sb_info *sbi = data; \ - int len; \ - *eof = 1; \ - if (off != 0) \ - return 0; \ - len = sprintf(page, "%ld\n", sbi->s_mb_##name); \ - *start = page; \ - return len; \ + ext4_fsblk_t discard_block; + + discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + + ext4_group_first_block_no(sb, block_group)); + count = EXT4_C2B(EXT4_SB(sb), count); + trace_ext4_discard_blocks(sb, + (unsigned long long) discard_block, count); + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } -#define MB_PROC_VALUE_WRITE(name) \ -static int ext4_mb_write_##name(struct file *file, \ - const char __user *buf, unsigned long cnt, void *data) \ -{ \ - struct ext4_sb_info *sbi = data; \ - char str[32]; \ - long value; \ - if (cnt >= sizeof(str)) \ - return -EINVAL; \ - if (copy_from_user(str, buf, cnt)) \ - return -EFAULT; \ - value = simple_strtol(str, NULL, 0); \ - if (value <= 0) \ - return -ERANGE; \ - sbi->s_mb_##name = value; \ - return cnt; \ -} - -MB_PROC_VALUE_READ(stats); -MB_PROC_VALUE_WRITE(stats); -MB_PROC_VALUE_READ(max_to_scan); -MB_PROC_VALUE_WRITE(max_to_scan); -MB_PROC_VALUE_READ(min_to_scan); -MB_PROC_VALUE_WRITE(min_to_scan); -MB_PROC_VALUE_READ(order2_reqs); -MB_PROC_VALUE_WRITE(order2_reqs); -MB_PROC_VALUE_READ(stream_request); -MB_PROC_VALUE_WRITE(stream_request); -MB_PROC_VALUE_READ(group_prealloc); -MB_PROC_VALUE_WRITE(group_prealloc); - -#define MB_PROC_HANDLER(name, var) \ -do { \ - proc = create_proc_entry(name, mode, sbi->s_mb_proc); \ - if (proc == NULL) { \ - printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ - goto err_out; \ - } \ - proc->data = sbi; \ - proc->read_proc = ext4_mb_read_##var ; \ - proc->write_proc = ext4_mb_write_##var; \ -} while (0) - -static int ext4_mb_init_per_dev_proc(struct super_block *sb) +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc) { - mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct proc_dir_entry *proc; - char devname[64]; + struct ext4_free_data *entry = (struct ext4_free_data *)jce; + struct ext4_buddy e4b; + struct ext4_group_info *db; + int err, count = 0, count2 = 0; + + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + entry->efd_count, entry->efd_group, entry); + + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, + entry->efd_count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%d failed" + " with %d", entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, err); + } - snprintf(devname, sizeof(devname) - 1, "%s", - bdevname(sb->s_bdev, devname)); - sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); + err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); - MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats); - MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan); - MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan); - MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs); - MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request); - MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc); - return 0; + db = e4b.bd_info; + /* there are blocks to put in buddy to make them really free */ + count += entry->efd_count; + count2++; + ext4_lock_group(sb, entry->efd_group); + /* Take it out of per group rb tree */ + rb_erase(&entry->efd_node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); -err_out: - printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); - remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); - remove_proc_entry(devname, proc_root_ext4); - sbi->s_mb_proc = NULL; - - return -ENOMEM; -} - -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - char devname[64]; - - if (sbi->s_mb_proc == NULL) - return -EINVAL; + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); - snprintf(devname, sizeof(devname) - 1, "%s", - bdevname(sb->s_bdev, devname)); - remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); - remove_proc_entry(devname, proc_root_ext4); + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); + } + ext4_unlock_group(sb, entry->efd_group); + kmem_cache_free(ext4_free_data_cachep, entry); + ext4_mb_unload_buddy(&e4b); - return 0; + mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } -int __init init_ext4_mballoc(void) +int __init ext4_init_mballoc(void) { - ext4_pspace_cachep = - kmem_cache_create("ext4_prealloc_space", - sizeof(struct ext4_prealloc_space), - 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, + SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) return -ENOMEM; - ext4_ac_cachep = - kmem_cache_create("ext4_alloc_context", - sizeof(struct ext4_allocation_context), - 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, + SLAB_RECLAIM_ACCOUNT); if (ext4_ac_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } -#ifdef CONFIG_PROC_FS - proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs); - if (proc_root_ext4 == NULL) - printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT); -#endif + + ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); + if (ext4_free_data_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; + } return 0; } -void exit_ext4_mballoc(void) +void ext4_exit_mballoc(void) { - /* XXX: synchronize_rcu(); */ + /* + * Wait for completion of call_rcu()'s on ext4_pspace_cachep + * before destroying the slab cache. + */ + rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); -#ifdef CONFIG_PROC_FS - remove_proc_entry(EXT4_ROOT, proc_root_fs); -#endif + kmem_cache_destroy(ext4_free_data_cachep); + ext4_groupinfo_destroy_slabs(); } /* - * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps + * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps * Returns 0 if success or error code */ -static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle) +static noinline_for_stack int +ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + handle_t *handle, unsigned int reserv_clstrs) { struct buffer_head *bitmap_bh = NULL; - struct ext4_super_block *es; struct ext4_group_desc *gdp; struct buffer_head *gdp_bh; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block; - int err; + int err, len; BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_b_ex.fe_len <= 0); sb = ac->ac_sb; sbi = EXT4_SB(sb); - es = sbi->s_es; - - ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, - gdp->bg_free_blocks_count); err = -EIO; - bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group); + bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); if (!bitmap_bh) goto out_err; + BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, bitmap_bh); if (err) goto out_err; @@ -3065,23 +2894,35 @@ static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (!gdp) goto out_err; + ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, + ext4_free_group_clusters(sb, gdp)); + + BUFFER_TRACE(gdp_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdp_bh); if (err) goto out_err; - block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) - + ac->ac_b_ex.fe_start - + le32_to_cpu(es->s_first_data_block); - - if (block == ext4_block_bitmap(sb, gdp) || - block == ext4_inode_bitmap(sb, gdp) || - in_range(block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - ext4_error(sb, __FUNCTION__, - "Allocating block in system zone - block = %llu", - block); + len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + if (!ext4_data_block_valid(sbi, block, len)) { + ext4_error(sb, "Allocating blocks %llu-%llu which overlap " + "fs metadata", block, block+len); + /* File system mounted not to panic on error + * Fix the bitmap and repeat the block allocation + * We leak some of the blocks here. + */ + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (!err) + err = -EAGAIN; + goto out_err; } + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); #ifdef AGGRESSIVE_CHECK { int i; @@ -3091,40 +2932,52 @@ static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); - - spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - gdp->bg_free_blocks_count = - cpu_to_le16(ext4_free_blocks_after_init(sb, - ac->ac_b_ex.fe_group, - gdp)); - } - gdp->bg_free_blocks_count = - cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - - ac->ac_b_ex.fe_len); - gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); - spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); - percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); - - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, + ac->ac_b_ex.fe_group, gdp)); + } + len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; + ext4_free_group_clusters_set(sb, gdp, len); + ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); + + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); + /* + * Now reduce the dirty block count also. Should not go negative + */ + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, + ac->ac_b_ex.fe_group); + atomic64_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_clusters); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (err) goto out_err; - err = ext4_journal_dirty_metadata(handle, gdp_bh); + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); out_err: - sb->s_dirt = 1; brelse(bitmap_bh); return err; } /* * here we normalize request for locality group - * Group request are normalized to s_strip size if we set the same via mount - * option. If not we set it to s_mb_group_prealloc which can be configured via - * /proc/fs/ext4/<partition>/group_prealloc + * Group request are normalized to s_mb_group_prealloc, which goes to + * s_strip if we set the same via mount option. + * s_mb_group_prealloc can be configured via + * /sys/fs/ext4/<partition>/mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? */ @@ -3134,11 +2987,8 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); - if (EXT4_SB(sb)->s_stripe) - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; - else - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; - mb_debug("#%u: goal %lu blocks for locality group\n", + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; + mb_debug(1, "#%u: goal %u blocks for locality group\n", current->pid, ac->ac_g_ex.fe_len); } @@ -3146,15 +2996,18 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) * Normalization means making request better in terms of * size and alignment */ -static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, +static noinline_for_stack void +ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits, max; ext4_lblk_t end; - struct list_head *cur; - loff_t size, orig_size, start_off; - ext4_lblk_t start, orig_start; + loff_t size, start_off; + loff_t orig_size __maybe_unused; + ext4_lblk_t start; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *pa; /* do normalize only data requests, metadata requests do not need preallocation */ @@ -3179,17 +3032,17 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* first, let's learn actual file size * given current request is allocated */ - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); + orig_size = size; - /* max available blocks in a free group */ - max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 - - EXT4_SB(ac->ac_sb)->s_itb_per_group; + /* max size of free chunks */ + max = 2 << bsbits; -#define NRL_CHECK_SIZE(req, size, max,bits) \ - (req <= (size) || max <= ((size) >> bits)) +#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ + (req <= (size) || max <= (chunk_size)) /* first, try to predict filesize */ /* XXX: should this table be tunable? */ @@ -3208,16 +3061,16 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, size = 512 * 1024; } else if (size <= 1024 * 1024) { size = 1024 * 1024; - } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) { + } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> - (20 - bsbits)) << 20; - size = 1024 * 1024; - } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) { + (21 - bsbits)) << 21; + size = 2 * 1024 * 1024; + } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (22 - bsbits)) << 22; size = 4 * 1024 * 1024; } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, - (8<<20)>>bsbits, max, bsbits)) { + (8<<20)>>bsbits, max, 8 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (23 - bsbits)) << 23; size = 8 * 1024 * 1024; @@ -3225,8 +3078,8 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; size = ac->ac_o_ex.fe_len << bsbits; } - orig_size = size = size >> bsbits; - orig_start = start = start_off >> bsbits; + size = size >> bsbits; + start = start_off >> bsbits; /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { @@ -3240,11 +3093,8 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* check we don't cross already preallocated blocks */ rcu_read_lock(); - list_for_each_rcu(cur, &ei->i_prealloc_list) { - struct ext4_prealloc_space *pa; - unsigned long pa_end; - - pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + ext4_lblk_t pa_end; if (pa->pa_deleted) continue; @@ -3254,29 +3104,25 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, continue; } - pa_end = pa->pa_lstart + pa->pa_len; + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); /* PA must not overlap original request */ BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || ac->ac_o_ex.fe_logical < pa->pa_lstart)); - /* skip PA normalized request doesn't overlap with */ - if (pa->pa_lstart >= end) { - spin_unlock(&pa->pa_lock); - continue; - } - if (pa_end <= start) { + /* skip PAs this normalized request doesn't overlap with */ + if (pa->pa_lstart >= end || pa_end <= start) { spin_unlock(&pa->pa_lock); continue; } BUG_ON(pa->pa_lstart <= start && pa_end >= end); + /* adjust start or end to be adjacent to this pa */ if (pa_end <= ac->ac_o_ex.fe_logical) { BUG_ON(pa_end < start); start = pa_end; - } - - if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { + } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { BUG_ON(pa->pa_lstart > end); end = pa->pa_lstart; } @@ -3287,13 +3133,13 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* XXX: extra loop to check we really don't overlap preallocations */ rcu_read_lock(); - list_for_each_rcu(cur, &ei->i_prealloc_list) { - struct ext4_prealloc_space *pa; - unsigned long pa_end; - pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + ext4_lblk_t pa_end; + spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0) { - pa_end = pa->pa_lstart + pa->pa_len; + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); } spin_unlock(&pa->pa_lock); @@ -3302,20 +3148,21 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, if (start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical) { - printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", - (unsigned long) start, (unsigned long) size, - (unsigned long) ac->ac_o_ex.fe_logical); + ext4_msg(ac->ac_sb, KERN_ERR, + "start %lu, size %lu, fe_logical %lu", + (unsigned long) start, (unsigned long) size, + (unsigned long) ac->ac_o_ex.fe_logical); } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ /* XXX: is it better to align blocks WRT to logical * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; - ac->ac_g_ex.fe_len = size; + ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size))) { @@ -3333,7 +3180,7 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } - mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, + mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, (unsigned) orig_size, (unsigned) start); } @@ -3344,7 +3191,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); - if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) + if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && @@ -3354,7 +3201,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) atomic_inc(&sbi->s_bal_breaks); } - ext4_mb_store_history(ac); + if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) + trace_ext4_mballoc_alloc(ac); + else + trace_ext4_mballoc_prealloc(ac); +} + +/* + * Called on failure; free up any blocks from the inode PA for this + * context. We don't need this for MB_GROUP_PA because we only change + * pa_free in ext4_mb_release_context(), but on failure, we've already + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. + */ +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + + if (pa && pa->pa_type == MB_INODE_PA) + pa->pa_free += ac->ac_b_ex.fe_len; } /* @@ -3363,14 +3227,16 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; ext4_fsblk_t end; int len; /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); - end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); - len = end - start; + end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), + start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); + len = EXT4_NUM_B2C(sbi, end - start); ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; @@ -3378,11 +3244,11 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, ac->ac_pa = pa; BUG_ON(start < pa->pa_pstart); - BUG_ON(start + len > pa->pa_pstart + pa->pa_len); + BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); pa->pa_free -= len; - mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa); + mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); } /* @@ -3391,7 +3257,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { - unsigned len = ac->ac_o_ex.fe_len; + unsigned int len = ac->ac_o_ex.fe_len; ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, &ac->ac_b_ex.fe_group, @@ -3406,18 +3272,50 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ - mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); + mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); +} + +/* + * Return the prealloc space that have minimal distance + * from the goal block. @cpa is the prealloc + * space that is having currently known minimal distance + * from the goal block. + */ +static struct ext4_prealloc_space * +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, + struct ext4_prealloc_space *pa, + struct ext4_prealloc_space *cpa) +{ + ext4_fsblk_t cur_distance, new_distance; + + if (cpa == NULL) { + atomic_inc(&pa->pa_count); + return pa; + } + cur_distance = abs(goal_block - cpa->pa_pstart); + new_distance = abs(goal_block - pa->pa_pstart); + + if (cur_distance <= new_distance) + return cpa; + + /* drop the previous reference */ + atomic_dec(&cpa->pa_count); + atomic_inc(&pa->pa_count); + return pa; } /* * search goal blocks in preallocated space */ -static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) +static noinline_for_stack int +ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; - struct ext4_prealloc_space *pa; - struct list_head *cur; + struct ext4_prealloc_space *pa, *cpa = NULL; + ext4_fsblk_t goal_block; /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) @@ -3425,13 +3323,19 @@ static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* first, try per-file preallocation */ rcu_read_lock(); - list_for_each_rcu(cur, &ei->i_prealloc_list) { - pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { /* all fields in this condition don't change, * so we can skip locking for them */ if (ac->ac_o_ex.fe_logical < pa->pa_lstart || - ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) + ac->ac_o_ex.fe_logical >= (pa->pa_lstart + + EXT4_C2B(sbi, pa->pa_len))) + continue; + + /* non-extent files can't have physical blocks past 2^32 */ + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && + (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) continue; /* found preallocated blocks, use them */ @@ -3456,32 +3360,70 @@ static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) lg = ac->ac_lg; if (lg == NULL) return 0; + order = fls(ac->ac_o_ex.fe_len) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; - rcu_read_lock(); - list_for_each_rcu(cur, &lg->lg_prealloc_list) { - pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { - atomic_inc(&pa->pa_count); - ext4_mb_use_group_pa(ac, pa); + goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); + /* + * search for the prealloc space that is having + * minimal distance from the goal block. + */ + for (i = order; i < PREALLOC_TB_SIZE; i++) { + rcu_read_lock(); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0 && + pa->pa_free >= ac->ac_o_ex.fe_len) { + + cpa = ext4_mb_check_group_pa(goal_block, + pa, cpa); + } spin_unlock(&pa->pa_lock); - ac->ac_criteria = 20; - rcu_read_unlock(); - return 1; } - spin_unlock(&pa->pa_lock); + rcu_read_unlock(); + } + if (cpa) { + ext4_mb_use_group_pa(ac, cpa); + ac->ac_criteria = 20; + return 1; } - rcu_read_unlock(); - return 0; } /* + * the function goes through all block freed in the group + * but not yet committed and marks them used in in-core bitmap. + * buddy must be generated from this bitmap + * Need to be called with the ext4 group lock held + */ +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct rb_node *n; + struct ext4_group_info *grp; + struct ext4_free_data *entry; + + grp = ext4_get_group_info(sb, group); + n = rb_first(&(grp->bb_free_root)); + + while (n) { + entry = rb_entry(n, struct ext4_free_data, efd_node); + ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); + n = rb_next(n); + } + return; +} + +/* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap - * Need to be called with ext4 group lock (ext4_lock_group) + * Need to be called with ext4 group lock held */ -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, +static noinline_for_stack +void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); @@ -3490,7 +3432,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t groupnr; ext4_grpblk_t start; int preallocated = 0; - int count = 0; int len; /* all form of preallocation discards first load group, @@ -3511,18 +3452,19 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); - mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), - bitmap, start, len); + ext4_set_bits(bitmap, start, len); preallocated += len; - count++; } - mb_debug("prellocated %u for group %lu\n", preallocated, group); + mb_debug(1, "prellocated %u for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + + BUG_ON(atomic_read(&pa->pa_count)); + BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } @@ -3533,13 +3475,16 @@ static void ext4_mb_pa_callback(struct rcu_head *head) static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { - unsigned long grp; - - if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) - return; + ext4_group_t grp; + ext4_fsblk_t grp_blk; /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { + spin_unlock(&pa->pa_lock); + return; + } + if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; @@ -3548,8 +3493,15 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, pa->pa_deleted = 1; spin_unlock(&pa->pa_lock); - /* -1 is to protect from crossing allocation group */ - ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); + grp_blk = pa->pa_pstart; + /* + * If doing group-based preallocation, pa_pstart may be in the + * next group when pa is used up + */ + if (pa->pa_type == MB_GROUP_PA) + grp_blk--; + + grp = ext4_get_group_number(sb, grp_blk); /* * possible race: @@ -3579,9 +3531,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, /* * creates new preallocated space for given inode */ -static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) +static noinline_for_stack int +ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_prealloc_space *pa; struct ext4_group_info *grp; struct ext4_inode_info *ei; @@ -3613,16 +3567,18 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; /* also, we should cover whole original request */ - wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; + wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); /* the smallest one defines real window */ win = min(winl, wins); - offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; + offs = ac->ac_o_ex.fe_logical % + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (offs && offs < win) win = offs; - ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; + ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - + EXT4_NUM_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); } @@ -3637,14 +3593,17 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_free = pa->pa_len; atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; - pa->pa_linear = 0; + pa->pa_type = MB_INODE_PA; - mb_debug("new inode pa %p: %llu/%u for %u\n", pa, + mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_inode_pa(ac, pa); ext4_mb_use_inode_pa(ac, pa); - atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + atomic_add(pa->pa_free, &sbi->s_mb_preallocated); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); @@ -3666,7 +3625,8 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) /* * creates new preallocated space for locality group inodes belongs to */ -static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac) +static noinline_for_stack int +ext4_mb_new_group_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg; @@ -3693,11 +3653,14 @@ static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_free = pa->pa_len; atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; - pa->pa_linear = 1; + pa->pa_type = MB_GROUP_PA; - mb_debug("new group pa %p: %llu/%u for %u\n", pa, + mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_ext4_mb_new_group_pa(ac, pa); ext4_mb_use_group_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); @@ -3713,10 +3676,10 @@ static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac) list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - spin_lock(pa->pa_obj_lock); - list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); - spin_unlock(pa->pa_obj_lock); - + /* + * We will later add the new pa to the right bucket + * after updating the pa_free in ext4_mb_release_context + */ return 0; } @@ -3739,107 +3702,76 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ -static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, - struct buffer_head *bitmap_bh, - struct ext4_prealloc_space *pa) +static noinline_for_stack int +ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + struct ext4_prealloc_space *pa) { - struct ext4_allocation_context *ac; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned long end; - unsigned long next; + unsigned int end; + unsigned int next; ext4_group_t group; ext4_grpblk_t bit; - sector_t start; + unsigned long long grp_blk_start; int err = 0; int free = 0; BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = pa->pa_inode; - ac->ac_op = EXT4_MB_HISTORY_DISCARD; - } - while (bit < end) { bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - if (next > end) - next = end; - start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + - le32_to_cpu(sbi->s_es->s_first_data_block); - mb_debug(" free preallocated %u/%u in group %u\n", - (unsigned) start, (unsigned) next - bit, - (unsigned) group); + mb_debug(1, " free preallocated %u/%u in group %u\n", + (unsigned) ext4_group_first_block_no(sb, group) + bit, + (unsigned) next - bit, (unsigned) group); free += next - bit; - if (ac) { - ac->ac_b_ex.fe_group = group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = next - bit; - ac->ac_b_ex.fe_logical = 0; - ext4_mb_store_history(ac); - } - + trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); + trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + + EXT4_C2B(sbi, bit)), + next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } if (free != pa->pa_free) { - printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", - pa, (unsigned long) pa->pa_lstart, - (unsigned long) pa->pa_pstart, - (unsigned long) pa->pa_len); - ext4_error(sb, __FUNCTION__, "free %u, pa_free %u\n", - free, pa->pa_free); + ext4_msg(e4b->bd_sb, KERN_CRIT, + "pa %p: logic %lu, phys. %lu, len %lu", + pa, (unsigned long) pa->pa_lstart, + (unsigned long) pa->pa_pstart, + (unsigned long) pa->pa_len); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", + free, pa->pa_free); /* * pa is already deleted so we use the value obtained * from the bitmap and continue. */ } atomic_add(free, &sbi->s_mb_discarded); - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); return err; } -static int ext4_mb_release_group_pa(struct ext4_buddy *e4b, +static noinline_for_stack int +ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { - struct ext4_allocation_context *ac; struct super_block *sb = e4b->bd_sb; ext4_group_t group; ext4_grpblk_t bit; - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - - if (ac) - ac->ac_op = EXT4_MB_HISTORY_DISCARD; - + trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); - - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = NULL; - ac->ac_b_ex.fe_group = group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = pa->pa_len; - ac->ac_b_ex.fe_logical = 0; - ext4_mb_store_history(ac); - kmem_cache_free(ext4_ac_cachep, ac); - } + trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); return 0; } @@ -3853,7 +3785,8 @@ static int ext4_mb_release_group_pa(struct ext4_buddy *e4b, * - how many do we discard * 1) how many requested */ -static int ext4_mb_discard_group_preallocations(struct super_block *sb, +static noinline_for_stack int +ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_group_t group, int needed) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); @@ -3865,27 +3798,28 @@ static int ext4_mb_discard_group_preallocations(struct super_block *sb, int busy = 0; int free = 0; - mb_debug("discard preallocation for group %lu\n", group); + mb_debug(1, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) return 0; - bitmap_bh = read_block_bitmap(sb, group); + bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - /* error handling here */ - ext4_mb_release_desc(&e4b); - BUG_ON(bitmap_bh == NULL); + ext4_error(sb, "Error reading block bitmap for %u", group); + return 0; } err = ext4_mb_load_buddy(sb, group, &e4b); - BUG_ON(err != 0); /* error handling here */ + if (err) { + ext4_error(sb, "Error loading buddy information for %u", group); + put_bh(bitmap_bh); + return 0; + } if (needed == 0) - needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; + needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; - grp = ext4_get_group_info(sb, group); INIT_LIST_HEAD(&list); - repeat: ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, @@ -3917,11 +3851,7 @@ repeat: if (free < needed && busy) { busy = 0; ext4_unlock_group(sb, group); - /* - * Yield the CPU here so that we don't get soft lockup - * in non preempt case. - */ - yield(); + cond_resched(); goto repeat; } @@ -3939,7 +3869,7 @@ repeat: list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); - if (pa->pa_linear) + if (pa->pa_type == MB_GROUP_PA) ext4_mb_release_group_pa(&e4b, pa); else ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); @@ -3950,7 +3880,7 @@ repeat: out: ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); return free; } @@ -3964,7 +3894,7 @@ out: * * FIXME!! Make sure it is valid at all the call sites */ -void ext4_mb_discard_inode_preallocations(struct inode *inode) +void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; @@ -3975,12 +3905,13 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode) struct ext4_buddy e4b; int err; - if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) { + if (!S_ISREG(inode->i_mode)) { /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ return; } - mb_debug("discard preallocation for inode %lu\n", inode->i_ino); + mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); + trace_ext4_discard_preallocations(inode); INIT_LIST_HEAD(&list); @@ -3997,7 +3928,8 @@ repeat: * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); spin_unlock(&ei->i_prealloc_lock); - printk(KERN_ERR "uh-oh! used pa while discarding\n"); + ext4_msg(sb, KERN_ERR, + "uh-oh! used pa while discarding"); WARN_ON(1); schedule_timeout_uninterruptible(HZ); goto repeat; @@ -4033,17 +3965,22 @@ repeat: spin_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { - BUG_ON(pa->pa_linear != 0); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + BUG_ON(pa->pa_type != MB_INODE_PA); + group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy(sb, group, &e4b); - BUG_ON(err != 0); /* error handling here */ + if (err) { + ext4_error(sb, "Error loading buddy information for %u", + group); + continue; + } - bitmap_bh = read_block_bitmap(sb, group); + bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - /* error handling here */ - ext4_mb_release_desc(&e4b); - BUG_ON(bitmap_bh == NULL); + ext4_error(sb, "Error reading block bitmap for %u", + group); + ext4_mb_unload_buddy(&e4b); + continue; } ext4_lock_group(sb, group); @@ -4051,7 +3988,7 @@ repeat: ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); @@ -4059,31 +3996,23 @@ repeat: } } -/* - * finds all preallocated spaces and return blocks being freed to them - * if preallocated space becomes full (no block is used from the space) - * then the function frees space in buddy - * XXX: at the moment, truncate (which is the only way to free blocks) - * discards all preallocations - */ -static void ext4_mb_return_to_preallocation(struct inode *inode, - struct ext4_buddy *e4b, - sector_t block, int count) -{ - BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); -} -#ifdef MB_DEBUG +#ifdef CONFIG_EXT4_DEBUG static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - ext4_group_t i; + ext4_group_t ngroups, i; + + if (!ext4_mballoc_debug || + (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + return; - printk(KERN_ERR "EXT4-fs: Can't allocate:" - " Allocation context details:\n"); - printk(KERN_ERR "EXT4-fs: status %d flags %d\n", + ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" + " Allocation context details:"); + ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", ac->ac_status, ac->ac_flags); - printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " - "best %lu/%lu/%lu@%lu cr %d\n", + ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " + "goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, (unsigned long)ac->ac_o_ex.fe_len, @@ -4097,10 +4026,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, - ac->ac_found); - printk(KERN_ERR "EXT4-fs: groups: \n"); - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); + ngroups = ext4_get_groups_count(sb); + for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; ext4_grpblk_t start; @@ -4113,14 +4042,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); - printk(KERN_ERR "PA:%lu:%d:%u \n", i, - start, pa->pa_len); + printk(KERN_ERR "PA:%u:%d:%u \n", i, + start, pa->pa_len); } - ext4_lock_group(sb, i); + ext4_unlock_group(sb, i); if (grp->bb_free == 0) continue; - printk(KERN_ERR "%lu: %d/%d \n", + printk(KERN_ERR "%u: %d/%d \n", i, grp->bb_free, grp->bb_fragments); } printk(KERN_ERR "\n"); @@ -4137,7 +4066,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) * file is determined by the current size or the resulting size after * allocation which ever is larger * - * One can tune this size via /proc/fs/ext4/<partition>/stream_req + * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req */ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) { @@ -4148,16 +4077,31 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; - isize = i_size_read(ac->ac_inode) >> bsbits; - size = max(size, isize); + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; - /* don't use group allocation for large files */ - if (size >= sbi->s_mb_stream_request) + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; + + if ((size == isize) && + !ext4_fs_is_busy(sbi) && + (atomic_read(&ac->ac_inode->i_writecount) == 0)) { + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; + } - if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + if (sbi->s_mb_group_prealloc <= 0) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; + } + + /* don't use group allocation for large files */ + size = max(size, isize); + if (size > sbi->s_mb_stream_request) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } BUG_ON(ac->ac_lg != NULL); /* @@ -4165,8 +4109,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) * per cpu locality group is to reduce the contention between block * request from multiple CPUs. */ - ac->ac_lg = &sbi->s_locality_groups[get_cpu()]; - put_cpu(); + ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups); /* we're going to use group allocation */ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; @@ -4175,23 +4118,24 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) mutex_lock(&ac->ac_lg->lg_mutex); } -static int ext4_mb_initialize_context(struct ext4_allocation_context *ac, +static noinline_for_stack int +ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_group_t group; - unsigned long len; - unsigned long goal; + unsigned int len; + ext4_fsblk_t goal; ext4_grpblk_t block; /* we can't allocate > group size */ len = ar->len; /* just a dirty hack to filter too big requests */ - if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) - len = EXT4_BLOCKS_PER_GROUP(sb) - 10; + if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) + len = EXT4_CLUSTERS_PER_GROUP(sb); /* start searching from the goal */ goal = ar->goal; @@ -4201,38 +4145,22 @@ static int ext4_mb_initialize_context(struct ext4_allocation_context *ac, ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ - ac->ac_b_ex.fe_logical = ar->logical; - ac->ac_b_ex.fe_group = 0; - ac->ac_b_ex.fe_start = 0; - ac->ac_b_ex.fe_len = 0; + ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; - ac->ac_groups_scanned = 0; - ac->ac_ex_scanned = 0; - ac->ac_found = 0; ac->ac_sb = sb; ac->ac_inode = ar->inode; - ac->ac_o_ex.fe_logical = ar->logical; + ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; ac->ac_o_ex.fe_group = group; ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; - ac->ac_g_ex.fe_logical = ar->logical; - ac->ac_g_ex.fe_group = group; - ac->ac_g_ex.fe_start = block; - ac->ac_g_ex.fe_len = len; - ac->ac_f_ex.fe_len = 0; + ac->ac_g_ex = ac->ac_o_ex; ac->ac_flags = ar->flags; - ac->ac_2order = 0; - ac->ac_criteria = 0; - ac->ac_pa = NULL; - ac->ac_bitmap_page = NULL; - ac->ac_buddy_page = NULL; - ac->ac_lg = NULL; /* we have to define context: we'll we work with a file or * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); - mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " + mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " "left: %u/%u, right %u/%u to %swritable\n", (unsigned) ar->len, (unsigned) ar->logical, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, @@ -4243,22 +4171,167 @@ static int ext4_mb_initialize_context(struct ext4_allocation_context *ac, } +static noinline_for_stack void +ext4_mb_discard_lg_preallocations(struct super_block *sb, + struct ext4_locality_group *lg, + int order, int total_entries) +{ + ext4_group_t group = 0; + struct ext4_buddy e4b; + struct list_head discard_list; + struct ext4_prealloc_space *pa, *tmp; + + mb_debug(1, "discard locality group preallocation\n"); + + INIT_LIST_HEAD(&discard_list); + + spin_lock(&lg->lg_prealloc_lock); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* + * This is the pa that we just used + * for block allocation. So don't + * free that + */ + spin_unlock(&pa->pa_lock); + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + /* only lg prealloc space */ + BUG_ON(pa->pa_type != MB_GROUP_PA); + + /* seems this one can be freed ... */ + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + + list_del_rcu(&pa->pa_inode_list); + list_add(&pa->u.pa_tmp_list, &discard_list); + + total_entries--; + if (total_entries <= 5) { + /* + * we want to keep only 5 entries + * allowing it to grow to 8. This + * mak sure we don't call discard + * soon for this list. + */ + break; + } + } + spin_unlock(&lg->lg_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { + + group = ext4_get_group_number(sb, pa->pa_pstart); + if (ext4_mb_load_buddy(sb, group, &e4b)) { + ext4_error(sb, "Error loading buddy information for %u", + group); + continue; + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_group_pa(&e4b, pa); + ext4_unlock_group(sb, group); + + ext4_mb_unload_buddy(&e4b); + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } +} + +/* + * We have incremented pa_count. So it cannot be freed at this + * point. Also we hold lg_mutex. So no parallel allocation is + * possible from this lg. That means pa_free cannot be updated. + * + * A parallel ext4_mb_discard_group_preallocations is possible. + * which can cause the lg_prealloc_list to be updated. + */ + +static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) +{ + int order, added = 0, lg_prealloc_count = 1; + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; + + order = fls(pa->pa_free) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + /* Add the prealloc space to lg */ + spin_lock(&lg->lg_prealloc_lock); + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted) { + spin_unlock(&tmp_pa->pa_lock); + continue; + } + if (!added && pa->pa_free < tmp_pa->pa_free) { + /* Add to the tail of the previous entry */ + list_add_tail_rcu(&pa->pa_inode_list, + &tmp_pa->pa_inode_list); + added = 1; + /* + * we want to count the total + * number of entries in the list + */ + } + spin_unlock(&tmp_pa->pa_lock); + lg_prealloc_count++; + } + if (!added) + list_add_tail_rcu(&pa->pa_inode_list, + &lg->lg_prealloc_list[order]); + spin_unlock(&lg->lg_prealloc_lock); + + /* Now trim the list to be not more than 8 elements */ + if (lg_prealloc_count > 8) { + ext4_mb_discard_lg_preallocations(sb, lg, + order, lg_prealloc_count); + return; + } + return ; +} + /* * release all resource we used in allocation */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { - if (ac->ac_pa) { - if (ac->ac_pa->pa_linear) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *pa = ac->ac_pa; + if (pa) { + if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ - spin_lock(&ac->ac_pa->pa_lock); - ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; - ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; - ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; - ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; - spin_unlock(&ac->ac_pa->pa_lock); + spin_lock(&pa->pa_lock); + pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + pa->pa_free -= ac->ac_b_ex.fe_len; + pa->pa_len -= ac->ac_b_ex.fe_len; + spin_unlock(&pa->pa_lock); + } + } + if (pa) { + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. + */ + if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); } - ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); + ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_page) page_cache_release(ac->ac_bitmap_page); @@ -4272,11 +4345,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) { - ext4_group_t i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); int ret; int freed = 0; - for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { + trace_ext4_mb_discard_preallocations(sb, needed); + for (i = 0; i < ngroups && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, needed); freed += ret; needed -= ret; @@ -4291,42 +4365,76 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) * to usual allocation */ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, - struct ext4_allocation_request *ar, int *errp) + struct ext4_allocation_request *ar, int *errp) { + int freed; struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; - int freed; - int inquota; + unsigned int inquota = 0; + unsigned int reserv_clstrs = 0; + might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); - if (!test_opt(sb, MBALLOC)) { - block = ext4_new_blocks_old(handle, ar->inode, ar->goal, - &(ar->len), errp); - return block; - } + trace_ext4_request_blocks(ar); - while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { - ar->flags |= EXT4_MB_HINT_NOPREALLOC; - ar->len--; - } - if (ar->len == 0) { - *errp = -EDQUOT; - return 0; + /* Allow to use superuser reservation for quota file */ + if (IS_NOQUOTA(ar->inode)) + ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; + + /* + * For delayed allocation, we could skip the ENOSPC and + * EDQUOT check, as blocks and quotas have been already + * reserved when data being copied into pagecache. + */ + if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) + ar->flags |= EXT4_MB_DELALLOC_RESERVED; + else { + /* Without delayed allocation we need to verify + * there is enough free blocks to do block allocation + * and verify allocation doesn't exceed the quota limits. + */ + while (ar->len && + ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { + + /* let others to free the space */ + cond_resched(); + ar->len = ar->len >> 1; + } + if (!ar->len) { + *errp = -ENOSPC; + return 0; + } + reserv_clstrs = ar->len; + if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { + dquot_alloc_block_nofail(ar->inode, + EXT4_C2B(sbi, ar->len)); + } else { + while (ar->len && + dquot_alloc_block(ar->inode, + EXT4_C2B(sbi, ar->len))) { + + ar->flags |= EXT4_MB_HINT_NOPREALLOC; + ar->len--; + } + } + inquota = ar->len; + if (ar->len == 0) { + *errp = -EDQUOT; + goto out; + } } - inquota = ar->len; - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { + ar->len = 0; *errp = -ENOMEM; - return 0; + goto out; } - ext4_mb_poll_new_transaction(sb, handle); - *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; @@ -4335,212 +4443,315 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ac->ac_op = EXT4_MB_HISTORY_PREALLOC; if (!ext4_mb_use_preallocated(ac)) { - ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); - repeat: /* allocate space in core */ - ext4_mb_regular_allocator(ac); + *errp = ext4_mb_regular_allocator(ac); + if (*errp) + goto discard_and_exit; /* as we've just preallocated more space than - * user requested orinally, we store allocated + * user requested originally, we store allocated * space in a special descriptor */ if (ac->ac_status == AC_STATUS_FOUND && - ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) - ext4_mb_new_preallocation(ac); + ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + *errp = ext4_mb_new_preallocation(ac); + if (*errp) { + discard_and_exit: + ext4_discard_allocated_blocks(ac); + goto errout; + } } - if (likely(ac->ac_status == AC_STATUS_FOUND)) { - ext4_mb_mark_diskspace_used(ac, handle); - *errp = 0; - block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - ar->len = ac->ac_b_ex.fe_len; + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); + if (*errp == -EAGAIN) { + /* + * drop the reference that we took + * in ext4_mb_use_best_found + */ + ext4_mb_release_context(ac); + ac->ac_b_ex.fe_group = 0; + ac->ac_b_ex.fe_start = 0; + ac->ac_b_ex.fe_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + goto repeat; + } else if (*errp) { + ext4_discard_allocated_blocks(ac); + goto errout; + } else { + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + ar->len = ac->ac_b_ex.fe_len; + } } else { freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); if (freed) goto repeat; *errp = -ENOSPC; + } + +errout: + if (*errp) { ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } - ext4_mb_release_context(ac); - out: - if (ar->len < inquota) - DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); + if (inquota && ar->len < inquota) + dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); + if (!ar->len) { + if (!ext4_test_inode_state(ar->inode, + EXT4_STATE_DELALLOC_RESERVED)) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); + } + + trace_ext4_allocate_blocks(ar, (unsigned long long)block); - kmem_cache_free(ext4_ac_cachep, ac); return block; } -static void ext4_mb_poll_new_transaction(struct super_block *sb, - handle_t *handle) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_last_transaction == handle->h_transaction->t_tid) - return; - /* new transaction! time to close last one and free blocks for - * committed transaction. we know that only transaction can be - * active, so previos transaction can be being logged and we - * know that transaction before previous is known to be already - * logged. this means that now we may free blocks freed in all - * transactions before previous one. hope I'm clear enough ... */ - - spin_lock(&sbi->s_md_lock); - if (sbi->s_last_transaction != handle->h_transaction->t_tid) { - mb_debug("new transaction %lu, old %lu\n", - (unsigned long) handle->h_transaction->t_tid, - (unsigned long) sbi->s_last_transaction); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_closed_transaction); - sbi->s_last_transaction = handle->h_transaction->t_tid; - } - spin_unlock(&sbi->s_md_lock); - - ext4_mb_free_committed_blocks(sb); +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if ((entry1->efd_tid == entry2->efd_tid) && + (entry1->efd_group == entry2->efd_group) && + ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) + return 1; + return 0; } -static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, - ext4_group_t group, ext4_grpblk_t block, int count) +static noinline_for_stack int +ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, + struct ext4_free_data *new_entry) { + ext4_group_t group = e4b->bd_group; + ext4_grpblk_t cluster; + struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_free_metadata *md; - int i; + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + BUG_ON(!ext4_handle_valid(handle)); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); - ext4_lock_group(sb, group); - for (i = 0; i < count; i++) { - md = db->bb_md_cur; - if (md && db->bb_tid != handle->h_transaction->t_tid) { - db->bb_md_cur = NULL; - md = NULL; + new_node = &new_entry->efd_node; + cluster = new_entry->efd_start_cluster; + + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e4b->bd_buddy_page); + page_cache_get(e4b->bd_bitmap_page); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, efd_node); + if (cluster < entry->efd_start_cluster) + n = &(*n)->rb_left; + else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) + n = &(*n)->rb_right; + else { + ext4_grp_locked_error(sb, group, 0, + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, cluster), + "Block already on to-be-freed list"); + return 0; } + } - if (md == NULL) { - ext4_unlock_group(sb, group); - md = kmalloc(sizeof(*md), GFP_NOFS); - if (md == NULL) - return -ENOMEM; - md->num = 0; - md->group = group; - - ext4_lock_group(sb, group); - if (db->bb_md_cur == NULL) { - spin_lock(&sbi->s_md_lock); - list_add(&md->list, &sbi->s_active_transaction); - spin_unlock(&sbi->s_md_lock); - /* protect buddy cache from being freed, - * otherwise we'll refresh it from - * on-disk bitmap and lose not-yet-available - * blocks */ - page_cache_get(e4b->bd_buddy_page); - page_cache_get(e4b->bd_bitmap_page); - db->bb_md_cur = md; - db->bb_tid = handle->h_transaction->t_tid; - mb_debug("new md 0x%p for group %lu\n", - md, md->group); - } else { - kfree(md); - md = db->bb_md_cur; - } + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &db->bb_free_root); + + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, efd_node); + if (can_merge(entry, new_entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; + rb_erase(node, &(db->bb_free_root)); + kmem_cache_free(ext4_free_data_cachep, entry); } + } - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); - md->blocks[md->num] = block + i; - md->num++; - if (md->num == EXT4_BB_MAX_BLOCKS) { - /* no more space, put full container on a sb's list */ - db->bb_md_cur = NULL; + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, efd_node); + if (can_merge(new_entry, entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { + new_entry->efd_count += entry->efd_count; + rb_erase(node, &(db->bb_free_root)); + kmem_cache_free(ext4_free_data_cachep, entry); } } - ext4_unlock_group(sb, group); + /* Add the extent to transaction's private list */ + ext4_journal_callback_add(handle, ext4_free_data_callback, + &new_entry->efd_jce); return 0; } -/* - * Main entry point into mballoc to free blocks +/** + * ext4_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to count + * @flags: flags used by ext4_free_blocks */ -void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, - unsigned long block, unsigned long count, - int metadata, unsigned long *freed) +void ext4_free_blocks(handle_t *handle, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t block, + unsigned long count, int flags) { struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; - struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; - struct ext4_super_block *es; - unsigned long overflow; + unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; + struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_buddy e4b; + unsigned int count_clusters; int err = 0; int ret; - *freed = 0; - - ext4_mb_poll_new_transaction(sb, handle); + might_sleep(); + if (bh) { + if (block) + BUG_ON(block != bh->b_blocknr); + else + block = bh->b_blocknr; + } sbi = EXT4_SB(sb); - es = EXT4_SB(sb)->s_es; - if (block < le32_to_cpu(es->s_first_data_block) || - block + count < block || - block + count > ext4_blocks_count(es)) { - ext4_error(sb, __FUNCTION__, - "Freeing blocks not in datazone - " - "block = %lu, count = %lu", block, count); + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_data_block_valid(sbi, block, count)) { + ext4_error(sb, "Freeing blocks not in datazone - " + "block = %llu, count = %lu", block, count); goto error_return; } - ext4_debug("freeing block %lu\n", block); + ext4_debug("freeing block %llu\n", block); + trace_ext4_free_blocks(inode, block, count, flags); + + if (flags & EXT4_FREE_BLOCKS_FORGET) { + struct buffer_head *tbh = bh; + int i; + + BUG_ON(bh && (count > 1)); + + for (i = 0; i < count; i++) { + cond_resched(); + if (!bh) + tbh = sb_find_get_block(inode->i_sb, + block + i); + if (!tbh) + continue; + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, tbh, block + i); + } + } + + /* + * We need to make sure we don't reuse the freed block until + * after the transaction is committed, which we can do by + * treating the block as metadata, below. We make an + * exception if the inode is to be written in writeback mode + * since writeback mode has weak data consistency guarantees. + */ + if (!ext4_should_writeback_data(inode)) + flags |= EXT4_FREE_BLOCKS_METADATA; - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) { - ac->ac_op = EXT4_MB_HISTORY_FREE; - ac->ac_inode = inode; - ac->ac_sb = sb; + /* + * If the extent to be freed does not begin on a cluster + * boundary, we need to deal with partial clusters at the + * beginning and end of the extent. Normally we will free + * blocks at the beginning or the end unless we are explicitly + * requested to avoid doing so. + */ + overflow = EXT4_PBLK_COFF(sbi, block); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { + overflow = sbi->s_cluster_ratio - overflow; + block += overflow; + if (count > overflow) + count -= overflow; + else + return; + } else { + block -= overflow; + count += overflow; + } + } + overflow = EXT4_LBLK_COFF(sbi, count); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { + if (count > overflow) + count -= overflow; + else + return; + } else + count += sbi->s_cluster_ratio - overflow; } do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( + ext4_get_group_info(sb, block_group)))) + return; + /* * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); + if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { + overflow = EXT4_C2B(sbi, bit) + count - + EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } - bitmap_bh = read_block_bitmap(sb, block_group); - if (!bitmap_bh) + count_clusters = EXT4_NUM_B2C(sbi, count); + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) { + err = -EIO; goto error_return; + } gdp = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!gdp) + if (!gdp) { + err = -EIO; goto error_return; + } if (in_range(ext4_block_bitmap(sb, gdp), block, count) || in_range(ext4_inode_bitmap(sb, gdp), block, count) || in_range(block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group) || + EXT4_SB(sb)->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { + EXT4_SB(sb)->s_itb_per_group)) { - ext4_error(sb, __FUNCTION__, - "Freeing blocks in system zone - " - "Block = %lu, count = %lu", block, count); + ext4_error(sb, "Freeing blocks in system zone - " + "Block = %llu, count = %lu", block, count); + /* err = 0. ext4_std_error should be a no op */ + goto error_return; } BUFFER_TRACE(bitmap_bh, "getting write access"); @@ -4557,58 +4768,101 @@ do_more: err = ext4_journal_get_write_access(handle, gd_bh); if (err) goto error_return; - - err = ext4_mb_load_buddy(sb, block_group, &e4b); - if (err) - goto error_return; - #ifdef AGGRESSIVE_CHECK { int i; - for (i = 0; i < count; i++) + for (i = 0; i < count_clusters; i++) BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, - bit, count); + trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; - if (ac) { - ac->ac_b_ex.fe_group = block_group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = count; - ext4_mb_store_history(ac); - } + if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { + struct ext4_free_data *new_entry; + /* + * blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed + */ + retry: + new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); + if (!new_entry) { + /* + * We use a retry loop because + * ext4_free_blocks() is not allowed to fail. + */ + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + new_entry->efd_start_cluster = bit; + new_entry->efd_group = block_group; + new_entry->efd_count = count_clusters; + new_entry->efd_tid = handle->h_transaction->t_tid; - if (metadata) { - /* blocks being freed are metadata. these blocks shouldn't - * be used until this transaction is committed */ - ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); + ext4_mb_free_metadata(handle, &e4b, new_entry); } else { + /* need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, block_group, bit, count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%lu failed" + " with %d", block_group, bit, count, + err); + } else + EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); + ext4_lock_group(sb, block_group); - err = mb_free_blocks(inode, &e4b, bit, count); - ext4_mb_return_to_preallocation(inode, &e4b, block, count); - ext4_unlock_group(sb, block_group); - BUG_ON(err != 0); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); + mb_free_blocks(inode, &e4b, bit, count_clusters); } - spin_lock(sb_bgl_lock(sbi, block_group)); - gdp->bg_free_blocks_count = - cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); - spin_unlock(sb_bgl_lock(sbi, block_group)); - percpu_counter_add(&sbi->s_freeblocks_counter, count); + ret = ext4_free_group_clusters(sb, gdp) + count_clusters; + ext4_free_group_clusters_set(sb, gdp, ret); + ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, block_group, gdp); + ext4_unlock_group(sb, block_group); - ext4_mb_release_desc(&e4b); + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic64_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); + } - *freed += count; + if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, + count_clusters); + spin_lock(&ei->i_block_reservation_lock); + if (flags & EXT4_FREE_BLOCKS_METADATA) + ei->i_reserved_meta_blocks += count_clusters; + else + ei->i_reserved_data_blocks += count_clusters; + spin_unlock(&ei->i_block_reservation_lock); + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_reclaim_block(inode, + EXT4_C2B(sbi, count_clusters)); + } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); + + ext4_mb_unload_buddy(&e4b); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_journal_dirty_metadata(handle, gd_bh); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); if (!err) err = ret; @@ -4618,11 +4872,365 @@ do_more: put_bh(bitmap_bh); goto do_more; } - sb->s_dirt = 1; error_return: brelse(bitmap_bh); ext4_std_error(sb, err); - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); return; } + +/** + * ext4_group_add_blocks() -- Add given blocks to an existing group + * @handle: handle to this transaction + * @sb: super block + * @block: start physical block to add to the block group + * @count: number of blocks to free + * + * This marks the blocks as free in the bitmap and buddy. + */ +int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + ext4_group_t block_group; + ext4_grpblk_t bit; + unsigned int i; + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_buddy e4b; + int err = 0, ret, blk_free_count; + ext4_grpblk_t blocks_freed; + + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + + if (count == 0) + return 0; + + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_warning(sb, "too much blocks added to group %u\n", + block_group); + err = -EINVAL; + goto error_return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) { + err = -EIO; + goto error_return; + } + + desc = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!desc) { + err = -EIO; + goto error_return; + } + + if (in_range(ext4_block_bitmap(sb, desc), block, count) || + in_range(ext4_inode_bitmap(sb, desc), block, count) || + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, desc), + sbi->s_itb_per_group)) { + ext4_error(sb, "Adding blocks in system zones - " + "Block = %llu, count = %lu", + block, count); + err = -EINVAL; + goto error_return; + } + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + for (i = 0, blocks_freed = 0; i < count; i++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { + ext4_error(sb, "bit already cleared for block %llu", + (ext4_fsblk_t)(block + i)); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + blocks_freed++; + } + } + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + + /* + * need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); + mb_free_blocks(NULL, &e4b, bit, count); + blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); + ext4_free_group_clusters_set(sb, desc, blk_free_count); + ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); + ext4_group_desc_csum_set(sb, block_group, desc); + ext4_unlock_group(sb, block_group); + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_NUM_B2C(sbi, blocks_freed)); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); + } + + ext4_mb_unload_buddy(&e4b); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, err); + return err; +} + +/** + * ext4_trim_extent -- function to TRIM one single free extent in the group + * @sb: super block for the file system + * @start: starting block of the free extent in the alloc. group + * @count: number of blocks to TRIM + * @group: alloc. group we are working with + * @e4b: ext4 buddy for the group + * + * Trim "count" blocks starting at "start" in the "group". To assure that no + * one will allocate those blocks, mark it as used in buddy bitmap. This must + * be called with under the group lock. + */ +static int ext4_trim_extent(struct super_block *sb, int start, int count, + ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) +{ + struct ext4_free_extent ex; + int ret = 0; + + trace_ext4_trim_extent(sb, group, start, count); + + assert_spin_locked(ext4_group_lock_ptr(sb, group)); + + ex.fe_start = start; + ex.fe_group = group; + ex.fe_len = count; + + /* + * Mark blocks used, so no one can reuse them while + * being trimmed. + */ + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, group); + ret = ext4_issue_discard(sb, group, start, count); + ext4_lock_group(sb, group); + mb_free_blocks(NULL, e4b, start, ex.fe_len); + return ret; +} + +/** + * ext4_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @group: group to be trimmed + * @start: first group block to examine + * @max: last group block to examine + * @minblocks: minimum extent block count + * + * ext4_trim_all_free walks through group's buddy bitmap searching for free + * extents. When the free block is found, ext4_trim_extent is called to TRIM + * the extent. + * + * + * ext4_trim_all_free walks through group's block bitmap searching for free + * extents. When the free extent is found, mark it as used in group buddy + * bitmap. Then issue a TRIM command on this extent and free the extent in + * the group buddy bitmap. This is done until whole group is scanned. + */ +static ext4_grpblk_t +ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + ext4_grpblk_t start, ext4_grpblk_t max, + ext4_grpblk_t minblocks) +{ + void *bitmap; + ext4_grpblk_t next, count = 0, free_count = 0; + struct ext4_buddy e4b; + int ret = 0; + + trace_ext4_trim_all_free(sb, group, start, max); + + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_error(sb, "Error in loading buddy " + "information for %u", group); + return ret; + } + bitmap = e4b.bd_bitmap; + + ext4_lock_group(sb, group); + if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && + minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) + goto out; + + start = (e4b.bd_info->bb_first_free > start) ? + e4b.bd_info->bb_first_free : start; + + while (start <= max) { + start = mb_find_next_zero_bit(bitmap, max + 1, start); + if (start > max) + break; + next = mb_find_next_bit(bitmap, max + 1, start); + + if ((next - start) >= minblocks) { + ret = ext4_trim_extent(sb, start, + next - start, group, &e4b); + if (ret && ret != -EOPNOTSUPP) + break; + ret = 0; + count += next - start; + } + free_count += next - start; + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if (need_resched()) { + ext4_unlock_group(sb, group); + cond_resched(); + ext4_lock_group(sb, group); + } + + if ((e4b.bd_info->bb_free - free_count) < minblocks) + break; + } + + if (!ret) { + ret = count; + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + } +out: + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + + ext4_debug("trimmed %d blocks in the group %d\n", + count, group); + + return ret; +} + +/** + * ext4_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @range: fstrim_range structure + * + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + * ext4_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext4_trim_all_free function + * is invoked to trim all free space. + */ +int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + struct ext4_group_info *grp; + ext4_group_t group, first_group, last_group; + ext4_grpblk_t cnt = 0, first_cluster, last_cluster; + uint64_t start, end, minlen, trimmed = 0; + ext4_fsblk_t first_data_blk = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); + int ret = 0; + + start = range->start >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + range->minlen >> sb->s_blocksize_bits); + + if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || + start >= max_blks || + range->len < sb->s_blocksize) + return -EINVAL; + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) + goto out; + if (start < first_data_blk) + start = first_data_blk; + + /* Determine first and last group to examine based on start and end */ + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, + &first_group, &first_cluster); + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, + &last_group, &last_cluster); + + /* end now represents the last cluster to discard in this group */ + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; + + for (group = first_group; group <= last_group; group++) { + grp = ext4_get_group_info(sb, group); + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(sb, group); + if (ret) + break; + } + + /* + * For all the groups except the last one, last cluster will + * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_cluster is + * already computed earlier by ext4_get_group_no_and_offset() + */ + if (group == last_group) + end = last_cluster; + + if (grp->bb_free >= minlen) { + cnt = ext4_trim_all_free(sb, group, first_cluster, + end, minlen); + if (cnt < 0) { + ret = cnt; + break; + } + trimmed += cnt; + } + + /* + * For every group except the first one, we are sure + * that the first cluster to discard will be cluster #0. + */ + first_cluster = 0; + } + + if (!ret) + atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); + +out: + range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; + return ret; +} |
