diff options
Diffstat (limited to 'fs/ext4/mballoc.c')
| -rw-r--r-- | fs/ext4/mballoc.c | 2431 |
1 files changed, 1506 insertions, 925 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d34afad3e13..2dcb936be90 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -21,10 +21,20 @@ * mballoc.c contains the multiblocks allocation routines */ +#include "ext4_jbd2.h" #include "mballoc.h" -#include <linux/debugfs.h> +#include <linux/log2.h> +#include <linux/module.h> +#include <linux/slab.h> #include <trace/events/ext4.h> +#ifdef CONFIG_EXT4_DEBUG +ushort ext4_mballoc_debug __read_mostly; + +module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); +MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); +#endif + /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() @@ -69,13 +79,13 @@ * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space - * pa_len -> lenght for this prealloc space - * pa_free -> free space available in this prealloc space + * pa_len -> length for this prealloc space (in clusters) + * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc - * space we will consume the particular prealloc space. This make sure that - * that the we have contiguous physical blocks representing the file blocks + * space we will consume the particular prealloc space. This makes sure that + * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except @@ -83,7 +93,7 @@ * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group - * prealloc space. These are per CPU prealloc list repreasented as + * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * @@ -91,7 +101,7 @@ * between CPUs. It is possible to get scheduled at this point. * * The locality group prealloc space is used looking at whether we have - * enough free space (pa_free) withing the prealloc space. + * enough free space (pa_free) within the prealloc space. * * If we can't allocate blocks via inode prealloc or/and locality group * prealloc then we look at the buddy cache. The buddy cache is represented @@ -125,14 +135,16 @@ * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to - * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is + * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is + * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via - * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in + * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the - * stripe value (sbi->s_stripe) + * the smallest multiple of the stripe value (sbi->s_stripe) which is + * greater than the default mb_group_prealloc. * - * The regular allocator(using the buddy cache) supports few tunables. + * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan @@ -151,7 +163,7 @@ * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it - * can used for allocation. ext4_mb_good_group explains how the groups are + * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * Both the prealloc space are getting populated as above. So for the first @@ -336,12 +348,26 @@ */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; -static struct kmem_cache *ext4_free_ext_cachep; +static struct kmem_cache *ext4_free_data_cachep; + +/* We create slab caches for groupinfo data structures based on the + * superblock block size. There will be one per mounted filesystem for + * each unique s_blocksize_bits */ +#define NR_GRPINFO_CACHES 8 +static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; + +static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { + "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", + "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", + "ext4_groupinfo_64k", "ext4_groupinfo_128k" +}; + static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int rc); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -379,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr) ext4_clear_bit(bit, addr); } +static inline int mb_test_and_clear_bit(int bit, void *addr) +{ + addr = mb_correct_addr_and_bit(&bit, addr); + return ext4_test_and_clear_bit(bit, addr); +} + static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; @@ -409,7 +441,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; - BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); if (order > e4b->bd_blkbits + 1) { @@ -418,11 +450,12 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) } /* at order 0 we see each particular block */ - *max = 1 << (e4b->bd_blkbits + 3); - if (order == 0) - return EXT4_MB_BITMAP(e4b); + if (order == 0) { + *max = 1 << (e4b->bd_blkbits + 3); + return e4b->bd_bitmap; + } - bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; + bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; return bb; @@ -441,15 +474,15 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; - blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); - blocknr += first + i; - blocknr += - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), first + i); ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %u)", - inode ? inode->i_ino : 0, blocknr, - first + i, e4b->bd_group); + inode ? inode->i_ino : 0, + blocknr, + "freeing block already freed " + "(bit %u)", + first + i); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -477,10 +510,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { - printk(KERN_ERR "corruption in group %u " - "at byte %u(%u): %x in copy != %x " - "on disk/prealloc\n", - e4b->bd_group, i, i * 8, b1[i], b2[i]); + ext4_msg(e4b->bd_sb, KERN_ERR, + "corruption in group %u " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc", + e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } @@ -563,14 +597,14 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, continue; } - /* both bits in buddy2 must be 0 */ + /* both bits in buddy2 must be 1 */ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); for (j = 0; j < (1 << order); j++) { k = (i * (1 << order)) + j; MB_CHECK_ASSERT( - !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); + !mb_test_bit(k, e4b->bd_bitmap)); } count++; } @@ -602,7 +636,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); - buddy = mb_find_buddy(e4b, 0, &max); list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; @@ -621,7 +654,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, #define mb_check_buddy(e4b) #endif -/* FIXME!! need more doc */ +/* + * Divide blocks started from @first with length @len into + * smaller chunks with power of 2 blocks. + * Clear the bits in bitmap which the blocks of the chunk(s) covered, + * then increase bb_counters[] for corresponded chunk size. + */ static void ext4_mb_mark_free_simple(struct super_block *sb, void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) @@ -632,7 +670,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, ext4_grpblk_t chunk; unsigned short border; - BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); + BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; @@ -658,12 +696,34 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } +/* + * Cache the order of the largest free extent we have available in this block + * group. + */ +static void +mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) +{ + int i; + int bits; + + grp->bb_largest_free_order = -1; /* uninit */ + + bits = sb->s_blocksize_bits + 1; + for (i = bits; i >= 0; i--) { + if (grp->bb_counters[i] > 0) { + grp->bb_largest_free_order = i; + break; + } + } +} + static noinline_for_stack void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); - ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; ext4_grpblk_t len; @@ -691,15 +751,21 @@ void ext4_mb_generate_buddy(struct super_block *sb, grp->bb_fragments = fragments; if (free != grp->bb_free) { - ext4_grp_locked_error(sb, group, __func__, - "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", - group, free, grp->bb_free); + ext4_grp_locked_error(sb, group, 0, 0, + "block bitmap and bg descriptor " + "inconsistent: %u vs %u free clusters", + free, grp->bb_free); /* - * If we intent to continue, we consider group descritor + * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); } + mb_set_largest_free_order(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); @@ -710,6 +776,24 @@ void ext4_mb_generate_buddy(struct super_block *sb, spin_unlock(&EXT4_SB(sb)->s_bal_lock); } +static void mb_regenerate_buddy(struct ext4_buddy *e4b) +{ + int count; + int order = 1; + void *buddy; + + while ((buddy = mb_find_buddy(e4b, order++, &count))) { + ext4_set_bits(buddy, 0, count); + } + e4b->bd_info->bb_fragments = 0; + memset(e4b->bd_info->bb_counters, 0, + sizeof(*e4b->bd_info->bb_counters) * + (e4b->bd_sb->s_blocksize_bits + 2)); + + ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, + e4b->bd_bitmap, e4b->bd_group); +} + /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve @@ -725,6 +809,9 @@ void ext4_mb_generate_buddy(struct super_block *sb, * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. * So it can have information regarding groups_per_page which * is blocks_per_page/2 + * + * Locking note: This routine takes the block group lock of all groups + * for this page; do not hold this lock when calling this routine! */ static int ext4_mb_init_cache(struct page *page, char *incore) @@ -735,14 +822,15 @@ static int ext4_mb_init_cache(struct page *page, char *incore) int groups_per_page; int err = 0; int i; - ext4_group_t first_group; + ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; - struct buffer_head **bh; + struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; + struct ext4_group_info *grinfo; mb_debug(1, "init page %lu\n", page->index); @@ -758,95 +846,58 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { - err = -ENOMEM; i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, GFP_NOFS); - if (bh == NULL) + if (bh == NULL) { + err = -ENOMEM; goto out; + } } else bh = &bhs; first_group = page->index * blocks_per_page / 2; /* read all groups the page covers into the cache */ - for (i = 0; i < groups_per_page; i++) { - struct ext4_group_desc *desc; - - if (first_group + i >= ngroups) + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (group >= ngroups) break; - err = -EIO; - desc = ext4_get_group_desc(sb, first_group + i, NULL); - if (desc == NULL) - goto out; - - err = -ENOMEM; - bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); - if (bh[i] == NULL) - goto out; - - if (bitmap_uptodate(bh[i])) - continue; - - lock_buffer(bh[i]); - if (bitmap_uptodate(bh[i])) { - unlock_buffer(bh[i]); - continue; - } - ext4_lock_group(sb, first_group + i); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - ext4_init_block_bitmap(sb, bh[i], - first_group + i, desc); - set_bitmap_uptodate(bh[i]); - set_buffer_uptodate(bh[i]); - ext4_unlock_group(sb, first_group + i); - unlock_buffer(bh[i]); + grinfo = ext4_get_group_info(sb, group); + /* + * If page is uptodate then we came here after online resize + * which added some new uninitialized group info structs, so + * we must skip all initialized uptodate buddies on the page, + * which may be currently in use by an allocating task. + */ + if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + bh[i] = NULL; continue; } - ext4_unlock_group(sb, first_group + i); - if (buffer_uptodate(bh[i])) { - /* - * if not uninit if bh is uptodate, - * bitmap is also uptodate - */ - set_bitmap_uptodate(bh[i]); - unlock_buffer(bh[i]); - continue; + if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { + err = -ENOMEM; + goto out; } - get_bh(bh[i]); - /* - * submit the buffer_head for read. We can - * safely mark the bitmap as uptodate now. - * We do it here so the bitmap uptodate bit - * get set with buffer lock held. - */ - set_bitmap_uptodate(bh[i]); - bh[i]->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh[i]); - mb_debug(1, "read bitmap for group %u\n", first_group + i); + mb_debug(1, "read bitmap for group %u\n", group); } /* wait for I/O completion */ - for (i = 0; i < groups_per_page && bh[i]; i++) - wait_on_buffer(bh[i]); - - err = -EIO; - for (i = 0; i < groups_per_page && bh[i]; i++) - if (!buffer_uptodate(bh[i])) + for (i = 0, group = first_group; i < groups_per_page; i++, group++) { + if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { + err = -EIO; goto out; + } + } - err = 0; first_block = page->index * blocks_per_page; - /* init the page */ - memset(page_address(page), 0xff, PAGE_CACHE_SIZE); for (i = 0; i < blocks_per_page; i++) { - int group; - struct ext4_group_info *grinfo; - group = (first_block + i) >> 1; if (group >= ngroups) break; + if (!bh[group - first_group]) + /* skip initialized uptodate buddy */ + continue; + /* * data carry information regarding this * particular group in the format specified @@ -865,6 +916,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) BUG_ON(incore == NULL); mb_debug(1, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); + trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo = ext4_get_group_info(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, @@ -874,6 +926,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) * incore got set to the group block bitmap below */ ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); ext4_mb_generate_buddy(sb, data, incore, group); ext4_unlock_group(sb, group); incore = NULL; @@ -882,6 +936,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) BUG_ON(incore != NULL); mb_debug(1, "put bitmap for group %u in page %lu/%x\n", group, page->index, i * blocksize); + trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ ext4_lock_group(sb, group); @@ -902,7 +957,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) out: if (bh) { - for (i = 0; i < groups_per_page && bh[i]; i++) + for (i = 0; i < groups_per_page; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); @@ -910,100 +965,138 @@ out: return err; } +/* + * Lock the buddy and bitmap pages. This make sure other parallel init_group + * on the same buddy page doesn't happen whild holding the buddy page lock. + * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap + * are on the same page e4b->bd_buddy_page is NULL and return value is 0. + */ +static int ext4_mb_get_buddy_page_lock(struct super_block *sb, + ext4_group_t group, struct ext4_buddy *e4b) +{ + struct inode *inode = EXT4_SB(sb)->s_buddy_cache; + int block, pnum, poff; + int blocks_per_page; + struct page *page; + + e4b->bd_buddy_page = NULL; + e4b->bd_bitmap_page = NULL; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -ENOMEM; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_bitmap_page = page; + e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + + if (blocks_per_page >= 2) { + /* buddy and bitmap are on the same page */ + return 0; + } + + block++; + pnum = block / blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (!page) + return -ENOMEM; + BUG_ON(page->mapping != inode->i_mapping); + e4b->bd_buddy_page = page; + return 0; +} + +static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) +{ + if (e4b->bd_bitmap_page) { + unlock_page(e4b->bd_bitmap_page); + page_cache_release(e4b->bd_bitmap_page); + } + if (e4b->bd_buddy_page) { + unlock_page(e4b->bd_buddy_page); + page_cache_release(e4b->bd_buddy_page); + } +} + +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) { - int ret = 0; - void *bitmap; - int blocks_per_page; - int block, pnum, poff; - int num_grp_locked = 0; struct ext4_group_info *this_grp; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - struct page *page = NULL, *bitmap_page = NULL; + struct ext4_buddy e4b; + struct page *page; + int ret = 0; + might_sleep(); mb_debug(1, "init group %u\n", group); - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; this_grp = ext4_get_group_info(sb, group); /* * This ensures that we don't reinit the buddy cache * page which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that - * would have taken the alloc_sem lock. + * would have pinned buddy page to page cache. + * The call to ext4_mb_get_buddy_page_lock will mark the + * page accessed. */ - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); - if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { + ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); + if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group * return without doing anything */ - ret = 0; goto err; } - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page) { - BUG_ON(page->mapping != inode->i_mapping); - ret = ext4_mb_init_cache(page, NULL); - if (ret) { - unlock_page(page); - goto err; - } - unlock_page(page); - } - if (page == NULL || !PageUptodate(page)) { + + page = e4b.bd_bitmap_page; + ret = ext4_mb_init_cache(page, NULL); + if (ret) + goto err; + if (!PageUptodate(page)) { ret = -EIO; goto err; } - mark_page_accessed(page); - bitmap_page = page; - bitmap = page_address(page) + (poff * sb->s_blocksize); - /* init buddy cache */ - block++; - pnum = block / blocks_per_page; - poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); - if (page == bitmap_page) { + if (e4b.bd_buddy_page == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force * init the buddy */ - unlock_page(page); - } else if (page) { - BUG_ON(page->mapping != inode->i_mapping); - ret = ext4_mb_init_cache(page, bitmap); - if (ret) { - unlock_page(page); - goto err; - } - unlock_page(page); + ret = 0; + goto err; } - if (page == NULL || !PageUptodate(page)) { + /* init buddy cache */ + page = e4b.bd_buddy_page; + ret = ext4_mb_init_cache(page, e4b.bd_bitmap); + if (ret) + goto err; + if (!PageUptodate(page)) { ret = -EIO; goto err; } - mark_page_accessed(page); err: - ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); - if (bitmap_page) - page_cache_release(bitmap_page); - if (page) - page_cache_release(page); + ext4_mb_put_buddy_page_lock(&e4b); return ret; } +/* + * Locking note: This routine calls ext4_mb_init_cache(), which takes the + * block group lock of all groups for this page; do not hold the BG lock when + * calling this routine! + */ static noinline_for_stack int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) @@ -1018,35 +1111,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; + might_sleep(); mb_debug(1, "load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; - e4b->bd_info = ext4_get_group_info(sb, group); + e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; - e4b->alloc_semp = &grp->alloc_sem; - - /* Take the read lock on the group alloc - * sem. This would make sure a parallel - * ext4_mb_init_group happening on other - * groups mapped by the page is blocked - * till we are done with allocation - */ -repeat_load_buddy: - down_read(e4b->alloc_semp); if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - /* we need to check for group need init flag - * with alloc_semp held so that we can be sure - * that new blocks didn't get added to the group - * when we are loading the buddy cache - */ - up_read(e4b->alloc_semp); /* * we need full data about the group * to make a good selection @@ -1054,7 +1132,6 @@ repeat_load_buddy: ret = ext4_mb_init_group(sb, group); if (ret) return ret; - goto repeat_load_buddy; } /* @@ -1068,7 +1145,7 @@ repeat_load_buddy: /* we could use find_or_create_page(), but it locks page * what we'd like to avoid in fast path ... */ - page = find_get_page(inode->i_mapping, pnum); + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) /* @@ -1095,19 +1172,24 @@ repeat_load_buddy: unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } + + /* Pages marked accessed already */ e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) page_cache_release(page); @@ -1124,13 +1206,18 @@ repeat_load_buddy: unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } + + /* Pages marked accessed already */ e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); @@ -1138,27 +1225,23 @@ repeat_load_buddy: return 0; err: + if (page) + page_cache_release(page); if (e4b->bd_bitmap_page) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; - - /* Done with the buddy cache */ - up_read(e4b->alloc_semp); return ret; } -static void ext4_mb_release_desc(struct ext4_buddy *e4b) +static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); - /* Done with the buddy cache */ - if (e4b->alloc_semp) - up_read(e4b->alloc_semp); } @@ -1167,10 +1250,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) int order = 1; void *bb; - BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); + BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); - bb = EXT4_MB_BUDDY(e4b); + bb = e4b->bd_buddy; while (order <= e4b->bd_blkbits + 1) { block = block >> 1; if (!mb_test_bit(block, bb)) { @@ -1201,7 +1284,34 @@ static void mb_clear_bits(void *bm, int cur, int len) } } -static void mb_set_bits(void *bm, int cur, int len) +/* clear bits in given range + * will return first found zero bit if any, -1 otherwise + */ +static int mb_test_and_clear_bits(void *bm, int cur, int len) +{ + __u32 *addr; + int zero_bit = -1; + + len = cur + len; + while (cur < len) { + if ((cur & 31) == 0 && (len - cur) >= 32) { + /* fast path: clear whole word at once */ + addr = bm + (cur >> 3); + if (*addr != (__u32)(-1) && zero_bit == -1) + zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); + *addr = 0; + cur += 32; + continue; + } + if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) + zero_bit = cur; + cur++; + } + + return zero_bit; +} + +void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1219,18 +1329,95 @@ static void mb_set_bits(void *bm, int cur, int len) } } +/* + * _________________________________________________________________ */ + +static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) +{ + if (mb_test_bit(*bit + side, bitmap)) { + mb_clear_bit(*bit, bitmap); + (*bit) -= side; + return 1; + } + else { + (*bit) += side; + mb_set_bit(*bit, bitmap); + return -1; + } +} + +static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) +{ + int max; + int order = 1; + void *buddy = mb_find_buddy(e4b, order, &max); + + while (buddy) { + void *buddy2; + + /* Bits in range [first; last] are known to be set since + * corresponding blocks were allocated. Bits in range + * (first; last) will stay set because they form buddies on + * upper layer. We just deal with borders if they don't + * align with upper layer and then go up. + * Releasing entire group is all about clearing + * single bit of highest order buddy. + */ + + /* Example: + * --------------------------------- + * | 1 | 1 | 1 | 1 | + * --------------------------------- + * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------- + * 0 1 2 3 4 5 6 7 + * \_____________________/ + * + * Neither [1] nor [6] is aligned to above layer. + * Left neighbour [0] is free, so mark it busy, + * decrease bb_counters and extend range to + * [0; 6] + * Right neighbour [7] is busy. It can't be coaleasced with [6], so + * mark [6] free, increase bb_counters and shrink range to + * [0; 5]. + * Then shift range to [0; 2], go up and do the same. + */ + + + if (first & 1) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); + if (!(last & 1)) + e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); + if (first > last) + break; + order++; + + if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { + mb_clear_bits(buddy, first, last - first + 1); + e4b->bd_info->bb_counters[order - 1] += last - first + 1; + break; + } + first >>= 1; + last >>= 1; + buddy = buddy2; + } +} + static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, - int first, int count) + int first, int count) { - int block = 0; - int max = 0; - int order; - void *buddy; - void *buddy2; + int left_is_free = 0; + int right_is_free = 0; + int block; + int last = first + count - 1; struct super_block *sb = e4b->bd_sb; - BUG_ON(first + count > (sb->s_blocksize << 3)); + BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + /* Don't bother if the block group is corrupt. */ + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) + return; + mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1238,83 +1425,77 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; - /* let's maintain fragments counter */ + /* access memory sequentially: check left neighbour, + * clear range and then check right neighbour + */ if (first != 0) - block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); - if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) - max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); - if (block && max) + left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); + block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); + if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) + right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); + + if (unlikely(block != -1)) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t blocknr; + + blocknr = ext4_group_first_block_no(sb, e4b->bd_group); + blocknr += EXT4_C2B(EXT4_SB(sb), block); + ext4_grp_locked_error(sb, e4b->bd_group, + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u); block bitmap corrupt.", + block); + if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) + percpu_counter_sub(&sbi->s_freeclusters_counter, + e4b->bd_info->bb_free); + /* Mark the block group as corrupt. */ + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &e4b->bd_info->bb_state); + mb_regenerate_buddy(e4b); + goto done; + } + + /* let's maintain fragments counter */ + if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; - else if (!block && !max) + else if (!left_is_free && !right_is_free) e4b->bd_info->bb_fragments++; - /* let's maintain buddy itself */ - while (count-- > 0) { - block = first++; - order = 0; - - if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { - ext4_fsblk_t blocknr; - blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); - blocknr += block; - blocknr += - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %u)", - inode ? inode->i_ino : 0, blocknr, block, - e4b->bd_group); - } - mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); - e4b->bd_info->bb_counters[order]++; - - /* start of the buddy */ - buddy = mb_find_buddy(e4b, order, &max); - - do { - block &= ~1UL; - if (mb_test_bit(block, buddy) || - mb_test_bit(block + 1, buddy)) - break; - - /* both the buddies are free, try to coalesce them */ - buddy2 = mb_find_buddy(e4b, order + 1, &max); - - if (!buddy2) - break; - - if (order > 0) { - /* for special purposes, we don't set - * free bits in bitmap */ - mb_set_bit(block, buddy); - mb_set_bit(block + 1, buddy); - } - e4b->bd_info->bb_counters[order]--; - e4b->bd_info->bb_counters[order]--; + /* buddy[0] == bd_bitmap is a special case, so handle + * it right away and let mb_buddy_mark_free stay free of + * zero order checks. + * Check if neighbours are to be coaleasced, + * adjust bitmap bb_counters and borders appropriately. + */ + if (first & 1) { + first += !left_is_free; + e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; + } + if (!(last & 1)) { + last -= !right_is_free; + e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; + } - block = block >> 1; - order++; - e4b->bd_info->bb_counters[order]++; + if (first <= last) + mb_buddy_mark_free(e4b, first >> 1, last >> 1); - mb_clear_bit(block, buddy2); - buddy = buddy2; - } while (1); - } +done: + mb_set_largest_free_order(sb, e4b->bd_info); mb_check_buddy(e4b); } -static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, +static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { int next = block; - int max; - int ord; + int max, order; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); - buddy = mb_find_buddy(e4b, order, &max); + buddy = mb_find_buddy(e4b, 0, &max); BUG_ON(buddy == NULL); BUG_ON(block >= max); if (mb_test_bit(block, buddy)) { @@ -1324,12 +1505,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, return 0; } - /* FIXME dorp order completely ? */ - if (likely(order == 0)) { - /* find actual order */ - order = mb_find_order_for_block(e4b, block); - block = block >> order; - } + /* find actual order */ + order = mb_find_order_for_block(e4b, block); + block = block >> order; ex->fe_len = 1 << order; ex->fe_start = block << order; @@ -1341,18 +1519,17 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, ex->fe_start += next; while (needed > ex->fe_len && - (buddy = mb_find_buddy(e4b, order, &max))) { + mb_find_buddy(e4b, order, &max)) { if (block + 1 >= max) break; next = (block + 1) * (1 << order); - if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) + if (mb_test_bit(next, e4b->bd_bitmap)) break; - ord = mb_find_order_for_block(e4b, next); + order = mb_find_order_for_block(e4b, next); - order = ord; block = next >> order; ex->fe_len += 1 << order; } @@ -1385,9 +1562,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain fragments counter */ if (start != 0) - mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); + mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) - max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); + max = !mb_test_bit(start + len, e4b->bd_bitmap); if (mlen && max) e4b->bd_info->bb_fragments++; else if (!mlen && !max) @@ -1428,8 +1605,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) e4b->bd_info->bb_counters[ord]++; e4b->bd_info->bb_counters[ord]++; } + mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); - mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -1470,9 +1648,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, get_page(ac->ac_bitmap_page); ac->ac_buddy_page = e4b->bd_buddy_page; get_page(ac->ac_buddy_page); - /* on allocation we use ac to track the held semaphore */ - ac->alloc_semp = e4b->alloc_semp; - e4b->alloc_semp = NULL; /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); @@ -1518,7 +1693,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, /* recheck chunk's availability - we don't know * when it was found (within this lock-unlock * period or not) */ - max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); + max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); if (max >= gex->fe_len) { ext4_mb_use_best_found(ac, e4b); return; @@ -1544,8 +1719,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); - BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); - BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; @@ -1610,7 +1785,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, return err; ext4_lock_group(ac->ac_sb, group); - max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); + max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { ac->ac_b_ex = ex; @@ -1618,7 +1793,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, } ext4_unlock_group(ac->ac_sb, group); - ext4_mb_release_desc(e4b); + ext4_mb_unload_buddy(e4b); return 0; } @@ -1631,25 +1806,33 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_super_block *es = sbi->s_es; + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) return 0; + if (grp->bb_free == 0) + return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return err; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { + ext4_mb_unload_buddy(e4b); + return 0; + } + ext4_lock_group(ac->ac_sb, group); - max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, + max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); + ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; - start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + - ex.fe_start + le32_to_cpu(es->s_first_data_block); + start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + + ex.fe_start; /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; @@ -1674,7 +1857,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ext4_mb_use_best_found(ac, e4b); } ext4_unlock_group(ac->ac_sb, group); - ext4_mb_release_desc(e4b); + ext4_mb_unload_buddy(e4b); return 0; } @@ -1732,7 +1915,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; - void *bitmap = EXT4_MB_BITMAP(e4b); + void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i; int free; @@ -1744,25 +1927,25 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, while (free && ac->ac_status == AC_STATUS_CONTINUE) { i = mb_find_next_zero_bit(bitmap, - EXT4_BLOCKS_PER_GROUP(sb), i); - if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { + EXT4_CLUSTERS_PER_GROUP(sb), i); + if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * we have free blocks */ - ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "%d free blocks as per " + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free clusters as per " "group info. But bitmap says 0", free); break; } - mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); + mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { - ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "%d free blocks as per " + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); /* @@ -1772,7 +1955,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, */ break; } - + ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; @@ -1784,8 +1967,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, /* * This is a special case for storages like raid5 - * we try to find stripe-aligned chunks for stripe-size requests - * XXX should do so at least for multiples of stripe size as well + * we try to find stripe-aligned chunks for stripe-size-multiple requests */ static noinline_for_stack void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, @@ -1793,7 +1975,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - void *bitmap = EXT4_MB_BITMAP(e4b); + void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; @@ -1803,17 +1985,18 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, BUG_ON(sbi->s_stripe == 0); /* find first stripe-aligned block in group */ - first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) - + le32_to_cpu(sbi->s_es->s_first_data_block); + first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); + a = first_group_block + sbi->s_stripe - 1; do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; - while (i < EXT4_BLOCKS_PER_GROUP(sb)) { + while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { - max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); + max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { ac->ac_found++; + ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; @@ -1823,21 +2006,33 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } } +/* This is now called BEFORE we load the buddy bitmap. */ static int ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { unsigned free, fragments; - unsigned i, bits; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < 0 || cr >= 4); - BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); free = grp->bb_free; - fragments = grp->bb_fragments; if (free == 0) return 0; + if (cr <= 2 && free < ac->ac_g_ex.fe_len) + return 0; + + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + return 0; + + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + int ret = ext4_mb_init_group(ac->ac_sb, group); + if (ret) + return 0; + } + + fragments = grp->bb_fragments; if (fragments == 0) return 0; @@ -1851,11 +2046,14 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, ((group % flex_size) == 0)) return 0; - bits = ac->ac_sb->s_blocksize_bits + 1; - for (i = ac->ac_2order; i <= bits; i++) - if (grp->bb_counters[i] > 0) - return 1; - break; + if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || + (free / fragments) >= ac->ac_g_ex.fe_len) + return 1; + + if (grp->bb_largest_free_order < ac->ac_2order) + return 0; + + return 1; case 1: if ((free / fragments) >= ac->ac_g_ex.fe_len) return 1; @@ -1873,91 +2071,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, return 0; } -/* - * lock the group_info alloc_sem of all the groups - * belonging to the same buddy cache page. This - * make sure other parallel operation on the buddy - * cache doesn't happen whild holding the buddy cache - * lock - */ -int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) -{ - int i; - int block, pnum; - int blocks_per_page; - int groups_per_page; - ext4_group_t ngroups = ext4_get_groups_count(sb); - ext4_group_t first_group; - struct ext4_group_info *grp; - - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - first_group = pnum * blocks_per_page / 2; - - groups_per_page = blocks_per_page >> 1; - if (groups_per_page == 0) - groups_per_page = 1; - /* read all groups the page covers into the cache */ - for (i = 0; i < groups_per_page; i++) { - - if ((first_group + i) >= ngroups) - break; - grp = ext4_get_group_info(sb, first_group + i); - /* take all groups write allocation - * semaphore. This make sure there is - * no block allocation going on in any - * of that groups - */ - down_write_nested(&grp->alloc_sem, i); - } - return i; -} - -void ext4_mb_put_buddy_cache_lock(struct super_block *sb, - ext4_group_t group, int locked_group) -{ - int i; - int block, pnum; - int blocks_per_page; - ext4_group_t first_group; - struct ext4_group_info *grp; - - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - /* - * the buddy cache inode stores the block bitmap - * and buddy information in consecutive blocks. - * So for each group we need two blocks. - */ - block = group * 2; - pnum = block / blocks_per_page; - first_group = pnum * blocks_per_page / 2; - /* release locks on all the groups */ - for (i = 0; i < locked_group; i++) { - - grp = ext4_get_group_info(sb, first_group + i); - /* take all groups write allocation - * semaphore. This make sure there is - * no block allocation going on in any - * of that groups - */ - up_write(&grp->alloc_sem); - } - -} - static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t ngroups, group, i; int cr; int err = 0; - int bsbits; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; @@ -1966,7 +2085,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) sbi = EXT4_SB(sb); ngroups = ext4_get_groups_count(sb); /* non-extent files are limited to low blocks/groups */ - if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) ngroups = sbi->s_blockfile_groups; BUG_ON(ac->ac_status == AC_STATUS_FOUND); @@ -1999,8 +2118,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ac->ac_2order = i - 1; } - bsbits = ac->ac_sb->s_blocksize_bits; - /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { /* TBD: may be hot point */ @@ -2026,15 +2143,16 @@ repeat: group = ac->ac_g_ex.fe_group; for (i = 0; i < ngroups; group++, i++) { - struct ext4_group_info *grp; - struct ext4_group_desc *desc; - - if (group == ngroups) + cond_resched(); + /* + * Artificially restricted ngroups for non-extent + * files makes group > ngroups possible on first loop. + */ + if (group >= ngroups) group = 0; - /* quick check to skip empty groups */ - grp = ext4_get_group_info(sb, group); - if (grp->bb_free == 0) + /* This now checks without needing the buddy page */ + if (!ext4_mb_good_group(ac, group, cr)) continue; err = ext4_mb_load_buddy(sb, group, &e4b); @@ -2042,25 +2160,28 @@ repeat: goto out; ext4_lock_group(sb, group); + + /* + * We need to check again after locking the + * block group + */ if (!ext4_mb_good_group(ac, group, cr)) { - /* someone did allocation from this group */ ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); continue; } ac->ac_groups_scanned++; - desc = ext4_get_group_desc(sb, group, NULL); - if (cr == 0) + if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2) ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == 1 && - ac->ac_g_ex.fe_len == sbi->s_stripe) + else if (cr == 1 && sbi->s_stripe && + !(ac->ac_g_ex.fe_len % sbi->s_stripe)) ext4_mb_scan_aligned(ac, &e4b); else ext4_mb_complex_scan_group(ac, &e4b); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); if (ac->ac_status != AC_STATUS_CONTINUE) break; @@ -2124,8 +2245,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) struct super_block *sb = seq->private; ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; - int err; + int err, buddy_loaded = 0; struct ext4_buddy e4b; + struct ext4_group_info *grinfo; struct sg { struct ext4_group_info info; ext4_grpblk_t counters[16]; @@ -2142,15 +2264,21 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); - err = ext4_mb_load_buddy(sb, group, &e4b); - if (err) { - seq_printf(seq, "#%-5u: I/O error\n", group); - return 0; + grinfo = ext4_get_group_info(sb, group); + /* Load the group info in memory only if not already loaded. */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { + seq_printf(seq, "#%-5u: I/O error\n", group); + return 0; + } + buddy_loaded = 1; } - ext4_lock_group(sb, group); + memcpy(&sg, ext4_get_group_info(sb, group), i); - ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + + if (buddy_loaded) + ext4_mb_unload_buddy(&e4b); seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); @@ -2175,12 +2303,12 @@ static const struct seq_operations ext4_mb_seq_groups_ops = { static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) { - struct super_block *sb = PDE(inode)->data; + struct super_block *sb = PDE_DATA(inode); int rc; rc = seq_open(file, &ext4_mb_seq_groups_ops); if (rc == 0) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; m->private = sb; } return rc; @@ -2195,15 +2323,57 @@ static const struct file_operations ext4_mb_seq_groups_fops = { .release = seq_release, }; +static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) +{ + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; + + BUG_ON(!cachep); + return cachep; +} + +/* + * Allocate the top-level s_group_info array for the specified number + * of groups + */ +int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned size; + struct ext4_group_info ***new_groupinfo; + + size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); + if (size <= sbi->s_group_info_size) + return 0; + + size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); + new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); + if (!new_groupinfo) { + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); + return -ENOMEM; + } + if (sbi->s_group_info) { + memcpy(new_groupinfo, sbi->s_group_info, + sbi->s_group_info_size * sizeof(*sbi->s_group_info)); + ext4_kvfree(sbi->s_group_info); + } + sbi->s_group_info = new_groupinfo; + sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", + sbi->s_group_info_size); + return 0; +} /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) { - int i, len; + int i; int metalen = 0; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info **meta_group_info; + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); /* * First check if this group is the first of a reserved block. @@ -2215,28 +2385,21 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_KERNEL); if (meta_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate mem for a " - "buddy group\n"); + ext4_msg(sb, KERN_ERR, "can't allocate mem " + "for a buddy group"); goto exit_meta_group_info; } sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = meta_group_info; } - /* - * calculate needed size. if change bb_counters size, - * don't forget about ext4_mb_generate_buddy() - */ - len = offsetof(typeof(**meta_group_info), - bb_counters[sb->s_blocksize_bits + 2]); - meta_group_info = sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); - meta_group_info[i] = kzalloc(len, GFP_KERNEL); + meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL); if (meta_group_info[i] == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); + ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, @@ -2248,15 +2411,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, */ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { meta_group_info[i]->bb_free = - ext4_free_blocks_after_init(sb, group, desc); + ext4_free_clusters_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = - ext4_free_blks_count(sb, desc); + ext4_free_group_clusters(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); - meta_group_info[i]->bb_free_root.rb_node = NULL; + meta_group_info[i]->bb_free_root = RB_ROOT; + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ #ifdef DOUBLE_CHECK { @@ -2276,8 +2440,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, exit_group_info: /* If a meta_group_info table has been allocated, release it now */ - if (group % EXT4_DESC_PER_BLOCK(sb) == 0) + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; + } exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ @@ -2287,59 +2453,29 @@ static int ext4_mb_init_backend(struct super_block *sb) ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - int num_meta_group_infos; - int num_meta_group_infos_max; - int array_size; + int err; struct ext4_group_desc *desc; + struct kmem_cache *cachep; - /* This is the number of blocks used by GDT */ - num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); - - /* - * This is the total number of blocks used by GDT including - * the number of reserved blocks for GDT. - * The s_group_info array is allocated with this value - * to allow a clean online resize without a complex - * manipulation of pointer. - * The drawback is the unused memory when no resize - * occurs but it's very low in terms of pages - * (see comments below) - * Need to handle this properly when META_BG resizing is allowed - */ - num_meta_group_infos_max = num_meta_group_infos + - le16_to_cpu(es->s_reserved_gdt_blocks); + err = ext4_mb_alloc_groupinfo(sb, ngroups); + if (err) + return err; - /* - * array_size is the size of s_group_info array. We round it - * to the next power of two because this approximation is done - * internally by kmalloc so we can have some more memory - * for free here (e.g. may be used for META_BG resize). - */ - array_size = 1; - while (array_size < sizeof(*sbi->s_group_info) * - num_meta_group_infos_max) - array_size = array_size << 1; - /* An 8TB filesystem with 64-bit pointers requires a 4096 byte - * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. - * So a two level scheme suffices for now. */ - sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); - if (sbi->s_group_info == NULL) { - printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); - return -ENOMEM; - } sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { - printk(KERN_ERR "EXT4-fs: can't get new inode\n"); + ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } + /* To avoid potentially colliding with an valid on-disk inode number, + * use EXT4_BAD_INO for the buddy cache inode number. This inode is + * not in the inode hash, so it should never be found by iget(), but + * this will avoid confusion if it ever shows up during debugging. */ + sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { - printk(KERN_ERR - "EXT4-fs: can't read descriptor %u\n", i); + ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) @@ -2349,18 +2485,69 @@ static int ext4_mb_init_backend(struct super_block *sb) return 0; err_freebuddy: + cachep = get_groupinfo_cache(sb->s_blocksize_bits); while (i-- > 0) - kfree(ext4_get_group_info(sb, i)); - i = num_meta_group_infos; + kmem_cache_free(cachep, ext4_get_group_info(sb, i)); + i = sbi->s_group_info_size; while (i-- > 0) kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: - kfree(sbi->s_group_info); + ext4_kvfree(sbi->s_group_info); return -ENOMEM; } -int ext4_mb_init(struct super_block *sb, int needs_recovery) +static void ext4_groupinfo_destroy_slabs(void) +{ + int i; + + for (i = 0; i < NR_GRPINFO_CACHES; i++) { + if (ext4_groupinfo_caches[i]) + kmem_cache_destroy(ext4_groupinfo_caches[i]); + ext4_groupinfo_caches[i] = NULL; + } +} + +static int ext4_groupinfo_create_slab(size_t size) +{ + static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); + int slab_size; + int blocksize_bits = order_base_2(size); + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; + struct kmem_cache *cachep; + + if (cache_index >= NR_GRPINFO_CACHES) + return -EINVAL; + + if (unlikely(cache_index < 0)) + cache_index = 0; + + mutex_lock(&ext4_grpinfo_slab_create_mutex); + if (ext4_groupinfo_caches[cache_index]) { + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + return 0; /* Already created */ + } + + slab_size = offsetof(struct ext4_group_info, + bb_counters[blocksize_bits + 2]); + + cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], + slab_size, 0, SLAB_RECLAIM_ACCOUNT, + NULL); + + ext4_groupinfo_caches[cache_index] = cachep; + + mutex_unlock(&ext4_grpinfo_slab_create_mutex); + if (!cachep) { + printk(KERN_EMERG + "EXT4-fs: no memory for groupinfo slab cache\n"); + return -ENOMEM; + } + + return 0; +} + +int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; @@ -2372,16 +2559,21 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { - return -ENOMEM; + ret = -ENOMEM; + goto out; } i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { - kfree(sbi->s_mb_offsets); - return -ENOMEM; + ret = -ENOMEM; + goto out; } + ret = ext4_groupinfo_create_slab(sb->s_blocksize); + if (ret < 0) + goto out; + /* order 0 is regular bitmap */ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; sbi->s_mb_offsets[0] = 0; @@ -2397,14 +2589,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) i++; } while (i <= sb->s_blocksize_bits + 1); - /* init file for buddy data */ - ret = ext4_mb_init_backend(sb); - if (ret != 0) { - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - return ret; - } - spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); @@ -2413,13 +2597,37 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; + /* + * The default group preallocation is 512, which for 4k block + * sizes translates to 2 megabytes. However for bigalloc file + * systems, this is probably too big (i.e, if the cluster size + * is 1 megabyte, then group preallocation size becomes half a + * gigabyte!). As a default, we will keep a two megabyte + * group pralloc size for cluster sizes up to 64k, and after + * that, we will force a minimum group preallocation size of + * 32 clusters. This translates to 8 megs when the cluster + * size is 256k, and 32 megs when the cluster size is 1 meg, + * which seems reasonable as a default. + */ + sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> + sbi->s_cluster_bits, 32); + /* + * If there is a s_stripe > 1, then we set the s_mb_group_prealloc + * to the lowest multiple of s_stripe which is bigger than + * the s_mb_group_prealloc as determined above. We want + * the preallocation size to be an exact multiple of the + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ + if (sbi->s_stripe > 1) { + sbi->s_mb_group_prealloc = roundup( + sbi->s_mb_group_prealloc, sbi->s_stripe); + } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { - kfree(sbi->s_mb_offsets); - kfree(sbi->s_mb_maxs); - return -ENOMEM; + ret = -ENOMEM; + goto out; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; @@ -2430,13 +2638,26 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } + /* init file for buddy data */ + ret = ext4_mb_init_backend(sb); + if (ret != 0) + goto out_free_locality_groups; + if (sbi->s_proc) proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); - if (sbi->s_journal) - sbi->s_journal->j_commit_callback = release_blocks_on_commit; return 0; + +out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; +out: + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); + sbi->s_mb_maxs = NULL; + return ret; } /* need to called with the ext4 group lock held */ @@ -2464,6 +2685,10 @@ int ext4_mb_release(struct super_block *sb) int num_meta_group_infos; struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + + if (sbi->s_proc) + remove_proc_entry("mb_groups", sbi->s_proc); if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { @@ -2474,199 +2699,173 @@ int ext4_mb_release(struct super_block *sb) ext4_lock_group(sb, i); ext4_mb_cleanup_pa(grinfo); ext4_unlock_group(sb, i); - kfree(grinfo); + kmem_cache_free(cachep, grinfo); } num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) kfree(sbi->s_group_info[i]); - kfree(sbi->s_group_info); + ext4_kvfree(sbi->s_group_info); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); if (sbi->s_buddy_cache) iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { - printk(KERN_INFO - "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u blocks %u reqs (%u success)", atomic_read(&sbi->s_bal_allocated), atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); - printk(KERN_INFO - "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " - "%u 2^N hits, %u breaks, %u lost\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u extents scanned, %u goal hits, " + "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); - printk(KERN_INFO - "EXT4-fs: mballoc: %lu generated and it took %Lu\n", - sbi->s_mb_buddies_generated++, + ext4_msg(sb, KERN_INFO, + "mballoc: %lu generated and it took %Lu", + sbi->s_mb_buddies_generated, sbi->s_mb_generation_time); - printk(KERN_INFO - "EXT4-fs: mballoc: %u preallocated, %u discarded\n", + ext4_msg(sb, KERN_INFO, + "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), atomic_read(&sbi->s_mb_discarded)); } free_percpu(sbi->s_locality_groups); - if (sbi->s_proc) - remove_proc_entry("mb_groups", sbi->s_proc); return 0; } +static inline int ext4_issue_discard(struct super_block *sb, + ext4_group_t block_group, ext4_grpblk_t cluster, int count) +{ + ext4_fsblk_t discard_block; + + discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + + ext4_group_first_block_no(sb, block_group)); + count = EXT4_C2B(EXT4_SB(sb), count); + trace_ext4_discard_blocks(sb, + (unsigned long long) discard_block, count); + return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); +} + /* * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. */ -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) +static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, + int rc) { - struct super_block *sb = journal->j_private; + struct ext4_free_data *entry = (struct ext4_free_data *)jce; struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0, count2 = 0; - struct ext4_free_data *entry; - struct list_head *l, *ltmp; - - list_for_each_safe(l, ltmp, &txn->t_private_list) { - entry = list_entry(l, struct ext4_free_data, list); - - mb_debug(1, "gonna free %u blocks in group %u (0x%p):", - entry->count, entry->group, entry); - - err = ext4_mb_load_buddy(sb, entry->group, &e4b); - /* we expect to find existing buddy because it's pinned */ - BUG_ON(err != 0); - - db = e4b.bd_info; - /* there are blocks to put in buddy to make them really free */ - count += entry->count; - count2++; - ext4_lock_group(sb, entry->group); - /* Take it out of per group rb tree */ - rb_erase(&entry->node, &(db->bb_free_root)); - mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); - - if (!db->bb_free_root.rb_node) { - /* No more items in the per group rb tree - * balance refcounts from ext4_mb_free_metadata() - */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - } - ext4_unlock_group(sb, entry->group); - if (test_opt(sb, DISCARD)) { - ext4_fsblk_t discard_block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - - discard_block = (ext4_fsblk_t)entry->group * - EXT4_BLOCKS_PER_GROUP(sb) - + entry->start_blk - + le32_to_cpu(es->s_first_data_block); - trace_ext4_discard_blocks(sb, - (unsigned long long)discard_block, - entry->count); - sb_issue_discard(sb, discard_block, entry->count); - } - kmem_cache_free(ext4_free_ext_cachep, entry); - ext4_mb_release_desc(&e4b); - } - mb_debug(1, "freed %u blocks in %u structures\n", count, count2); -} + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + entry->efd_count, entry->efd_group, entry); -#ifdef CONFIG_EXT4_DEBUG -u8 mb_enable_debug __read_mostly; + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, + entry->efd_count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%d failed" + " with %d", entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, err); + } -static struct dentry *debugfs_dir; -static struct dentry *debugfs_debug; + err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); -static void __init ext4_create_debugfs_entry(void) -{ - debugfs_dir = debugfs_create_dir("ext4", NULL); - if (debugfs_dir) - debugfs_debug = debugfs_create_u8("mballoc-debug", - S_IRUGO | S_IWUSR, - debugfs_dir, - &mb_enable_debug); -} -static void ext4_remove_debugfs_entry(void) -{ - debugfs_remove(debugfs_debug); - debugfs_remove(debugfs_dir); -} + db = e4b.bd_info; + /* there are blocks to put in buddy to make them really free */ + count += entry->efd_count; + count2++; + ext4_lock_group(sb, entry->efd_group); + /* Take it out of per group rb tree */ + rb_erase(&entry->efd_node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); -#else + /* + * Clear the trimmed flag for the group so that the next + * ext4_trim_fs can trim it. + * If the volume is mounted with -o discard, online discard + * is supported and the free blocks will be trimmed online. + */ + if (!test_opt(sb, DISCARD)) + EXT4_MB_GRP_CLEAR_TRIMMED(db); -static void __init ext4_create_debugfs_entry(void) -{ -} + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); + } + ext4_unlock_group(sb, entry->efd_group); + kmem_cache_free(ext4_free_data_cachep, entry); + ext4_mb_unload_buddy(&e4b); -static void ext4_remove_debugfs_entry(void) -{ + mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } -#endif - -int __init init_ext4_mballoc(void) +int __init ext4_init_mballoc(void) { - ext4_pspace_cachep = - kmem_cache_create("ext4_prealloc_space", - sizeof(struct ext4_prealloc_space), - 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, + SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) return -ENOMEM; - ext4_ac_cachep = - kmem_cache_create("ext4_alloc_context", - sizeof(struct ext4_allocation_context), - 0, SLAB_RECLAIM_ACCOUNT, NULL); + ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, + SLAB_RECLAIM_ACCOUNT); if (ext4_ac_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } - ext4_free_ext_cachep = - kmem_cache_create("ext4_free_block_extents", - sizeof(struct ext4_free_data), - 0, SLAB_RECLAIM_ACCOUNT, NULL); - if (ext4_free_ext_cachep == NULL) { + ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, + SLAB_RECLAIM_ACCOUNT); + if (ext4_free_data_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); return -ENOMEM; } - ext4_create_debugfs_entry(); return 0; } -void exit_ext4_mballoc(void) +void ext4_exit_mballoc(void) { - /* + /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. */ rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); - kmem_cache_destroy(ext4_free_ext_cachep); - ext4_remove_debugfs_entry(); + kmem_cache_destroy(ext4_free_data_cachep); + ext4_groupinfo_destroy_slabs(); } /* - * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps + * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps * Returns 0 if success or error code */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle, unsigned int reserv_blks) + handle_t *handle, unsigned int reserv_clstrs) { struct buffer_head *bitmap_bh = NULL; - struct ext4_super_block *es; struct ext4_group_desc *gdp; struct buffer_head *gdp_bh; struct ext4_sb_info *sbi; @@ -2679,14 +2878,13 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, sb = ac->ac_sb; sbi = EXT4_SB(sb); - es = sbi->s_es; - err = -EIO; bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); if (!bitmap_bh) goto out_err; + BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, bitmap_bh); if (err) goto out_err; @@ -2697,28 +2895,26 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, goto out_err; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, - ext4_free_blks_count(sb, gdp)); + ext4_free_group_clusters(sb, gdp)); + BUFFER_TRACE(gdp_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdp_bh); if (err) goto out_err; - block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) - + ac->ac_b_ex.fe_start - + le32_to_cpu(es->s_first_data_block); + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - len = ac->ac_b_ex.fe_len; + len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_data_block_valid(sbi, block, len)) { - ext4_error(sb, __func__, - "Allocating blocks %llu-%llu which overlap " - "fs metadata\n", block, block+len); + ext4_error(sb, "Allocating blocks %llu-%llu which overlap " + "fs metadata", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. */ ext4_lock_group(sb, ac->ac_b_ex.fe_group); - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) @@ -2736,31 +2932,34 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); + ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_blks_set(sb, gdp, - ext4_free_blocks_after_init(sb, - ac->ac_b_ex.fe_group, gdp)); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, + ac->ac_b_ex.fe_group, gdp)); } - len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; - ext4_free_blks_set(sb, gdp, len); - gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); + len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; + ext4_free_group_clusters_set(sb, gdp, len); + ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); + percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative */ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); - atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_blocks); + atomic64_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -2769,15 +2968,15 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); out_err: - sb->s_dirt = 1; brelse(bitmap_bh); return err; } /* * here we normalize request for locality group - * Group request are normalized to s_strip size if we set the same via mount - * option. If not we set it to s_mb_group_prealloc which can be configured via + * Group request are normalized to s_mb_group_prealloc, which goes to + * s_strip if we set the same via mount option. + * s_mb_group_prealloc can be configured via * /sys/fs/ext4/<partition>/mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? @@ -2788,10 +2987,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); - if (EXT4_SB(sb)->s_stripe) - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; - else - ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; + ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; mb_debug(1, "#%u: goal %u blocks for locality group\n", current->pid, ac->ac_g_ex.fe_len); } @@ -2804,10 +3000,12 @@ static noinline_for_stack void ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits, max; ext4_lblk_t end; - loff_t size, orig_size, start_off; - ext4_lblk_t start, orig_start; + loff_t size, start_off; + loff_t orig_size __maybe_unused; + ext4_lblk_t start; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *pa; @@ -2834,10 +3032,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* first, let's learn actual file size * given current request is allocated */ - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); + orig_size = size; /* max size of free chunks */ max = 2 << bsbits; @@ -2879,8 +3078,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; size = ac->ac_o_ex.fe_len << bsbits; } - orig_size = size = size >> bsbits; - orig_start = start = start_off >> bsbits; + size = size >> bsbits; + start = start_off >> bsbits; /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { @@ -2905,7 +3104,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, continue; } - pa_end = pa->pa_lstart + pa->pa_len; + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); /* PA must not overlap original request */ BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || @@ -2935,9 +3135,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { ext4_lblk_t pa_end; + spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0) { - pa_end = pa->pa_lstart + pa->pa_len; + pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), + pa->pa_len); BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); } spin_unlock(&pa->pa_lock); @@ -2946,9 +3148,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, if (start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical) { - printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", - (unsigned long) start, (unsigned long) size, - (unsigned long) ac->ac_o_ex.fe_logical); + ext4_msg(ac->ac_sb, KERN_ERR, + "start %lu, size %lu, fe_logical %lu", + (unsigned long) start, (unsigned long) size, + (unsigned long) ac->ac_o_ex.fe_logical); } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); @@ -2959,7 +3162,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* XXX: is it better to align blocks WRT to logical * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; - ac->ac_g_ex.fe_len = size; + ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size))) { @@ -2988,7 +3191,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); - if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) + if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && @@ -3013,13 +3216,9 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; - int len; - - if (pa && pa->pa_type == MB_INODE_PA) { - len = ac->ac_b_ex.fe_len; - pa->pa_free += len; - } + if (pa && pa->pa_type == MB_INODE_PA) + pa->pa_free += ac->ac_b_ex.fe_len; } /* @@ -3028,14 +3227,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; ext4_fsblk_t end; int len; /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); - end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); - len = end - start; + end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), + start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); + len = EXT4_NUM_B2C(sbi, end - start); ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; @@ -3043,7 +3244,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, ac->ac_pa = pa; BUG_ON(start < pa->pa_pstart); - BUG_ON(start + len > pa->pa_pstart + pa->pa_len); + BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); pa->pa_free -= len; @@ -3094,7 +3295,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, cur_distance = abs(goal_block - cpa->pa_pstart); new_distance = abs(goal_block - pa->pa_pstart); - if (cur_distance < new_distance) + if (cur_distance <= new_distance) return cpa; /* drop the previous reference */ @@ -3109,6 +3310,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, static noinline_for_stack int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; @@ -3126,12 +3328,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* all fields in this condition don't change, * so we can skip locking for them */ if (ac->ac_o_ex.fe_logical < pa->pa_lstart || - ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) + ac->ac_o_ex.fe_logical >= (pa->pa_lstart + + EXT4_C2B(sbi, pa->pa_len))) continue; /* non-extent files can't have physical blocks past 2^32 */ - if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && - pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && + (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > + EXT4_MAX_BLOCK_FILE_PHYS)) continue; /* found preallocated blocks, use them */ @@ -3161,9 +3365,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; - goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + - ac->ac_g_ex.fe_start + - le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); + goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); /* * search for the prealloc space that is having * minimal distance from the goal block. @@ -3208,8 +3410,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, n = rb_first(&(grp->bb_free_root)); while (n) { - entry = rb_entry(n, struct ext4_free_data, node); - mb_set_bits(bitmap, entry->start_blk, entry->count); + entry = rb_entry(n, struct ext4_free_data, efd_node); + ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); n = rb_next(n); } return; @@ -3230,7 +3432,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t groupnr; ext4_grpblk_t start; int preallocated = 0; - int count = 0; int len; /* all form of preallocation discards first load group, @@ -3251,9 +3452,8 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); - mb_set_bits(bitmap, start, len); + ext4_set_bits(bitmap, start, len); preallocated += len; - count++; } mb_debug(1, "prellocated %u for group %u\n", preallocated, group); } @@ -3262,6 +3462,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + + BUG_ON(atomic_read(&pa->pa_count)); + BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } @@ -3275,11 +3478,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, ext4_group_t grp; ext4_fsblk_t grp_blk; - if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) - return; - /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { + spin_unlock(&pa->pa_lock); + return; + } + if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; @@ -3289,14 +3494,14 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; - /* + /* * If doing group-based preallocation, pa_pstart may be in the * next group when pa is used up */ if (pa->pa_type == MB_GROUP_PA) grp_blk--; - ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); + grp = ext4_get_group_number(sb, grp_blk); /* * possible race: @@ -3330,6 +3535,7 @@ static noinline_for_stack int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_prealloc_space *pa; struct ext4_group_info *grp; struct ext4_inode_info *ei; @@ -3361,16 +3567,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; /* also, we should cover whole original request */ - wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; + wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); /* the smallest one defines real window */ win = min(winl, wins); - offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; + offs = ac->ac_o_ex.fe_logical % + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (offs && offs < win) win = offs; - ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; + ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - + EXT4_NUM_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); } @@ -3395,7 +3603,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) trace_ext4_mb_new_inode_pa(ac, pa); ext4_mb_use_inode_pa(ac, pa); - atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); + atomic_add(pa->pa_free, &sbi->s_mb_preallocated); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); @@ -3496,8 +3704,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) */ static noinline_for_stack int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, - struct ext4_prealloc_space *pa, - struct ext4_allocation_context *ac) + struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3506,53 +3713,39 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ext4_group_t group; ext4_grpblk_t bit; unsigned long long grp_blk_start; - sector_t start; int err = 0; int free = 0; BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); - grp_blk_start = pa->pa_pstart - bit; + grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = pa->pa_inode; - } - while (bit < end) { bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + - le32_to_cpu(sbi->s_es->s_first_data_block); mb_debug(1, " free preallocated %u/%u in group %u\n", - (unsigned) start, (unsigned) next - bit, - (unsigned) group); + (unsigned) ext4_group_first_block_no(sb, group) + bit, + (unsigned) next - bit, (unsigned) group); free += next - bit; - if (ac) { - ac->ac_b_ex.fe_group = group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = next - bit; - ac->ac_b_ex.fe_logical = 0; - trace_ext4_mballoc_discard(ac); - } - - trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, + trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); + trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + + EXT4_C2B(sbi, bit)), next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } if (free != pa->pa_free) { - printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", - pa, (unsigned long) pa->pa_lstart, - (unsigned long) pa->pa_pstart, - (unsigned long) pa->pa_len); - ext4_grp_locked_error(sb, group, - __func__, "free %u, pa_free %u", + ext4_msg(e4b->bd_sb, KERN_CRIT, + "pa %p: logic %lu, phys. %lu, len %lu", + pa, (unsigned long) pa->pa_lstart, + (unsigned long) pa->pa_pstart, + (unsigned long) pa->pa_len); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* * pa is already deleted so we use the value obtained @@ -3566,29 +3759,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, static noinline_for_stack int ext4_mb_release_group_pa(struct ext4_buddy *e4b, - struct ext4_prealloc_space *pa, - struct ext4_allocation_context *ac) + struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(ac, pa); + trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); - - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = NULL; - ac->ac_b_ex.fe_group = group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = pa->pa_len; - ac->ac_b_ex.fe_logical = 0; - trace_ext4_mballoc_discard(ac); - } + trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); return 0; } @@ -3609,7 +3792,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct ext4_allocation_context *ac; struct list_head list; struct ext4_buddy e4b; int err; @@ -3623,26 +3805,21 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - ext4_error(sb, __func__, "Error in reading block " - "bitmap for %u", group); + ext4_error(sb, "Error reading block bitmap for %u", group); return 0; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - ext4_error(sb, __func__, "Error in loading buddy " - "information for %u", group); + ext4_error(sb, "Error loading buddy information for %u", group); put_bh(bitmap_bh); return 0; } if (needed == 0) - needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; + needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; INIT_LIST_HEAD(&list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) - ac->ac_sb = sb; repeat: ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, @@ -3674,11 +3851,7 @@ repeat: if (free < needed && busy) { busy = 0; ext4_unlock_group(sb, group); - /* - * Yield the CPU here so that we don't get soft lockup - * in non preempt case. - */ - yield(); + cond_resched(); goto repeat; } @@ -3697,9 +3870,9 @@ repeat: spin_unlock(pa->pa_obj_lock); if (pa->pa_type == MB_GROUP_PA) - ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_mb_release_group_pa(&e4b, pa); else - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); @@ -3707,9 +3880,7 @@ repeat: out: ext4_unlock_group(sb, group); - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); return free; } @@ -3729,7 +3900,6 @@ void ext4_discard_preallocations(struct inode *inode) struct super_block *sb = inode->i_sb; struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct ext4_allocation_context *ac; ext4_group_t group = 0; struct list_head list; struct ext4_buddy e4b; @@ -3745,11 +3915,6 @@ void ext4_discard_preallocations(struct inode *inode) INIT_LIST_HEAD(&list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = inode; - } repeat: /* first, collect all pa's in the inode */ spin_lock(&ei->i_prealloc_lock); @@ -3763,7 +3928,8 @@ repeat: * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); spin_unlock(&ei->i_prealloc_lock); - printk(KERN_ERR "uh-oh! used pa while discarding\n"); + ext4_msg(sb, KERN_ERR, + "uh-oh! used pa while discarding"); WARN_ON(1); schedule_timeout_uninterruptible(HZ); goto repeat; @@ -3800,63 +3966,53 @@ repeat: list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - ext4_error(sb, __func__, "Error in loading buddy " - "information for %u", group); + ext4_error(sb, "Error loading buddy information for %u", + group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - ext4_error(sb, __func__, "Error in reading block " - "bitmap for %u", group); - ext4_mb_release_desc(&e4b); + ext4_error(sb, "Error reading block bitmap for %u", + group); + ext4_mb_unload_buddy(&e4b); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); } -/* - * finds all preallocated spaces and return blocks being freed to them - * if preallocated space becomes full (no block is used from the space) - * then the function frees space in buddy - * XXX: at the moment, truncate (which is the only way to free blocks) - * discards all preallocations - */ -static void ext4_mb_return_to_preallocation(struct inode *inode, - struct ext4_buddy *e4b, - sector_t block, int count) -{ - BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); -} #ifdef CONFIG_EXT4_DEBUG static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; ext4_group_t ngroups, i; - printk(KERN_ERR "EXT4-fs: Can't allocate:" - " Allocation context details:\n"); - printk(KERN_ERR "EXT4-fs: status %d flags %d\n", + if (!ext4_mballoc_debug || + (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + return; + + ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" + " Allocation context details:"); + ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", ac->ac_status, ac->ac_flags); - printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " - "best %lu/%lu/%lu@%lu cr %d\n", + ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " + "goal %lu/%lu/%lu@%lu, " + "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, (unsigned long)ac->ac_o_ex.fe_len, @@ -3870,9 +4026,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, - ac->ac_found); - printk(KERN_ERR "EXT4-fs: groups: \n"); + ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); @@ -3925,7 +4080,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; @@ -3936,9 +4091,14 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) return; } + if (sbi->s_mb_group_prealloc <= 0) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } + /* don't use group allocation for large files */ size = max(size, isize); - if (size >= sbi->s_mb_stream_request) { + if (size > sbi->s_mb_stream_request) { ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; } @@ -3974,8 +4134,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, len = ar->len; /* just a dirty hack to filter too big requests */ - if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) - len = EXT4_BLOCKS_PER_GROUP(sb) - 10; + if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) + len = EXT4_CLUSTERS_PER_GROUP(sb); /* start searching from the goal */ goal = ar->goal; @@ -3985,19 +4145,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ - memset(ac, 0, sizeof(struct ext4_allocation_context)); - ac->ac_b_ex.fe_logical = ar->logical; + ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; - ac->ac_o_ex.fe_logical = ar->logical; + ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; ac->ac_o_ex.fe_group = group; ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; - ac->ac_g_ex.fe_logical = ar->logical; - ac->ac_g_ex.fe_group = group; - ac->ac_g_ex.fe_start = block; - ac->ac_g_ex.fe_len = len; + ac->ac_g_ex = ac->ac_o_ex; ac->ac_flags = ar->flags; /* we have to define context: we'll we work with a file or @@ -4024,14 +4180,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, struct ext4_buddy e4b; struct list_head discard_list; struct ext4_prealloc_space *pa, *tmp; - struct ext4_allocation_context *ac; mb_debug(1, "discard locality group preallocation\n"); INIT_LIST_HEAD(&discard_list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) - ac->ac_sb = sb; spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], @@ -4075,23 +4227,21 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { - ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + group = ext4_get_group_number(sb, pa->pa_pstart); if (ext4_mb_load_buddy(sb, group, &e4b)) { - ext4_error(sb, __func__, "Error in loading buddy " - "information for %u", group); + ext4_error(sb, "Error loading buddy information for %u", + group); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); - ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_mb_release_group_pa(&e4b, pa); ext4_unlock_group(sb, group); - ext4_mb_release_desc(&e4b); + ext4_mb_unload_buddy(&e4b); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); } /* @@ -4115,7 +4265,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; /* Add the prealloc space to lg */ - rcu_read_lock(); + spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], pa_inode_list) { spin_lock(&tmp_pa->pa_lock); @@ -4139,12 +4289,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) if (!added) list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list[order]); - rcu_read_unlock(); + spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ if (lg_prealloc_count > 8) { ext4_mb_discard_lg_preallocations(sb, lg, - order, lg_prealloc_count); + order, lg_prealloc_count); return; } return ; @@ -4155,27 +4305,25 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); - pa->pa_pstart += ac->ac_b_ex.fe_len; - pa->pa_lstart += ac->ac_b_ex.fe_len; + pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); } } - if (ac->alloc_semp) - up_read(ac->alloc_semp); if (pa) { /* * We want to add the pa to the right bucket. * Remove it from the list and while adding * make sure the list to which we are adding - * doesn't grow big. We need to release - * alloc_semp before calling ext4_mb_add_n_trim() + * doesn't grow big. */ if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { spin_lock(pa->pa_obj_lock); @@ -4217,7 +4365,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) * to usual allocation */ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, - struct ext4_allocation_request *ar, int *errp) + struct ext4_allocation_request *ar, int *errp) { int freed; struct ext4_allocation_context *ac = NULL; @@ -4225,57 +4373,72 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct super_block *sb; ext4_fsblk_t block = 0; unsigned int inquota = 0; - unsigned int reserv_blks = 0; + unsigned int reserv_clstrs = 0; + might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); trace_ext4_request_blocks(ar); + /* Allow to use superuser reservation for quota file */ + if (IS_NOQUOTA(ar->inode)) + ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; + /* * For delayed allocation, we could skip the ENOSPC and * EDQUOT check, as blocks and quotas have been already * reserved when data being copied into pagecache. */ - if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) + if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) ar->flags |= EXT4_MB_DELALLOC_RESERVED; else { /* Without delayed allocation we need to verify * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. */ - while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { + while (ar->len && + ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { + /* let others to free the space */ - yield(); + cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { *errp = -ENOSPC; return 0; } - reserv_blks = ar->len; - while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) { - ar->flags |= EXT4_MB_HINT_NOPREALLOC; - ar->len--; + reserv_clstrs = ar->len; + if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { + dquot_alloc_block_nofail(ar->inode, + EXT4_C2B(sbi, ar->len)); + } else { + while (ar->len && + dquot_alloc_block(ar->inode, + EXT4_C2B(sbi, ar->len))) { + + ar->flags |= EXT4_MB_HINT_NOPREALLOC; + ar->len--; + } } inquota = ar->len; if (ar->len == 0) { *errp = -EDQUOT; - goto out3; + goto out; } } - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { ar->len = 0; *errp = -ENOMEM; - goto out1; + goto out; } *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; - goto out2; + goto out; } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; @@ -4284,18 +4447,25 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_mb_normalize_request(ac, ar); repeat: /* allocate space in core */ - ext4_mb_regular_allocator(ac); + *errp = ext4_mb_regular_allocator(ac); + if (*errp) + goto discard_and_exit; /* as we've just preallocated more space than - * user requested orinally, we store allocated + * user requested originally, we store allocated * space in a special descriptor */ if (ac->ac_status == AC_STATUS_FOUND && - ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) - ext4_mb_new_preallocation(ac); + ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + *errp = ext4_mb_new_preallocation(ac); + if (*errp) { + discard_and_exit: + ext4_discard_allocated_blocks(ac); + goto errout; + } } if (likely(ac->ac_status == AC_STATUS_FOUND)) { - *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); - if (*errp == -EAGAIN) { + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); + if (*errp == -EAGAIN) { /* * drop the reference that we took * in ext4_mb_use_best_found @@ -4308,9 +4478,7 @@ repeat: goto repeat; } else if (*errp) { ext4_discard_allocated_blocks(ac); - ac->ac_b_ex.fe_len = 0; - ar->len = 0; - ext4_mb_show_ac(ac); + goto errout; } else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; @@ -4320,24 +4488,26 @@ repeat: if (freed) goto repeat; *errp = -ENOSPC; + } + +errout: + if (*errp) { ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } - ext4_mb_release_context(ac); - -out2: - kmem_cache_free(ext4_ac_cachep, ac); -out1: +out: + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); if (inquota && ar->len < inquota) - vfs_dq_free_block(ar->inode, inquota - ar->len); -out3: + dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { - if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) + if (!ext4_test_inode_state(ar->inode, + EXT4_STATE_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, - reserv_blks); + percpu_counter_sub(&sbi->s_dirtyclusters_counter, + reserv_clstrs); } trace_ext4_allocate_blocks(ar, (unsigned long long)block); @@ -4353,9 +4523,9 @@ out3: static int can_merge(struct ext4_free_data *entry1, struct ext4_free_data *entry2) { - if ((entry1->t_tid == entry2->t_tid) && - (entry1->group == entry2->group) && - ((entry1->start_blk + entry1->count) == entry2->start_blk)) + if ((entry1->efd_tid == entry2->efd_tid) && + (entry1->efd_group == entry2->efd_group) && + ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster)) return 1; return 0; } @@ -4364,7 +4534,8 @@ static noinline_for_stack int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { - ext4_grpblk_t block; + ext4_group_t group = e4b->bd_group; + ext4_grpblk_t cluster; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; @@ -4376,8 +4547,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); - new_node = &new_entry->node; - block = new_entry->start_blk; + new_node = &new_entry->efd_node; + cluster = new_entry->efd_start_cluster; if (!*n) { /* first free block exent. We need to @@ -4390,15 +4561,16 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } while (*n) { parent = *n; - entry = rb_entry(parent, struct ext4_free_data, node); - if (block < entry->start_blk) + entry = rb_entry(parent, struct ext4_free_data, efd_node); + if (cluster < entry->efd_start_cluster) n = &(*n)->rb_left; - else if (block >= (entry->start_blk + entry->count)) + else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) n = &(*n)->rb_right; else { - ext4_grp_locked_error(sb, e4b->bd_group, __func__, - "Double free of blocks %d (%d %d)", - block, entry->start_blk, entry->count); + ext4_grp_locked_error(sb, group, 0, + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, cluster), + "Block already on to-be-freed list"); return 0; } } @@ -4409,34 +4581,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, /* Now try to see the extent can be merged to left and right */ node = rb_prev(new_node); if (node) { - entry = rb_entry(node, struct ext4_free_data, node); - if (can_merge(entry, new_entry)) { - new_entry->start_blk = entry->start_blk; - new_entry->count += entry->count; + entry = rb_entry(node, struct ext4_free_data, efd_node); + if (can_merge(entry, new_entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { + new_entry->efd_start_cluster = entry->efd_start_cluster; + new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - spin_lock(&sbi->s_md_lock); - list_del(&entry->list); - spin_unlock(&sbi->s_md_lock); - kmem_cache_free(ext4_free_ext_cachep, entry); + kmem_cache_free(ext4_free_data_cachep, entry); } } node = rb_next(new_node); if (node) { - entry = rb_entry(node, struct ext4_free_data, node); - if (can_merge(new_entry, entry)) { - new_entry->count += entry->count; + entry = rb_entry(node, struct ext4_free_data, efd_node); + if (can_merge(new_entry, entry) && + ext4_journal_callback_try_del(handle, &entry->efd_jce)) { + new_entry->efd_count += entry->efd_count; rb_erase(node, &(db->bb_free_root)); - spin_lock(&sbi->s_md_lock); - list_del(&entry->list); - spin_unlock(&sbi->s_md_lock); - kmem_cache_free(ext4_free_ext_cachep, entry); + kmem_cache_free(ext4_free_data_cachep, entry); } } /* Add the extent to transaction's private list */ - spin_lock(&sbi->s_md_lock); - list_add(&new_entry->list, &handle->h_transaction->t_private_list); - spin_unlock(&sbi->s_md_lock); + ext4_journal_callback_add(handle, ext4_free_data_callback, + &new_entry->efd_jce); return 0; } @@ -4446,7 +4613,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * @inode: inode * @block: start physical block to free * @count: number of blocks to count - * @metadata: Are these metadata blocks + * @flags: flags used by ext4_free_blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, @@ -4454,19 +4621,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, { struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; - struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; - struct ext4_super_block *es; - unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; + struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_buddy e4b; + unsigned int count_clusters; int err = 0; int ret; + might_sleep(); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); @@ -4475,11 +4642,10 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } sbi = EXT4_SB(sb); - es = EXT4_SB(sb)->s_es; - if (!ext4_data_block_valid(sbi, block, count)) { - ext4_error(sb, __func__, - "Freeing blocks not in datazone - " - "block = %llu, count = %lu", block, count); + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_data_block_valid(sbi, block, count)) { + ext4_error(sb, "Freeing blocks not in datazone - " + "block = %llu, count = %lu", block, count); goto error_return; } @@ -4493,15 +4659,18 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, BUG_ON(bh && (count > 1)); for (i = 0; i < count; i++) { + cond_resched(); if (!bh) tbh = sb_find_get_block(inode->i_sb, block + i); - ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + if (!tbh) + continue; + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, tbh, block + i); } } - /* + /* * We need to make sure we don't reuse the freed block until * after the transaction is committed, which we can do by * treating the block as metadata, below. We make an @@ -4511,24 +4680,56 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!ext4_should_writeback_data(inode)) flags |= EXT4_FREE_BLOCKS_METADATA; - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) { - ac->ac_inode = inode; - ac->ac_sb = sb; + /* + * If the extent to be freed does not begin on a cluster + * boundary, we need to deal with partial clusters at the + * beginning and end of the extent. Normally we will free + * blocks at the beginning or the end unless we are explicitly + * requested to avoid doing so. + */ + overflow = EXT4_PBLK_COFF(sbi, block); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { + overflow = sbi->s_cluster_ratio - overflow; + block += overflow; + if (count > overflow) + count -= overflow; + else + return; + } else { + block -= overflow; + count += overflow; + } + } + overflow = EXT4_LBLK_COFF(sbi, count); + if (overflow) { + if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { + if (count > overflow) + count -= overflow; + else + return; + } else + count += sbi->s_cluster_ratio - overflow; } do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( + ext4_get_group_info(sb, block_group)))) + return; + /* * Check to see if we are freeing blocks across a group * boundary. */ - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); + if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { + overflow = EXT4_C2B(sbi, bit) + count - + EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } + count_clusters = EXT4_NUM_B2C(sbi, count); bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) { err = -EIO; @@ -4543,12 +4744,11 @@ do_more: if (in_range(ext4_block_bitmap(sb, gdp), block, count) || in_range(ext4_inode_bitmap(sb, gdp), block, count) || in_range(block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group) || + EXT4_SB(sb)->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { + EXT4_SB(sb)->s_itb_per_group)) { - ext4_error(sb, __func__, - "Freeing blocks in system zone - " + ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_return; @@ -4571,16 +4771,11 @@ do_more: #ifdef AGGRESSIVE_CHECK { int i; - for (i = 0; i < count; i++) + for (i = 0; i < count_clusters; i++) BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif - if (ac) { - ac->ac_b_ex.fe_group = block_group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = count; - trace_ext4_mballoc_free(ac); - } + trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) @@ -4592,40 +4787,74 @@ do_more: * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed */ - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); - new_entry->start_blk = bit; - new_entry->group = block_group; - new_entry->count = count; - new_entry->t_tid = handle->h_transaction->t_tid; + retry: + new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); + if (!new_entry) { + /* + * We use a retry loop because + * ext4_free_blocks() is not allowed to fail. + */ + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + new_entry->efd_start_cluster = bit; + new_entry->efd_group = block_group; + new_entry->efd_count = count_clusters; + new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { /* need to update group_info->bb_free and bitmap * with group lock held. generate_buddy look at * them with group lock_held */ + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, block_group, bit, count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%lu failed" + " with %d", block_group, bit, count, + err); + } else + EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); + ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count); - mb_free_blocks(inode, &e4b, bit, count); - ext4_mb_return_to_preallocation(inode, &e4b, block, count); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); + mb_free_blocks(inode, &e4b, bit, count_clusters); } - ret = ext4_free_blks_count(sb, gdp) + count; - ext4_free_blks_set(sb, gdp, ret); - gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); + ret = ext4_free_group_clusters(sb, gdp) + count_clusters; + ext4_free_group_clusters_set(sb, gdp, ret); + ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeblocks_counter, count); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); + atomic64_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); } - ext4_mb_release_desc(&e4b); + if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) { + percpu_counter_add(&sbi->s_dirtyclusters_counter, + count_clusters); + spin_lock(&ei->i_block_reservation_lock); + if (flags & EXT4_FREE_BLOCKS_METADATA) + ei->i_reserved_meta_blocks += count_clusters; + else + ei->i_reserved_data_blocks += count_clusters; + spin_unlock(&ei->i_block_reservation_lock); + if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_reclaim_block(inode, + EXT4_C2B(sbi, count_clusters)); + } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) + dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); + percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); - freed += count; + ext4_mb_unload_buddy(&e4b); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -4643,13 +4872,365 @@ do_more: put_bh(bitmap_bh); goto do_more; } - sb->s_dirt = 1; error_return: - if (freed) - vfs_dq_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); return; } + +/** + * ext4_group_add_blocks() -- Add given blocks to an existing group + * @handle: handle to this transaction + * @sb: super block + * @block: start physical block to add to the block group + * @count: number of blocks to free + * + * This marks the blocks as free in the bitmap and buddy. + */ +int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) +{ + struct buffer_head *bitmap_bh = NULL; + struct buffer_head *gd_bh; + ext4_group_t block_group; + ext4_grpblk_t bit; + unsigned int i; + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_buddy e4b; + int err = 0, ret, blk_free_count; + ext4_grpblk_t blocks_freed; + + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); + + if (count == 0) + return 0; + + ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_warning(sb, "too much blocks added to group %u\n", + block_group); + err = -EINVAL; + goto error_return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, block_group); + if (!bitmap_bh) { + err = -EIO; + goto error_return; + } + + desc = ext4_get_group_desc(sb, block_group, &gd_bh); + if (!desc) { + err = -EIO; + goto error_return; + } + + if (in_range(ext4_block_bitmap(sb, desc), block, count) || + in_range(ext4_inode_bitmap(sb, desc), block, count) || + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || + in_range(block + count - 1, ext4_inode_table(sb, desc), + sbi->s_itb_per_group)) { + ext4_error(sb, "Adding blocks in system zones - " + "Block = %llu, count = %lu", + block, count); + err = -EINVAL; + goto error_return; + } + + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + for (i = 0, blocks_freed = 0; i < count; i++) { + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { + ext4_error(sb, "bit already cleared for block %llu", + (ext4_fsblk_t)(block + i)); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + blocks_freed++; + } + } + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + + /* + * need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); + mb_free_blocks(NULL, &e4b, bit, count); + blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc); + ext4_free_group_clusters_set(sb, desc, blk_free_count); + ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); + ext4_group_desc_csum_set(sb, block_group, desc); + ext4_unlock_group(sb, block_group); + percpu_counter_add(&sbi->s_freeclusters_counter, + EXT4_NUM_B2C(sbi, blocks_freed)); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); + } + + ext4_mb_unload_buddy(&e4b); + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; + +error_return: + brelse(bitmap_bh); + ext4_std_error(sb, err); + return err; +} + +/** + * ext4_trim_extent -- function to TRIM one single free extent in the group + * @sb: super block for the file system + * @start: starting block of the free extent in the alloc. group + * @count: number of blocks to TRIM + * @group: alloc. group we are working with + * @e4b: ext4 buddy for the group + * + * Trim "count" blocks starting at "start" in the "group". To assure that no + * one will allocate those blocks, mark it as used in buddy bitmap. This must + * be called with under the group lock. + */ +static int ext4_trim_extent(struct super_block *sb, int start, int count, + ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) +{ + struct ext4_free_extent ex; + int ret = 0; + + trace_ext4_trim_extent(sb, group, start, count); + + assert_spin_locked(ext4_group_lock_ptr(sb, group)); + + ex.fe_start = start; + ex.fe_group = group; + ex.fe_len = count; + + /* + * Mark blocks used, so no one can reuse them while + * being trimmed. + */ + mb_mark_used(e4b, &ex); + ext4_unlock_group(sb, group); + ret = ext4_issue_discard(sb, group, start, count); + ext4_lock_group(sb, group); + mb_free_blocks(NULL, e4b, start, ex.fe_len); + return ret; +} + +/** + * ext4_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @group: group to be trimmed + * @start: first group block to examine + * @max: last group block to examine + * @minblocks: minimum extent block count + * + * ext4_trim_all_free walks through group's buddy bitmap searching for free + * extents. When the free block is found, ext4_trim_extent is called to TRIM + * the extent. + * + * + * ext4_trim_all_free walks through group's block bitmap searching for free + * extents. When the free extent is found, mark it as used in group buddy + * bitmap. Then issue a TRIM command on this extent and free the extent in + * the group buddy bitmap. This is done until whole group is scanned. + */ +static ext4_grpblk_t +ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + ext4_grpblk_t start, ext4_grpblk_t max, + ext4_grpblk_t minblocks) +{ + void *bitmap; + ext4_grpblk_t next, count = 0, free_count = 0; + struct ext4_buddy e4b; + int ret = 0; + + trace_ext4_trim_all_free(sb, group, start, max); + + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_error(sb, "Error in loading buddy " + "information for %u", group); + return ret; + } + bitmap = e4b.bd_bitmap; + + ext4_lock_group(sb, group); + if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && + minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) + goto out; + + start = (e4b.bd_info->bb_first_free > start) ? + e4b.bd_info->bb_first_free : start; + + while (start <= max) { + start = mb_find_next_zero_bit(bitmap, max + 1, start); + if (start > max) + break; + next = mb_find_next_bit(bitmap, max + 1, start); + + if ((next - start) >= minblocks) { + ret = ext4_trim_extent(sb, start, + next - start, group, &e4b); + if (ret && ret != -EOPNOTSUPP) + break; + ret = 0; + count += next - start; + } + free_count += next - start; + start = next + 1; + + if (fatal_signal_pending(current)) { + count = -ERESTARTSYS; + break; + } + + if (need_resched()) { + ext4_unlock_group(sb, group); + cond_resched(); + ext4_lock_group(sb, group); + } + + if ((e4b.bd_info->bb_free - free_count) < minblocks) + break; + } + + if (!ret) { + ret = count; + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + } +out: + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + + ext4_debug("trimmed %d blocks in the group %d\n", + count, group); + + return ret; +} + +/** + * ext4_trim_fs() -- trim ioctl handle function + * @sb: superblock for filesystem + * @range: fstrim_range structure + * + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + * ext4_trim_fs goes through all allocation groups containing Bytes from + * start to start+len. For each such a group ext4_trim_all_free function + * is invoked to trim all free space. + */ +int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + struct ext4_group_info *grp; + ext4_group_t group, first_group, last_group; + ext4_grpblk_t cnt = 0, first_cluster, last_cluster; + uint64_t start, end, minlen, trimmed = 0; + ext4_fsblk_t first_data_blk = + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); + int ret = 0; + + start = range->start >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; + minlen = EXT4_NUM_B2C(EXT4_SB(sb), + range->minlen >> sb->s_blocksize_bits); + + if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || + start >= max_blks || + range->len < sb->s_blocksize) + return -EINVAL; + if (end >= max_blks) + end = max_blks - 1; + if (end <= first_data_blk) + goto out; + if (start < first_data_blk) + start = first_data_blk; + + /* Determine first and last group to examine based on start and end */ + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, + &first_group, &first_cluster); + ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, + &last_group, &last_cluster); + + /* end now represents the last cluster to discard in this group */ + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; + + for (group = first_group; group <= last_group; group++) { + grp = ext4_get_group_info(sb, group); + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(sb, group); + if (ret) + break; + } + + /* + * For all the groups except the last one, last cluster will + * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to + * change it for the last group, note that last_cluster is + * already computed earlier by ext4_get_group_no_and_offset() + */ + if (group == last_group) + end = last_cluster; + + if (grp->bb_free >= minlen) { + cnt = ext4_trim_all_free(sb, group, first_cluster, + end, minlen); + if (cnt < 0) { + ret = cnt; + break; + } + trimmed += cnt; + } + + /* + * For every group except the first one, we are sure + * that the first cluster to discard will be cluster #0. + */ + first_cluster = 0; + } + + if (!ret) + atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); + +out: + range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; + return ret; +} |
