aboutsummaryrefslogtreecommitdiff
path: root/fs/ext4/mballoc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r--fs/ext4/mballoc.c1992
1 files changed, 1178 insertions, 814 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5b4d4e3a4d5..2dcb936be90 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,11 +21,20 @@
* mballoc.c contains the multiblocks allocation routines
*/
+#include "ext4_jbd2.h"
#include "mballoc.h"
-#include <linux/debugfs.h>
+#include <linux/log2.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <trace/events/ext4.h>
+#ifdef CONFIG_EXT4_DEBUG
+ushort ext4_mballoc_debug __read_mostly;
+
+module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
+MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
+#endif
+
/*
* MUSTDO:
* - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -70,13 +79,13 @@
*
* pa_lstart -> the logical start block for this prealloc space
* pa_pstart -> the physical start block for this prealloc space
- * pa_len -> length for this prealloc space
- * pa_free -> free space available in this prealloc space
+ * pa_len -> length for this prealloc space (in clusters)
+ * pa_free -> free space available in this prealloc space (in clusters)
*
* The inode preallocation space is used looking at the _logical_ start
* block. If only the logical file block falls within the range of prealloc
- * space we will consume the particular prealloc space. This make sure that
- * that the we have contiguous physical blocks representing the file blocks
+ * space we will consume the particular prealloc space. This makes sure that
+ * we have contiguous physical blocks representing the file blocks
*
* The important thing to be noted in case of inode prealloc space is that
* we don't modify the values associated to inode prealloc space except
@@ -84,7 +93,7 @@
*
* If we are not able to find blocks in the inode prealloc space and if we
* have the group allocation flag set then we look at the locality group
- * prealloc space. These are per CPU prealloc list repreasented as
+ * prealloc space. These are per CPU prealloc list represented as
*
* ext4_sb_info.s_locality_groups[smp_processor_id()]
*
@@ -92,7 +101,7 @@
* between CPUs. It is possible to get scheduled at this point.
*
* The locality group prealloc space is used looking at whether we have
- * enough free space (pa_free) withing the prealloc space.
+ * enough free space (pa_free) within the prealloc space.
*
* If we can't allocate blocks via inode prealloc or/and locality group
* prealloc then we look at the buddy cache. The buddy cache is represented
@@ -126,14 +135,16 @@
* list. In case of inode preallocation we follow a list of heuristics
* based on file size. This can be found in ext4_mb_normalize_request. If
* we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
+ * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is
+ * dependent on the cluster size; for non-bigalloc file systems, it is
* 512 blocks. This can be tuned via
- * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
+ * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
* terms of number of blocks. If we have mounted the file system with -O
* stripe=<value> option the group prealloc request is normalized to the
- * stripe value (sbi->s_stripe)
+ * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * greater than the default mb_group_prealloc.
*
- * The regular allocator(using the buddy cache) supports few tunables.
+ * The regular allocator (using the buddy cache) supports a few tunables.
*
* /sys/fs/ext4/<partition>/mb_min_to_scan
* /sys/fs/ext4/<partition>/mb_max_to_scan
@@ -152,7 +163,7 @@
* best extent in the found extents. Searching for the blocks starts with
* the group specified as the goal value in allocation context via
* ac_g_ex. Each group is first checked based on the criteria whether it
- * can used for allocation. ext4_mb_good_group explains how the groups are
+ * can be used for allocation. ext4_mb_good_group explains how the groups are
* checked.
*
* Both the prealloc space are getting populated as above. So for the first
@@ -337,20 +348,26 @@
*/
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
+static struct kmem_cache *ext4_free_data_cachep;
/* We create slab caches for groupinfo data structures based on the
* superblock block size. There will be one per mounted filesystem for
* each unique s_blocksize_bits */
-#define NR_GRPINFO_CACHES \
- (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
+#define NR_GRPINFO_CACHES 8
static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+ "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+ "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+ "ext4_groupinfo_64k", "ext4_groupinfo_128k"
+};
+
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce, int rc);
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -388,6 +405,12 @@ static inline void mb_clear_bit(int bit, void *addr)
ext4_clear_bit(bit, addr);
}
+static inline int mb_test_and_clear_bit(int bit, void *addr)
+{
+ addr = mb_correct_addr_and_bit(&bit, addr);
+ return ext4_test_and_clear_bit(bit, addr);
+}
+
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
int fix = 0, ret, tmpmax;
@@ -418,7 +441,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
{
char *bb;
- BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+ BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(max == NULL);
if (order > e4b->bd_blkbits + 1) {
@@ -427,11 +450,12 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
}
/* at order 0 we see each particular block */
- *max = 1 << (e4b->bd_blkbits + 3);
- if (order == 0)
- return EXT4_MB_BITMAP(e4b);
+ if (order == 0) {
+ *max = 1 << (e4b->bd_blkbits + 3);
+ return e4b->bd_bitmap;
+ }
- bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+ bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
return bb;
@@ -452,7 +476,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
ext4_fsblk_t blocknr;
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
- blocknr += first + i;
+ blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
@@ -486,10 +510,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
b2 = (unsigned char *) bitmap;
for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
if (b1[i] != b2[i]) {
- printk(KERN_ERR "corruption in group %u "
- "at byte %u(%u): %x in copy != %x "
- "on disk/prealloc\n",
- e4b->bd_group, i, i * 8, b1[i], b2[i]);
+ ext4_msg(e4b->bd_sb, KERN_ERR,
+ "corruption in group %u "
+ "at byte %u(%u): %x in copy != %x "
+ "on disk/prealloc",
+ e4b->bd_group, i, i * 8, b1[i], b2[i]);
BUG();
}
}
@@ -572,14 +597,14 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
continue;
}
- /* both bits in buddy2 must be 0 */
+ /* both bits in buddy2 must be 1 */
MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
for (j = 0; j < (1 << order); j++) {
k = (i * (1 << order)) + j;
MB_CHECK_ASSERT(
- !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+ !mb_test_bit(k, e4b->bd_bitmap));
}
count++;
}
@@ -611,7 +636,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
grp = ext4_get_group_info(sb, e4b->bd_group);
- buddy = mb_find_buddy(e4b, 0, &max);
list_for_each(cur, &grp->bb_prealloc_list) {
ext4_group_t groupnr;
struct ext4_prealloc_space *pa;
@@ -630,7 +654,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
#define mb_check_buddy(e4b)
#endif
-/* FIXME!! need more doc */
+/*
+ * Divide blocks started from @first with length @len into
+ * smaller chunks with power of 2 blocks.
+ * Clear the bits in bitmap which the blocks of the chunk(s) covered,
+ * then increase bb_counters[] for corresponded chunk size.
+ */
static void ext4_mb_mark_free_simple(struct super_block *sb,
void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
struct ext4_group_info *grp)
@@ -641,7 +670,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
ext4_grpblk_t chunk;
unsigned short border;
- BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
+ BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
border = 2 << sb->s_blocksize_bits;
@@ -693,7 +722,8 @@ void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
- ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
ext4_grpblk_t i = 0;
ext4_grpblk_t first;
ext4_grpblk_t len;
@@ -722,13 +752,18 @@ void ext4_mb_generate_buddy(struct super_block *sb,
if (free != grp->bb_free) {
ext4_grp_locked_error(sb, group, 0, 0,
- "%u blocks in bitmap, %u in gd",
+ "block bitmap and bg descriptor "
+ "inconsistent: %u vs %u free clusters",
free, grp->bb_free);
/*
- * If we intent to continue, we consider group descritor
+ * If we intend to continue, we consider group descriptor
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
}
mb_set_largest_free_order(sb, grp);
@@ -741,6 +776,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
spin_unlock(&EXT4_SB(sb)->s_bal_lock);
}
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+ int count;
+ int order = 1;
+ void *buddy;
+
+ while ((buddy = mb_find_buddy(e4b, order++, &count))) {
+ ext4_set_bits(buddy, 0, count);
+ }
+ e4b->bd_info->bb_fragments = 0;
+ memset(e4b->bd_info->bb_counters, 0,
+ sizeof(*e4b->bd_info->bb_counters) *
+ (e4b->bd_sb->s_blocksize_bits + 2));
+
+ ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+ e4b->bd_bitmap, e4b->bd_group);
+}
+
/* The buddy information is attached the buddy cache inode
* for convenience. The information regarding each group
* is loaded via ext4_mb_load_buddy. The information involve
@@ -769,14 +822,15 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
int groups_per_page;
int err = 0;
int i;
- ext4_group_t first_group;
+ ext4_group_t first_group, group;
int first_block;
struct super_block *sb;
struct buffer_head *bhs;
- struct buffer_head **bh;
+ struct buffer_head **bh = NULL;
struct inode *inode;
char *data;
char *bitmap;
+ struct ext4_group_info *grinfo;
mb_debug(1, "init page %lu\n", page->index);
@@ -792,95 +846,58 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* allocate buffer_heads to read bitmaps */
if (groups_per_page > 1) {
- err = -ENOMEM;
i = sizeof(struct buffer_head *) * groups_per_page;
bh = kzalloc(i, GFP_NOFS);
- if (bh == NULL)
+ if (bh == NULL) {
+ err = -ENOMEM;
goto out;
+ }
} else
bh = &bhs;
first_group = page->index * blocks_per_page / 2;
/* read all groups the page covers into the cache */
- for (i = 0; i < groups_per_page; i++) {
- struct ext4_group_desc *desc;
-
- if (first_group + i >= ngroups)
+ for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ if (group >= ngroups)
break;
- err = -EIO;
- desc = ext4_get_group_desc(sb, first_group + i, NULL);
- if (desc == NULL)
- goto out;
-
- err = -ENOMEM;
- bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
- if (bh[i] == NULL)
- goto out;
-
- if (bitmap_uptodate(bh[i]))
- continue;
-
- lock_buffer(bh[i]);
- if (bitmap_uptodate(bh[i])) {
- unlock_buffer(bh[i]);
- continue;
- }
- ext4_lock_group(sb, first_group + i);
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- ext4_init_block_bitmap(sb, bh[i],
- first_group + i, desc);
- set_bitmap_uptodate(bh[i]);
- set_buffer_uptodate(bh[i]);
- ext4_unlock_group(sb, first_group + i);
- unlock_buffer(bh[i]);
+ grinfo = ext4_get_group_info(sb, group);
+ /*
+ * If page is uptodate then we came here after online resize
+ * which added some new uninitialized group info structs, so
+ * we must skip all initialized uptodate buddies on the page,
+ * which may be currently in use by an allocating task.
+ */
+ if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+ bh[i] = NULL;
continue;
}
- ext4_unlock_group(sb, first_group + i);
- if (buffer_uptodate(bh[i])) {
- /*
- * if not uninit if bh is uptodate,
- * bitmap is also uptodate
- */
- set_bitmap_uptodate(bh[i]);
- unlock_buffer(bh[i]);
- continue;
+ if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
+ err = -ENOMEM;
+ goto out;
}
- get_bh(bh[i]);
- /*
- * submit the buffer_head for read. We can
- * safely mark the bitmap as uptodate now.
- * We do it here so the bitmap uptodate bit
- * get set with buffer lock held.
- */
- set_bitmap_uptodate(bh[i]);
- bh[i]->b_end_io = end_buffer_read_sync;
- submit_bh(READ, bh[i]);
- mb_debug(1, "read bitmap for group %u\n", first_group + i);
+ mb_debug(1, "read bitmap for group %u\n", group);
}
/* wait for I/O completion */
- for (i = 0; i < groups_per_page && bh[i]; i++)
- wait_on_buffer(bh[i]);
-
- err = -EIO;
- for (i = 0; i < groups_per_page && bh[i]; i++)
- if (!buffer_uptodate(bh[i]))
+ for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+ if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
+ err = -EIO;
goto out;
+ }
+ }
- err = 0;
first_block = page->index * blocks_per_page;
- /* init the page */
- memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
for (i = 0; i < blocks_per_page; i++) {
- int group;
- struct ext4_group_info *grinfo;
-
group = (first_block + i) >> 1;
if (group >= ngroups)
break;
+ if (!bh[group - first_group])
+ /* skip initialized uptodate buddy */
+ continue;
+
/*
* data carry information regarding this
* particular group in the format specified
@@ -909,6 +926,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
* incore got set to the group block bitmap below
*/
ext4_lock_group(sb, group);
+ /* init the buddy */
+ memset(data, 0xff, blocksize);
ext4_mb_generate_buddy(sb, data, incore, group);
ext4_unlock_group(sb, group);
incore = NULL;
@@ -938,7 +957,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
out:
if (bh) {
- for (i = 0; i < groups_per_page && bh[i]; i++)
+ for (i = 0; i < groups_per_page; i++)
brelse(bh[i]);
if (bh != &bhs)
kfree(bh);
@@ -947,22 +966,21 @@ out:
}
/*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen whild holding the buddy cache
- * lock
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
*/
-static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
- ext4_group_t group)
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+ ext4_group_t group, struct ext4_buddy *e4b)
{
- int i;
- int block, pnum;
+ struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+ int block, pnum, poff;
int blocks_per_page;
- int groups_per_page;
- ext4_group_t ngroups = ext4_get_groups_count(sb);
- ext4_group_t first_group;
- struct ext4_group_info *grp;
+ struct page *page;
+
+ e4b->bd_buddy_page = NULL;
+ e4b->bd_bitmap_page = NULL;
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
/*
@@ -972,57 +990,39 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
*/
block = group * 2;
pnum = block / blocks_per_page;
- first_group = pnum * blocks_per_page / 2;
-
- groups_per_page = blocks_per_page >> 1;
- if (groups_per_page == 0)
- groups_per_page = 1;
- /* read all groups the page covers into the cache */
- for (i = 0; i < groups_per_page; i++) {
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ BUG_ON(page->mapping != inode->i_mapping);
+ e4b->bd_bitmap_page = page;
+ e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
- if ((first_group + i) >= ngroups)
- break;
- grp = ext4_get_group_info(sb, first_group + i);
- /* take all groups write allocation
- * semaphore. This make sure there is
- * no block allocation going on in any
- * of that groups
- */
- down_write_nested(&grp->alloc_sem, i);
+ if (blocks_per_page >= 2) {
+ /* buddy and bitmap are on the same page */
+ return 0;
}
- return i;
+
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ BUG_ON(page->mapping != inode->i_mapping);
+ e4b->bd_buddy_page = page;
+ return 0;
}
-static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
- ext4_group_t group, int locked_group)
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
{
- int i;
- int block, pnum;
- int blocks_per_page;
- ext4_group_t first_group;
- struct ext4_group_info *grp;
-
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- first_group = pnum * blocks_per_page / 2;
- /* release locks on all the groups */
- for (i = 0; i < locked_group; i++) {
-
- grp = ext4_get_group_info(sb, first_group + i);
- /* take all groups write allocation
- * semaphore. This make sure there is
- * no block allocation going on in any
- * of that groups
- */
- up_write(&grp->alloc_sem);
+ if (e4b->bd_bitmap_page) {
+ unlock_page(e4b->bd_bitmap_page);
+ page_cache_release(e4b->bd_bitmap_page);
+ }
+ if (e4b->bd_buddy_page) {
+ unlock_page(e4b->bd_buddy_page);
+ page_cache_release(e4b->bd_buddy_page);
}
-
}
/*
@@ -1034,93 +1034,61 @@ static noinline_for_stack
int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
{
- int ret = 0;
- void *bitmap;
- int blocks_per_page;
- int block, pnum, poff;
- int num_grp_locked = 0;
struct ext4_group_info *this_grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- struct page *page = NULL, *bitmap_page = NULL;
+ struct ext4_buddy e4b;
+ struct page *page;
+ int ret = 0;
+ might_sleep();
mb_debug(1, "init group %u\n", group);
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
this_grp = ext4_get_group_info(sb, group);
/*
* This ensures that we don't reinit the buddy cache
* page which map to the group from which we are already
* allocating. If we are looking at the buddy cache we would
* have taken a reference using ext4_mb_load_buddy and that
- * would have taken the alloc_sem lock.
+ * would have pinned buddy page to page cache.
+ * The call to ext4_mb_get_buddy_page_lock will mark the
+ * page accessed.
*/
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
- if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+ if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
/*
* somebody initialized the group
* return without doing anything
*/
- ret = 0;
goto err;
}
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, NULL);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
+
+ page = e4b.bd_bitmap_page;
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret)
+ goto err;
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
- bitmap_page = page;
- bitmap = page_address(page) + (poff * sb->s_blocksize);
- /* init buddy cache */
- block++;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page == bitmap_page) {
+ if (e4b.bd_buddy_page == NULL) {
/*
* If both the bitmap and buddy are in
* the same page we don't need to force
* init the buddy
*/
- unlock_page(page);
- } else if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, bitmap);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
+ ret = 0;
+ goto err;
}
- if (page == NULL || !PageUptodate(page)) {
+ /* init buddy cache */
+ page = e4b.bd_buddy_page;
+ ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+ if (ret)
+ goto err;
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
err:
- ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
- if (bitmap_page)
- page_cache_release(bitmap_page);
- if (page)
- page_cache_release(page);
+ ext4_mb_put_buddy_page_lock(&e4b);
return ret;
}
@@ -1143,35 +1111,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct inode *inode = sbi->s_buddy_cache;
+ might_sleep();
mb_debug(1, "load group %u\n", group);
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
e4b->bd_blkbits = sb->s_blocksize_bits;
- e4b->bd_info = ext4_get_group_info(sb, group);
+ e4b->bd_info = grp;
e4b->bd_sb = sb;
e4b->bd_group = group;
e4b->bd_buddy_page = NULL;
e4b->bd_bitmap_page = NULL;
- e4b->alloc_semp = &grp->alloc_sem;
-
- /* Take the read lock on the group alloc
- * sem. This would make sure a parallel
- * ext4_mb_init_group happening on other
- * groups mapped by the page is blocked
- * till we are done with allocation
- */
-repeat_load_buddy:
- down_read(e4b->alloc_semp);
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- /* we need to check for group need init flag
- * with alloc_semp held so that we can be sure
- * that new blocks didn't get added to the group
- * when we are loading the buddy cache
- */
- up_read(e4b->alloc_semp);
/*
* we need full data about the group
* to make a good selection
@@ -1179,7 +1132,6 @@ repeat_load_buddy:
ret = ext4_mb_init_group(sb, group);
if (ret)
return ret;
- goto repeat_load_buddy;
}
/*
@@ -1193,7 +1145,7 @@ repeat_load_buddy:
/* we could use find_or_create_page(), but it locks page
* what we'd like to avoid in fast path ... */
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
/*
@@ -1220,19 +1172,24 @@ repeat_load_buddy:
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page)) {
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
block++;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
page_cache_release(page);
@@ -1249,13 +1206,18 @@ repeat_load_buddy:
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page)) {
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
@@ -1263,15 +1225,14 @@ repeat_load_buddy:
return 0;
err:
+ if (page)
+ page_cache_release(page);
if (e4b->bd_bitmap_page)
page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
-
- /* Done with the buddy cache */
- up_read(e4b->alloc_semp);
return ret;
}
@@ -1281,9 +1242,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page);
- /* Done with the buddy cache */
- if (e4b->alloc_semp)
- up_read(e4b->alloc_semp);
}
@@ -1292,10 +1250,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
int order = 1;
void *bb;
- BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+ BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
- bb = EXT4_MB_BUDDY(e4b);
+ bb = e4b->bd_buddy;
while (order <= e4b->bd_blkbits + 1) {
block = block >> 1;
if (!mb_test_bit(block, bb)) {
@@ -1326,7 +1284,34 @@ static void mb_clear_bits(void *bm, int cur, int len)
}
}
-static void mb_set_bits(void *bm, int cur, int len)
+/* clear bits in given range
+ * will return first found zero bit if any, -1 otherwise
+ */
+static int mb_test_and_clear_bits(void *bm, int cur, int len)
+{
+ __u32 *addr;
+ int zero_bit = -1;
+
+ len = cur + len;
+ while (cur < len) {
+ if ((cur & 31) == 0 && (len - cur) >= 32) {
+ /* fast path: clear whole word at once */
+ addr = bm + (cur >> 3);
+ if (*addr != (__u32)(-1) && zero_bit == -1)
+ zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
+ *addr = 0;
+ cur += 32;
+ continue;
+ }
+ if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
+ zero_bit = cur;
+ cur++;
+ }
+
+ return zero_bit;
+}
+
+void ext4_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1344,18 +1329,95 @@ static void mb_set_bits(void *bm, int cur, int len)
}
}
+/*
+ * _________________________________________________________________ */
+
+static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
+{
+ if (mb_test_bit(*bit + side, bitmap)) {
+ mb_clear_bit(*bit, bitmap);
+ (*bit) -= side;
+ return 1;
+ }
+ else {
+ (*bit) += side;
+ mb_set_bit(*bit, bitmap);
+ return -1;
+ }
+}
+
+static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
+{
+ int max;
+ int order = 1;
+ void *buddy = mb_find_buddy(e4b, order, &max);
+
+ while (buddy) {
+ void *buddy2;
+
+ /* Bits in range [first; last] are known to be set since
+ * corresponding blocks were allocated. Bits in range
+ * (first; last) will stay set because they form buddies on
+ * upper layer. We just deal with borders if they don't
+ * align with upper layer and then go up.
+ * Releasing entire group is all about clearing
+ * single bit of highest order buddy.
+ */
+
+ /* Example:
+ * ---------------------------------
+ * | 1 | 1 | 1 | 1 |
+ * ---------------------------------
+ * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+ * ---------------------------------
+ * 0 1 2 3 4 5 6 7
+ * \_____________________/
+ *
+ * Neither [1] nor [6] is aligned to above layer.
+ * Left neighbour [0] is free, so mark it busy,
+ * decrease bb_counters and extend range to
+ * [0; 6]
+ * Right neighbour [7] is busy. It can't be coaleasced with [6], so
+ * mark [6] free, increase bb_counters and shrink range to
+ * [0; 5].
+ * Then shift range to [0; 2], go up and do the same.
+ */
+
+
+ if (first & 1)
+ e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
+ if (!(last & 1))
+ e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
+ if (first > last)
+ break;
+ order++;
+
+ if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+ mb_clear_bits(buddy, first, last - first + 1);
+ e4b->bd_info->bb_counters[order - 1] += last - first + 1;
+ break;
+ }
+ first >>= 1;
+ last >>= 1;
+ buddy = buddy2;
+ }
+}
+
static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
- int first, int count)
+ int first, int count)
{
- int block = 0;
- int max = 0;
- int order;
- void *buddy;
- void *buddy2;
+ int left_is_free = 0;
+ int right_is_free = 0;
+ int block;
+ int last = first + count - 1;
struct super_block *sb = e4b->bd_sb;
- BUG_ON(first + count > (sb->s_blocksize << 3));
+ BUG_ON(last >= (sb->s_blocksize << 3));
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+ /* Don't bother if the block group is corrupt. */
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ return;
+
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
@@ -1363,83 +1425,77 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
if (first < e4b->bd_info->bb_first_free)
e4b->bd_info->bb_first_free = first;
- /* let's maintain fragments counter */
+ /* access memory sequentially: check left neighbour,
+ * clear range and then check right neighbour
+ */
if (first != 0)
- block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
- if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
- max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
- if (block && max)
+ left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
+ block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
+ if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
+ right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
+
+ if (unlikely(block != -1)) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t blocknr;
+
+ blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+ blocknr += EXT4_C2B(EXT4_SB(sb), block);
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0,
+ blocknr,
+ "freeing already freed block "
+ "(bit %u); block bitmap corrupt.",
+ block);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ e4b->bd_info->bb_free);
+ /* Mark the block group as corrupt. */
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
+ &e4b->bd_info->bb_state);
+ mb_regenerate_buddy(e4b);
+ goto done;
+ }
+
+ /* let's maintain fragments counter */
+ if (left_is_free && right_is_free)
e4b->bd_info->bb_fragments--;
- else if (!block && !max)
+ else if (!left_is_free && !right_is_free)
e4b->bd_info->bb_fragments++;
- /* let's maintain buddy itself */
- while (count-- > 0) {
- block = first++;
- order = 0;
-
- if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
- ext4_fsblk_t blocknr;
-
- blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
- blocknr += block;
- ext4_grp_locked_error(sb, e4b->bd_group,
- inode ? inode->i_ino : 0,
- blocknr,
- "freeing already freed block "
- "(bit %u)", block);
- }
- mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
- e4b->bd_info->bb_counters[order]++;
-
- /* start of the buddy */
- buddy = mb_find_buddy(e4b, order, &max);
-
- do {
- block &= ~1UL;
- if (mb_test_bit(block, buddy) ||
- mb_test_bit(block + 1, buddy))
- break;
-
- /* both the buddies are free, try to coalesce them */
- buddy2 = mb_find_buddy(e4b, order + 1, &max);
-
- if (!buddy2)
- break;
-
- if (order > 0) {
- /* for special purposes, we don't set
- * free bits in bitmap */
- mb_set_bit(block, buddy);
- mb_set_bit(block + 1, buddy);
- }
- e4b->bd_info->bb_counters[order]--;
- e4b->bd_info->bb_counters[order]--;
+ /* buddy[0] == bd_bitmap is a special case, so handle
+ * it right away and let mb_buddy_mark_free stay free of
+ * zero order checks.
+ * Check if neighbours are to be coaleasced,
+ * adjust bitmap bb_counters and borders appropriately.
+ */
+ if (first & 1) {
+ first += !left_is_free;
+ e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
+ }
+ if (!(last & 1)) {
+ last -= !right_is_free;
+ e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
+ }
- block = block >> 1;
- order++;
- e4b->bd_info->bb_counters[order]++;
+ if (first <= last)
+ mb_buddy_mark_free(e4b, first >> 1, last >> 1);
- mb_clear_bit(block, buddy2);
- buddy = buddy2;
- } while (1);
- }
+done:
mb_set_largest_free_order(sb, e4b->bd_info);
mb_check_buddy(e4b);
}
-static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+static int mb_find_extent(struct ext4_buddy *e4b, int block,
int needed, struct ext4_free_extent *ex)
{
int next = block;
- int max;
- int ord;
+ int max, order;
void *buddy;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
BUG_ON(ex == NULL);
- buddy = mb_find_buddy(e4b, order, &max);
+ buddy = mb_find_buddy(e4b, 0, &max);
BUG_ON(buddy == NULL);
BUG_ON(block >= max);
if (mb_test_bit(block, buddy)) {
@@ -1449,12 +1505,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
return 0;
}
- /* FIXME dorp order completely ? */
- if (likely(order == 0)) {
- /* find actual order */
- order = mb_find_order_for_block(e4b, block);
- block = block >> order;
- }
+ /* find actual order */
+ order = mb_find_order_for_block(e4b, block);
+ block = block >> order;
ex->fe_len = 1 << order;
ex->fe_start = block << order;
@@ -1466,18 +1519,17 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
ex->fe_start += next;
while (needed > ex->fe_len &&
- (buddy = mb_find_buddy(e4b, order, &max))) {
+ mb_find_buddy(e4b, order, &max)) {
if (block + 1 >= max)
break;
next = (block + 1) * (1 << order);
- if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+ if (mb_test_bit(next, e4b->bd_bitmap))
break;
- ord = mb_find_order_for_block(e4b, next);
+ order = mb_find_order_for_block(e4b, next);
- order = ord;
block = next >> order;
ex->fe_len += 1 << order;
}
@@ -1510,9 +1562,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
/* let's maintain fragments counter */
if (start != 0)
- mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+ mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
- max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+ max = !mb_test_bit(start + len, e4b->bd_bitmap);
if (mlen && max)
e4b->bd_info->bb_fragments++;
else if (!mlen && !max)
@@ -1555,7 +1607,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
}
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
- mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+ ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
mb_check_buddy(e4b);
return ret;
@@ -1596,9 +1648,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
get_page(ac->ac_bitmap_page);
ac->ac_buddy_page = e4b->bd_buddy_page;
get_page(ac->ac_buddy_page);
- /* on allocation we use ac to track the held semaphore */
- ac->alloc_semp = e4b->alloc_semp;
- e4b->alloc_semp = NULL;
/* store last allocated for subsequent stream allocation */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock);
@@ -1644,7 +1693,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
/* recheck chunk's availability - we don't know
* when it was found (within this lock-unlock
* period or not) */
- max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+ max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
if (max >= gex->fe_len) {
ext4_mb_use_best_found(ac, e4b);
return;
@@ -1670,8 +1719,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
struct ext4_free_extent *gex = &ac->ac_g_ex;
BUG_ON(ex->fe_len <= 0);
- BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
- BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+ BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
ac->ac_found++;
@@ -1736,7 +1785,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
return err;
ext4_lock_group(ac->ac_sb, group);
- max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+ max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
ac->ac_b_ex = ex;
@@ -1757,18 +1806,27 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
int max;
int err;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct ext4_free_extent ex;
if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
return 0;
+ if (grp->bb_free == 0)
+ return 0;
err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
if (err)
return err;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
+ ext4_mb_unload_buddy(e4b);
+ return 0;
+ }
+
ext4_lock_group(ac->ac_sb, group);
- max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+ max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
+ ex.fe_logical = 0xDEADFA11; /* debug value */
if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
ext4_fsblk_t start;
@@ -1857,7 +1915,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
struct super_block *sb = ac->ac_sb;
- void *bitmap = EXT4_MB_BITMAP(e4b);
+ void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
int i;
int free;
@@ -1869,25 +1927,25 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
while (free && ac->ac_status == AC_STATUS_CONTINUE) {
i = mb_find_next_zero_bit(bitmap,
- EXT4_BLOCKS_PER_GROUP(sb), i);
- if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+ EXT4_CLUSTERS_PER_GROUP(sb), i);
+ if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
/*
* IF we have corrupt bitmap, we won't find any
* free blocks even though group info says we
* we have free blocks
*/
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
- "%d free blocks as per "
+ "%d free clusters as per "
"group info. But bitmap says 0",
free);
break;
}
- mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+ mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
BUG_ON(ex.fe_len <= 0);
if (free < ex.fe_len) {
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
- "%d free blocks as per "
+ "%d free clusters as per "
"group info. But got %d blocks",
free, ex.fe_len);
/*
@@ -1897,7 +1955,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
*/
break;
}
-
+ ex.fe_logical = 0xDEADC0DE; /* debug value */
ext4_mb_measure_extent(ac, &ex, e4b);
i += ex.fe_len;
@@ -1917,7 +1975,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
{
struct super_block *sb = ac->ac_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- void *bitmap = EXT4_MB_BITMAP(e4b);
+ void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
ext4_fsblk_t first_group_block;
ext4_fsblk_t a;
@@ -1933,11 +1991,12 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
do_div(a, sbi->s_stripe);
i = (a * sbi->s_stripe) - first_group_block;
- while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+ while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
- max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+ max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
if (max >= sbi->s_stripe) {
ac->ac_found++;
+ ex.fe_logical = 0xDEADF00D; /* debug value */
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
break;
@@ -1957,6 +2016,15 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
BUG_ON(cr < 0 || cr >= 4);
+ free = grp->bb_free;
+ if (free == 0)
+ return 0;
+ if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ return 0;
+
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ return 0;
+
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
int ret = ext4_mb_init_group(ac->ac_sb, group);
@@ -1964,10 +2032,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
return 0;
}
- free = grp->bb_free;
fragments = grp->bb_fragments;
- if (free == 0)
- return 0;
if (fragments == 0)
return 0;
@@ -1975,15 +2040,19 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
case 0:
BUG_ON(ac->ac_2order == 0);
- if (grp->bb_largest_free_order < ac->ac_2order)
- return 0;
-
/* Avoid using the first bg of a flexgroup for data files */
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
return 0;
+ if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
+ (free / fragments) >= ac->ac_g_ex.fe_len)
+ return 1;
+
+ if (grp->bb_largest_free_order < ac->ac_2order)
+ return 0;
+
return 1;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
@@ -2074,7 +2143,12 @@ repeat:
group = ac->ac_g_ex.fe_group;
for (i = 0; i < ngroups; group++, i++) {
- if (group == ngroups)
+ cond_resched();
+ /*
+ * Artificially restricted ngroups for non-extent
+ * files makes group > ngroups possible on first loop.
+ */
+ if (group >= ngroups)
group = 0;
/* This now checks without needing the buddy page */
@@ -2098,7 +2172,7 @@ repeat:
}
ac->ac_groups_scanned++;
- if (cr == 0)
+ if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
ext4_mb_simple_scan_group(ac, &e4b);
else if (cr == 1 && sbi->s_stripe &&
!(ac->ac_g_ex.fe_len % sbi->s_stripe))
@@ -2171,8 +2245,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
struct super_block *sb = seq->private;
ext4_group_t group = (ext4_group_t) ((unsigned long) v);
int i;
- int err;
+ int err, buddy_loaded = 0;
struct ext4_buddy e4b;
+ struct ext4_group_info *grinfo;
struct sg {
struct ext4_group_info info;
ext4_grpblk_t counters[16];
@@ -2189,15 +2264,21 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
sizeof(struct ext4_group_info);
- err = ext4_mb_load_buddy(sb, group, &e4b);
- if (err) {
- seq_printf(seq, "#%-5u: I/O error\n", group);
- return 0;
+ grinfo = ext4_get_group_info(sb, group);
+ /* Load the group info in memory only if not already loaded. */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
+ err = ext4_mb_load_buddy(sb, group, &e4b);
+ if (err) {
+ seq_printf(seq, "#%-5u: I/O error\n", group);
+ return 0;
+ }
+ buddy_loaded = 1;
}
- ext4_lock_group(sb, group);
+
memcpy(&sg, ext4_get_group_info(sb, group), i);
- ext4_unlock_group(sb, group);
- ext4_mb_unload_buddy(&e4b);
+
+ if (buddy_loaded)
+ ext4_mb_unload_buddy(&e4b);
seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2222,7 +2303,7 @@ static const struct seq_operations ext4_mb_seq_groups_ops = {
static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
{
- struct super_block *sb = PDE(inode)->data;
+ struct super_block *sb = PDE_DATA(inode);
int rc;
rc = seq_open(file, &ext4_mb_seq_groups_ops);
@@ -2251,6 +2332,39 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
return cachep;
}
+/*
+ * Allocate the top-level s_group_info array for the specified number
+ * of groups
+ */
+int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned size;
+ struct ext4_group_info ***new_groupinfo;
+
+ size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ if (size <= sbi->s_group_info_size)
+ return 0;
+
+ size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
+ new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+ if (!new_groupinfo) {
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
+ return -ENOMEM;
+ }
+ if (sbi->s_group_info) {
+ memcpy(new_groupinfo, sbi->s_group_info,
+ sbi->s_group_info_size * sizeof(*sbi->s_group_info));
+ ext4_kvfree(sbi->s_group_info);
+ }
+ sbi->s_group_info = new_groupinfo;
+ sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+ ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
+ sbi->s_group_info_size);
+ return 0;
+}
+
/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *desc)
@@ -2271,8 +2385,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
EXT4_DESC_PER_BLOCK_BITS(sb);
meta_group_info = kmalloc(metalen, GFP_KERNEL);
if (meta_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
- "buddy group\n");
+ ext4_msg(sb, KERN_ERR, "can't allocate mem "
+ "for a buddy group");
goto exit_meta_group_info;
}
sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
@@ -2283,12 +2397,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
+ meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
if (meta_group_info[i] == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+ ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
goto exit_group_info;
}
- memset(meta_group_info[i], 0, kmem_cache_size(cachep));
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
&(meta_group_info[i]->bb_state));
@@ -2298,10 +2411,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
*/
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
meta_group_info[i]->bb_free =
- ext4_free_blocks_after_init(sb, group, desc);
+ ext4_free_clusters_after_init(sb, group, desc);
} else {
meta_group_info[i]->bb_free =
- ext4_free_blks_count(sb, desc);
+ ext4_free_group_clusters(sb, desc);
}
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
@@ -2327,8 +2440,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
exit_group_info:
/* If a meta_group_info table has been allocated, release it now */
- if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
+ }
exit_meta_group_info:
return -ENOMEM;
} /* ext4_mb_add_groupinfo */
@@ -2338,61 +2453,29 @@ static int ext4_mb_init_backend(struct super_block *sb)
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
- int num_meta_group_infos;
- int num_meta_group_infos_max;
- int array_size;
+ int err;
struct ext4_group_desc *desc;
struct kmem_cache *cachep;
- /* This is the number of blocks used by GDT */
- num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
- 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
-
- /*
- * This is the total number of blocks used by GDT including
- * the number of reserved blocks for GDT.
- * The s_group_info array is allocated with this value
- * to allow a clean online resize without a complex
- * manipulation of pointer.
- * The drawback is the unused memory when no resize
- * occurs but it's very low in terms of pages
- * (see comments below)
- * Need to handle this properly when META_BG resizing is allowed
- */
- num_meta_group_infos_max = num_meta_group_infos +
- le16_to_cpu(es->s_reserved_gdt_blocks);
+ err = ext4_mb_alloc_groupinfo(sb, ngroups);
+ if (err)
+ return err;
- /*
- * array_size is the size of s_group_info array. We round it
- * to the next power of two because this approximation is done
- * internally by kmalloc so we can have some more memory
- * for free here (e.g. may be used for META_BG resize).
- */
- array_size = 1;
- while (array_size < sizeof(*sbi->s_group_info) *
- num_meta_group_infos_max)
- array_size = array_size << 1;
- /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
- * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
- * So a two level scheme suffices for now. */
- sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
- if (sbi->s_group_info == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
- return -ENOMEM;
- }
sbi->s_buddy_cache = new_inode(sb);
if (sbi->s_buddy_cache == NULL) {
- printk(KERN_ERR "EXT4-fs: can't get new inode\n");
+ ext4_msg(sb, KERN_ERR, "can't get new inode");
goto err_freesgi;
}
- sbi->s_buddy_cache->i_ino = get_next_ino();
+ /* To avoid potentially colliding with an valid on-disk inode number,
+ * use EXT4_BAD_INO for the buddy cache inode number. This inode is
+ * not in the inode hash, so it should never be found by iget(), but
+ * this will avoid confusion if it ever shows up during debugging. */
+ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
for (i = 0; i < ngroups; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
- printk(KERN_ERR
- "EXT4-fs: can't read descriptor %u\n", i);
+ ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
goto err_freebuddy;
}
if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2405,25 +2488,72 @@ err_freebuddy:
cachep = get_groupinfo_cache(sb->s_blocksize_bits);
while (i-- > 0)
kmem_cache_free(cachep, ext4_get_group_info(sb, i));
- i = num_meta_group_infos;
+ i = sbi->s_group_info_size;
while (i-- > 0)
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
err_freesgi:
- kfree(sbi->s_group_info);
+ ext4_kvfree(sbi->s_group_info);
return -ENOMEM;
}
-int ext4_mb_init(struct super_block *sb, int needs_recovery)
+static void ext4_groupinfo_destroy_slabs(void)
+{
+ int i;
+
+ for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+ if (ext4_groupinfo_caches[i])
+ kmem_cache_destroy(ext4_groupinfo_caches[i]);
+ ext4_groupinfo_caches[i] = NULL;
+ }
+}
+
+static int ext4_groupinfo_create_slab(size_t size)
+{
+ static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+ int slab_size;
+ int blocksize_bits = order_base_2(size);
+ int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+ struct kmem_cache *cachep;
+
+ if (cache_index >= NR_GRPINFO_CACHES)
+ return -EINVAL;
+
+ if (unlikely(cache_index < 0))
+ cache_index = 0;
+
+ mutex_lock(&ext4_grpinfo_slab_create_mutex);
+ if (ext4_groupinfo_caches[cache_index]) {
+ mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+ return 0; /* Already created */
+ }
+
+ slab_size = offsetof(struct ext4_group_info,
+ bb_counters[blocksize_bits + 2]);
+
+ cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+ slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+ NULL);
+
+ ext4_groupinfo_caches[cache_index] = cachep;
+
+ mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+ if (!cachep) {
+ printk(KERN_EMERG
+ "EXT4-fs: no memory for groupinfo slab cache\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned i, j;
unsigned offset;
unsigned max;
int ret;
- int cache_index;
- struct kmem_cache *cachep;
- char *namep = NULL;
i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
@@ -2440,30 +2570,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
goto out;
}
- cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
- cachep = ext4_groupinfo_caches[cache_index];
- if (!cachep) {
- char name[32];
- int len = offsetof(struct ext4_group_info,
- bb_counters[sb->s_blocksize_bits + 2]);
-
- sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
- namep = kstrdup(name, GFP_KERNEL);
- if (!namep) {
- ret = -ENOMEM;
- goto out;
- }
-
- /* Need to free the kmem_cache_name() when we
- * destroy the slab */
- cachep = kmem_cache_create(namep, len, 0,
- SLAB_RECLAIM_ACCOUNT, NULL);
- if (!cachep) {
- ret = -ENOMEM;
- goto out;
- }
- ext4_groupinfo_caches[cache_index] = cachep;
- }
+ ret = ext4_groupinfo_create_slab(sb->s_blocksize);
+ if (ret < 0)
+ goto out;
/* order 0 is regular bitmap */
sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
@@ -2480,12 +2589,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
i++;
} while (i <= sb->s_blocksize_bits + 1);
- /* init file for buddy data */
- ret = ext4_mb_init_backend(sb);
- if (ret != 0) {
- goto out;
- }
-
spin_lock_init(&sbi->s_md_lock);
spin_lock_init(&sbi->s_bal_lock);
@@ -2494,7 +2597,32 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
- sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+ /*
+ * The default group preallocation is 512, which for 4k block
+ * sizes translates to 2 megabytes. However for bigalloc file
+ * systems, this is probably too big (i.e, if the cluster size
+ * is 1 megabyte, then group preallocation size becomes half a
+ * gigabyte!). As a default, we will keep a two megabyte
+ * group pralloc size for cluster sizes up to 64k, and after
+ * that, we will force a minimum group preallocation size of
+ * 32 clusters. This translates to 8 megs when the cluster
+ * size is 256k, and 32 megs when the cluster size is 1 meg,
+ * which seems reasonable as a default.
+ */
+ sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
+ sbi->s_cluster_bits, 32);
+ /*
+ * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
+ * to the lowest multiple of s_stripe which is bigger than
+ * the s_mb_group_prealloc as determined above. We want
+ * the preallocation size to be an exact multiple of the
+ * RAID stripe size so that preallocations don't fragment
+ * the stripes.
+ */
+ if (sbi->s_stripe > 1) {
+ sbi->s_mb_group_prealloc = roundup(
+ sbi->s_mb_group_prealloc, sbi->s_stripe);
+ }
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
@@ -2510,18 +2638,25 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
spin_lock_init(&lg->lg_prealloc_lock);
}
+ /* init file for buddy data */
+ ret = ext4_mb_init_backend(sb);
+ if (ret != 0)
+ goto out_free_locality_groups;
+
if (sbi->s_proc)
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);
- if (sbi->s_journal)
- sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+ return 0;
+
+out_free_locality_groups:
+ free_percpu(sbi->s_locality_groups);
+ sbi->s_locality_groups = NULL;
out:
- if (ret) {
- kfree(sbi->s_mb_offsets);
- kfree(sbi->s_mb_maxs);
- kfree(namep);
- }
+ kfree(sbi->s_mb_offsets);
+ sbi->s_mb_offsets = NULL;
+ kfree(sbi->s_mb_maxs);
+ sbi->s_mb_maxs = NULL;
return ret;
}
@@ -2552,6 +2687,9 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+ if (sbi->s_proc)
+ remove_proc_entry("mb_groups", sbi->s_proc);
+
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
grinfo = ext4_get_group_info(sb, i);
@@ -2568,145 +2706,119 @@ int ext4_mb_release(struct super_block *sb)
EXT4_DESC_PER_BLOCK_BITS(sb);
for (i = 0; i < num_meta_group_infos; i++)
kfree(sbi->s_group_info[i]);
- kfree(sbi->s_group_info);
+ ext4_kvfree(sbi->s_group_info);
}
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
if (sbi->s_buddy_cache)
iput(sbi->s_buddy_cache);
if (sbi->s_mb_stats) {
- printk(KERN_INFO
- "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u blocks %u reqs (%u success)",
atomic_read(&sbi->s_bal_allocated),
atomic_read(&sbi->s_bal_reqs),
atomic_read(&sbi->s_bal_success));
- printk(KERN_INFO
- "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
- "%u 2^N hits, %u breaks, %u lost\n",
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u extents scanned, %u goal hits, "
+ "%u 2^N hits, %u breaks, %u lost",
atomic_read(&sbi->s_bal_ex_scanned),
atomic_read(&sbi->s_bal_goals),
atomic_read(&sbi->s_bal_2orders),
atomic_read(&sbi->s_bal_breaks),
atomic_read(&sbi->s_mb_lost_chunks));
- printk(KERN_INFO
- "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
- sbi->s_mb_buddies_generated++,
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %lu generated and it took %Lu",
+ sbi->s_mb_buddies_generated,
sbi->s_mb_generation_time);
- printk(KERN_INFO
- "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
+ ext4_msg(sb, KERN_INFO,
+ "mballoc: %u preallocated, %u discarded",
atomic_read(&sbi->s_mb_preallocated),
atomic_read(&sbi->s_mb_discarded));
}
free_percpu(sbi->s_locality_groups);
- if (sbi->s_proc)
- remove_proc_entry("mb_groups", sbi->s_proc);
return 0;
}
static inline int ext4_issue_discard(struct super_block *sb,
- ext4_group_t block_group, ext4_grpblk_t block, int count)
+ ext4_group_t block_group, ext4_grpblk_t cluster, int count)
{
- int ret;
ext4_fsblk_t discard_block;
- discard_block = block + ext4_group_first_block_no(sb, block_group);
+ discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
+ ext4_group_first_block_no(sb, block_group));
+ count = EXT4_C2B(EXT4_SB(sb), count);
trace_ext4_discard_blocks(sb,
(unsigned long long) discard_block, count);
- ret = sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
- if (ret == -EOPNOTSUPP) {
- ext4_warning(sb, "discard not supported, disabling");
- clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
- }
- return ret;
+ return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
}
/*
* This function is called by the jbd2 layer once the commit has finished,
* so we know we can free the blocks that were released with that commit.
*/
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
+static void ext4_free_data_callback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int rc)
{
- struct super_block *sb = journal->j_private;
+ struct ext4_free_data *entry = (struct ext4_free_data *)jce;
struct ext4_buddy e4b;
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
- struct ext4_free_data *entry;
- struct list_head *l, *ltmp;
-
- list_for_each_safe(l, ltmp, &txn->t_private_list) {
- entry = list_entry(l, struct ext4_free_data, list);
-
- mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
- entry->count, entry->group, entry);
-
- if (test_opt(sb, DISCARD))
- ext4_issue_discard(sb, entry->group,
- entry->start_blk, entry->count);
-
- err = ext4_mb_load_buddy(sb, entry->group, &e4b);
- /* we expect to find existing buddy because it's pinned */
- BUG_ON(err != 0);
-
- db = e4b.bd_info;
- /* there are blocks to put in buddy to make them really free */
- count += entry->count;
- count2++;
- ext4_lock_group(sb, entry->group);
- /* Take it out of per group rb tree */
- rb_erase(&entry->node, &(db->bb_free_root));
- mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
-
- if (!db->bb_free_root.rb_node) {
- /* No more items in the per group rb tree
- * balance refcounts from ext4_mb_free_metadata()
- */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
- }
- ext4_unlock_group(sb, entry->group);
- kmem_cache_free(ext4_free_ext_cachep, entry);
- ext4_mb_unload_buddy(&e4b);
- }
- mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
-}
+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+ entry->efd_count, entry->efd_group, entry);
-#ifdef CONFIG_EXT4_DEBUG
-u8 mb_enable_debug __read_mostly;
+ if (test_opt(sb, DISCARD)) {
+ err = ext4_issue_discard(sb, entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count);
+ if (err && err != -EOPNOTSUPP)
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%d failed"
+ " with %d", entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count, err);
+ }
-static struct dentry *debugfs_dir;
-static struct dentry *debugfs_debug;
+ err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
+ /* we expect to find existing buddy because it's pinned */
+ BUG_ON(err != 0);
-static void __init ext4_create_debugfs_entry(void)
-{
- debugfs_dir = debugfs_create_dir("ext4", NULL);
- if (debugfs_dir)
- debugfs_debug = debugfs_create_u8("mballoc-debug",
- S_IRUGO | S_IWUSR,
- debugfs_dir,
- &mb_enable_debug);
-}
-static void ext4_remove_debugfs_entry(void)
-{
- debugfs_remove(debugfs_debug);
- debugfs_remove(debugfs_dir);
-}
+ db = e4b.bd_info;
+ /* there are blocks to put in buddy to make them really free */
+ count += entry->efd_count;
+ count2++;
+ ext4_lock_group(sb, entry->efd_group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->efd_node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
-#else
+ /*
+ * Clear the trimmed flag for the group so that the next
+ * ext4_trim_fs can trim it.
+ * If the volume is mounted with -o discard, online discard
+ * is supported and the free blocks will be trimmed online.
+ */
+ if (!test_opt(sb, DISCARD))
+ EXT4_MB_GRP_CLEAR_TRIMMED(db);
-static void __init ext4_create_debugfs_entry(void)
-{
-}
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
+ }
+ ext4_unlock_group(sb, entry->efd_group);
+ kmem_cache_free(ext4_free_data_cachep, entry);
+ ext4_mb_unload_buddy(&e4b);
-static void ext4_remove_debugfs_entry(void)
-{
+ mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
}
-#endif
-
int __init ext4_init_mballoc(void)
{
ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
@@ -2721,20 +2833,18 @@ int __init ext4_init_mballoc(void)
return -ENOMEM;
}
- ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
- SLAB_RECLAIM_ACCOUNT);
- if (ext4_free_ext_cachep == NULL) {
+ ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
+ SLAB_RECLAIM_ACCOUNT);
+ if (ext4_free_data_cachep == NULL) {
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
return -ENOMEM;
}
- ext4_create_debugfs_entry();
return 0;
}
void ext4_exit_mballoc(void)
{
- int i;
/*
* Wait for completion of call_rcu()'s on ext4_pspace_cachep
* before destroying the slab cache.
@@ -2742,17 +2852,8 @@ void ext4_exit_mballoc(void)
rcu_barrier();
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
- kmem_cache_destroy(ext4_free_ext_cachep);
-
- for (i = 0; i < NR_GRPINFO_CACHES; i++) {
- struct kmem_cache *cachep = ext4_groupinfo_caches[i];
- if (cachep) {
- char *name = (char *)kmem_cache_name(cachep);
- kmem_cache_destroy(cachep);
- kfree(name);
- }
- }
- ext4_remove_debugfs_entry();
+ kmem_cache_destroy(ext4_free_data_cachep);
+ ext4_groupinfo_destroy_slabs();
}
@@ -2762,7 +2863,7 @@ void ext4_exit_mballoc(void)
*/
static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
- handle_t *handle, unsigned int reserv_blks)
+ handle_t *handle, unsigned int reserv_clstrs)
{
struct buffer_head *bitmap_bh = NULL;
struct ext4_group_desc *gdp;
@@ -2783,6 +2884,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (!bitmap_bh)
goto out_err;
+ BUFFER_TRACE(bitmap_bh, "getting write access");
err = ext4_journal_get_write_access(handle, bitmap_bh);
if (err)
goto out_err;
@@ -2793,25 +2895,26 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
goto out_err;
ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
- ext4_free_blks_count(sb, gdp));
+ ext4_free_group_clusters(sb, gdp));
+ BUFFER_TRACE(gdp_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdp_bh);
if (err)
goto out_err;
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
- len = ac->ac_b_ex.fe_len;
+ len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (!ext4_data_block_valid(sbi, block, len)) {
ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
- "fs metadata\n", block, block+len);
+ "fs metadata", block, block+len);
/* File system mounted not to panic on error
* Fix the bitmap and repeat the block allocation
* We leak some of the blocks here.
*/
ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
- ac->ac_b_ex.fe_len);
+ ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
if (!err)
@@ -2829,31 +2932,34 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
}
#endif
- mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
+ ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_blks_set(sb, gdp,
- ext4_free_blocks_after_init(sb,
- ac->ac_b_ex.fe_group, gdp));
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb,
+ ac->ac_b_ex.fe_group, gdp));
}
- len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
- ext4_free_blks_set(sb, gdp, len);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
+ len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
+ ext4_free_group_clusters_set(sb, gdp, len);
+ ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
- percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+ percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
/*
* Now reduce the dirty block count also. Should not go negative
*/
if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
ac->ac_b_ex.fe_group);
- atomic_sub(ac->ac_b_ex.fe_len,
- &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic64_sub(ac->ac_b_ex.fe_len,
+ &sbi->s_flex_groups[flex_group].free_clusters);
}
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -2862,15 +2968,15 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
out_err:
- ext4_mark_super_dirty(sb);
brelse(bitmap_bh);
return err;
}
/*
* here we normalize request for locality group
- * Group request are normalized to s_strip size if we set the same via mount
- * option. If not we set it to s_mb_group_prealloc which can be configured via
+ * Group request are normalized to s_mb_group_prealloc, which goes to
+ * s_strip if we set the same via mount option.
+ * s_mb_group_prealloc can be configured via
* /sys/fs/ext4/<partition>/mb_group_prealloc
*
* XXX: should we try to preallocate more than the group has now?
@@ -2881,10 +2987,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
struct ext4_locality_group *lg = ac->ac_lg;
BUG_ON(lg == NULL);
- if (EXT4_SB(sb)->s_stripe)
- ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
- else
- ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
+ ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
mb_debug(1, "#%u: goal %u blocks for locality group\n",
current->pid, ac->ac_g_ex.fe_len);
}
@@ -2897,9 +3000,11 @@ static noinline_for_stack void
ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits, max;
ext4_lblk_t end;
- loff_t size, orig_size, start_off;
+ loff_t size, start_off;
+ loff_t orig_size __maybe_unused;
ext4_lblk_t start;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_prealloc_space *pa;
@@ -2927,7 +3032,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* first, let's learn actual file size
* given current request is allocated */
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
size = size << bsbits;
if (size < i_size_read(ac->ac_inode))
size = i_size_read(ac->ac_inode);
@@ -2999,7 +3104,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
continue;
}
- pa_end = pa->pa_lstart + pa->pa_len;
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
/* PA must not overlap original request */
BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
@@ -3029,9 +3135,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
rcu_read_lock();
list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
ext4_lblk_t pa_end;
+
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0) {
- pa_end = pa->pa_lstart + pa->pa_len;
+ pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+ pa->pa_len);
BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
}
spin_unlock(&pa->pa_lock);
@@ -3040,9 +3148,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
if (start + size <= ac->ac_o_ex.fe_logical &&
start > ac->ac_o_ex.fe_logical) {
- printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
- (unsigned long) start, (unsigned long) size,
- (unsigned long) ac->ac_o_ex.fe_logical);
+ ext4_msg(ac->ac_sb, KERN_ERR,
+ "start %lu, size %lu, fe_logical %lu",
+ (unsigned long) start, (unsigned long) size,
+ (unsigned long) ac->ac_o_ex.fe_logical);
}
BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
start > ac->ac_o_ex.fe_logical);
@@ -3053,7 +3162,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* XXX: is it better to align blocks WRT to logical
* placement or satisfy big request as is */
ac->ac_g_ex.fe_logical = start;
- ac->ac_g_ex.fe_len = size;
+ ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
/* define goal start in order to merge */
if (ar->pright && (ar->lright == (start + size))) {
@@ -3107,13 +3216,9 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
{
struct ext4_prealloc_space *pa = ac->ac_pa;
- int len;
-
- if (pa && pa->pa_type == MB_INODE_PA) {
- len = ac->ac_b_ex.fe_len;
- pa->pa_free += len;
- }
+ if (pa && pa->pa_type == MB_INODE_PA)
+ pa->pa_free += ac->ac_b_ex.fe_len;
}
/*
@@ -3122,14 +3227,16 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
struct ext4_prealloc_space *pa)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
ext4_fsblk_t start;
ext4_fsblk_t end;
int len;
/* found preallocated blocks, use them */
start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
- end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
- len = end - start;
+ end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+ start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
+ len = EXT4_NUM_B2C(sbi, end - start);
ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
&ac->ac_b_ex.fe_start);
ac->ac_b_ex.fe_len = len;
@@ -3137,7 +3244,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
ac->ac_pa = pa;
BUG_ON(start < pa->pa_pstart);
- BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+ BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
BUG_ON(pa->pa_free < len);
pa->pa_free -= len;
@@ -3188,7 +3295,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
cur_distance = abs(goal_block - cpa->pa_pstart);
new_distance = abs(goal_block - pa->pa_pstart);
- if (cur_distance < new_distance)
+ if (cur_distance <= new_distance)
return cpa;
/* drop the previous reference */
@@ -3203,6 +3310,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
static noinline_for_stack int
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
@@ -3220,12 +3328,14 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/* all fields in this condition don't change,
* so we can skip locking for them */
if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
- ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+ ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
+ EXT4_C2B(sbi, pa->pa_len)))
continue;
/* non-extent files can't have physical blocks past 2^32 */
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
- pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+ (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
+ EXT4_MAX_BLOCK_FILE_PHYS))
continue;
/* found preallocated blocks, use them */
@@ -3300,8 +3410,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
n = rb_first(&(grp->bb_free_root));
while (n) {
- entry = rb_entry(n, struct ext4_free_data, node);
- mb_set_bits(bitmap, entry->start_blk, entry->count);
+ entry = rb_entry(n, struct ext4_free_data, efd_node);
+ ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
n = rb_next(n);
}
return;
@@ -3322,7 +3432,6 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t groupnr;
ext4_grpblk_t start;
int preallocated = 0;
- int count = 0;
int len;
/* all form of preallocation discards first load group,
@@ -3343,9 +3452,8 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
if (unlikely(len == 0))
continue;
BUG_ON(groupnr != group);
- mb_set_bits(bitmap, start, len);
+ ext4_set_bits(bitmap, start, len);
preallocated += len;
- count++;
}
mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
}
@@ -3354,6 +3462,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
{
struct ext4_prealloc_space *pa;
pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+
+ BUG_ON(atomic_read(&pa->pa_count));
+ BUG_ON(pa->pa_deleted == 0);
kmem_cache_free(ext4_pspace_cachep, pa);
}
@@ -3367,11 +3478,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
ext4_group_t grp;
ext4_fsblk_t grp_blk;
- if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
- return;
-
/* in this short window concurrent discard can set pa_deleted */
spin_lock(&pa->pa_lock);
+ if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+ spin_unlock(&pa->pa_lock);
+ return;
+ }
+
if (pa->pa_deleted == 1) {
spin_unlock(&pa->pa_lock);
return;
@@ -3388,7 +3501,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
if (pa->pa_type == MB_GROUP_PA)
grp_blk--;
- ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
+ grp = ext4_get_group_number(sb, grp_blk);
/*
* possible race:
@@ -3422,6 +3535,7 @@ static noinline_for_stack int
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_prealloc_space *pa;
struct ext4_group_info *grp;
struct ext4_inode_info *ei;
@@ -3453,16 +3567,18 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
/* also, we should cover whole original request */
- wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+ wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
/* the smallest one defines real window */
win = min(winl, wins);
- offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+ offs = ac->ac_o_ex.fe_logical %
+ EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (offs && offs < win)
win = offs;
- ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+ ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
+ EXT4_NUM_B2C(sbi, win);
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
}
@@ -3487,7 +3603,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
trace_ext4_mb_new_inode_pa(ac, pa);
ext4_mb_use_inode_pa(ac, pa);
- atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+ atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
ei = EXT4_I(ac->ac_inode);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
@@ -3602,7 +3718,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- grp_blk_start = pa->pa_pstart - bit;
+ grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
end = bit + pa->pa_len;
@@ -3617,16 +3733,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
free += next - bit;
trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
- trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
- grp_blk_start + bit, next - bit);
+ trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
+ EXT4_C2B(sbi, bit)),
+ next - bit);
mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
bit = next + 1;
}
if (free != pa->pa_free) {
- printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
- pa, (unsigned long) pa->pa_lstart,
- (unsigned long) pa->pa_pstart,
- (unsigned long) pa->pa_len);
+ ext4_msg(e4b->bd_sb, KERN_CRIT,
+ "pa %p: logic %lu, phys. %lu, len %lu",
+ pa, (unsigned long) pa->pa_lstart,
+ (unsigned long) pa->pa_pstart,
+ (unsigned long) pa->pa_len);
ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
free, pa->pa_free);
/*
@@ -3699,7 +3817,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
}
if (needed == 0)
- needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
INIT_LIST_HEAD(&list);
repeat:
@@ -3733,11 +3851,7 @@ repeat:
if (free < needed && busy) {
busy = 0;
ext4_unlock_group(sb, group);
- /*
- * Yield the CPU here so that we don't get soft lockup
- * in non preempt case.
- */
- yield();
+ cond_resched();
goto repeat;
}
@@ -3814,7 +3928,8 @@ repeat:
* use preallocation while we're discarding it */
spin_unlock(&pa->pa_lock);
spin_unlock(&ei->i_prealloc_lock);
- printk(KERN_ERR "uh-oh! used pa while discarding\n");
+ ext4_msg(sb, KERN_ERR,
+ "uh-oh! used pa while discarding");
WARN_ON(1);
schedule_timeout_uninterruptible(HZ);
goto repeat;
@@ -3851,7 +3966,7 @@ repeat:
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
BUG_ON(pa->pa_type != MB_INODE_PA);
- ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ group = ext4_get_group_number(sb, pa->pa_pstart);
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
@@ -3881,34 +3996,23 @@ repeat:
}
}
-/*
- * finds all preallocated spaces and return blocks being freed to them
- * if preallocated space becomes full (no block is used from the space)
- * then the function frees space in buddy
- * XXX: at the moment, truncate (which is the only way to free blocks)
- * discards all preallocations
- */
-static void ext4_mb_return_to_preallocation(struct inode *inode,
- struct ext4_buddy *e4b,
- sector_t block, int count)
-{
- BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
-}
#ifdef CONFIG_EXT4_DEBUG
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
ext4_group_t ngroups, i;
- if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ if (!ext4_mballoc_debug ||
+ (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
return;
- printk(KERN_ERR "EXT4-fs: Can't allocate:"
- " Allocation context details:\n");
- printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
+ ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
+ " Allocation context details:");
+ ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
ac->ac_status, ac->ac_flags);
- printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
- "best %lu/%lu/%lu@%lu cr %d\n",
+ ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
+ "goal %lu/%lu/%lu@%lu, "
+ "best %lu/%lu/%lu@%lu cr %d",
(unsigned long)ac->ac_o_ex.fe_group,
(unsigned long)ac->ac_o_ex.fe_start,
(unsigned long)ac->ac_o_ex.fe_len,
@@ -3922,9 +4026,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_len,
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
- printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
- ac->ac_found);
- printk(KERN_ERR "EXT4-fs: groups: \n");
+ ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
+ ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
ngroups = ext4_get_groups_count(sb);
for (i = 0; i < ngroups; i++) {
struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -3977,7 +4080,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return;
- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
@@ -3988,6 +4091,11 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
return;
}
+ if (sbi->s_mb_group_prealloc <= 0) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ return;
+ }
+
/* don't use group allocation for large files */
size = max(size, isize);
if (size > sbi->s_mb_stream_request) {
@@ -4026,8 +4134,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
len = ar->len;
/* just a dirty hack to filter too big requests */
- if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
- len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+ if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
+ len = EXT4_CLUSTERS_PER_GROUP(sb);
/* start searching from the goal */
goal = ar->goal;
@@ -4037,19 +4145,15 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ext4_get_group_no_and_offset(sb, goal, &group, &block);
/* set up allocation goals */
- memset(ac, 0, sizeof(struct ext4_allocation_context));
- ac->ac_b_ex.fe_logical = ar->logical;
+ ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_sb = sb;
ac->ac_inode = ar->inode;
- ac->ac_o_ex.fe_logical = ar->logical;
+ ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
ac->ac_o_ex.fe_group = group;
ac->ac_o_ex.fe_start = block;
ac->ac_o_ex.fe_len = len;
- ac->ac_g_ex.fe_logical = ar->logical;
- ac->ac_g_ex.fe_group = group;
- ac->ac_g_ex.fe_start = block;
- ac->ac_g_ex.fe_len = len;
+ ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
/* we have to define context: we'll we work with a file or
@@ -4123,7 +4227,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
- ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ group = ext4_get_group_number(sb, pa->pa_pstart);
if (ext4_mb_load_buddy(sb, group, &e4b)) {
ext4_error(sb, "Error loading buddy information for %u",
group);
@@ -4161,7 +4265,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
/* The max size of hash table is PREALLOC_TB_SIZE */
order = PREALLOC_TB_SIZE - 1;
/* Add the prealloc space to lg */
- rcu_read_lock();
+ spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
pa_inode_list) {
spin_lock(&tmp_pa->pa_lock);
@@ -4185,12 +4289,12 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
if (!added)
list_add_tail_rcu(&pa->pa_inode_list,
&lg->lg_prealloc_list[order]);
- rcu_read_unlock();
+ spin_unlock(&lg->lg_prealloc_lock);
/* Now trim the list to be not more than 8 elements */
if (lg_prealloc_count > 8) {
ext4_mb_discard_lg_preallocations(sb, lg,
- order, lg_prealloc_count);
+ order, lg_prealloc_count);
return;
}
return ;
@@ -4201,27 +4305,25 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
if (pa->pa_type == MB_GROUP_PA) {
/* see comment in ext4_mb_use_group_pa() */
spin_lock(&pa->pa_lock);
- pa->pa_pstart += ac->ac_b_ex.fe_len;
- pa->pa_lstart += ac->ac_b_ex.fe_len;
+ pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
pa->pa_free -= ac->ac_b_ex.fe_len;
pa->pa_len -= ac->ac_b_ex.fe_len;
spin_unlock(&pa->pa_lock);
}
}
- if (ac->alloc_semp)
- up_read(ac->alloc_semp);
if (pa) {
/*
* We want to add the pa to the right bucket.
* Remove it from the list and while adding
* make sure the list to which we are adding
- * doesn't grow big. We need to release
- * alloc_semp before calling ext4_mb_add_n_trim()
+ * doesn't grow big.
*/
if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
spin_lock(pa->pa_obj_lock);
@@ -4271,38 +4373,53 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
struct super_block *sb;
ext4_fsblk_t block = 0;
unsigned int inquota = 0;
- unsigned int reserv_blks = 0;
+ unsigned int reserv_clstrs = 0;
+ might_sleep();
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
trace_ext4_request_blocks(ar);
+ /* Allow to use superuser reservation for quota file */
+ if (IS_NOQUOTA(ar->inode))
+ ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
+
/*
* For delayed allocation, we could skip the ENOSPC and
* EDQUOT check, as blocks and quotas have been already
* reserved when data being copied into pagecache.
*/
- if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+ if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
ar->flags |= EXT4_MB_DELALLOC_RESERVED;
else {
/* Without delayed allocation we need to verify
* there is enough free blocks to do block allocation
* and verify allocation doesn't exceed the quota limits.
*/
- while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+ while (ar->len &&
+ ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
+
/* let others to free the space */
- yield();
+ cond_resched();
ar->len = ar->len >> 1;
}
if (!ar->len) {
*errp = -ENOSPC;
return 0;
}
- reserv_blks = ar->len;
- while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
- ar->flags |= EXT4_MB_HINT_NOPREALLOC;
- ar->len--;
+ reserv_clstrs = ar->len;
+ if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
+ dquot_alloc_block_nofail(ar->inode,
+ EXT4_C2B(sbi, ar->len));
+ } else {
+ while (ar->len &&
+ dquot_alloc_block(ar->inode,
+ EXT4_C2B(sbi, ar->len))) {
+
+ ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+ ar->len--;
+ }
}
inquota = ar->len;
if (ar->len == 0) {
@@ -4311,7 +4428,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
}
- ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+ ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
ar->len = 0;
*errp = -ENOMEM;
@@ -4332,17 +4449,22 @@ repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
if (*errp)
- goto errout;
+ goto discard_and_exit;
/* as we've just preallocated more space than
- * user requested orinally, we store allocated
+ * user requested originally, we store allocated
* space in a special descriptor */
if (ac->ac_status == AC_STATUS_FOUND &&
- ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
- ext4_mb_new_preallocation(ac);
+ ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+ *errp = ext4_mb_new_preallocation(ac);
+ if (*errp) {
+ discard_and_exit:
+ ext4_discard_allocated_blocks(ac);
+ goto errout;
+ }
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
- *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
+ *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
if (*errp == -EAGAIN) {
/*
* drop the reference that we took
@@ -4354,10 +4476,10 @@ repeat:
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
goto repeat;
- } else if (*errp)
- errout:
+ } else if (*errp) {
ext4_discard_allocated_blocks(ac);
- else {
+ goto errout;
+ } else {
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
ar->len = ac->ac_b_ex.fe_len;
}
@@ -4368,6 +4490,7 @@ repeat:
*errp = -ENOSPC;
}
+errout:
if (*errp) {
ac->ac_b_ex.fe_len = 0;
ar->len = 0;
@@ -4378,12 +4501,13 @@ out:
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
if (inquota && ar->len < inquota)
- dquot_free_block(ar->inode, inquota - ar->len);
+ dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
if (!ar->len) {
- if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+ if (!ext4_test_inode_state(ar->inode,
+ EXT4_STATE_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
- reserv_blks);
+ percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+ reserv_clstrs);
}
trace_ext4_allocate_blocks(ar, (unsigned long long)block);
@@ -4399,9 +4523,9 @@ out:
static int can_merge(struct ext4_free_data *entry1,
struct ext4_free_data *entry2)
{
- if ((entry1->t_tid == entry2->t_tid) &&
- (entry1->group == entry2->group) &&
- ((entry1->start_blk + entry1->count) == entry2->start_blk))
+ if ((entry1->efd_tid == entry2->efd_tid) &&
+ (entry1->efd_group == entry2->efd_group) &&
+ ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
return 1;
return 0;
}
@@ -4411,7 +4535,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_free_data *new_entry)
{
ext4_group_t group = e4b->bd_group;
- ext4_grpblk_t block;
+ ext4_grpblk_t cluster;
struct ext4_free_data *entry;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
@@ -4423,8 +4547,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
- new_node = &new_entry->node;
- block = new_entry->start_blk;
+ new_node = &new_entry->efd_node;
+ cluster = new_entry->efd_start_cluster;
if (!*n) {
/* first free block exent. We need to
@@ -4437,14 +4561,15 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
}
while (*n) {
parent = *n;
- entry = rb_entry(parent, struct ext4_free_data, node);
- if (block < entry->start_blk)
+ entry = rb_entry(parent, struct ext4_free_data, efd_node);
+ if (cluster < entry->efd_start_cluster)
n = &(*n)->rb_left;
- else if (block >= (entry->start_blk + entry->count))
+ else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
n = &(*n)->rb_right;
else {
ext4_grp_locked_error(sb, group, 0,
- ext4_group_first_block_no(sb, group) + block,
+ ext4_group_first_block_no(sb, group) +
+ EXT4_C2B(sbi, cluster),
"Block already on to-be-freed list");
return 0;
}
@@ -4456,34 +4581,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
/* Now try to see the extent can be merged to left and right */
node = rb_prev(new_node);
if (node) {
- entry = rb_entry(node, struct ext4_free_data, node);
- if (can_merge(entry, new_entry)) {
- new_entry->start_blk = entry->start_blk;
- new_entry->count += entry->count;
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
+ if (can_merge(entry, new_entry) &&
+ ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
+ new_entry->efd_start_cluster = entry->efd_start_cluster;
+ new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- spin_lock(&sbi->s_md_lock);
- list_del(&entry->list);
- spin_unlock(&sbi->s_md_lock);
- kmem_cache_free(ext4_free_ext_cachep, entry);
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
}
node = rb_next(new_node);
if (node) {
- entry = rb_entry(node, struct ext4_free_data, node);
- if (can_merge(new_entry, entry)) {
- new_entry->count += entry->count;
+ entry = rb_entry(node, struct ext4_free_data, efd_node);
+ if (can_merge(new_entry, entry) &&
+ ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
+ new_entry->efd_count += entry->efd_count;
rb_erase(node, &(db->bb_free_root));
- spin_lock(&sbi->s_md_lock);
- list_del(&entry->list);
- spin_unlock(&sbi->s_md_lock);
- kmem_cache_free(ext4_free_ext_cachep, entry);
+ kmem_cache_free(ext4_free_data_cachep, entry);
}
}
/* Add the extent to transaction's private list */
- spin_lock(&sbi->s_md_lock);
- list_add(&new_entry->list, &handle->h_transaction->t_private_list);
- spin_unlock(&sbi->s_md_lock);
+ ext4_journal_callback_add(handle, ext4_free_data_callback,
+ &new_entry->efd_jce);
return 0;
}
@@ -4493,7 +4613,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
* @inode: inode
* @block: start physical block to free
* @count: number of blocks to count
- * @metadata: Are these metadata blocks
+ * @flags: flags used by ext4_free_blocks
*/
void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
@@ -4502,16 +4622,18 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
struct ext4_group_desc *gdp;
- unsigned long freed = 0;
unsigned int overflow;
ext4_grpblk_t bit;
struct buffer_head *gd_bh;
ext4_group_t block_group;
struct ext4_sb_info *sbi;
+ struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_buddy e4b;
+ unsigned int count_clusters;
int err = 0;
int ret;
+ might_sleep();
if (bh) {
if (block)
BUG_ON(block != bh->b_blocknr);
@@ -4537,10 +4659,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
BUG_ON(bh && (count > 1));
for (i = 0; i < count; i++) {
+ cond_resched();
if (!bh)
tbh = sb_find_get_block(inode->i_sb,
block + i);
- if (unlikely(!tbh))
+ if (!tbh)
continue;
ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
inode, tbh, block + i);
@@ -4557,18 +4680,56 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
if (!ext4_should_writeback_data(inode))
flags |= EXT4_FREE_BLOCKS_METADATA;
+ /*
+ * If the extent to be freed does not begin on a cluster
+ * boundary, we need to deal with partial clusters at the
+ * beginning and end of the extent. Normally we will free
+ * blocks at the beginning or the end unless we are explicitly
+ * requested to avoid doing so.
+ */
+ overflow = EXT4_PBLK_COFF(sbi, block);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+ overflow = sbi->s_cluster_ratio - overflow;
+ block += overflow;
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else {
+ block -= overflow;
+ count += overflow;
+ }
+ }
+ overflow = EXT4_LBLK_COFF(sbi, count);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else
+ count += sbi->s_cluster_ratio - overflow;
+ }
+
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
+ ext4_get_group_info(sb, block_group))))
+ return;
+
/*
* Check to see if we are freeing blocks across a group
* boundary.
*/
- if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+ if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+ overflow = EXT4_C2B(sbi, bit) + count -
+ EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
+ count_clusters = EXT4_NUM_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh) {
err = -EIO;
@@ -4583,9 +4744,9 @@ do_more:
if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
in_range(block, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group) ||
+ EXT4_SB(sb)->s_itb_per_group) ||
in_range(block + count - 1, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group)) {
+ EXT4_SB(sb)->s_itb_per_group)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
@@ -4610,11 +4771,11 @@ do_more:
#ifdef AGGRESSIVE_CHECK
{
int i;
- for (i = 0; i < count; i++)
+ for (i = 0; i < count_clusters; i++)
BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
}
#endif
- trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
+ trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
err = ext4_mb_load_buddy(sb, block_group, &e4b);
if (err)
@@ -4626,40 +4787,74 @@ do_more:
* blocks being freed are metadata. these blocks shouldn't
* be used until this transaction is committed
*/
- new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
- new_entry->start_blk = bit;
- new_entry->group = block_group;
- new_entry->count = count;
- new_entry->t_tid = handle->h_transaction->t_tid;
+ retry:
+ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
+ if (!new_entry) {
+ /*
+ * We use a retry loop because
+ * ext4_free_blocks() is not allowed to fail.
+ */
+ cond_resched();
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry;
+ }
+ new_entry->efd_start_cluster = bit;
+ new_entry->efd_group = block_group;
+ new_entry->efd_count = count_clusters;
+ new_entry->efd_tid = handle->h_transaction->t_tid;
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count);
+ mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
ext4_mb_free_metadata(handle, &e4b, new_entry);
} else {
/* need to update group_info->bb_free and bitmap
* with group lock held. generate_buddy look at
* them with group lock_held
*/
+ if (test_opt(sb, DISCARD)) {
+ err = ext4_issue_discard(sb, block_group, bit, count);
+ if (err && err != -EOPNOTSUPP)
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%lu failed"
+ " with %d", block_group, bit, count,
+ err);
+ } else
+ EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
+
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count);
- mb_free_blocks(inode, &e4b, bit, count);
- ext4_mb_return_to_preallocation(inode, &e4b, block, count);
+ mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+ mb_free_blocks(inode, &e4b, bit, count_clusters);
}
- ret = ext4_free_blks_count(sb, gdp) + count;
- ext4_free_blks_set(sb, gdp, ret);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+ ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
+ ext4_free_group_clusters_set(sb, gdp, ret);
+ ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
- percpu_counter_add(&sbi->s_freeblocks_counter, count);
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic64_add(count_clusters,
+ &sbi->s_flex_groups[flex_group].free_clusters);
}
- ext4_mb_unload_buddy(&e4b);
+ if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) {
+ percpu_counter_add(&sbi->s_dirtyclusters_counter,
+ count_clusters);
+ spin_lock(&ei->i_block_reservation_lock);
+ if (flags & EXT4_FREE_BLOCKS_METADATA)
+ ei->i_reserved_meta_blocks += count_clusters;
+ else
+ ei->i_reserved_data_blocks += count_clusters;
+ spin_unlock(&ei->i_block_reservation_lock);
+ if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_reclaim_block(inode,
+ EXT4_C2B(sbi, count_clusters));
+ } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+ dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+ percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
- freed += count;
+ ext4_mb_unload_buddy(&e4b);
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -4677,16 +4872,147 @@ do_more:
put_bh(bitmap_bh);
goto do_more;
}
- ext4_mark_super_dirty(sb);
error_return:
- if (freed)
- dquot_free_block(inode, freed);
brelse(bitmap_bh);
ext4_std_error(sb, err);
return;
}
/**
+ * ext4_group_add_blocks() -- Add given blocks to an existing group
+ * @handle: handle to this transaction
+ * @sb: super block
+ * @block: start physical block to add to the block group
+ * @count: number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *gd_bh;
+ ext4_group_t block_group;
+ ext4_grpblk_t bit;
+ unsigned int i;
+ struct ext4_group_desc *desc;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_buddy e4b;
+ int err = 0, ret, blk_free_count;
+ ext4_grpblk_t blocks_freed;
+
+ ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+ if (count == 0)
+ return 0;
+
+ ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ */
+ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+ ext4_warning(sb, "too much blocks added to group %u\n",
+ block_group);
+ err = -EINVAL;
+ goto error_return;
+ }
+
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+ if (!bitmap_bh) {
+ err = -EIO;
+ goto error_return;
+ }
+
+ desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+ if (!desc) {
+ err = -EIO;
+ goto error_return;
+ }
+
+ if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+ in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+ in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+ in_range(block + count - 1, ext4_inode_table(sb, desc),
+ sbi->s_itb_per_group)) {
+ ext4_error(sb, "Adding blocks in system zones - "
+ "Block = %llu, count = %lu",
+ block, count);
+ err = -EINVAL;
+ goto error_return;
+ }
+
+ BUFFER_TRACE(bitmap_bh, "getting write access");
+ err = ext4_journal_get_write_access(handle, bitmap_bh);
+ if (err)
+ goto error_return;
+
+ /*
+ * We are about to modify some metadata. Call the journal APIs
+ * to unshare ->b_data if a currently-committing transaction is
+ * using it
+ */
+ BUFFER_TRACE(gd_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gd_bh);
+ if (err)
+ goto error_return;
+
+ for (i = 0, blocks_freed = 0; i < count; i++) {
+ BUFFER_TRACE(bitmap_bh, "clear bit");
+ if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+ ext4_error(sb, "bit already cleared for block %llu",
+ (ext4_fsblk_t)(block + i));
+ BUFFER_TRACE(bitmap_bh, "bit already cleared");
+ } else {
+ blocks_freed++;
+ }
+ }
+
+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_return;
+
+ /*
+ * need to update group_info->bb_free and bitmap
+ * with group lock held. generate_buddy look at
+ * them with group lock_held
+ */
+ ext4_lock_group(sb, block_group);
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
+ mb_free_blocks(NULL, &e4b, bit, count);
+ blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
+ ext4_free_group_clusters_set(sb, desc, blk_free_count);
+ ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
+ ext4_group_desc_csum_set(sb, block_group, desc);
+ ext4_unlock_group(sb, block_group);
+ percpu_counter_add(&sbi->s_freeclusters_counter,
+ EXT4_NUM_B2C(sbi, blocks_freed));
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+ &sbi->s_flex_groups[flex_group].free_clusters);
+ }
+
+ ext4_mb_unload_buddy(&e4b);
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+ /* And the group descriptor block */
+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+ ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+ if (!err)
+ err = ret;
+
+error_return:
+ brelse(bitmap_bh);
+ ext4_std_error(sb, err);
+ return err;
+}
+
+/**
* ext4_trim_extent -- function to TRIM one single free extent in the group
* @sb: super block for the file system
* @start: starting block of the free extent in the alloc. group
@@ -4699,11 +5025,15 @@ error_return:
* be called with under the group lock.
*/
static int ext4_trim_extent(struct super_block *sb, int start, int count,
- ext4_group_t group, struct ext4_buddy *e4b)
+ ext4_group_t group, struct ext4_buddy *e4b)
+__releases(bitlock)
+__acquires(bitlock)
{
struct ext4_free_extent ex;
int ret = 0;
+ trace_ext4_trim_extent(sb, group, start, count);
+
assert_spin_locked(ext4_group_lock_ptr(sb, group));
ex.fe_start = start;
@@ -4716,11 +5046,7 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
-
ret = ext4_issue_discard(sb, group, start, count);
- if (ret)
- ext4_std_error(sb, ret);
-
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
return ret;
@@ -4729,7 +5055,7 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
/**
* ext4_trim_all_free -- function to trim all free space in alloc. group
* @sb: super block for file system
- * @e4b: ext4 buddy
+ * @group: group to be trimmed
* @start: first group block to examine
* @max: last group block to examine
* @minblocks: minimum extent block count
@@ -4744,35 +5070,49 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
* bitmap. Then issue a TRIM command on this extent and free the extent in
* the group buddy bitmap. This is done until whole group is scanned.
*/
-ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
- ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+ ext4_grpblk_t start, ext4_grpblk_t max,
+ ext4_grpblk_t minblocks)
{
void *bitmap;
- ext4_grpblk_t next, count = 0;
- ext4_group_t group;
+ ext4_grpblk_t next, count = 0, free_count = 0;
+ struct ext4_buddy e4b;
int ret = 0;
- BUG_ON(e4b == NULL);
+ trace_ext4_trim_all_free(sb, group, start, max);
+
+ ret = ext4_mb_load_buddy(sb, group, &e4b);
+ if (ret) {
+ ext4_error(sb, "Error in loading buddy "
+ "information for %u", group);
+ return ret;
+ }
+ bitmap = e4b.bd_bitmap;
- bitmap = e4b->bd_bitmap;
- group = e4b->bd_group;
- start = (e4b->bd_info->bb_first_free > start) ?
- e4b->bd_info->bb_first_free : start;
ext4_lock_group(sb, group);
+ if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
+ minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
+ goto out;
- while (start < max) {
- start = mb_find_next_zero_bit(bitmap, max, start);
- if (start >= max)
+ start = (e4b.bd_info->bb_first_free > start) ?
+ e4b.bd_info->bb_first_free : start;
+
+ while (start <= max) {
+ start = mb_find_next_zero_bit(bitmap, max + 1, start);
+ if (start > max)
break;
- next = mb_find_next_bit(bitmap, max, start);
+ next = mb_find_next_bit(bitmap, max + 1, start);
if ((next - start) >= minblocks) {
ret = ext4_trim_extent(sb, start,
- next - start, group, e4b);
- if (ret < 0)
+ next - start, group, &e4b);
+ if (ret && ret != -EOPNOTSUPP)
break;
+ ret = 0;
count += next - start;
}
+ free_count += next - start;
start = next + 1;
if (fatal_signal_pending(current)) {
@@ -4786,18 +5126,22 @@ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
ext4_lock_group(sb, group);
}
- if ((e4b->bd_info->bb_free - count) < minblocks)
+ if ((e4b.bd_info->bb_free - free_count) < minblocks)
break;
}
+
+ if (!ret) {
+ ret = count;
+ EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+ }
+out:
ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
ext4_debug("trimmed %d blocks in the group %d\n",
count, group);
- if (ret < 0)
- count = ret;
-
- return count;
+ return ret;
}
/**
@@ -4814,59 +5158,79 @@ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
*/
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
- struct ext4_buddy e4b;
- ext4_group_t first_group, last_group;
- ext4_group_t group, ngroups = ext4_get_groups_count(sb);
- ext4_grpblk_t cnt = 0, first_block, last_block;
- uint64_t start, len, minlen, trimmed;
+ struct ext4_group_info *grp;
+ ext4_group_t group, first_group, last_group;
+ ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
+ uint64_t start, end, minlen, trimmed = 0;
+ ext4_fsblk_t first_data_blk =
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+ ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
int ret = 0;
start = range->start >> sb->s_blocksize_bits;
- len = range->len >> sb->s_blocksize_bits;
- minlen = range->minlen >> sb->s_blocksize_bits;
- trimmed = 0;
+ end = start + (range->len >> sb->s_blocksize_bits) - 1;
+ minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+ range->minlen >> sb->s_blocksize_bits);
- if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+ if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
+ start >= max_blks ||
+ range->len < sb->s_blocksize)
return -EINVAL;
+ if (end >= max_blks)
+ end = max_blks - 1;
+ if (end <= first_data_blk)
+ goto out;
+ if (start < first_data_blk)
+ start = first_data_blk;
- /* Determine first and last group to examine based on start and len */
+ /* Determine first and last group to examine based on start and end */
ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
- &first_group, &first_block);
- ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
- &last_group, &last_block);
- last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
- last_block = EXT4_BLOCKS_PER_GROUP(sb);
+ &first_group, &first_cluster);
+ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
+ &last_group, &last_cluster);
- if (first_group > last_group)
- return -EINVAL;
+ /* end now represents the last cluster to discard in this group */
+ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
for (group = first_group; group <= last_group; group++) {
- ret = ext4_mb_load_buddy(sb, group, &e4b);
- if (ret) {
- ext4_error(sb, "Error in loading buddy "
- "information for %u", group);
- break;
+ grp = ext4_get_group_info(sb, group);
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ break;
}
- if (len >= EXT4_BLOCKS_PER_GROUP(sb))
- len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
- else
- last_block = len;
+ /*
+ * For all the groups except the last one, last cluster will
+ * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
+ * change it for the last group, note that last_cluster is
+ * already computed earlier by ext4_get_group_no_and_offset()
+ */
+ if (group == last_group)
+ end = last_cluster;
- if (e4b.bd_info->bb_free >= minlen) {
- cnt = ext4_trim_all_free(sb, &e4b, first_block,
- last_block, minlen);
+ if (grp->bb_free >= minlen) {
+ cnt = ext4_trim_all_free(sb, group, first_cluster,
+ end, minlen);
if (cnt < 0) {
ret = cnt;
- ext4_mb_unload_buddy(&e4b);
break;
}
+ trimmed += cnt;
}
- ext4_mb_unload_buddy(&e4b);
- trimmed += cnt;
- first_block = 0;
+
+ /*
+ * For every group except the first one, we are sure
+ * that the first cluster to discard will be cluster #0.
+ */
+ first_cluster = 0;
}
- range->len = trimmed * sb->s_blocksize;
+ if (!ret)
+ atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
+
+out:
+ range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
return ret;
}