aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-07-15 08:36:38 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-15 08:36:38 -0700
commit8d2567a620ae8c24968a2bdc1c906c724fac1f6a (patch)
tree8e228abbadbe760e3f015d30c2e1180a67eeb8f9
parentbcf559e385ba099996c90469c817d2eb38aba418 (diff)
parent49f1487b2e41bd8439ea39a4f15b4064e823cc54 (diff)
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (61 commits) ext4: Documention update for new ordered mode and delayed allocation ext4: do not set extents feature from the kernel ext4: Don't allow nonextenst mount option for large filesystem ext4: Enable delalloc by default. ext4: delayed allocation i_blocks fix for stat ext4: fix delalloc i_disksize early update issue ext4: Handle page without buffers in ext4_*_writepage() ext4: Add ordered mode support for delalloc ext4: Invert lock ordering of page_lock and transaction start in delalloc mm: Add range_cont mode for writeback ext4: delayed allocation ENOSPC handling percpu_counter: new function percpu_counter_sum_and_set ext4: Add delayed allocation support in data=writeback mode vfs: add hooks for ext4's delayed allocation support jbd2: Remove data=ordered mode support using jbd buffer heads ext4: Use new framework for data=ordered mode in JBD2 jbd2: Implement data=ordered mode handling via inodes vfs: export filemap_fdatawrite_range() ext4: Fix lock inversion in ext4_ext_truncate() ext4: Invert the locking order of page_lock and transaction start ...
-rw-r--r--Documentation/filesystems/ext4.txt125
-rw-r--r--fs/buffer.c19
-rw-r--r--fs/ext4/balloc.c209
-rw-r--r--fs/ext4/dir.c17
-rw-r--r--fs/ext4/ext4.h61
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h10
-rw-r--r--fs/ext4/ext4_jbd2.h21
-rw-r--r--fs/ext4/ext4_sb.h5
-rw-r--r--fs/ext4/extents.c111
-rw-r--r--fs/ext4/file.c20
-rw-r--r--fs/ext4/fsync.c4
-rw-r--r--fs/ext4/group.h2
-rw-r--r--fs/ext4/ialloc.c113
-rw-r--r--fs/ext4/inode.c1591
-rw-r--r--fs/ext4/mballoc.c451
-rw-r--r--fs/ext4/namei.c45
-rw-r--r--fs/ext4/resize.c52
-rw-r--r--fs/ext4/super.c142
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr_trusted.c4
-rw-r--r--fs/ext4/xattr_user.c4
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c294
-rw-r--r--fs/jbd2/journal.c53
-rw-r--r--fs/jbd2/transaction.c365
-rw-r--r--fs/mpage.c14
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/jbd2.h73
-rw-r--r--include/linux/mpage.h10
-rw-r--r--include/linux/percpu_counter.h12
-rw-r--r--include/linux/writeback.h1
-rw-r--r--lib/percpu_counter.c7
-rw-r--r--mm/filemap.c3
-rw-r--r--mm/page-writeback.c3
35 files changed, 2805 insertions, 1042 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 0c5086db835..80e193d82e2 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -13,72 +13,93 @@ Mailing list: linux-ext4@vger.kernel.org
1. Quick usage instructions:
===========================
- - Grab updated e2fsprogs from
- ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/
- This is a patchset on top of e2fsprogs-1.39, which can be found at
+ - Compile and install the latest version of e2fsprogs (as of this
+ writing version 1.41) from:
+
+ http://sourceforge.net/project/showfiles.php?group_id=2406
+
+ or
+
ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
- - It's still mke2fs -j /dev/hda1
+ or grab the latest git repository from:
+
+ git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
+
+ - Create a new filesystem using the ext4dev filesystem type:
+
+ # mke2fs -t ext4dev /dev/hda1
+
+ Or configure an existing ext3 filesystem to support extents and set
+ the test_fs flag to indicate that it's ok for an in-development
+ filesystem to touch this filesystem:
- - mount /dev/hda1 /wherever -t ext4dev
+ # tune2fs -O extents -E test_fs /dev/hda1
- - To enable extents,
+ If the filesystem was created with 128 byte inodes, it can be
+ converted to use 256 byte for greater efficiency via:
- mount /dev/hda1 /wherever -t ext4dev -o extents
+ # tune2fs -I 256 /dev/hda1
- - The filesystem is compatible with the ext3 driver until you add a file
- which has extents (ie: `mount -o extents', then create a file).
+ (Note: we currently do not have tools to convert an ext4dev
+ filesystem back to ext3; so please do not do try this on production
+ filesystems.)
- NOTE: The "extents" mount flag is temporary. It will soon go away and
- extents will be enabled by the "-o extents" flag to mke2fs or tune2fs
+ - Mounting:
+
+ # mount -t ext4dev /dev/hda1 /wherever
- When comparing performance with other filesystems, remember that
- ext3/4 by default offers higher data integrity guarantees than most. So
- when comparing with a metadata-only journalling filesystem, use `mount -o
- data=writeback'. And you might as well use `mount -o nobh' too along
- with it. Making the journal larger than the mke2fs default often helps
- performance with metadata-intensive workloads.
+ ext3/4 by default offers higher data integrity guarantees than most.
+ So when comparing with a metadata-only journalling filesystem, such
+ as ext3, use `mount -o data=writeback'. And you might as well use
+ `mount -o nobh' too along with it. Making the journal larger than
+ the mke2fs default often helps performance with metadata-intensive
+ workloads.
2. Features
===========
2.1 Currently available
-* ability to use filesystems > 16TB
+* ability to use filesystems > 16TB (e2fsprogs support not available yet)
* extent format reduces metadata overhead (RAM, IO for access, transactions)
* extent format more robust in face of on-disk corruption due to magics,
* internal redunancy in tree
-
-2.1 Previously available, soon to be enabled by default by "mkefs.ext4":
-
-* dir_index and resize inode will be on by default
-* large inodes will be used by default for fast EAs, nsec timestamps, etc
+* improved file allocation (multi-block alloc)
+* fix 32000 subdirectory limit
+* nsec timestamps for mtime, atime, ctime, create time
+* inode version field on disk (NFSv4, Lustre)
+* reduced e2fsck time via uninit_bg feature
+* journal checksumming for robustness, performance
+* persistent file preallocation (e.g for streaming media, databases)
+* ability to pack bitmaps and inode tables into larger virtual groups via the
+ flex_bg feature
+* large file support
+* Inode allocation using large virtual block groups via flex_bg
+* delayed allocation
+* large block (up to pagesize) support
+* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
+ the ordering)
2.2 Candidate features for future inclusion
-There are several under discussion, whether they all make it in is
-partly a function of how much time everyone has to work on them:
+* Online defrag (patches available but not well tested)
+* reduced mke2fs time via lazy itable initialization in conjuction with
+ the uninit_bg feature (capability to do this is available in e2fsprogs
+ but a kernel thread to do lazy zeroing of unused inode table blocks
+ after filesystem is first mounted is required for safety)
-* improved file allocation (multi-block alloc, delayed alloc; basically done)
-* fix 32000 subdirectory limit (patch exists, needs some e2fsck work)
-* nsec timestamps for mtime, atime, ctime, create time (patch exists,
- needs some e2fsck work)
-* inode version field on disk (NFSv4, Lustre; prototype exists)
-* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
-* journal checksumming for robustness, performance (prototype exists)
-* persistent file preallocation (e.g for streaming media, databases)
+There are several others under discussion, whether they all make it in is
+partly a function of how much time everyone has to work on them. Features like
+metadata checksumming have been discussed and planned for a bit but no patches
+exist yet so I'm not sure they're in the near-term roadmap.
-Features like metadata checksumming have been discussed and planned for
-a bit but no patches exist yet so I'm not sure they're in the near-term
-roadmap.
+The big performance win will come with mballoc, delalloc and flex_bg
+grouping of bitmaps and inode tables. Some test results available here:
-The big performance win will come with mballoc and delalloc. CFS has
-been using mballoc for a few years already with Lustre, and IBM + Bull
-did a lot of benchmarking on it. The reason it isn't in the first set of
-patches is partly a manageability issue, and partly because it doesn't
-directly affect the on-disk format (outside of much better allocation)
-so it isn't critical to get into the first round of changes. I believe
-Alex is working on a new set of patches right now.
+ - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
+ - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
3. Options
==========
@@ -222,9 +243,11 @@ stripe=n Number of filesystem blocks that mballoc will try
to use for allocation size and alignment. For RAID5/6
systems this should be the number of data
disks * RAID chunk size in file system blocks.
-
+delalloc (*) Deferring block allocation until write-out time.
+nodelalloc Disable delayed allocation. Blocks are allocation
+ when data is copied from user to page cache.
Data Mode
----------
+=========
There are 3 different data modes:
* writeback mode
@@ -236,10 +259,10 @@ typically provide the best ext4 performance.
* ordered mode
In data=ordered mode, ext4 only officially journals metadata, but it logically
-groups metadata and data blocks into a single unit called a transaction. When
-it's time to write the new metadata out to disk, the associated data blocks
-are written first. In general, this mode performs slightly slower than
-writeback but significantly faster than journal mode.
+groups metadata information related to data changes with the data blocks into a
+single unit called a transaction. When it's time to write the new metadata
+out to disk, the associated data blocks are written first. In general,
+this mode performs slightly slower than writeback but significantly faster than journal mode.
* journal mode
data=journal mode provides full data and metadata journaling. All new data is
@@ -247,7 +270,8 @@ written to the journal first, and then to its final location.
In the event of a crash, the journal can be replayed, bringing both data and
metadata into a consistent state. This mode is the slowest except when data
needs to be read from and written to disk at the same time where it
-outperforms all others modes.
+outperforms all others modes. Curently ext4 does not have delayed
+allocation support if this data journalling mode is selected.
References
==========
@@ -256,7 +280,8 @@ kernel source: <file:fs/ext4/>
<file:fs/jbd2/>
programs: http://e2fsprogs.sourceforge.net/
- http://ext2resize.sourceforge.net
useful links: http://fedoraproject.org/wiki/ext3-devel
http://www.bullopensource.org/ext4/
+ http://ext4.wiki.kernel.org/index.php/Main_Page
+ http://fedoraproject.org/wiki/Features/Ext4
diff --git a/fs/buffer.c b/fs/buffer.c
index 0f51c0f7c26..5fa1512cd9a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
- } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+ } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+ buffer_dirty(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1);
if (err)
goto recover;
+ clear_buffer_delay(bh);
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
bh = head;
/* Recovery: lock and submit the mapped buffers */
do {
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
+ if (buffer_mapped(bh) && buffer_dirty(bh) &&
+ !buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write(bh);
} else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
+ int i_size_changed = 0;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
*/
if (pos+copied > inode->i_size) {
i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
+ i_size_changed = 1;
}
unlock_page(page);
page_cache_release(page);
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+
return copied;
}
EXPORT_SYMBOL(generic_write_end);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d..495ab21b983 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
ext4_group_t block_group)
{
ext4_group_t actual_group;
- ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+ ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
if (actual_group == block_group)
return 1;
return 0;
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
}
} else { /* For META_BG_BLOCK_GROUPS */
- int group_rel = (block_group -
- le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
- EXT4_DESC_PER_BLOCK(sb);
- if (group_rel == 0 || group_rel == 1 ||
- (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
- bit_max += 1;
+ bit_max += ext4_bg_num_gdb(sb, block_group);
}
if (block_group == sbi->s_groups_count - 1) {
@@ -295,7 +290,7 @@ err_out:
return 0;
}
/**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
* @sb: super block
* @block_group: given block group
*
@@ -305,7 +300,7 @@ err_out:
* Return buffer_head on success or NULL in case of failure.
*/
struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc * desc;
struct buffer_head * bh = NULL;
@@ -409,8 +404,7 @@ restart:
prev = rsv;
}
printk("Window map complete.\n");
- if (bad)
- BUG();
+ BUG_ON(bad);
}
#define rsv_window_dump(root, verbose) \
__rsv_window_dump((root), (verbose), __func__)
@@ -694,7 +688,7 @@ do_more:
count -= overflow;
}
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -810,6 +804,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1598,23 +1599,35 @@ out:
/**
* ext4_has_free_blocks()
- * @sbi: in-core super block structure.
+ * @sbi: in-core super block structure.
+ * @nblocks: number of neeed blocks
*
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
*/
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ ext4_fsblk_t nblocks)
{
- ext4_fsblk_t free_blocks, root_blocks;
+ ext4_fsblk_t free_blocks;
+ ext4_fsblk_t root_blocks = 0;
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- root_blocks = ext4_r_blocks_count(sbi->s_es);
- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+
+ if (!capable(CAP_SYS_RESOURCE) &&
sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
- return 0;
- }
- return 1;
-}
+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
+#ifdef CONFIG_SMP
+ if (free_blocks - root_blocks < FBC_BATCH)
+ free_blocks =
+ percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+#endif
+ if (free_blocks - root_blocks < nblocks)
+ return free_blocks - root_blocks;
+ return nblocks;
+ }
+
/**
* ext4_should_retry_alloc()
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+ if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
}
/**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
* @count: target number of blocks to allocate
* @errp: error code
*
- * ext4_new_blocks uses a goal block to assist allocation. It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
+ * the block bitmap directly to do block allocation. It tries to
+ * allocate block(s) from the block group contains the goal block first. If
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
*
*/
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
struct buffer_head *bitmap_bh = NULL;
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
ext4_group_t ngroups;
unsigned long num = *count;
- *errp = -ENOSPC;
sb = inode->i_sb;
if (!sb) {
+ *errp = -ENODEV;
printk("ext4_new_block: nonexistent device");
return 0;
}
+ sbi = EXT4_SB(sb);
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ *count = ext4_has_free_blocks(sbi, *count);
+ }
+ if (*count == 0) {
+ *errp = -ENOSPC;
+ return 0; /*return with ENOSPC error */
+ }
+ num = *count;
+
/*
* Check quota for allocation of this block.
*/
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
my_rsv = &block_i->rsv_window_node;
- if (!ext4_has_free_blocks(sbi)) {
- *errp = -ENOSPC;
- goto out;
- }
-
/*
* First, test whether the goal block is free.
*/
@@ -1734,7 +1759,7 @@ retry_alloc:
my_rsv = NULL;
if (free_blocks > 0) {
- bitmap_bh = read_block_bitmap(sb, group_no);
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1770,7 +1795,7 @@ retry_alloc:
continue;
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, group_no);
+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
if (!bitmap_bh)
goto io_error;
/*
@@ -1882,7 +1907,15 @@ allocated:
le16_add_cpu(&gdp->bg_free_blocks_count, -num);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
spin_unlock(sb_bgl_lock(sbi, group_no));
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+ if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= num;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1915,46 +1948,104 @@ out:
return 0;
}
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int *errp)
+#define EXT4_META_BLOCK 0x1
+
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp, int flags)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
if (!test_opt(inode->i_sb, MBALLOC)) {
- unsigned long count = 1;
- ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
- return ret;
+ return ext4_old_new_blocks(handle, inode, goal, count, errp);
}
memset(&ar, 0, sizeof(ar));
+ /* Fill with neighbour allocated blocks */
+
ar.inode = inode;
ar.goal = goal;
- ar.len = 1;
+ ar.len = *count;
+ ar.logical = iblock;
+
+ if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+ /* enable in-core preallocation for data block allocation */
+ ar.flags = EXT4_MB_HINT_DATA;
+ else
+ /* disable in-core preallocation for non-regular files */
+ ar.flags = 0;
+
ret = ext4_mb_new_blocks(handle, &ar, errp);
+ *count = ar.len;
return ret;
}
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
- struct ext4_allocation_request ar;
ext4_fsblk_t ret;
-
- if (!test_opt(inode->i_sb, MBALLOC)) {
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
- return ret;
+ ret = do_blk_alloc(handle, inode, 0, goal,
+ count, errp, EXT4_META_BLOCK);
+ /*
+ * Account for the allocated meta blocks
+ */
+ if (!(*errp)) {
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ EXT4_I(inode)->i_allocated_meta_blocks += *count;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}
-
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = *count;
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- *count = ar.len;
return ret;
}
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @errp: error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, int *errp)
+{
+ unsigned long count = 1;
+ return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp)
+{
+ return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
/**
* ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
continue;
desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, i);
+ bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea19..d3d23d73c08 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
struct buffer_head *bh = NULL;
map_bh.b_state = 0;
- err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+ err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+ 0, 0, 0);
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
while (n) {
/* Do the node's children first */
- if ((n)->rb_left) {
+ if (n->rb_left) {
n = n->rb_left;
continue;
}
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
parent->rb_right = NULL;
n = parent;
}
- root->rb_node = NULL;
}
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
{
struct dir_private_info *p;
- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->root.rb_node = NULL;
- p->curr_node = NULL;
- p->extra_fname = NULL;
- p->last_pos = 0;
p->curr_hash = pos2maj_hash(pos);
p->curr_minor_hash = pos2min_hash(pos);
- p->next_hash = 0;
return p;
}
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
int ret;
if (!info) {
- info = create_dir_info(filp->f_pos);
+ info = ext4_htree_create_dir_info(filp->f_pos);
if (!info)
return -ENOMEM;
filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac..303e41cf7b1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
#include "ext4_i.h"
/*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
*/
/*
@@ -45,7 +45,7 @@
#define ext4_debug(f, a...) \
do { \
printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
- __FILE__, __LINE__, __FUNCTION__); \
+ __FILE__, __LINE__, __func__); \
printk (KERN_DEBUG f, ## a); \
} while (0)
#else
@@ -74,6 +74,9 @@
#define EXT4_MB_HINT_GOAL_ONLY 256
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL 512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED 1024
+
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
__u32 bg_reserved2[3];
};
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+ __u32 free_inodes;
+ __u32 free_blocks;
+};
+
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do { \
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
__le16 s_mmp_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad2;
+ __le16 s_reserved_pad;
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};
#ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);