diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-fs-ext4 | 81 | ||||
-rw-r--r-- | Documentation/filesystems/ext4.txt | 30 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 21 | ||||
-rw-r--r-- | fs/ext4/balloc.c | 14 | ||||
-rw-r--r-- | fs/ext4/dir.c | 16 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 91 | ||||
-rw-r--r-- | fs/ext4/ext4_extents.h | 1 | ||||
-rw-r--r-- | fs/ext4/ext4_i.h | 6 | ||||
-rw-r--r-- | fs/ext4/ext4_sb.h | 14 | ||||
-rw-r--r-- | fs/ext4/extents.c | 127 | ||||
-rw-r--r-- | fs/ext4/file.c | 7 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 273 | ||||
-rw-r--r-- | fs/ext4/inode.c | 424 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 17 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 158 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 8 | ||||
-rw-r--r-- | fs/ext4/namei.c | 164 | ||||
-rw-r--r-- | fs/ext4/resize.c | 8 | ||||
-rw-r--r-- | fs/ext4/super.c | 327 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 5 | ||||
-rw-r--r-- | fs/jbd2/revoke.c | 24 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 2 | ||||
-rw-r--r-- | include/linux/jbd2.h | 6 |
23 files changed, 1224 insertions, 600 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4 new file mode 100644 index 00000000000..4e79074de28 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-fs-ext4 @@ -0,0 +1,81 @@ +What: /sys/fs/ext4/<disk>/mb_stats +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + Controls whether the multiblock allocator should + collect statistics, which are shown during the unmount. + 1 means to collect statistics, 0 means not to collect + statistics + +What: /sys/fs/ext4/<disk>/mb_group_prealloc +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + The multiblock allocator will round up allocation + requests to a multiple of this tuning parameter if the + stripe size is not set in the ext4 superblock + +What: /sys/fs/ext4/<disk>/mb_max_to_scan +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + The maximum number of extents the multiblock allocator + will search to find the best extent + +What: /sys/fs/ext4/<disk>/mb_min_to_scan +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + The minimum number of extents the multiblock allocator + will search to find the best extent + +What: /sys/fs/ext4/<disk>/mb_order2_req +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + Tuning parameter which controls the minimum size for + requests (as a power of 2) where the buddy cache is + used + +What: /sys/fs/ext4/<disk>/mb_stream_req +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + Files which have fewer blocks than this tunable + parameter will have their blocks allocated out of a + block group specific preallocation pool, so that small + files are packed closely together. Each large file + will have its blocks allocated out of its own unique + preallocation pool. + +What: /sys/fs/ext4/<disk>/inode_readahead +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + Tuning parameter which controls the maximum number of + inode table blocks that ext4's inode table readahead + algorithm will pre-read into the buffer cache + +What: /sys/fs/ext4/<disk>/delayed_allocation_blocks +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + This file is read-only and shows the number of blocks + that are dirty in the page cache, but which do not + have their location in the filesystem allocated yet. + +What: /sys/fs/ext4/<disk>/lifetime_write_kbytes +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + This file is read-only and shows the number of kilobytes + of data that have been written to this filesystem since it was + created. + +What: /sys/fs/ext4/<disk>/session_write_kbytes +Date: March 2008 +Contact: "Theodore Ts'o" <tytso@mit.edu> +Description: + This file is read-only and shows the number of + kilobytes of data that have been written to this + filesystem since it was mounted. diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index cec829bc729..97882df0486 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be * extent format more robust in face of on-disk corruption due to magics, * internal redundancy in tree * improved file allocation (multi-block alloc) -* fix 32000 subdirectory limit +* lift 32000 subdirectory limit imposed by i_links_count[1] * nsec timestamps for mtime, atime, ctime, create time * inode version field on disk (NFSv4, Lustre) * reduced e2fsck time via uninit_bg feature @@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force the ordering) +[1] Filesystems with a block size of 1k may see a limit imposed by the +directory hash tree having a maximum depth of two. + 2.2 Candidate features for future inclusion * Online defrag (patches available but not well tested) @@ -180,8 +183,8 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata performance. barrier=<0|1(*)> This enables/disables the use of write barriers in - the jbd code. barrier=0 disables, barrier=1 enables. - This also requires an IO stack which can support +barrier(*) the jbd code. barrier=0 disables, barrier=1 enables. +nobarrier This also requires an IO stack which can support barriers, and if jbd gets an error on a barrier write, it will disable again with a warning. Write barriers enforce proper on-disk ordering @@ -189,6 +192,9 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in safe to use, at some performance penalty. If your disks are battery-backed in one way or another, disabling barriers may safely improve performance. + The mount options "barrier" and "nobarrier" can + also be used to enable or disable barriers, for + consistency with other ext4 mount options. inode_readahead=n This tuning parameter controls the maximum number of inode table blocks that ext4's inode @@ -310,6 +316,24 @@ journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the a slightly higher priority than the default I/O priority. +auto_da_alloc(*) Many broken applications don't use fsync() when +noauto_da_alloc replacing existing files via patterns such as + fd = open("foo.new")/write(fd,..)/close(fd)/ + rename("foo.new", "foo"), or worse yet, + fd = open("foo", O_TRUNC)/write(fd,..)/close(fd). + If auto_da_alloc is enabled, ext4 will detect + the replace-via-rename and replace-via-truncate + patterns and force that any delayed allocation + blocks are allocated such that at the next + journal commit, in the default data=ordered + mode, the data blocks of the new file are forced + to disk before the rename() operation is + commited. This provides roughly the same level + of guarantees as ext3, and avoids the + "zero-length" problem that can happen when a + system crashes before the delayed allocation + blocks are forced to disk. + Data Mode ========= There are 3 different data modes: diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 830bad7cce0..efc4fd9f40c 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -940,27 +940,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname> File Content mb_groups details of multiblock allocator buddy cache of free blocks mb_history multiblock allocation history - stats controls whether the multiblock allocator should start - collecting statistics, which are shown during the unmount - group_prealloc the multiblock allocator will round up allocation - requests to a multiple of this tuning parameter if the - stripe size is not set in the ext4 superblock - max_to_scan The maximum number of extents the multiblock allocator - will search to find the best extent - min_to_scan The minimum number of extents the multiblock allocator - will search to find the best extent - order2_req Tuning parameter which controls the minimum size for - requests (as a power of 2) where the buddy cache is - used - stream_req Files which have fewer blocks than this tunable - parameter will have their blocks allocated out of a - block group specific preallocation pool, so that small - files are packed closely together. Each large file - will have its blocks allocated out of its own unique - preallocation pool. -inode_readahead Tuning parameter which controls the maximum number of - inode table blocks that ext4's inode table readahead - algorithm will pre-read into the buffer cache .............................................................................. diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 38f40d55899..53c72ad8587 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, } static int ext4_group_used_meta_blocks(struct super_block *sb, - ext4_group_t block_group) + ext4_group_t block_group, + struct ext4_group_desc *gdp) { ext4_fsblk_t tmp; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb, int used_blocks = sbi->s_itb_per_group + 2; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { - struct ext4_group_desc *gdp; - struct buffer_head *bh; - - gdp = ext4_get_group_desc(sb, block_group, &bh); if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) used_blocks--; @@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data); } - return free_blocks - ext4_group_used_meta_blocks(sb, block_group); + return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp); } @@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; - spin_unlock(sb_bgl_lock(sbi, flex_group)); + atomic_add(blocks_freed, + &sbi->s_flex_groups[flex_group].free_blocks); } /* * request to reload the buddy with the diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 2df2e40b01a..b64789929a6 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, unsigned int offset) { const char *error_msg = NULL; - const int rlen = ext4_rec_len_from_disk(de->rec_len); + const int rlen = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); if (rlen < EXT4_DIR_REC_LEN(1)) error_msg = "rec_len is smaller than minimal"; @@ -178,10 +179,11 @@ revalidate: * least that it is non-zero. A * failure will be detected in the * dirent test below. */ - if (ext4_rec_len_from_disk(de->rec_len) - < EXT4_DIR_REC_LEN(1)) + if (ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) break; - i += ext4_rec_len_from_disk(de->rec_len); + i += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); } offset = i; filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) @@ -203,7 +205,8 @@ revalidate: ret = stored; goto out; } - offset += ext4_rec_len_from_disk(de->rec_len); + offset += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); if (le32_to_cpu(de->inode)) { /* We might block in the next section * if the data destination is @@ -225,7 +228,8 @@ revalidate: goto revalidate; stored++; } - filp->f_pos += ext4_rec_len_from_disk(de->rec_len); + filp->f_pos += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); } offset = 0; brelse(bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 990c9400092..d0f15ef56de 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -33,14 +33,6 @@ #undef EXT4FS_DEBUG /* - * Define EXT4_RESERVATION to reserve data blocks for expanding files - */ -#define EXT4_DEFAULT_RESERVE_BLOCKS 8 -/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ -#define EXT4_MAX_RESERVE_BLOCKS 1027 -#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0 - -/* * Debug code */ #ifdef EXT4FS_DEBUG @@ -54,8 +46,6 @@ #define ext4_debug(f, a...) do {} while (0) #endif -#define EXT4_MULTIBLOCK_ALLOCATOR 1 - /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 1 /* blocks already reserved */ @@ -180,8 +170,9 @@ struct ext4_group_desc */ struct flex_groups { - __u32 free_inodes; - __u32 free_blocks; + atomic_t free_inodes; + atomic_t free_blocks; + atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ @@ -249,6 +240,30 @@ struct flex_groups { #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ #define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ +/* Flags that should be inherited by new inodes from their parent. */ +#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ + EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\ + EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ + EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) + +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & EXT4_REG_FLMASK; + else + return flags & EXT4_OTHER_FLMASK; +} + /* * Inode dynamic state flags */ @@ -256,6 +271,7 @@ struct flex_groups { #define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ +#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { @@ -303,7 +319,9 @@ struct ext4_new_group_data { #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) #define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) /* * ioctl commands in 32 bit emulation @@ -531,7 +549,7 @@ do { \ #define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ #define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ -#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */ +#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ #define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ @@ -666,7 +684,8 @@ struct ext4_super_block { __u8 s_log_groups_per_flex; /* FLEX_BG group size */ __u8 s_reserved_char_pad2; __le16 s_reserved_pad; - __u32 s_reserved[162]; /* Padding to the end of the block */ + __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ + __u32 s_reserved[160]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ @@ -814,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ /* + * Minimum number of groups in a flexgroup before we separate out + * directories into the first block group of a flexgroup + */ +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 + +/* * Structure of a directory entry */ #define EXT4_NAME_LEN 255 @@ -865,24 +890,6 @@ struct ext4_dir_entry_2 { ~EXT4_DIR_ROUND) #define EXT4_MAX_REC_LEN ((1<<16)-1) -static inline unsigned ext4_rec_len_from_disk(__le16 dlen) -{ - unsigned len = le16_to_cpu(dlen); - - if (len == EXT4_MAX_REC_LEN || len == 0) - return 1 << 16; - return len; -} - -static inline __le16 ext4_rec_len_to_disk(unsigned len) -{ - if (len == (1 << 16)) - return cpu_to_le16(EXT4_MAX_REC_LEN); - else if (len > (1 << 16)) - BUG(); - return cpu_to_le16(len); -} - /* * Hash Tree Directory indexing * (c) Daniel Phillips, 2001 @@ -970,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, extern struct proc_dir_entry *ext4_proc_root; -#ifdef CONFIG_PROC_FS -extern const struct file_operations ext4_ui_proc_fops; - -#define EXT4_PROC_HANDLER(name, var) \ -do { \ - proc = proc_create_data(name, mode, sbi->s_proc, \ - &ext4_ui_proc_fops, &sbi->s_##var); \ - if (proc == NULL) { \ - printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \ - goto err_out; \ - } \ -} while (0) -#else -#define EXT4_PROC_HANDLER(name, var) -#endif - /* * Function prototypes */ @@ -1092,6 +1083,7 @@ extern int ext4_can_truncate(struct inode *inode); extern void ext4_truncate(struct inode *); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); +extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); @@ -1107,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); + /* namei.c */ +extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize); +extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize); extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 18cb67b2cbb..f0c3ec85bd4 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, ext4_lblk_t *, ext4_fsblk_t *); extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index e69acc16f5c..4ce2187123a 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; -#define rsv_start rsv_window._rsv_start -#define rsv_end rsv_window._rsv_end - /* * storage for cached extent */ @@ -125,6 +122,9 @@ struct ext4_inode_info { struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + /* ialloc */ + ext4_group_t i_last_alloc_group; + /* allocation reservation info for delalloc */ unsigned int i_reserved_data_blocks; unsigned int i_reserved_meta_blocks; diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 039b6ea1a04..57b71fefbcc 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -62,12 +62,10 @@ struct ext4_sb_info { struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; struct percpu_counter s_dirtyblocks_counter; - struct blockgroup_lock s_blockgroup_lock; + struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; - - /* root of the per fs reservation window tree */ - spinlock_t s_rsv_window_lock; - struct rb_root s_rsv_window_root; + struct kobject s_kobj; + struct completion s_kobj_unregister; /* Journaling */ struct inode *s_journal_inode; @@ -146,6 +144,10 @@ struct ext4_sb_info { /* locality groups */ struct ext4_locality_group *s_locality_groups; + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + unsigned int s_log_groups_per_flex; struct flex_groups *s_flex_groups; }; @@ -153,7 +155,7 @@ struct ext4_sb_info { static inline spinlock_t * sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) { - return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group); + return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); } #endif /* _EXT4_SB */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e0aa4fe4f59..ac77d8b8251 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, ext4_fsblk_t bg_start; ext4_fsblk_t last_block; ext4_grpblk_t colour; + ext4_group_t block_group; + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); int depth; if (path) { @@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, } /* OK. use inode's group */ - bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + + block_group = ei->i_block_group; + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { + /* + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME + * block groups per flexgroup, reserve the first block + * group for directories and special files. Regular + * files will start at the second block group. This + * tends to speed up directory access and improves + * fsck times. + */ + block_group &= ~(flex_size-1); + if (S_ISREG(inode->i_mode)) + block_group++; + } + bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) + le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; + /* + * If we are doing delayed allocation, we don't need take + * colour into account. + */ + if (test_opt(inode->i_sb, DELALLOC)) + return bg_start; + if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); @@ -301,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth) return max; } -static int __ext4_ext_check_header(const char *function, struct inode *inode, +static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) +{ + ext4_fsblk_t block = ext_pblock(ext); + int len = ext4_ext_get_actual_len(ext); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + if (unlikely(block < le32_to_cpu(es->s_first_data_block) || + ((block + len) > ext4_blocks_count(es)))) + return 0; + else + return 1; +} + +static int ext4_valid_extent_idx(struct inode *inode, + struct ext4_extent_idx *ext_idx) +{ + ext4_fsblk_t block = idx_pblock(ext_idx); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + if (unlikely(block < le32_to_cpu(es->s_first_data_block) || + (block > ext4_blocks_count(es)))) + return 0; + else + return 1; +} + +static int ext4_valid_extent_entries(struct inode *inode, + struct ext4_extent_header *eh, + int depth) +{ + struct ext4_extent *ext; + struct ext4_extent_idx *ext_idx; + unsigned short entries; + if (eh->eh_entries == 0) + return 1; + + entries = le16_to_cpu(eh->eh_entries); + + if (depth == 0) { + /* leaf entries */ + ext = EXT_FIRST_EXTENT(eh); + while (entries) { + if (!ext4_valid_extent(inode, ext)) + return 0; + ext++; + entries--; + } + } else { + ext_idx = EXT_FIRST_INDEX(eh); + while (entries) { + if (!ext4_valid_extent_idx(inode, ext_idx)) + return 0; + ext_idx++; + entries--; + } + } + return 1; +} + +static int __ext4_ext_check(const char *function, struct inode *inode, struct ext4_extent_header *eh, int depth) { @@ -329,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode, error_msg = "invalid eh_entries"; goto corrupted; } + if (!ext4_valid_extent_entries(inode, eh, depth)) { + error_msg = "invalid extent entries"; + goto corrupted; + } return 0; corrupted: ext4_error(inode->i_sb, function, - "bad header in inode #%lu: %s - magic %x, " + "bad header/extent in inode #%lu: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), @@ -342,8 +426,13 @@ corrupted: return -EIO; } -#define ext4_ext_check_header(inode, eh, depth) \ - __ext4_ext_check_header(__func__, inode, eh, depth) +#define ext4_ext_check(inode, eh, depth) \ + __ext4_ext_check(__func__, inode, eh, depth) + +int ext4_ext_check_inode(struct inode *inode) +{ + return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode)); +} #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) @@ -547,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_inode_hdr(inode); depth = ext_depth(inode); - if (ext4_ext_check_header(inode, eh, depth)) - return ERR_PTR(-EIO); - /* account possible depth increase */ if (!path) { @@ -565,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, i = depth; /* walk through the tree */ while (i) { + int need_to_validate = 0; + ext_debug("depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); @@ -573,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_depth = i; path[ppos].p_ext = NULL; - bh = sb_bread(inode->i_sb, path[ppos].p_block); - if (!bh) + bh = sb_getblk(inode->i_sb, path[ppos].p_block); + if (unlikely(!bh)) goto err; - + if (!bh_uptodate_or_lock(bh)) { + if (bh_submit_read(bh) < 0) { + put_bh(bh); + goto err; + } + /* validate the extent entries */ + need_to_validate = 1; + } eh = ext_block_hdr(bh); ppos++; BUG_ON(ppos > depth); @@ -584,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, path[ppos].p_hdr = eh; i--; - if (ext4_ext_check_header(inode, eh, i)) + if (need_to_validate && ext4_ext_check(inode, eh, i)) goto err; } @@ -1181,7 +1276,7 @@ got_index: return -EIO; eh = ext_block_hdr(bh); /* subtract from p_depth to get proper eh_depth */ - if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } @@ -1194,7 +1289,7 @@ got_index: if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { + if (ext4_ext_check(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } @@ -2137,7 +2232,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) return -ENOMEM; } path[0].p_hdr = ext_inode_hdr(inode); - if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) { + if (ext4_ext_check(inode, path[0].p_hdr, depth)) { err = -EIO; goto out; } @@ -2191,7 +2286,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) err = -EIO; break; } - if (ext4_ext_check_header(inode, ext_block_hdr(bh), + if (ext4_ext_check(inode, ext_block_hdr(bh), depth - i - 1)) { err = -EIO; break; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f731cb545a0..588af8c7724 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -33,9 +33,14 @@ */ static int ext4_release_file(struct inode *inode, struct file *filp) { + if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { + ext4_alloc_da_blocks(inode); + EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; + } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && - (atomic_read(&inode->i_writecount) == 1)) + (atomic_read(&inode->i_writecount) == 1) && + !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index fb51b40e3e8..47b84e8df56 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; - ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", @@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (is_directory) { count = ext4_used_dirs_count(sb, gdp) - 1; ext4_used_dirs_set(sb, gdp, count); + if (sbi->s_log_groups_per_flex) { + ext4_group_t f; + + f = ext4_flex_group(sbi, block_group); + atomic_dec(&sbi->s_flex_groups[f].free_inodes); + } + } gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); @@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) percpu_counter_dec(&sbi->s_dirs_counter); if (sbi->s_log_groups_per_flex) { - flex_group = ext4_flex_group(sbi, block_group); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_inodes++; - spin_unlock(sb_bgl_lock(sbi, flex_group)); + ext4_group_t f; + + f = ext4_flex_group(sbi, block_group); + atomic_inc(&sbi->s_flex_groups[f].free_inodes); } } BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); @@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, sbi->s_log_groups_per_flex; find_close_to_parent: - flexbg_free_blocks = flex_group[best_flex].free_blocks; + flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks); flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; - if (flex_group[best_flex].free_inodes && + if (atomic_read(&flex_group[best_flex].free_inodes) && flex_freeb_ratio > free_block_ratio) goto found_flexbg; @@ -375,24 +381,24 @@ find_close_to_parent: if (i == parent_fbg_group || i == parent_fbg_group - 1) continue; - flexbg_free_blocks = flex_group[i].free_blocks; + flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks); flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; if (flex_freeb_ratio > free_block_ratio && - flex_group[i].free_inodes) { + (atomic_read(&flex_group[i].free_inodes))) { best_flex = i; goto found_flexbg; } - if (flex_group[best_flex].free_inodes == 0 || - (flex_group[i].free_blocks > - flex_group[best_flex].free_blocks && - flex_group[i].free_inodes)) + if ((atomic_re |