diff options
Diffstat (limited to 'fs/f2fs')
| -rw-r--r-- | fs/f2fs/Kconfig | 20 | ||||
| -rw-r--r-- | fs/f2fs/Makefile | 2 | ||||
| -rw-r--r-- | fs/f2fs/acl.c | 182 | ||||
| -rw-r--r-- | fs/f2fs/acl.h | 14 | ||||
| -rw-r--r-- | fs/f2fs/checkpoint.c | 597 | ||||
| -rw-r--r-- | fs/f2fs/data.c | 880 | ||||
| -rw-r--r-- | fs/f2fs/debug.c | 111 | ||||
| -rw-r--r-- | fs/f2fs/dir.c | 388 | ||||
| -rw-r--r-- | fs/f2fs/f2fs.h | 603 | ||||
| -rw-r--r-- | fs/f2fs/file.c | 410 | ||||
| -rw-r--r-- | fs/f2fs/gc.c | 258 | ||||
| -rw-r--r-- | fs/f2fs/gc.h | 38 | ||||
| -rw-r--r-- | fs/f2fs/inline.c | 250 | ||||
| -rw-r--r-- | fs/f2fs/inode.c | 175 | ||||
| -rw-r--r-- | fs/f2fs/namei.c | 124 | ||||
| -rw-r--r-- | fs/f2fs/node.c | 1056 | ||||
| -rw-r--r-- | fs/f2fs/node.h | 156 | ||||
| -rw-r--r-- | fs/f2fs/recovery.c | 300 | ||||
| -rw-r--r-- | fs/f2fs/segment.c | 959 | ||||
| -rw-r--r-- | fs/f2fs/segment.h | 227 | ||||
| -rw-r--r-- | fs/f2fs/super.c | 749 | ||||
| -rw-r--r-- | fs/f2fs/xattr.c | 451 | ||||
| -rw-r--r-- | fs/f2fs/xattr.h | 45 |
23 files changed, 5370 insertions, 2625 deletions
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index fd27e7e6326..214fe1054fc 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -51,3 +51,23 @@ config F2FS_FS_POSIX_ACL Linux website <http://acl.bestbits.at/>. If you don't know what Access Control Lists are, say N + +config F2FS_FS_SECURITY + bool "F2FS Security Labels" + depends on F2FS_FS_XATTR + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the f2fs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. + +config F2FS_CHECK_FS + bool "F2FS consistency checking feature" + depends on F2FS_FS + help + Enables BUG_ONs which check the file system consistency in runtime. + + If you want to improve the performance, say N. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 27a0820340b..2e35da12d29 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o -f2fs-y := dir.o file.o inode.o namei.o hash.o super.o +f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 137af4255da..dbe2141d10a 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -17,9 +17,6 @@ #include "xattr.h" #include "acl.h" -#define get_inode_mode(i) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ - (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) - static inline size_t f2fs_acl_size(int count) { if (count <= 4) { @@ -167,25 +164,17 @@ fail: struct posix_acl *f2fs_get_acl(struct inode *inode, int type) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; void *value = NULL; struct posix_acl *acl; int retval; - if (!test_opt(sbi, POSIX_ACL)) - return NULL; - - acl = get_cached_acl(inode, type); - if (acl != ACL_NOT_CACHED) - return acl; - if (type == ACL_TYPE_ACCESS) name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; retval = f2fs_getxattr(inode, name_index, "", NULL, 0); if (retval > 0) { - value = kmalloc(retval, GFP_KERNEL); + value = kmalloc(retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, retval); @@ -205,19 +194,20 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) return acl; } -static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +static int __f2fs_set_acl(struct inode *inode, int type, + struct posix_acl *acl, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); int name_index; void *value = NULL; size_t size = 0; int error; - if (!test_opt(sbi, POSIX_ACL)) - return 0; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; + if (acl) { + error = posix_acl_valid(acl); + if (error < 0) + return error; + } switch (type) { case ACL_TYPE_ACCESS: @@ -250,7 +240,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) } } - error = f2fs_setxattr(inode, name_index, "", value, size); + error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); kfree(value); if (!error) @@ -260,153 +250,31 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -int f2fs_init_acl(struct inode *inode, struct inode *dir) +int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *acl = NULL; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - int error = 0; - - if (!S_ISLNK(inode->i_mode)) { - if (test_opt(sbi, POSIX_ACL)) { - acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(acl)) - return PTR_ERR(acl); - } - if (!acl) - inode->i_mode &= ~current_umask(); - } - - if (test_opt(sbi, POSIX_ACL) && acl) { - - if (S_ISDIR(inode->i_mode)) { - error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl); - if (error) - goto cleanup; - } - error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); - if (error < 0) - return error; - if (error > 0) - error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl); - } -cleanup: - posix_acl_release(acl); - return error; + return __f2fs_set_acl(inode, type, acl, NULL); } -int f2fs_acl_chmod(struct inode *inode) +int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct posix_acl *acl; - int error; - mode_t mode = get_inode_mode(inode); - - if (!test_opt(sbi, POSIX_ACL)) - return 0; - if (S_ISLNK(mode)) - return -EOPNOTSUPP; - - acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); + struct posix_acl *default_acl, *acl; + int error = 0; - error = posix_acl_chmod(&acl, GFP_KERNEL, mode); + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; - error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl); - posix_acl_release(acl); - return error; -} - -static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - const char *xname = POSIX_ACL_XATTR_DEFAULT; - size_t size; - if (!test_opt(sbi, POSIX_ACL)) - return 0; - - if (type == ACL_TYPE_ACCESS) - xname = POSIX_ACL_XATTR_ACCESS; - - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; -} - -static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name, - void *buffer, size_t size, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct posix_acl *acl; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(sbi, POSIX_ACL)) - return -EOPNOTSUPP; - - acl = f2fs_get_acl(dentry->d_inode, type); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (!acl) - return -ENODATA; - error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - - return error; -} - -static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - struct inode *inode = dentry->d_inode; - struct posix_acl *acl = NULL; - int error; - - if (strcmp(name, "") != 0) - return -EINVAL; - if (!test_opt(sbi, POSIX_ACL)) - return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) - return -EPERM; - - if (value) { - acl = posix_acl_from_xattr(&init_user_ns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - if (acl) { - error = posix_acl_valid(acl); - if (error) - goto release_and_out; - } - } else { - acl = NULL; + if (default_acl) { + error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, + ipage); + posix_acl_release(default_acl); + } + if (acl) { + if (error) + error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, + ipage); + posix_acl_release(acl); } - error = f2fs_set_acl(inode, type, acl); - -release_and_out: - posix_acl_release(acl); return error; } - -const struct xattr_handler f2fs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .flags = ACL_TYPE_DEFAULT, - .list = f2fs_xattr_list_acl, - .get = f2fs_xattr_get_acl, - .set = f2fs_xattr_set_acl, -}; - -const struct xattr_handler f2fs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .flags = ACL_TYPE_ACCESS, - .list = f2fs_xattr_list_acl, - .get = f2fs_xattr_get_acl, - .set = f2fs_xattr_set_acl, -}; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 80f43067441..e0864651cdc 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -36,20 +36,16 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL -extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type); -extern int f2fs_acl_chmod(struct inode *inode); -extern int f2fs_init_acl(struct inode *inode, struct inode *dir); +extern struct posix_acl *f2fs_get_acl(struct inode *, int); +extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int f2fs_init_acl(struct inode *, struct inode *, struct page *); #else #define f2fs_check_acl NULL #define f2fs_get_acl NULL #define f2fs_set_acl NULL -static inline int f2fs_acl_chmod(struct inode *inode) -{ - return 0; -} - -static inline int f2fs_init_acl(struct inode *inode, struct inode *dir) +static inline int f2fs_init_acl(struct inode *inode, struct inode *dir, + struct page *page) { return 0; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2b6fc131e2c..0b4710c1d37 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -20,6 +20,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include <trace/events/f2fs.h> static struct kmem_cache *orphan_entry_slab; static struct kmem_cache *inode_entry_slab; @@ -29,7 +30,7 @@ static struct kmem_cache *inode_entry_slab; */ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: page = grab_cache_page(mapping, index); @@ -37,9 +38,7 @@ repeat: cond_resched(); goto repeat; } - - /* We wait writeback only inside grab_meta_page() */ - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, META); SetPageUptodate(page); return page; } @@ -49,7 +48,7 @@ repeat: */ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); struct page *page; repeat: page = grab_cache_page(mapping, index); @@ -57,64 +56,157 @@ repeat: cond_resched(); goto repeat; } - if (f2fs_readpage(sbi, page, index, READ_SYNC)) { + if (PageUptodate(page)) + goto out; + + if (f2fs_submit_page_bio(sbi, page, index, + READ_SYNC | REQ_META | REQ_PRIO)) + goto repeat; + + lock_page(page); + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } - mark_page_accessed(page); - - /* We do not allow returning an errorneous page */ +out: return page; } +static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) +{ + switch (type) { + case META_NAT: + return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK; + case META_SIT: + return SIT_BLK_CNT(sbi); + case META_SSA: + case META_CP: + return 0; + default: + BUG(); + } +} + +/* + * Readahead CP/NAT/SIT/SSA pages + */ +int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) +{ + block_t prev_blk_addr = 0; + struct page *page; + int blkno = start; + int max_blks = get_max_meta_blks(sbi, type); + + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (; nrpages-- > 0; blkno++) { + block_t blk_addr; + + switch (type) { + case META_NAT: + /* get nat block addr */ + if (unlikely(blkno >= max_blks)) + blkno = 0; + blk_addr = current_nat_addr(sbi, + blkno * NAT_ENTRY_PER_BLOCK); + break; + case META_SIT: + /* get sit block addr */ + if (unlikely(blkno >= max_blks)) + goto out; + blk_addr = current_sit_addr(sbi, + blkno * SIT_ENTRY_PER_BLOCK); + if (blkno != start && prev_blk_addr + 1 != blk_addr) + goto out; + prev_blk_addr = blk_addr; + break; + case META_SSA: + case META_CP: + /* get ssa/cp block addr */ + blk_addr = blkno; + break; + default: + BUG(); + } + + page = grab_cache_page(META_MAPPING(sbi), blk_addr); + if (!page) + continue; + if (PageUptodate(page)) { + f2fs_put_page(page, 1); + continue; + } + + f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); + f2fs_put_page(page, 0); + } +out: + f2fs_submit_merged_bio(sbi, META, READ); + return blkno - start; +} + static int f2fs_write_meta_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - /* Should not write any meta pages, if any IO error was occurred */ - if (wbc->for_reclaim || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) { - dec_page_count(sbi, F2FS_DIRTY_META); - wbc->pages_skipped++; - set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; - } + trace_f2fs_writepage(page, META); + + if (unlikely(sbi->por_doing)) + goto redirty_out; + if (wbc->for_reclaim) + goto redirty_out; - wait_on_page_writeback(page); + /* Should not write any meta pages, if any IO error was occurred */ + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + goto no_write; + f2fs_wait_on_page_writeback(page, META); write_meta_page(sbi, page); +no_write: dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; } static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - struct block_device *bdev = sbi->sb->s_bdev; - long written; + long diff, written; - if (wbc->for_kupdate) - return 0; + trace_f2fs_writepages(mapping->host, wbc, META); - if (get_pages(sbi, F2FS_DIRTY_META) == 0) - return 0; + /* collect a number of dirty meta pages and write together */ + if (wbc->for_kupdate || + get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) + goto skip_write; /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); - written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev)); + diff = nr_pages_to_write(sbi, META, wbc); + written = sync_meta_pages(sbi, META, wbc->nr_to_write); mutex_unlock(&sbi->cp_mutex); - wbc->nr_to_write -= written; + wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); return 0; } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write) { - struct address_space *mapping = sbi->meta_inode->i_mapping; + struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = LONG_MAX; struct pagevec pvec; long nwritten = 0; @@ -129,20 +221,33 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) + if (unlikely(nr_pages == 0)) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + lock_page(page); - BUG_ON(page->mapping != mapping); - BUG_ON(!PageDirty(page)); - clear_page_dirty_for_io(page); + + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + if (f2fs_write_meta_page(page, &wbc)) { unlock_page(page); break; } - if (nwritten++ >= nr_to_write) + nwritten++; + if (unlikely(nwritten >= nr_to_write)) break; } pagevec_release(&pvec); @@ -150,7 +255,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, } if (nwritten) - f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX); + f2fs_submit_merged_bio(sbi, type, WRITE); return nwritten; } @@ -160,6 +265,8 @@ static int f2fs_set_meta_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + trace_f2fs_set_page_dirty(page, META); + SetPageUptodate(page); if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); @@ -175,99 +282,99 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, }; -int check_orphan_space(struct f2fs_sb_info *sbi) +int acquire_orphan_inode(struct f2fs_sb_info *sbi) { - unsigned int max_orphans; int err = 0; - /* - * considering 512 blocks in a segment 5 blocks are needed for cp - * and log segment summaries. Remaining blocks are used to keep - * orphan entries with the limitation one reserved segment - * for cp pack we can have max 1020*507 orphan entries - */ - max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK; - mutex_lock(&sbi->orphan_inode_mutex); - if (sbi->n_orphans >= max_orphans) + spin_lock(&sbi->orphan_inode_lock); + if (unlikely(sbi->n_orphans >= sbi->max_orphans)) err = -ENOSPC; - mutex_unlock(&sbi->orphan_inode_mutex); + else + sbi->n_orphans++; + spin_unlock(&sbi->orphan_inode_lock); + return err; } +void release_orphan_inode(struct f2fs_sb_info *sbi) +{ + spin_lock(&sbi->orphan_inode_lock); + f2fs_bug_on(sbi->n_orphans == 0); + sbi->n_orphans--; + spin_unlock(&sbi->orphan_inode_lock); +} + void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *head, *this; - struct orphan_inode_entry *new = NULL, *orphan = NULL; + struct list_head *head; + struct orphan_inode_entry *new, *orphan; + + new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); + new->ino = ino; - mutex_lock(&sbi->orphan_inode_mutex); + spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; - list_for_each(this, head) { - orphan = list_entry(this, struct orphan_inode_entry, list); - if (orphan->ino == ino) - goto out; + list_for_each_entry(orphan, head, list) { + if (orphan->ino == ino) { + spin_unlock(&sbi->orphan_inode_lock); + kmem_cache_free(orphan_entry_slab, new); + return; + } + if (orphan->ino > ino) break; - orphan = NULL; - } -retry: - new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC); - if (!new) { - cond_resched(); - goto retry; } - new->ino = ino; - /* add new_oentry into list which is sorted by inode number */ - if (orphan) - list_add(&new->list, this->prev); - else - list_add_tail(&new->list, head); - - sbi->n_orphans++; -out: - mutex_unlock(&sbi->orphan_inode_mutex); + /* add new orphan entry into list which is sorted by inode number */ + list_add_tail(&new->list, &orphan->list); + spin_unlock(&sbi->orphan_inode_lock); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { - struct list_head *this, *next, *head; + struct list_head *head; struct orphan_inode_entry *orphan; - mutex_lock(&sbi->orphan_inode_mutex); + spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; - list_for_each_safe(this, next, head) { - orphan = list_entry(this, struct orphan_inode_entry, list); + list_for_each_entry(orphan, head, list) { if (orphan->ino == ino) { list_del(&orphan->list); - kmem_cache_free(orphan_entry_slab, orphan); + f2fs_bug_on(sbi->n_orphans == 0); sbi->n_orphans--; - break; + spin_unlock(&sbi->orphan_inode_lock); + kmem_cache_free(orphan_entry_slab, orphan); + return; } } - mutex_unlock(&sbi->orphan_inode_mutex); + spin_unlock(&sbi->orphan_inode_lock); } static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode = f2fs_iget(sbi->sb, ino); - BUG_ON(IS_ERR(inode)); + f2fs_bug_on(IS_ERR(inode)); clear_nlink(inode); /* truncate all the data during iput */ iput(inode); } -int recover_orphan_inodes(struct f2fs_sb_info *sbi) +void recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blkaddr, i, j; if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) - return 0; + return; - sbi->por_doing = 1; - start_blk = __start_cp_addr(sbi) + 1; + sbi->por_doing = true; + + start_blk = __start_cp_addr(sbi) + 1 + + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); orphan_blkaddr = __start_sum_addr(sbi) - 1; + ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP); + for (i = 0; i < orphan_blkaddr; i++) { struct page *page = get_meta_page(sbi, start_blk + i); struct f2fs_orphan_block *orphan_blk; @@ -281,30 +388,40 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) } /* clear Orphan Flag */ clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); - sbi->por_doing = 0; - return 0; + sbi->por_doing = false; + return; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) { - struct list_head *head, *this, *next; + struct list_head *head; struct f2fs_orphan_block *orphan_blk = NULL; - struct page *page = NULL; unsigned int nentries = 0; - unsigned short index = 1; - unsigned short orphan_blocks; - - orphan_blocks = (unsigned short)((sbi->n_orphans + + unsigned short index; + unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans + (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); + struct page *page = NULL; + struct orphan_inode_entry *orphan = NULL; + + for (index = 0; index < orphan_blocks; index++) + grab_meta_page(sbi, start_blk + index); - mutex_lock(&sbi->orphan_inode_mutex); + index = 1; + spin_lock(&sbi->orphan_inode_lock); head = &sbi->orphan_inode_list; /* loop for each orphan inode entry and write them in Jornal block */ - list_for_each_safe(this, next, head) { - struct orphan_inode_entry *orphan; + list_for_each_entry(orphan, head, list) { + if (!page) { + page = find_get_page(META_MAPPING(sbi), start_blk++); + f2fs_bug_on(!page); + orphan_blk = + (struct f2fs_orphan_block *)page_address(page); + memset(orphan_blk, 0, sizeof(*orphan_blk)); + f2fs_put_page(page, 0); + } - orphan = list_entry(this, struct orphan_inode_entry, list); + orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); if (nentries == F2FS_ORPHANS_PER_BLOCK) { /* @@ -318,29 +435,20 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) set_page_dirty(page); f2fs_put_page(page, 1); index++; - start_blk++; nentries = 0; page = NULL; } - if (page) - goto page_exist; + } - page = grab_meta_page(sbi, start_blk); - orphan_blk = (struct f2fs_orphan_block *)page_address(page); - memset(orphan_blk, 0, sizeof(*orphan_blk)); -page_exist: - orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); + if (page) { + orphan_blk->blk_addr = cpu_to_le16(index); + orphan_blk->blk_count = cpu_to_le16(orphan_blocks); + orphan_blk->entry_count = cpu_to_le32(nentries); + set_page_dirty(page); + f2fs_put_page(page, 1); } - if (!page) - goto end; - - orphan_blk->blk_addr = cpu_to_le16(index); - orphan_blk->blk_count = cpu_to_le16(orphan_blocks); - orphan_blk->entry_count = cpu_to_le32(nentries); - set_page_dirty(page); - f2fs_put_page(page, 1); -end: - mutex_unlock(&sbi->orphan_inode_mutex); + + spin_unlock(&sbi->orphan_inode_lock); } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, @@ -350,8 +458,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, unsigned long blk_size = sbi->blocksize; struct f2fs_checkpoint *cp_block; unsigned long long cur_version = 0, pre_version = 0; - unsigned int crc = 0; size_t crc_offset; + __u32 crc = 0; /* Read the 1st cp block in this CP pack */ cp_page_1 = get_meta_page(sbi, cp_addr); @@ -362,11 +470,11 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (crc_offset >= blk_size) goto invalid_cp1; - crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); + crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp1; - pre_version = le64_to_cpu(cp_block->checkpoint_ver); + pre_version = cur_cp_version(cp_block); /* Read the 2nd cp block in this CP pack */ cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; @@ -377,11 +485,11 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (crc_offset >= blk_size) goto invalid_cp2; - crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset); + crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp2; - cur_version = le64_to_cpu(cp_block->checkpoint_ver); + cur_version = cur_cp_version(cp_block); if (cur_version == pre_version) { *version = cur_version; @@ -403,8 +511,11 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) unsigned long blk_size = sbi->blocksize; unsigned long long cp1_version = 0, cp2_version = 0; unsigned long long cp_start_blk_no; + unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); + block_t cp_blk_no; + int i; - sbi->ckpt = kzalloc(blk_size, GFP_KERNEL); + sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* @@ -415,7 +526,8 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); /* The second checkpoint pack should start at the next segment */ - cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + cp_start_blk_no += ((unsigned long long)1) << + le32_to_cpu(fsb->log_blocks_per_seg); cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); if (cp1 && cp2) { @@ -434,6 +546,23 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_block = (struct f2fs_checkpoint *)page_address(cur_page); memcpy(sbi->ckpt, cp_block, blk_size); + if (cp_blks <= 1) + goto done; + + cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); + if (cur_page == cp2) + cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); + + for (i = 1; i < cp_blks; i++) { + void *sit_bitmap_ptr; + unsigned char *ckpt = (unsigned char *)sbi->ckpt; + + cur_page = get_meta_page(sbi, cp_blk_no + i); + sit_bitmap_ptr = page_address(cur_page); + memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); + f2fs_put_page(cur_page, 1); + } +done: f2fs_put_page(cp1, 1); f2fs_put_page(cp2, 1); return 0; @@ -443,79 +572,100 @@ fail_no_cp: return -EINVAL; } +static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + + if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) + return -EEXIST; + + set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); + F2FS_I(inode)->dirty_dir = new; + list_add_tail(&new->list, &sbi->dir_inode_list); + stat_inc_dirty_dir(sbi); + return 0; +} + void set_dirty_dir_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *head = &sbi->dir_inode_list; struct dir_inode_entry *new; - struct list_head *this; + int ret = 0; if (!S_ISDIR(inode->i_mode)) return; -retry: - new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - if (!new) { - cond_resched(); - goto retry; - } + + new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); new->inode = inode; INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); - if (entry->inode == inode) { - kmem_cache_free(inode_entry_slab, new); - goto out; - } - } - list_add_tail(&new->list, head); - sbi->n_dirty_dirs++; - - BUG_ON(!S_ISDIR(inode->i_mode)); -out: - inc_page_count(sbi, F2FS_DIRTY_DENTS); + ret = __add_dirty_inode(inode, new); inode_inc_dirty_dents(inode); SetPagePrivate(page); + spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); +} + +void add_dirty_dir_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct dir_inode_entry *new = + f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + int ret = 0; + new->inode = inode; + INIT_LIST_HEAD(&new->list); + + spin_lock(&sbi->dir_inode_lock); + ret = __add_dirty_inode(inode, new); spin_unlock(&sbi->dir_inode_lock); + + if (ret) + kmem_cache_free(inode_entry_slab, new); } void remove_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct list_head *head = &sbi->dir_inode_list; - struct list_head *this; + struct dir_inode_entry *entry; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&sbi->dir_inode_lock); - if (atomic_read(&F2FS_I(inode)->dirty_dents)) - goto out; - - list_for_each(this, head) { - struct dir_inode_entry *entry; - entry = list_entry(this, struct dir_inode_entry, list); - if (entry->inode == inode) { - list_del(&entry->list); - kmem_cache_free(inode_entry_slab, entry); - sbi->n_dirty_dirs--; - break; - } + if (get_dirty_dents(inode) || + !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { + spin_unlock(&sbi->dir_inode_lock); + return; } -out: + + entry = F2FS_I(inode)->dirty_dir; + list_del(&entry->list); + F2FS_I(inode)->dirty_dir = NULL; + clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); + stat_dec_dirty_dir(sbi); spin_unlock(&sbi->dir_inode_lock); + kmem_cache_free(inode_entry_slab, entry); + + /* Only from the recovery routine */ + if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { + clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); + iput(inode); + } } void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) { - struct list_head *head = &sbi->dir_inode_list; + struct list_head *head; struct dir_inode_entry *entry; struct inode *inode; retry: spin_lock(&sbi->dir_inode_lock); + + head = &sbi->dir_inode_list; if (list_empty(head)) { spin_unlock(&sbi->dir_inode_lock); return; @@ -524,14 +674,14 @@ retry: inode = igrab(entry->inode); spin_unlock(&sbi->dir_inode_lock); if (inode) { - filemap_flush(inode->i_mapping); + filemap_fdatawrite(inode->i_mapping); iput(inode); } else { /* * We should submit bio, since it exists several * wribacking dentry pages in the freeing inode. */ - f2fs_submit_bio(sbi, DATA, true); + f2fs_submit_merged_bio(sbi, DATA, WRITE); } goto retry; } @@ -541,54 +691,58 @@ retry: */ static void block_operations(struct f2fs_sb_info *sbi) { - int t; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; + struct blk_plug plug; - /* Stop renaming operation */ - mutex_lock_op(sbi, RENAME); - mutex_lock_op(sbi, DENTRY_OPS); + blk_start_plug(&plug); -retry_dents: +retry_flush_dents: + f2fs_lock_all(sbi); /* write all the dirty dentry pages */ - sync_dirty_dir_inodes(sbi); - - mutex_lock_op(sbi, DATA_WRITE); if (get_pages(sbi, F2FS_DIRTY_DENTS)) { - mutex_unlock_op(sbi, DATA_WRITE); - goto retry_dents; + f2fs_unlock_all(sbi); + sync_dirty_dir_inodes(sbi); + goto retry_flush_dents; } - /* block all the operations */ - for (t = DATA_NEW; t <= NODE_TRUNC; t++) - mutex_lock_op(sbi, t); - - mutex_lock(&sbi->write_inode); - /* * POR: we should ensure that there is no dirty node pages * until finishing nat/sit flush. */ -retry: - sync_node_pages(sbi, 0, &wbc); - - mutex_lock_op(sbi, NODE_WRITE); +retry_flush_nodes: + mutex_lock(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - mutex_unlock_op(sbi, NODE_WRITE); - goto retry; + mutex_unlock(&sbi->node_write); + sync_node_pages(sbi, 0, &wbc); + goto retry_flush_nodes; } - mutex_unlock(&sbi->write_inode); + blk_finish_plug(&plug); } static void unblock_operations(struct f2fs_sb_info *sbi) { - int t; - for (t = NODE_WRITE; t >= RENAME; t--) - mutex_unlock_op(sbi, t); + mutex_unlock(&sbi->node_write); + f2fs_unlock_all(sbi); +} + +static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) +{ + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); + + if (!get_pages(sbi, F2FS_WRITEBACK)) + break; + + io_schedule(); + } + finish_wait(&sbi->cp_wait, &wait); } static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) @@ -598,9 +752,16 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) block_t start_blk; struct page *cp_page; unsigned int data_sum_blocks, orphan_blocks; - unsigned int crc32 = 0; + __u32 crc32 = 0; void *kaddr; int i; + int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); + + /* + * This avoids to conduct wrong roll-forward operations and uses + * metapages, so should be called prior to sync_meta_pages below. + */ + discard_next_dnode(sbi); /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) @@ -645,16 +806,19 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1) / F2FS_ORPHANS_PER_BLOCK; - ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks); + ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + + orphan_blocks); if (is_umount) { set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); ckpt->cp_pack_total_block_count = cpu_to_le32(2 + - data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); + cp_payload_blks + data_sum_blocks + + orphan_blocks + NR_CURSEG_NODE_TYPE); } else { clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); ckpt->cp_pack_total_block_count = cpu_to_le32(2 + - data_sum_blocks + orphan_blocks); + cp_payload_blks + data_sum_blocks + + orphan_blocks); } if (sbi->n_orphans) @@ -667,8 +831,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); - *(__le32 *)((unsigned char *)ckpt + - le32_to_cpu(ckpt->checksum_offset)) + *((__le32 *)((unsigned char *)ckpt + + le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); start_blk = __start_cp_addr(sbi); @@ -680,6 +844,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) set_page_dirty(cp_page); f2fs_put_page(cp_page, 1); + for (i = 1; i < 1 + cp_payload_blks; i++) { + cp_page = grab_meta_page(sbi, start_blk++); + kaddr = page_address(cp_page); + memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, + (1 << sbi->log_blocksize)); + set_page_dirty(cp_page); + f2fs_put_page(cp_page, 1); + } + if (sbi->n_orphans) { write_orphan_inodes(sbi, start_blk); start_blk += orphan_blocks; @@ -700,11 +873,10 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) f2fs_put_page(cp_page, 1); /* wait for previous submitted node/meta pages writeback */ - while (get_pages(sbi, F2FS_WRITEBACK)) - congestion_wait(BLK_RW_ASYNC, HZ / 50); + wait_on_all_pages_writeback(sbi); - filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); - filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX); + filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); + filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; @@ -713,7 +885,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); - if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { clear_prefree_segments(sbi); F2FS_RESET_SB_DIRT(sbi); } @@ -727,50 +899,63 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops"); + mutex_lock(&sbi->cp_mutex); block_operations(sbi); - f2fs_submit_bio(sbi, DATA, true); - f2fs_submit_bio(sbi, NODE, true); - f2fs_submit_bio(sbi, META, true); + trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); + + f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_bio(sbi, META, WRITE); /* * update checkpoint pack index * Increase the version number so that * SIT entries and seg summaries are written at correct place */ - ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver); + ckpt_ver = cur_cp_version(ckpt); ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ flush_nat_entries(sbi); flush_sit_entries(sbi); - reset_victim_segmap(sbi); - /* unlock all the fs_lock[] in do_checkpoint() */ do_checkpoint(sbi, is_umount); unblock_operations(sbi); mutex_unlock(&sbi->cp_mutex); + + stat_inc_cp_count(sbi->stat_info); + trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } void init_orphan_info(struct f2fs_sb_info *sbi) { - mutex_init(&sbi->orphan_inode_mutex); + spin_lock_init(&sbi->orphan_inode_lock); INIT_LIST_HEAD(&sbi->orphan_inode_list); sbi->n_orphans = 0; + /* + * considering 512 blocks in a segment 8 blocks are needed for cp + * and log segment summaries. Remaining blocks are used to keep + * orphan entries with the limitation one reserved segment + * for cp pack we can have max 1020*504 orphan entries + */ + sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE) + * F2FS_ORPHANS_PER_BLOCK; } int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", - sizeof(struct orphan_inode_entry), NULL); - if (unlikely(!orphan_entry_slab)) + sizeof(struct orphan_inode_entry)); + if (!orphan_entry_slab) return -ENOMEM; inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", - sizeof(struct dir_inode_entry), NULL); - if (unlikely(!inode_entry_slab)) { + sizeof(struct dir_inode_entry)); + if (!inode_entry_slab) { kmem_cache_destroy(orphan_entry_slab); return -ENOMEM; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7bd22a20112..f8cf619edb5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -12,6 +12,7 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/mpage.h> +#include <linux/aio.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> @@ -21,6 +22,191 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include <trace/events/f2fs.h> + +static void f2fs_read_end_io(struct bio *bio, int err) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } + bio_put(bio); +} + +static void f2fs_write_end_io(struct bio *bio, int err) +{ + struct f2fs_sb_info *sbi = bio->bi_private; + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + + if (unlikely(err)) { + SetPageError(page); + set_bit(AS_EIO, &page->mapping->flags); + f2fs_stop_checkpoint(sbi); + } + end_page_writeback(page); + dec_page_count(sbi, F2FS_WRITEBACK); + } + + if (sbi->wait_io) { + complete(sbi->wait_io); + sbi->wait_io = NULL; + } + + if (!get_pages(sbi, F2FS_WRITEBACK) && + !list_empty(&sbi->cp_wait.task_list)) + wake_up(&sbi->cp_wait); + + bio_put(bio); +} + +/* + * Low-level block read/write IO operations. + */ +static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, + int npages, bool is_read) +{ + struct bio *bio; + + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + + bio->bi_bdev = sbi->sb->s_bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); + bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; + bio->bi_private = sbi; + + return bio; +} + +static void __submit_merged_bio(struct f2fs_bio_info *io) +{ + struct f2fs_io_info *fio = &io->fio; + int rw; + + if (!io->bio) + return; + + rw = fio->rw; + + if (is_read_io(rw)) { + trace_f2fs_submit_read_bio(io->sbi->sb, rw, + fio->type, io->bio); + submit_bio(rw, io->bio); + } else { + trace_f2fs_submit_write_bio(io->sbi->sb, rw, + fio->type, io->bio); + /* + * META_FLUSH is only from the checkpoint procedure, and we + * should wait this metadata bio for FS consistency. + */ + if (fio->type == META_FLUSH) { + DECLARE_COMPLETION_ONSTACK(wait); + io->sbi->wait_io = &wait; + submit_bio(rw, io->bio); + wait_for_completion(&wait); + } else { + submit_bio(rw, io->bio); + } + } + + io->bio = NULL; +} + +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, + enum page_type type, int rw) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io; + + io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + + down_write(&io->io_rwsem); + + /* change META to META_FLUSH in the checkpoint procedure */ + if (type >= META_FLUSH) { + io->fio.type = META_FLUSH; + io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + } + __submit_merged_bio(io); + up_write(&io->io_rwsem); +} + +/* + * Fill the locked page with data located in the block address. + * Return unlocked page. + */ +int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, + block_t blk_addr, int rw) +{ + struct bio *bio; + + trace_f2fs_submit_page_bio(page, blk_addr, rw); + + /* Allocate a new bio */ + bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw)); + + if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + bio_put(bio); + f2fs_put_page(page, 1); + return -EFAULT; + } + + submit_bio(rw, bio); + return 0; +} + +void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, + block_t blk_addr, struct f2fs_io_info *fio) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); + struct f2fs_bio_info *io; + bool is_read = is_read_io(fio->rw); + + io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + + verify_block_addr(sbi, blk_addr); + + down_write(&io->io_rwsem); + + if (!is_read) + inc_page_count(sbi, F2FS_WRITEBACK); + + if (io->bio && (io->last_block_in_bio != blk_addr - 1 || + io->fio.rw != fio->rw)) + __submit_merged_bio(io); +alloc_new: + if (io->bio == NULL) { + int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + + io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); + io->fio = *fio; + } + + if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + __submit_merged_bio(io); + goto alloc_new; + } + + io->last_block_in_bio = blk_addr; + + up_write(&io->io_rwsem); + trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); +} /* * Lock ordering for the change of data block address: @@ -35,9 +221,9 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) struct page *node_page = dn->node_page; unsigned int ofs_in_node = dn->ofs_in_node; - wait_on_page_writeback(node_page); + f2fs_wait_on_page_writeback(node_page, NODE); - rn = (struct f2fs_node *)page_address(node_page); + rn = F2FS_NODE(node_page); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); @@ -49,32 +235,57 @@ int reserve_new_block(struct dnode_of_data *dn) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; - if (!inc_valid_block_count(sbi, dn->inode, 1)) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) return -ENOSPC; + trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + __set_data_blkaddr(dn, NEW_ADDR); dn->data_blkaddr = NEW_ADDR; + mark_inode_dirty(dn->inode); sync_inode_page(dn); return 0; } +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) +{ + bool need_put = dn->inode_page ? false : true; + int err; + + /* if inode_page exists, index should be zero */ + f2fs_bug_on(!need_put && index); + + err = get_dnode_of_data(dn, index, ALLOC_NODE); + if (err) + return err; + + if (dn->data_blkaddr == NULL_ADDR) + err = reserve_new_block(dn); + if (err || need_put) + f2fs_put_dnode(dn); + return err; +} + static int check_extent_cache(struct inode *inode, pgoff_t pgofs, struct buffer_head *bh_result) { struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); pgoff_t start_fofs, end_fofs; block_t start_blkaddr; + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return 0; + read_lock(&fi->ext.ext_lock); if (fi->ext.len == 0) { read_unlock(&fi->ext.ext_lock); return 0; } - sbi->total_hit_ext++; + stat_inc_total_hit(inode->i_sb); + start_fofs = fi->ext.fofs; end_fofs = fi->ext.fofs + fi->ext.len - 1; start_blkaddr = fi->ext.blk_addr; @@ -92,7 +303,7 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, else bh_result->b_size = UINT_MAX; - sbi->read_hit_ext++; + stat_inc_read_hit(inode->i_sb); read_unlock(&fi->ext.ext_lock); return 1; } @@ -105,13 +316,18 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) struct f2fs_inode_info *fi = F2FS_I(dn->inode); pgoff_t fofs, start_fofs, end_fofs; block_t start_blkaddr, end_blkaddr; + int need_update = true; - BUG_ON(blk_addr == NEW_ADDR); - fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node; + f2fs_bug_on(blk_addr == NEW_ADDR); + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; /* Update the page address in the parent node */ __set_data_blkaddr(dn, blk_addr); + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return; + write_lock(&fi->ext.ext_lock); start_fofs = fi->ext.fofs; @@ -133,7 +349,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) goto end_update; } - /* Frone merge */ + /* Front merge */ if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) { fi->ext.fofs--; fi->ext.blk_addr--; @@ -158,18 +374,24 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) fofs - start_fofs + 1; fi->ext.len -= fofs - start_fofs + 1; } - goto end_update; + } else { + need_update = false; } - write_unlock(&fi->ext.ext_lock); - return; + /* Finally, if the extent is very fragmented, let's drop the cache. */ + if (fi->ext.len < F2FS_MIN_EXTENT_LEN) { + fi->ext.len = 0; + set_inode_flag(fi, FI_NO_EXTENT); + need_update = true; + } end_update: write_unlock(&fi->ext.ext_lock); - sync_inode_page(dn); + if (need_update) + sync_inode_page(dn); return; } -struct page *find_data_page(struct inode *inode, pgoff_t index) +struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; @@ -183,7 +405,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) f2fs_put_page(page, 0); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, RDONLY_NODE); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) return ERR_PTR(err); f2fs_put_dnode(&dn); @@ -192,19 +414,30 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return ERR_PTR(-ENOENT); /* By fallocate(), there is no cached page, but with NEW_ADDR */ - if (dn.data_blkaddr == NEW_ADDR) + if (unlikely(dn.data_blkaddr == NEW_ADDR)) return ERR_PTR(-EINVAL); page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (PageUptodate(page)) { + unlock_page(page); + return page; + } + + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + sync ? READ_SYNC : READA); + if (err) return ERR_PTR(err); + + if (sync) { + wait_on_page_locked(page); + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 0); + return ERR_PTR(-EIO); + } } - unlock_page(page); return page; } @@ -221,29 +454,51 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) struct page *page; int err; +repeat: + page = grab_cache_page(mapping, index); + if (!page) + return ERR_PTR(-ENOMEM); + set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, RDONLY_NODE); - if (err) + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) { + f2fs_put_page(page, 1); return ERR_PTR(err); + } f2fs_put_dnode(&dn); - if (dn.data_blkaddr == NULL_ADDR) + if (unlikely(dn.data_blkaddr == NULL_ADDR)) { + f2fs_put_page(page, 1); return ERR_PTR(-ENOENT); - - page = grab_cache_page(mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); + } if (PageUptodate(page)) return page; - BUG_ON(dn.data_blkaddr == NEW_ADDR); - BUG_ON(dn.data_blkaddr == NULL_ADDR); + /* + * A new dentry page is allocated but not able to be written, since its + * new inode page couldn't be allocated due to -ENOSPC. + * In such the case, its blkaddr can be remained as NEW_ADDR. + * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. + */ + if (dn.data_blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return page; + } - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); + if (err) return ERR_PTR(err); + + lock_page(page); + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; } return page; } @@ -251,9 +506,13 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) /* * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. + * + * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + * Note that, ipage is set only by make_empty_dir. */ -struct page *get_new_data_page(struct inode *inode, pgoff_t index, - bool new_i_size) +struct page *get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; @@ -261,196 +520,258 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index, struct dnode_of_data dn; int err; - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, 0); + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, index); if (err) return ERR_PTR(err); - - if (dn.data_blkaddr == NULL_ADDR) { - if (reserve_new_block(&dn)) { - f2fs_put_dnode(&dn); - return ERR_PTR(-ENOSPC); - } - } - f2fs_put_dnode(&dn); - +repeat: page = grab_cache_page(mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); + if (!page) { + err = -ENOMEM; + goto put_err; + } if (PageUptodate(page)) return page; if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); } else { - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + READ_SYNC); + if (err) + goto put_err; + + lock_page(page); + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + err = -EIO; + goto put_err; + } + if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); - return ERR_PTR(err); + goto repeat; } } - SetPageUptodate(page); if (new_i_size && i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); - mark_inode_dirty_sync(inode); + /* Only the directory inode sets new_i_size */ + set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); } return page; -} - -static void read_end_io(struct bio *bio, int err) -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (uptodate) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } while (bvec >= bio->bi_io_vec); - kfree(bio->bi_private); - bio_put(bio); +put_err: + f2fs_put_dnode(&dn); + return ERR_PTR(err); } -/* - * Fill the locked page with data located in the block address. - * Read operation is synchronous, and caller must unlock the page. - */ -int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, int type) +static int __allocate_data_block(struct dnode_of_data *dn) { - struct block_device *bdev = sbi->sb->s_bdev; - bool sync = (type == READ_SYNC); - struct bio *bio; + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_summary sum; + block_t new_blkaddr; + struct node_info ni; + int type; - /* This page can be already read by other threads */ - if (PageUptodate(page)) { - if (!sync) - unlock_page(page); - return 0; - } + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + return -EPERM; + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + return -ENOSPC; - down_read(&sbi->bio_sem); + __set_data_blkaddr(dn, NEW_ADDR); + dn->data_blkaddr = NEW_ADDR; - /* Allocate a new bio */ - bio = f2fs_bio_alloc(bdev, 1); + get_node_info(sbi, dn->nid, &ni); + set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - /* Initialize the bio */ - bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); - bio->bi_end_io = read_end_io; + type = CURSEG_WARM_DATA; - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { - kfree(bio->bi_private); - bio_put(bio); - up_read(&sbi->bio_sem); - return -EFAULT; - } + allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type); - submit_bio(type, bio); - up_read(&sbi->bio_sem); + /* direct IO doesn't use extent cache to maximize the performance */ + set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); + update_extent_cache(new_blkaddr, dn); + clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); - /* wait for read completion if sync */ - if (sync) { - lock_page(page); - if (PageError(page)) - return -EIO; - } + dn->data_blkaddr = new_blkaddr; return 0; } /* - * This function should be used by the data read flow only where it - * does not check the "create" flag that indicates block allocation. - * The reason for this special functionality is to exploit VFS readahead - * mechanism. + * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh. + * If original data blocks are allocated, then give them to blockdev. + * Otherwise, + * a. preallocate requested block addresses + * b. do not use extent cache for better performance + * c. give the block addresses to blockdev */ -static int get_data_block_ro(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int __get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create, bool fiemap) { + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; unsigned maxblocks = bh_result->b_size >> blkbits; struct dnode_of_data dn; - pgoff_t pgofs; - int err; + int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; + pgoff_t pgofs, end_offset; + int err = 0, ofs = 1; + bool allocated = false; /* Get the page offset from the block offset(iblock) */ pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); if (check_extent_cache(inode, pgofs, bh_result)) - return 0; + goto out; + + if (create) + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE); - if (err) - return (err == -ENOENT) ? 0 : err; + err = get_dnode_of_data(&dn, pgofs, mode); + if (err) { + if (err == -ENOENT) + err = 0; + goto unlock_out; + } + if (dn.data_blkaddr == NEW_ADDR && !fiemap) + goto put_out; - /* It does not support data allocation */ - BUG_ON(create); + if (dn.data_blkaddr != NULL_ADDR) { + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + } else if (create) { + err = __allocate_data_block(&dn); + if (err) + goto put_out; + allocated = true; + map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + } else { + goto put_out; + } - if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) { - int i; - unsigned int end_offset; + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + bh_result->b_size = (((size_t)1) << blkbits); + dn.ofs_in_node++; + pgofs++; - end_offset = IS_INODE(dn.node_page) ? - ADDRS_PER_INODE : - ADDRS_PER_BLOCK; +get_next: + if (dn.ofs_in_node >= end_offset) { + if (allocated) + sync_inode_page(&dn); + allocated = false; + f2fs_put_dnode(&dn); - clear_buffer_new(bh_result); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, pgofs, mode); + if (err) { + if (err == -ENOENT) + err = 0; + goto unlock_out; + } + if (dn.data_blkaddr == NEW_ADDR && !fiemap) + goto put_out; + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + } + + if (maxblocks > (bh_result->b_size >> blkbits)) { + block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (blkaddr == NULL_ADDR && create) { + err = __allocate_data_block(&dn); + if (err) + goto sync_out; + allocated = true; + blkaddr = dn.data_blkaddr; + } /* Give more consecutive addresses for the read ahead */ - for (i = 0; i < end_offset - dn.ofs_in_node; i++) - if (((datablock_addr(dn.node_page, - dn.ofs_in_node + i)) - != (dn.data_blkaddr + i)) || maxblocks == i) - break; - map_bh(bh_result, inode->i_sb, dn.data_blkaddr); - bh_result->b_size = (i << blkbits); + if (blkaddr == (bh_result->b_blocknr + ofs)) { + ofs++; + dn.ofs_in_node++; + pgofs++; + bh_result->b_size += (((size_t)1) << blkbits); + goto get_next; + } } +sync_out: + if (allocated) + sync_inode_page(&dn); +put_out: f2fs_put_dnode(&dn); - return 0; +unlock_out: + if (create) + f2fs_unlock_op(sbi); +out: + trace_f2fs_get_data_block(inode, iblock, bh_result, err); + return err; +} + +static int get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, false); +} + +static int get_data_block_fiemap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, true); +} + +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, + start, len, get_data_block_fiemap); } static int f2fs_read_data_page(struct file *file, struct page *page) { - return mpage_readpage(page, get_data_block_ro); + struct inode *inode = page->mapping->host; + int ret; + + trace_f2fs_readpage(page, DATA); + + /* If the file has inline data, try to read it directlly */ + if (f2fs_has_inline_data(inode)) + ret = f2fs_read_inline_data(inode, page); + else + ret = mpage_readpage(page, get_data_block); + + return ret; } static int f2fs_read_data_pages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro); + struct inode *inode = file->f_mapping->host; + + /* If the file has inline data, skip readpages */ + if (f2fs_has_inline_data(inode)) + return 0; + + return mpage_readpages(mapping, pages, nr_pages, get_data_block); } -int do_write_data_page(struct page *page) +int do_write_data_page(struct page *page, struct f2fs_io_info *fio) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - block_t old_blk_addr, new_blk_addr; + block_t old_blkaddr, new_blkaddr; struct dnode_of_data dn; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, RDONLY_NODE); + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) return err; - old_blk_addr = dn.data_blkaddr; + old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ - if (old_blk_addr == NULL_ADDR) + if (old_blkaddr == NULL_ADDR) goto out_writepage; set_page_writeback(page); @@ -459,16 +780,13 @@ int do_write_data_page(struct page *page) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (old_blk_addr != NEW_ADDR && !is_cold_data(page) && - need_inplace_update(inode)) { - rewrite_data_page(F2FS_SB(inode->i_sb), page, - old_blk_addr); + if (unlikely(old_blkaddr != NEW_ADDR && + !is_cold_data(page) && + need_inplace_update(inode))) { + rewrite_data_page(page, old_blkaddr, fio); } else { - write_data_page(inode, page, &dn, - old_blk_addr, &new_blk_addr); - update_extent_cache(new_blk_addr, &dn); - F2FS_I(inode)->data_version = - le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver); + write_data_page(page, &dn, &new_blkaddr, fio); + update_extent_cache(new_blkaddr, &dn); } out_writepage: f2fs_put_dnode(&dn); @@ -483,70 +801,68 @@ static int f2fs_write_data_page(struct page *page, loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_CACHE_SHIFT; - unsigned offset; + unsigned offset = 0; + bool need_balance_fs = false; int err = 0; + struct f2fs_io_info fio = { + .type = DATA, + .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + }; + + trace_f2fs_writepage(page, DATA); if (page->index < end_index) - goto out; + goto write; /* * If the offset is out-of-range of file size, * this page does not have to be written to disk. */ offset = i_size & (PAGE_CACHE_SIZE - 1); - if ((page->index >= end_index + 1) || !offset) { - if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(inode); - } - goto unlock_out; - } + if ((page->index >= end_index + 1) || !offset) + goto out; zero_user_segment(page, offset, PAGE_CACHE_SIZE); -out: - if (sbi->por_doing) - goto redirty_out; - - if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page)) +write: + if (unlikely(sbi->por_doing)) goto redirty_out; - mutex_lock_op(sbi, DATA_WRITE); + /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); - inode_dec_dirty_dents(inode); - } - err = do_write_data_page(page); - if (err && err != -ENOENT) { - wbc->pages_skipped++; - set_page_dirty(page); + err = do_write_data_page(page, &fio); + goto done; } - mutex_unlock_op(sbi, DATA_WRITE); - if (wbc->for_reclaim) - f2fs_submit_bio(sbi, DATA, true); + if (!wbc->for_reclaim) + need_balance_fs = true; + else if (has_not_enough_free_secs(sbi, 0)) + goto redirty_out; - if (err == -ENOENT) - goto unlock_out; + f2fs_lock_op(sbi); + if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) + err = f2fs_write_inline_data(inode, page, offset); + else + err = do_write_data_page(page, &fio); + f2fs_unlock_op(sbi); +done: + if (err && err != -ENOENT) + goto redirty_out; clear_cold_data(page); +out: + inode_dec_dirty_dents(inode); unlock_page(page); - - if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode)) + if (need_balance_fs) f2fs_balance_fs(sbi); + if (wbc->for_reclaim) + f2fs_submit_merged_bio(sbi, DATA, WRITE); return 0; -unlock_out: - unlock_page(page); - return (err == -ENOENT) ? 0 : err; - redirty_out: - wbc->pages_skipped++; - set_page_dirty(page); + redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } -#define MAX_DESIRED_PAGES_WP 4096 - static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -561,26 +877,41 @@ static int f2fs_write_data_pages(struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + bool locked = false; int ret; - long excess_nrtw = 0, desired_nrtw; + long diff; - if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { - desired_nrtw = MAX_DESIRED_PAGES_WP; - excess_nrtw = desired_nrtw - wbc->nr_to_write; - wbc->nr_to_write = desired_nrtw; - } + trace_f2fs_writepages(mapping->host, wbc, DATA); + + /* deal with chardevs and other special file */ + if (!mapping->a_ops->writepage) + return 0; - if (!S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) && + available_free_memory(sbi, DIRTY_DENTS)) + goto skip_write; + + diff = nr_pages_to_write(sbi, DATA, wbc); + + if (!S_ISDIR(inode->i_mode)) { mutex_lock(&sbi->writepages); + locked = true; + } ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); - if (!S_ISDIR(inode->i_mode)) + if (locked) mutex_unlock(&sbi->writepages); - f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); + + f2fs_submit_merged_bio(sbi, DATA, WRITE); remove_dirty_dir_inode(inode); - wbc->nr_to_write -= excess_nrtw; + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; + +skip_write: + wbc->pages_skipped += get_dirty_dents(inode); + return 0; } static int f2fs_write_begin(struct file *file, struct address_space *mapping, @@ -594,38 +925,43 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct dnode_of_data dn; int err = 0; - /* for nobh_write_end */ - *fsdata = NULL; + trace_f2fs_write_begin(inode, pos, len, flags); f2fs_balance_fs(sbi); +repeat: + err = f2fs_convert_inline_data(inode, pos + len); + if (err) + return err; page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; + + /* to avoid latency during memory pressure */ + unlock_page(page); + *pagep = page; - mutex_lock_op(sbi, DATA_NEW); + if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA) + goto inline_data; + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, 0); + err = f2fs_reserve_block(&dn, index); + f2fs_unlock_op(sbi); + if (err) { - mutex_unlock_op(sbi, DATA_NEW); - f2fs_put_page(page, 1); + f2fs_put_page(page, 0); return err; } - - if (dn.data_blkaddr == NULL_ADDR) { - err = reserve_new_block(&dn); - if (err) { - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); - f2fs_put_page(page, 1); - return err; - } +inline_data: + lock_page(page); + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; } - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); + f2fs_wait_on_page_writeback(page, DATA); if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) return 0; @@ -636,52 +972,113 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, /* Reading beyond i_size is simple: memset to zero */ zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); - return 0; + goto out; } if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { + if (f2fs_has_inline_data(inode)) { + err = f2fs_read_inline_data(inode, page); + if (err) { + page_cache_release(page); + return err; + } + } else { + err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + READ_SYNC); + if (err) + return err; + } + + lock_page(page); + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); - return err; + return -EIO; + } + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; } } +out: SetPageUptodate(page); clear_cold_data(page); return 0; } +static int f2fs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + + trace_f2fs_write_end(inode, pos, len, copied); + + SetPageUptodate(page); + set_page_dirty(page); + + if (pos + copied > i_size_read(inode)) { + i_size_write(inode, pos + copied); + mark_inode_dirty(inode); + update_inode_page(inode); + } + + f2fs_put_page(page, 1); + return copied; +} + +static int check_direct_IO(struct inode *inode, int rw, + struct iov_iter *iter, loff_t offset) +{ + unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; + + if (rw == READ) + return 0; + + if (offset & blocksize_mask) + return -EINVAL; + + if (iov_iter_alignment(iter) & blocksize_mask) + return -EINVAL; + + return 0; +} + static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - if (rw == WRITE) + /* Let buffer I/O handle the inline data case. */ + if (f2fs_has_inline_data(inode)) + return 0; + + if (check_direct_IO(inode, rw, iter, offset)) return 0; - /* Needs synchronization with the cleaner */ - return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - get_data_block_ro); + /* clear fsync mark to recover these blocks */ + fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino); + + return blockdev_direct_IO(rw, iocb, inode, iter, offset, + get_data_block); } -static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) +static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - if (S_ISDIR(inode->i_mode) && PageDirty(page)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); + if (PageDirty(page)) inode_dec_dirty_dents(inode); - } ClearPagePrivate(page); } static int f2fs_release_data_page(struct page *page, gfp_t wait) { ClearPagePrivate(page); - return 0; + return 1; } static int f2fs_set_data_page_dirty(struct page *page) @@ -689,7 +1086,11 @@ static int f2fs_set_data_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; + trace_f2fs_set_page_dirty(page, DATA); + SetPageUptodate(page); + mark_inode_dirty(inode); + if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); set_dirty_dir_page(inode, page); @@ -700,7 +1101,12 @@ static int f2fs_set_data_page_dirty(struct page *page) static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { - return generic_block_bmap(mapping, block, get_data_block_ro); + struct inode *inode = mapping->host; + + if (f2fs_has_inline_data(inode)) + return 0; + + return generic_block_bmap(mapping, block, get_data_block); } const struct address_space_operations f2fs_dblock_aops = { @@ -709,7 +1115,7 @@ const struct address_space_operations f2fs_dblock_aops = { .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, - .write_end = nobh_write_end, + .write_end = f2fs_write_end, .set_page_dirty = f2fs_set_data_page_dirty, .invalidatepage = f2fs_invalidate_data_page, .releasepage = f2fs_release_data_page, diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 025b9e2f935..b52c12cf587 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -13,7 +13,6 @@ #include <linux/fs.h> #include <linux/backing-dev.h> -#include <linux/proc_fs.h> #include <linux/f2fs_fs.h> #include <linux/blkdev.h> #include <linux/debugfs.h> @@ -25,12 +24,12 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static struct dentry *debugfs_root; +static struct dentry *f2fs_debugfs_root; static DEFINE_MUTEX(f2fs_stat_mutex); static void update_general_status(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); int i; /* valid check of the segment numbers */ @@ -46,14 +45,15 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->valid_count = valid_user_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); + si->inline_inode = sbi->inline_inode; si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); si->free_secs = free_sections(sbi); si->prefree_count = prefree_segments(sbi); si->dirty_count = dirty_segments(sbi); - si->node_pages = sbi->node_inode->i_mapping->nrpages; - si->meta_pages = sbi->meta_inode->i_mapping->nrpages; + si->node_pages = NODE_MAPPING(sbi)->nrpages; + si->meta_pages = META_MAPPING(sbi)->nrpages; si->nats = NM_I(sbi)->nat_cnt; si->sits = SIT_I(sbi)->dirty_sentries; si->fnids = NM_I(sbi)->fcnt; @@ -84,9 +84,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) */ static void update_sit_info(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; - struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, vblocks; int ndirty = 0; @@ -94,7 +93,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) total_vblocks = 0; blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); hblks_per_sec = blks_per_sec / 2; - mutex_lock(&sit_i->sentry_lock); for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); dist = abs(vblocks - hblks_per_sec); @@ -105,8 +103,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) ndirty++; } } - mutex_unlock(&sit_i->sentry_lock); - dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100; + dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; si->bimodal = bimodal / dist; if (si->dirty_count) si->avg_vblocks = total_vblocks / ndirty; @@ -119,7 +116,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) */ static void update_mem_info(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); unsigned npages; if (si->base_mem) @@ -138,14 +135,13 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi); if (sbi->segs_per_sec > 1) - si->base_mem += sbi->total_sections * - sizeof(struct sec_entry); + si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry); si->base_mem += __bitmap_size(sbi, SIT_BITMAP); /* build free segmap */ si->base_mem += sizeof(struct free_segmap_info); si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += f2fs_bitmap_size(sbi->total_sections); + si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); /* build curseg */ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; @@ -154,7 +150,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build dirty segmap */ si->base_mem += sizeof(struct dirty_seglist_info); si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); /* buld nm */ si->base_mem += sizeof(struct f2fs_nm_info); @@ -167,9 +163,9 @@ get_cache: /* free nids */ si->cache_mem = NM_I(sbi)->fcnt; si->cache_mem += NM_I(sbi)->nat_cnt; - npages = sbi->node_inode->i_mapping->nrpages; + npages = NODE_MAPPING(sbi)->nrpages; si->cache_mem += npages << PAGE_CACHE_SHIFT; - npages = sbi->meta_inode->i_mapping->nrpages; + npages = META_MAPPING(sbi)->nrpages; si->cache_mem += npages << PAGE_CACHE_SHIFT; si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); @@ -177,12 +173,12 @@ get_cache: static int stat_show(struct seq_file *s, void *v) { - struct f2fs_stat_info *si, *next; + struct f2fs_stat_info *si; int i = 0; int j; mutex_lock(&f2fs_stat_mutex); - list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { + list_for_each_entry(si, &f2fs_stat_list, stat_list) { char devname[BDEVNAME_SIZE]; update_general_status(si->sbi); @@ -202,6 +198,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "Other: %u)\n - Data: %u\n", si->valid_node_count - si->valid_inode_count, si->valid_count - si->valid_node_count); + seq_printf(s, " - Inline_data Inode: %u\n", + si->inline_inode); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -235,6 +233,7 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); + seq_printf(s, "CP calls: %d\n", si->cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d\n", si->data_segs); @@ -244,32 +243,32 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - node blocks : %d\n", si->node_blks); seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", si->hit_ext, si->total_ext); - seq_printf(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - nodes %4d in %4d\n", + seq_puts(s, "\nBalancing F2FS Async:\n"); + seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents %4d in dirs:%4d\n", + seq_printf(s, " - dents: %4d in dirs:%4d\n", si->ndirty_dent, si->ndirty_dirs); - seq_printf(s, " - meta %4d in %4d\n", + seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - NATs %5d > %lu\n", - si->nats, NM_WOUT_THRESHOLD); - seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n", - si->sits, si->fnids); - seq_printf(s, "\nDistribution of User Blocks:"); - seq_printf(s, " [ valid | invalid | free ]\n"); - seq_printf(s, " ["); + seq_printf(s, " - NATs: %9d\n - SITs: %9d\n", + si->nats, si->sits); + seq_printf(s, " - free_nids: %9d\n", + si->fnids); + seq_puts(s, "\nDistribution of User Blocks:"); + seq_puts(s, " [ valid | invalid | free ]\n"); + seq_puts(s, " ["); for (j = 0; j < si->util_valid; j++) - seq_printf(s, "-"); - seq_printf(s, "|"); + seq_putc(s, '-'); + seq_putc(s, '|'); for (j = 0; j < si->util_invalid; j++) - seq_printf(s, "-"); - seq_printf(s, "|"); + seq_putc(s, '-'); + seq_putc(s, '|'); for (j = 0; j < si->util_free; j++) - seq_printf(s, "-"); - seq_printf(s, "]\n\n"); + seq_putc(s, '-'); + seq_puts(s, "]\n\n"); seq_printf(s, "SSR: %u blocks in %u segments\n", si->block_count[SSR], si->segment_count[SSR]); seq_printf(s, "LFS: %u blocks in %u segments\n", @@ -307,11 +306,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; - sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); - if (!sbi->stat_info) + si = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL); + if (!si) return -ENOMEM; - si = sbi->stat_info; si->all_area_segs = le32_to_cpu(raw_super->segment_count); si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); @@ -321,6 +319,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); si->sbi = sbi; + sbi->stat_info = si; mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); @@ -331,25 +330,43 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { - struct f2fs_stat_info *si = sbi->stat_info; + struct f2fs_stat_info *si = F2FS_STAT(sbi); mutex_lock(&f2fs_stat_mutex); list_del(&si->stat_list); mutex_unlock(&f2fs_stat_mutex); - kfree(sbi->stat_info); + kfree(si); } void __init f2fs_create_root_stats(void) { - debugfs_root = debugfs_create_dir("f2fs", NULL); - if (debugfs_root) - debugfs_create_file("status", S_IRUGO, debugfs_root, - NULL, &stat_fops); + struct dentry *file; + + f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); + if (!f2fs_debugfs_root) + goto bail; + + file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, + NULL, &stat_fops); + if (!file) + goto free_debugfs_dir; + + return; + +free_debugfs_dir: + debugfs_remove(f2fs_debugfs_root); + +bail: + f2fs_debugfs_root = NULL; + return; } void f2fs_destroy_root_stats(void) { - debugfs_remove_recursive(debugfs_root); - debugfs_root = NULL; + if (!f2fs_debugfs_root) + return; + + debugfs_remove_recursive(f2fs_debugfs_root); + f2fs_debugfs_root = NULL; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a1f38443ece..a4addd72ebb 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -13,6 +13,7 @@ #include "f2fs.h" #include "node.h" #include "acl.h" +#include "xattr.h" static unsigned long dir_blocks(struct inode *inode) { @@ -20,12 +21,12 @@ static unsigned long dir_blocks(struct inode *inode) >> PAGE_CACHE_SHIFT; } -static unsigned int dir_buckets(unsigned int level) +static unsigned int dir_buckets(unsigned int level, int dir_level) { - if (level < MAX_DIR_HASH_DEPTH / 2) - return 1 << level; + if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) + return 1 << (level + dir_level); else - return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1); + return MAX_DIR_BUCKETS; } static unsigned int bucket_blocks(unsigned int level) @@ -60,17 +61,18 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } -static unsigned long dir_block_index(unsigned int level, unsigned int idx) +static unsigned long dir_block_index(unsigned int level, + int dir_level, unsigned int idx) { unsigned long i; unsigned long bidx = 0; for (i = 0; i < level; i++) - bidx += dir_buckets(i) * bucket_blocks(i); + bidx += dir_buckets(i, dir_level) * bucket_blocks(i); bidx += idx * bucket_blocks(level); return bidx; } @@ -92,16 +94,21 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, f2fs_hash_t namehash, struct page **res_page) { struct f2fs_dir_entry *de; - unsigned long bit_pos, end_pos, next_pos; + unsigned long bit_pos = 0; struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); - int slots; + const void *dentry_bits = &dentry_blk->dentry_bitmap; + int max_len = 0; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, 0); while (bit_pos < NR_DENTRY_IN_BLOCK) { + if (!test_bit_le(bit_pos, dentry_bits)) { + if (bit_pos == 0) + max_len = 1; + else if (!test_bit_le(bit_pos - 1, dentry_bits)) + max_len++; + bit_pos++; + continue; + } de = &dentry_blk->dentry[bit_pos]; - slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - if (early_match_name(name, namelen, namehash, de)) { if (!memcmp(dentry_blk->filename[bit_pos], name, namelen)) { @@ -109,20 +116,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, goto found; } } - next_pos = bit_pos + slots; - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, next_pos); - if (bit_pos >= NR_DENTRY_IN_BLOCK) - end_pos = NR_DENTRY_IN_BLOCK; - else - end_pos = bit_pos; - if (*max_slots < end_pos - next_pos) - *max_slots = end_pos - next_pos; + if (max_len > *max_slots) { + *max_slots = max_len; + max_len = 0; + } + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } de = NULL; kunmap(dentry_page); found: + if (max_len > *max_slots) + *max_slots = max_len; return de; } @@ -138,17 +143,18 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, bool room = false; int max_slots = 0; - BUG_ON(level > MAX_DIR_HASH_DEPTH); + f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + le32_to_cpu(namehash) % nbucket); end_block = bidx + nblock; for (; bidx < end_block; bidx++) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = find_data_page(dir, bidx); + dentry_page = find_data_page(dir, bidx, true); if (IS_ERR(dentry_page)) { room = true; continue; @@ -212,9 +218,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { - struct page *page = NULL; - struct f2fs_dir_entry *de = NULL; - struct f2fs_dentry_block *dentry_blk = NULL; + struct page *page; + struct f2fs_dir_entry *de; + struct f2fs_dentry_block *dentry_blk; page = get_lock_data_page(dir, 0); if (IS_ERR(page)) @@ -246,11 +252,8 @@ ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - - mutex_lock_op(sbi, DENTRY_OPS); lock_page(page); - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode); kunmap(page); @@ -258,90 +261,148 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, dir->i_mtime = dir->i_ctime = CURRENT_TIME; mark_inode_dirty(dir); - /* update parent inode number before releasing dentry page */ - F2FS_I(inode)->i_pino = dir->i_ino; - f2fs_put_page(page, 1); - mutex_unlock_op(sbi, DENTRY_OPS); } -void init_dent_inode(const struct qstr *name, struct page *ipage) +static void init_dent_inode(const struct qstr *name, struct page *ipage) { - struct f2fs_node *rn; - - if (IS_ERR(ipage)) - return; + struct f2fs_inode *ri; - wait_on_page_writeback(ipage); + f2fs_wait_on_page_writeback(ipage, NODE); /* copy name info. to this inode page */ - rn = (struct f2fs_node *)page_address(ipage); - rn->i.i_namelen = cpu_to_le32(name->len); - memcpy(rn->i.i_name, name->name, name->len); + ri = F2FS_INODE(ipage); + ri->i_namelen = cpu_to_le32(name->len); + memcpy(ri->i_name, name->name, name->len); set_page_dirty(ipage); } -static int init_inode_metadata(struct inode *inode, +int update_dent_inode(struct inode *inode, const struct qstr *name) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *page; + + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + return PTR_ERR(page); + + init_dent_inode(name, page); + f2fs_put_page(page, 1); + + return 0; +} + +static int make_empty_dir(struct inode *inode, + struct inode *parent, struct page *page) +{ + struct page *dentry_page; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dir_entry *de; + void *kaddr; + + dentry_page = get_new_data_page(inode, page, 0, true); + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); + + kaddr = kmap_atomic(dentry_page); + dentry_blk = (struct f2fs_dentry_block *)kaddr; + + de = &dentry_blk->dentry[0]; + de->name_len = cpu_to_le16(1); + de->hash_code = 0; + de->ino = cpu_to_le32(inode->i_ino); + memcpy(dentry_blk->filename[0], ".", 1); + set_de_type(de, inode); + + de = &dentry_blk->dentry[1]; + de->hash_code = 0; + de->name_len = cpu_to_le16(2); + de->ino = cpu_to_le32(parent->i_ino); + memcpy(dentry_blk->filename[1], "..", 2); + set_de_type(de, inode); + + test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); + test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); + kunmap_atomic(kaddr); + + set_page_dirty(dentry_page); + f2fs_put_page(dentry_page, 1); + return 0; +} + +static struct page *init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *name) { + struct page *page; + int err; + if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - int err; - err = new_inode_page(inode, name); - if (err) - return err; + page = new_inode_page(inode, name); + if (IS_ERR(page)) + return page; if (S_ISDIR(inode->i_mode)) { - err = f2fs_make_empty(inode, dir); - if (err) { - remove_inode_page(inode); - return err; - } + err = make_empty_dir(inode, dir, page); + if (err) + goto error; } - err = f2fs_init_acl(inode, dir); - if (err) { - remove_inode_page(inode); - return err; - } + err = f2fs_init_acl(inode, dir, page); + if (err) + goto put_error; + + err = f2fs_init_security(inode, dir, name, page); + if (err) + goto put_error; } else { - struct page *ipage; - ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); - set_cold_node(inode, ipage); - init_dent_inode(name, ipage); - f2fs_put_page(ipage, 1); + page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); + if (IS_ERR(page)) + return page; + + set_cold_node(inode, page); } + + init_dent_inode(name, page); + + /* + * This file should be checkpointed during fsync. + * We lost i_pino from now on. + */ if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { + file_lost_pino(inode); inc_nlink(inode); - f2fs_write_inode(inode, NULL); } - return 0; + return page; + +put_error: + f2fs_put_page(page, 1); +error: + /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ + truncate_inode_pages(&inode->i_data, 0); + truncate_blocks(inode, 0); + remove_dirty_dir_inode(inode); + remove_inode_page(inode); + return ERR_PTR(err); } static void update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { - bool need_dir_update = false; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { if (S_ISDIR(inode->i_mode)) { inc_nlink(dir); - need_dir_update = true; + set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + if (F2FS_I(dir)->i_current_depth != current_depth) { F2FS_I(dir)->i_current_depth = current_depth; - need_dir_update = true; + set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } - if (need_dir_update) - f2fs_write_inode(dir, NULL); - else - mark_inode_dirty(dir); - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) clear_inode_flag(F2FS_I(inode), FI_INC_LINK); } @@ -370,7 +431,12 @@ next: goto next; } -int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode) { unsigned int bit_pos; unsigned int level; @@ -379,11 +445,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in f2fs_hash_t dentry_hash; struct f2fs_dir_entry *de; unsigned int nbucket, nblock; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); size_t namelen = name->len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; int slots = GET_DENTRY_SLOTS(namelen); + struct page *page; int err = 0; int i; @@ -396,25 +462,23 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in } start: - if (current_depth == MAX_DIR_HASH_DEPTH) + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) ++current_depth; - nbucket = dir_buckets(level); + nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); - bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); + bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level, + (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { - mutex_lock_op(sbi, DENTRY_OPS); - dentry_page = get_new_data_page(dir, block, true); - if (IS_ERR(dentry_page)) { - mutex_unlock_op(sbi, DENTRY_OPS); + dentry_page = get_new_data_page(dir, NULL, block, true); + if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - } dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(dentry_blk, slots); @@ -423,19 +487,20 @@ start: kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - mutex_unlock_op(sbi, DENTRY_OPS); } /* Move to next level to find the empty slot for new dentry */ ++level; goto start; add_dentry: - err = init_inode_metadata(inode, dir, name); - if (err) - goto fail; - - wait_on_page_writeback(dentry_page); + f2fs_wait_on_page_writeback(dentry_page, DATA); + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, name); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } de = &dentry_blk->dentry[bit_pos]; de->hash_code = dentry_hash; de->name_len = cpu_to_le16(namelen); @@ -446,14 +511,21 @@ add_dentry: test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); set_page_dirty(dentry_page); - update_parent_metadata(dir, inode, current_depth); - - /* update parent inode number before releasing dentry page */ + /* we don't need to mark_inode_dirty now */ F2FS_I(inode)->i_pino = dir->i_ino; + update_inode(inode, page); + f2fs_put_page(page, 1); + + update_parent_metadata(dir, inode, current_depth); fail: + up_write(&F2FS_I(inode)->i_sem); + + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { + update_inode_page(dir); + clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - mutex_unlock_op(sbi, DENTRY_OPS); return err; } @@ -468,15 +540,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, unsigned int bit_pos; struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); void *kaddr = page_address(page); int i; - mutex_lock_op(sbi, DENTRY_OPS); - lock_page(page); - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); dentry_blk = (struct f2fs_dentry_block *)kaddr; bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; @@ -492,72 +561,37 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dir->i_ctime = dir->i_mtime = CURRENT_TIME; - if (inode && S_ISDIR(inode->i_mode)) { - drop_nlink(dir); - f2fs_write_inode(dir, NULL); - } else { - mark_inode_dirty(dir); - } - if (inode) { + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + + down_write(&F2FS_I(inode)->i_sem); + + if (S_ISDIR(inode->i_mode)) { + drop_nlink(dir); + update_inode_page(dir); + } inode->i_ctime = CURRENT_TIME; drop_nlink(inode); if (S_ISDIR(inode->i_mode)) { drop_nlink(inode); i_size_write(inode, 0); } - f2fs_write_inode(inode, NULL); + up_write(&F2FS_I(inode)->i_sem); + update_inode_page(inode); + if (inode->i_nlink == 0) add_orphan_inode(sbi, inode->i_ino); + else + release_orphan_inode(sbi); } if (bit_pos == NR_DENTRY_IN_BLOCK) { truncate_hole(dir, page->index, page->index + 1); clear_page_dirty_for_io(page); ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(dir); } f2fs_put_page(page, 1); - - mutex_unlock_op(sbi, DENTRY_OPS); -} - -int f2fs_make_empty(struct inode *inode, struct inode *parent) -{ - struct page *dentry_page; - struct f2fs_dentry_block *dentry_blk; - struct f2fs_dir_entry *de; - void *kaddr; - - dentry_page = get_new_data_page(inode, 0, true); - if (IS_ERR(dentry_page)) - return PTR_ERR(dentry_page); - - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; - - de = &dentry_blk->dentry[0]; - de->name_len = cpu_to_le16(1); - de->hash_code = f2fs_dentry_hash(".", 1); - de->ino = cpu_to_le32(inode->i_ino); - memcpy(dentry_blk->filename[0], ".", 1); - set_de_type(de, inode); - - de = &dentry_blk->dentry[1]; - de->hash_code = f2fs_dentry_hash("..", 2); - de->name_len = cpu_to_le16(2); - de->ino = cpu_to_le32(parent->i_ino); - memcpy(dentry_blk->filename[1], "..", 2); - set_de_type(de, inode); - - test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); - test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); - kunmap_atomic(kaddr); - - set_page_dirty(dentry_page); - f2fs_put_page(dentry_page, 1); - return 0; } bool f2fs_empty_dir(struct inode *dir) @@ -597,34 +631,32 @@ bool f2fs_empty_dir(struct inode *dir) return true; } -static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir) +static int f2fs_readdir(struct file *file, struct dir_context *ctx) { - unsigned long pos = file->f_pos; struct inode *inode = file_inode(file); unsigned long npages = dir_blocks(inode); - unsigned char *types = NULL; - unsigned int bit_pos = 0, start_bit_pos = 0; - int over = 0; + unsigned int bit_pos = 0; struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dir_entry *de = NULL; struct page *dentry_page = NULL; - unsigned int n = 0; + struct file_ra_state *ra = &file->f_ra; + unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); unsigned char d_type = DT_UNKNOWN; - int slots; - types = f2fs_filetype_table; - bit_pos = (pos % NR_DENTRY_IN_BLOCK); - n = (pos / NR_DENTRY_IN_BLOCK); + bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); + + /* readahead for multi pages of dir */ + if (npages - n > 1 && !ra_has_index(ra, n)) + page_cache_sync_readahead(inode->i_mapping, ra, file, n, + min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - for ( ; n < npages; n++) { + for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n); if (IS_ERR(dentry_page)) continue; - start_bit_pos = bit_pos; dentry_blk = kmap(dentry_page); while (bit_pos < NR_DENTRY_IN_BLOCK) { - d_type = DT_UNKNOWN; bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, bit_pos); @@ -632,28 +664,26 @@ static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir) break; de = &dentry_blk->dentry[bit_pos]; - if (types && de->file_type < F2FS_FT_MAX) - d_type = types[de->file_type]; - - over = filldir(dirent, + if (de->file_type < F2FS_FT_MAX) + d_type = f2fs_filetype_table[de->file_type]; + else + d_type = DT_UNKNOWN; + if (!dir_emit(ctx, dentry_blk->filename[bit_pos], le16_to_cpu(de->name_len), - (n * NR_DENTRY_IN_BLOCK) + bit_pos, - le32_to_cpu(de->ino), d_type); - if (over) { - file->f_pos += bit_pos - start_bit_pos; - goto success; - } - slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - bit_pos += slots; + le32_to_cpu(de->ino), d_type)) + goto stop; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos; } bit_pos = 0; - file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK; + ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); dentry_page = NULL; } -success: +stop: if (dentry_page && !IS_ERR(dentry_page)) { kunmap(dentry_page); f2fs_put_page(dentry_page, 1); @@ -665,7 +695,7 @@ success: const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = f2fs_readdir, + .iterate = f2fs_readdir, .fsync = f2fs_sync_file, .unlocked_ioctl = f2fs_ioctl, }; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cc2213afdcc..58df97e174d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -17,6 +17,16 @@ #include <linux/slab.h> #include <linux/crc32.h> #include <linux/magic.h> +#include <linux/kobject.h> +#include <linux/sched.h> + +#ifdef CONFIG_F2FS_CHECK_FS +#define f2fs_bug_on(condition) BUG_ON(condition) +#define f2fs_down_write(x, y) down_write_nest_lock(x, y) +#else +#define f2fs_bug_on(condition) +#define f2fs_down_write(x, y) down_write(x) +#endif /* * For mount options @@ -28,6 +38,9 @@ #define F2FS_MOUNT_XATTR_USER 0x00000010 #define F2FS_MOUNT_POSIX_ACL 0x00000020 #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 +#define F2FS_MOUNT_INLINE_XATTR 0x00000080 +#define F2FS_MOUNT_INLINE_DATA 0x00000100 +#define F2FS_MOUNT_FLUSH_MERGE 0x00000200 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -37,21 +50,35 @@ typecheck(unsigned long long, b) && \ ((long long)((a) - (b)) > 0)) -typedef u64 block_t; +typedef u32 block_t; /* + * should not change u32, since it is the on-disk block + * address format, __le32. + */ typedef u32 nid_t; struct f2fs_mount_info { unsigned int opt; }; -static inline __u32 f2fs_crc32(void *buff, size_t len) +#define CRCPOLY_LE 0xedb88320 + +static inline __u32 f2fs_crc32(void *buf, size_t len) { - return crc32_le(F2FS_SUPER_MAGIC, buff, len); + unsigned char *p = (unsigned char *)buf; + __u32 crc = F2FS_SUPER_MAGIC; + int i; + + while (len--) { + crc ^= *p++; + for (i = 0; i < 8; i++) + crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0); + } + return crc; } -static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size) +static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size) { - return f2fs_crc32(buff, buff_size) == blk_crc; + return f2fs_crc32(buf, buf_size) == blk_crc; } /* @@ -62,6 +89,16 @@ enum { SIT_BITMAP }; +/* + * For CP/NAT/SIT/SSA readahead + */ +enum { + META_CP, + META_NAT, + META_SIT, + META_SSA +}; + /* for the list of orphan inodes */ struct orphan_inode_entry { struct list_head list; /* list head */ @@ -74,6 +111,13 @@ struct dir_inode_entry { struct inode *inode; /* vfs inode pointer */ }; +/* for the list of blockaddresses to be discarded */ +struct discard_entry { + struct list_head list; /* list head */ + block_t blkaddr; /* block address to be discarded */ + int len; /* # of consecutive blocks of the discard */ +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ @@ -120,47 +164,63 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) /* * For INODE and NODE manager */ -#define XATTR_NODE_OFFSET (-1) /* - * store xattrs to one node block per - * file keeping -1 as its node offset to - * distinguish from index node blocks. - */ -#define RDONLY_NODE 1 /* - * specify a read-only mode when getting - * a node block. 0 is read-write mode. - * used by get_dnode_of_data(). +/* + * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1 + * as its node offset to distinguish from index node blocks. + * But some bits are used to mark the node block. + */ +#define XATTR_NODE_OFFSET ((((unsigned int)-1) << OFFSET_BIT_SHIFT) \ + >> OFFSET_BIT_SHIFT) +enum { + ALLOC_NODE, /* allocate a new node page if needed */ + LOOKUP_NODE, /* look up a node without readahead */ + LOOKUP_NODE_RA, /* + * look up a node with readahead called + * by get_data_block. */ +}; + #define F2FS_LINK_MAX 32000 /* maximum link count per file */ +#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ + /* for in-memory extent cache entry */ +#define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */ + struct extent_info { rwlock_t ext_lock; /* rwlock for consistency */ unsigned int fofs; /* start offset in a file */ u32 blk_addr; /* start block address of the extent */ - unsigned int len; /* lenth of the extent */ + unsigned int len; /* length of the extent */ }; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. */ #define FADVISE_COLD_BIT 0x01 +#define FADVISE_LOST_PINO_BIT 0x02 + +#define DEF_DIR_LEVEL 0 struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ unsigned char i_advise; /* use to give file attribute hints */ + unsigned char i_dir_level; /* use for dentry level for large dir */ unsigned int i_current_depth; /* use only in directory structure */ unsigned int i_pino; /* parent inode number */ umode_t i_acl_mode; /* keep file acl mode temporarily */ /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ - unsigned long long data_version;/* latest version of data for fsync */ + struct rw_semaphore i_sem; /* protect fi info */ atomic_t dirty_dents; /* # of dirty dentry pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ + unsigned long long xattr_ver; /* cp version of xattr modification */ struct extent_info ext; /* in-memory extent cache entry */ + struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */ }; static inline void get_extent_info(struct extent_info *ext, @@ -186,8 +246,9 @@ static inline void set_raw_extent(struct extent_info *ext, struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ - nid_t init_scan_nid; /* the first nid to be scanned */ + nid_t available_nids; /* maximum available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ + unsigned int ram_thresh; /* control the memory footprint */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ @@ -197,6 +258,7 @@ struct f2fs_nm_info { struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ /* free node ids management */ + struct radix_tree_root free_nid_root;/* root of the free_nid cache */ struct list_head free_nid_list; /* a list for free nids */ spinlock_t free_nid_list_lock; /* protect free nid list */ unsigned int fcnt; /* the number of free node id */ @@ -259,15 +321,27 @@ enum { NO_CHECK_TYPE }; +struct flush_cmd { + struct flush_cmd *next; + struct completion wait; + int ret; +}; + +struct flush_cmd_control { + struct task_struct *f2fs_issue_flush; /* flush thread */ + wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + struct flush_cmd *issue_list; /* list for command issue */ + struct flush_cmd *dispatch_list; /* list for command dispatch */ + spinlock_t issue_lock; /* for issue list lock */ + struct flush_cmd *issue_tail; /* list tail of issue list */ +}; + struct f2fs_sm_info { struct sit_info *sit_info; /* whole segment information */ struct free_segmap_info *free_info; /* free segment information */ struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ - struct list_head wblist_head; /* list of under-writeback pages */ - spinlock_t wblist_lock; /* lock for checkpoint */ - block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ @@ -276,16 +350,22 @@ struct f2fs_sm_info { unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ unsigned int ovp_segments; /* # of overprovision segments */ -}; -/* - * For directory operation - */ -#define NODE_DIR1_BLOCK (ADDRS_PER_INODE + 1) -#define NODE_DIR2_BLOCK (ADDRS_PER_INODE + 2) -#define NODE_IND1_BLOCK (ADDRS_PER_INODE + 3) -#define NODE_IND2_BLOCK (ADDRS_PER_INODE + 4) -#define NODE_DIND_BLOCK (ADDRS_PER_INODE + 5) + /* a threshold to reclaim prefree segments */ + unsigned int rec_prefree_segments; + + /* for small discard management */ + struct list_head discard_list; /* 4KB discard list */ + int nr_discards; /* # of discards in the list */ + int max_discards; /* max. discards to be issued */ + + unsigned int ipu_policy; /* in-place-update policy */ + unsigned int min_ipu_util; /* in-place-update threshold */ + + /* for flush command control */ + struct flush_cmd_control *cmd_control_info; + +}; /* * For superblock @@ -305,25 +385,6 @@ enum count_type { }; /* - * FS_LOCK nesting subclasses for the lock validator: - * - * The locking order between these classes is - * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW - * -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC - */ -enum lock_type { - RENAME, /* for renaming operations */ - DENTRY_OPS, /* for directory operations */ - DATA_WRITE, /* for data write */ - DATA_NEW, /* for data allocation */ - DATA_TRUNC, /* for data truncate */ - NODE_NEW, /* for node allocation */ - NODE_TRUNC, /* for node truncate */ - NODE_WRITE, /* for node write */ - NR_LOCK_TYPE, -}; - -/* * The below are the page types of bios used in submti_bio(). * The available types are: * DATA User data pages. It operates as async mode. @@ -334,6 +395,7 @@ enum lock_type { * with waiting the bio's completion * ... Only can be used with META. */ +#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type)) enum page_type { DATA, NODE, @@ -342,8 +404,23 @@ enum page_type { META_FLUSH, }; +struct f2fs_io_info { + enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ +}; + +#define is_read_io(rw) (((rw) & 1) == READ) +struct f2fs_bio_info { + struct f2fs_sb_info *sbi; /* f2fs superblock */ + struct bio *bio; /* bios to merge */ + sector_t last_block_in_bio; /* last block number */ + struct f2fs_io_info fio; /* store buffered io info. */ + struct rw_semaphore io_rwsem; /* blocking op for bio */ +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ + struct proc_dir_entry *s_proc; /* proc entry */ struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ int s_dirty; /* dirty flag for checkpoint */ @@ -354,28 +431,31 @@ struct f2fs_sb_info { /* for segment-related operations */ struct f2fs_sm_info *sm_info; /* segment manager */ - struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */ - sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */ - struct rw_semaphore bio_sem; /* IO semaphore */ + + /* for bio operations */ + struct f2fs_bio_info read_io; /* for read bios */ + struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct completion *wait_io; /* for completion bios */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ struct inode *meta_inode; /* cache meta blocks */ - struct mutex cp_mutex; /* for checkpoint procedure */ - struct mutex fs_lock[NR_LOCK_TYPE]; /* for blocking FS operations */ - struct mutex write_inode; /* mutex for write inode */ + struct mutex cp_mutex; /* checkpoint procedure lock */ + struct rw_semaphore cp_rwsem; /* blocking FS operations */ + struct mutex node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ - int por_doing; /* recovery is doing or not */ + bool por_doing; /* recovery is doing or not */ + wait_queue_head_t cp_wait; /* for orphan inode management */ struct list_head orphan_inode_list; /* orphan inode list */ - struct mutex orphan_inode_mutex; /* for orphan inode list */ + spinlock_t orphan_inode_lock; /* for orphan inode list */ unsigned int n_orphans; /* # of orphan inodes */ + unsigned int max_orphans; /* max orphan inodes */ /* for directory inode management */ struct list_head dir_inode_list; /* dir inode list */ spinlock_t dir_inode_lock; /* for dir inode list lock */ - unsigned int n_dirty_dirs; /* # of dir inodes */ /* basic file system units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ @@ -393,6 +473,7 @@ struct f2fs_sb_info { unsigned int total_valid_node_count; /* valid node block count */ unsigned int total_valid_inode_count; /* valid inode count */ int active_logs; /* # of active logs */ + int dir_level; /* directory level */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ @@ -406,18 +487,30 @@ struct f2fs_sb_info { /* for cleaning operations */ struct mutex gc_mutex; /* mutex for GC */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ + unsigned int cur_victim_sec; /* current victim section num */ + + /* maximum # of trials to find a victim segment for SSR and GC */ + unsigned int max_victim_search; /* * for stat information. * one is for the LFS mode, and the other is for the SSR mode. */ +#ifdef CONFIG_F2FS_STAT_FS struct f2fs_stat_info *stat_info; /* FS status information */ unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ - unsigned int last_victim[2]; /* last victim segment # */ int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ + int inline_inode; /* # of inline_data inodes */ int bg_gc; /* background gc calls */ + unsigned int n_dirty_dirs; /* # of dir inodes */ +#endif + unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ + + /* For sysfs suppport */ + struct kobject s_kobj; + struct completion s_kobj_unregister; }; /* @@ -443,6 +536,16 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) return (struct f2fs_checkpoint *)(sbi->ckpt); } +static inline struct f2fs_node *F2FS_NODE(struct page *page) +{ + return (struct f2fs_node *)page_address(page); +} + +static inline struct f2fs_inode *F2FS_INODE(struct page *page) +{ + return &((struct f2fs_node *)page_address(page))->i; +} + static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi) { return (struct f2fs_nm_info *)(sbi->nm_info); @@ -468,6 +571,16 @@ static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi) return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info); } +static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->meta_inode->i_mapping; +} + +static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->node_inode->i_mapping; +} + static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi) { sbi->s_dirty = 1; @@ -478,6 +591,11 @@ static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi) sbi->s_dirty = 0; } +static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) +{ + return le64_to_cpu(cp->checkpoint_ver); +} + static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); @@ -498,22 +616,36 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t) +static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { - mutex_lock_nested(&sbi->fs_lock[t], t); + down_read(&sbi->cp_rwsem); } -static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t) +static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { - mutex_unlock(&sbi->fs_lock[t]); + up_read(&sbi->cp_rwsem); +} + +static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) +{ + f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex); +} + +static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) +{ + up_write(&sbi->cp_rwsem); } /* * Check whether the given nid is within node id range. */ -static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - BUG_ON((nid >= NM_I(sbi)->max_nid)); + if (unlikely(nid < F2FS_ROOT_INO(sbi))) + return -EINVAL; + if (unlikely(nid >= NM_I(sbi)->max_nid)) + return -EINVAL; + return 0; } #define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 @@ -524,9 +656,14 @@ static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) static inline int F2FS_HAS_BLOCKS(struct inode *inode) { if (F2FS_I(inode)->i_xattr_nid) - return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1); + return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1; else - return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS); + return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; +} + +static inline bool f2fs_has_xattr_block(unsigned int ofs) +{ + return ofs == XATTR_NODE_OFFSET; } static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, @@ -537,7 +674,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + (block_t)count; - if (valid_block_count > sbi->user_block_count) { + if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); return false; } @@ -548,17 +685,16 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, return true; } -static inline int dec_valid_block_count(struct f2fs_sb_info *sbi, +static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t count) { spin_lock(&sbi->stat_lock); - BUG_ON(sbi->total_valid_block_count < (block_t) count); - BUG_ON(inode->i_blocks < count); + f2fs_bug_on(sbi->total_valid_block_count < (block_t) count); + f2fs_bug_on(inode->i_blocks < count); inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); - return 0; } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -569,6 +705,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_dents(struct inode *inode) { + inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); atomic_inc(&F2FS_I(inode)->dirty_dents); } @@ -579,6 +716,10 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_dec_dirty_dents(struct inode *inode) { + if (!S_ISDIR(inode->i_mode)) + return; + + dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); atomic_dec(&F2FS_I(inode)->dirty_dents); } @@ -587,6 +728,11 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } +static inline int get_dirty_dents(struct inode *inode) +{ + return atomic_read(&F2FS_I(inode)->dirty_dents); +} + static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { unsigned int pages_per_sec = sbi->segs_per_sec * @@ -597,11 +743,7 @@ static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) { - block_t ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_block_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_block_count; } static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) @@ -620,16 +762,25 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - int offset = (flag == NAT_BITMAP) ? + int offset; + + if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload) > 0) { + if (flag == NAT_BITMAP) + return &ckpt->sit_nat_version_bitmap; + else + return ((unsigned char *)ckpt + F2FS_BLKSIZE); + } else { + offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; - return &ckpt->sit_nat_version_bitmap + offset; + return &ckpt->sit_nat_version_bitmap + offset; + } } static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { block_t start_addr; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver); + unsigned long long ckpt_version = cur_cp_version(ckpt); start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); @@ -649,96 +800,85 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) } static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode, - unsigned int count) + struct inode *inode) { block_t valid_block_count; unsigned int valid_node_count; spin_lock(&sbi->stat_lock); - valid_block_count = sbi->total_valid_block_count + (block_t)count; - sbi->alloc_valid_block_count += (block_t)count; - valid_node_count = sbi->total_valid_node_count + count; - - if (valid_block_count > sbi->user_block_count) { + valid_block_count = sbi->total_valid_block_count + 1; + if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); return false; } - if (valid_node_count > sbi->total_node_count) { + valid_node_count = sbi->total_valid_node_count + 1; + if (unlikely(valid_node_count > sbi->total_node_count)) { spin_unlock(&sbi->stat_lock); return false; } if (inode) - inode->i_blocks += count; - sbi->total_valid_node_count = valid_node_count; - sbi->total_valid_block_count = valid_block_count; + inode->i_blocks++; + + sbi->alloc_valid_block_count++; + sbi->total_valid_node_count++; + sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); return true; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode, - unsigned int count) + struct inode *inode) { spin_lock(&sbi->stat_lock); - BUG_ON(sbi->total_valid_block_count < count); - BUG_ON(sbi->total_valid_node_count < count); - BUG_ON(inode->i_blocks < count); + f2fs_bug_on(!sbi->total_valid_block_count); + f2fs_bug_on(!sbi->total_valid_node_count); + f2fs_bug_on(!inode->i_blocks); - inode->i_blocks -= count; - sbi->total_valid_node_count -= count; - sbi->total_valid_block_count -= (block_t)count; + inode->i_blocks--; + sbi->total_valid_node_count--; + sbi->total_valid_block_count--; spin_unlock(&sbi->stat_lock); } static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_node_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_node_count; } static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { spin_lock(&sbi->stat_lock); - BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count); + f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count); sbi->total_valid_inode_count++; spin_unlock(&sbi->stat_lock); } -static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi) +static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { spin_lock(&sbi->stat_lock); - BUG_ON(!sbi->total_valid_inode_count); + f2fs_bug_on(!sbi->total_valid_inode_count); sbi->total_valid_inode_count--; spin_unlock(&sbi->stat_lock); - return 0; } static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) { - unsigned int ret; - spin_lock(&sbi->stat_lock); - ret = sbi->total_valid_inode_count; - spin_unlock(&sbi->stat_lock); - return ret; + return sbi->total_valid_inode_count; } static inline void f2fs_put_page(struct page *page, int unlock) { - if (!page || IS_ERR(page)) + if (!page) return; if (unlock) { - BUG_ON(!PageLocked(page)); + f2fs_bug_on(!PageLocked(page)); unlock_page(page); } page_cache_release(page); @@ -755,16 +895,30 @@ static inline void f2fs_put_dnode(struct dnode_of_data *dn) } static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, - size_t size, void (*ctor)(void *)) + size_t size) { - return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor); + return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); +} + +static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags) +{ + void *entry; +retry: + entry = kmem_cache_alloc(cachep, flags); + if (!entry) { + cond_resched(); + goto retry; + } + + return entry; } #define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) static inline bool IS_INODE(struct page *page) { - struct f2fs_node *p = (struct f2fs_node *)page_address(page); + struct f2fs_node *p = F2FS_NODE(page); return RAW_IS_INODE(p); } @@ -778,7 +932,7 @@ static inline block_t datablock_addr(struct page *node_page, { struct f2fs_node *raw_node; __le32 *addr_array; - raw_node = (struct f2fs_node *)page_address(node_page); + raw_node = F2FS_NODE(node_page); addr_array = blkaddr_in_node(raw_node); return le32_to_cpu(addr_array[offset]); } @@ -819,10 +973,16 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr) /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ - FI_NEED_CP, /* need to do checkpoint during fsync */ + FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_DIRTY_DIR, /* indicate directory has dirty pages */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ + FI_UPDATE_DIR, /* should update inode block for consistency */ + FI_DELAY_IPUT, /* used for the recovery */ + FI_NO_EXTENT, /* not to use the extent cache */ + FI_INLINE_XATTR, /* used for inline xattr */ + FI_INLINE_DATA, /* used for inline data*/ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -855,14 +1015,96 @@ static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag) return 0; } +static inline void get_inline_info(struct f2fs_inode_info *fi, + struct f2fs_inode *ri) +{ + if (ri->i_inline & F2FS_INLINE_XATTR) + set_inode_flag(fi, FI_INLINE_XATTR); + if (ri->i_inline & F2FS_INLINE_DATA) + set_inode_flag(fi, FI_INLINE_DATA); +} + +static inline void set_raw_inline(struct f2fs_inode_info *fi, + struct f2fs_inode *ri) +{ + ri->i_inline = 0; + + if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + ri->i_inline |= F2FS_INLINE_XATTR; + if (is_inode_flag_set(fi, FI_INLINE_DATA)) + ri->i_inline |= F2FS_INLINE_DATA; +} + +static inline int f2fs_has_inline_xattr(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); +} + +static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) +{ + if (f2fs_has_inline_xattr(&fi->vfs_inode)) + return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; + return DEF_ADDRS_PER_INODE; +} + +static inline void *inline_xattr_addr(struct page *page) +{ + struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - + F2FS_INLINE_XATTR_ADDRS]); +} + +static inline int inline_xattr_size(struct inode *inode) +{ + if (f2fs_has_inline_xattr(inode)) + return F2FS_INLINE_XATTR_ADDRS << 2; + else + return 0; +} + +static inline int f2fs_has_inline_data(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); +} + +static inline void *inline_data_addr(struct page *page) +{ + struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[1]); +} + +static inline int f2fs_readonly(struct super_block *sb) +{ + return sb->s_flags & MS_RDONLY; +} + +static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) +{ + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; +} + +#define get_inode_mode(i) \ + ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ + (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) + +/* get offset of first page in next direct node */ +#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \ + ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \ + (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \ + ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi)) + /* * file.c */ int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); +int truncate_blocks(struct inode *, u64); void f2fs_truncate(struct inode *); +int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); +int truncate_data_blocks_range(struct dnode_of_data *, int); long f2fs_ioctl(struct file *, unsigned int, unsigned long); long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); @@ -871,7 +1113,9 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); */ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); +int try_to_free_nats(struct f2fs_sb_info *, int); void update_inode(struct inode *, struct page *); +void update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); @@ -889,7 +1133,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); ino_t f2fs_inode_by_name(struct inode *, struct qstr *); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); -void init_dent_inode(const struct qstr *, struct page *); +int update_dent_inode(struct inode *, const struct qstr *); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); int f2fs_make_empty(struct inode *, struct inode *); @@ -919,13 +1163,18 @@ f2fs_hash_t f2fs_dentry_hash(const char *, size_t); struct dnode_of_data; struct node_info; +bool available_free_memory(struct f2fs_sb_info *, int); int is_checkpointed_node(struct f2fs_sb_info *, nid_t); +bool fsync_mark_done(struct f2fs_sb_info *, nid_t); +void fsync_mark_clear(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); -int remove_inode_page(struct inode *); -int new_inode_page(struct inode *, const struct qstr *); -struct page *new_node_page(struct dnode_of_data *, unsigned int); +int truncate_xattr_node(struct inode *, struct page *); +int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); +void remove_inode_page(struct inode *); +struct page *new_inode_page(struct inode *, const struct qstr *); +struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); @@ -936,6 +1185,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); void recover_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, struct node_info *, block_t); +bool recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); int restore_node_summary(struct f2fs_sb_info *, unsigned int, struct f2fs_summary_block *); @@ -949,45 +1199,55 @@ void destroy_node_manager_caches(void); * segment.c */ void f2fs_balance_fs(struct f2fs_sb_info *); +void f2fs_balance_fs_bg(struct f2fs_sb_info *); +int f2fs_issue_flush(struct f2fs_sb_info *); +int create_flush_cmd_control(struct f2fs_sb_info *); +void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); -void locate_dirty_segment(struct f2fs_sb_info *, unsigned int); +void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); +void discard_next_dnode(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -struct bio *f2fs_bio_alloc(struct block_device *, int); -void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync); void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int, - block_t, block_t *); -void write_data_page(struct inode *, struct page *, struct dnode_of_data*, - block_t, block_t *); -void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t); +void write_node_page(struct f2fs_sb_info *, struct page *, + struct f2fs_io_info *, unsigned int, block_t, block_t *); +void write_data_page(struct page *, struct dnode_of_data *, block_t *, + struct f2fs_io_info *); +void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *); void recover_data_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, block_t, block_t); void rewrite_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, block_t, block_t); +void allocate_data_block(struct f2fs_sb_info *, struct page *, + block_t, block_t *, struct f2fs_summary *, int); +void f2fs_wait_on_page_writeback(struct page *, enum page_type); void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); int lookup_journal_in_cursum(struct f2fs_summary_block *, int, unsigned int, int); void flush_sit_entries(struct f2fs_sb_info *); int build_segment_manager(struct f2fs_sb_info *); -void reset_victim_segmap(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); +int __init create_segment_manager_caches(void); +void destroy_segment_manager_caches(void); /* * checkpoint.c */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +int ra_meta_pages(struct f2fs_sb_info *, int, int, int); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -int check_orphan_space(struct f2fs_sb_info *); +int acquire_orphan_inode(struct f2fs_sb_info *); +void release_orphan_inode(struct f2fs_sb_info *); void add_orphan_inode(struct f2fs_sb_info *, nid_t); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -int recover_orphan_inodes(struct f2fs_sb_info *); +void recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void set_dirty_dir_page(struct inode *, struct page *); +void add_dirty_dir_inode(struct inode *); void remove_dirty_dir_inode(struct inode *); void sync_dirty_dir_inodes(struct f2fs_sb_info *); void write_checkpoint(struct f2fs_sb_info *, bool); @@ -998,20 +1258,25 @@ void destroy_checkpoint_caches(void); /* * data.c */ +void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); +int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int); +void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t, + struct f2fs_io_info *); int reserve_new_block(struct dnode_of_data *); +int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); void update_extent_cache(block_t, struct dnode_of_data *); -struct page *find_data_page(struct inode *, pgoff_t); +struct page *find_data_page(struct inode *, pgoff_t, bool); struct page *get_lock_data_page(struct inode *, pgoff_t); -struct page *get_new_data_page(struct inode *, pgoff_t, bool); -int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); -int do_write_data_page(struct page *); +struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); +int do_write_data_page(struct page *, struct f2fs_io_info *); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); /* * gc.c */ int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int); +block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); int f2fs_gc(struct f2fs_sb_info *); void build_gc_manager(struct f2fs_sb_info *); int __init create_gc_caches(void); @@ -1020,7 +1285,7 @@ void destroy_gc_caches(void); /* * recovery.c */ -void recover_fsync_data(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *); bool space_for_roll_forward(struct f2fs_sb_info *); /* @@ -1037,13 +1302,13 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, sits, fnids; int total_count, utilization; - int bg_gc; + int bg_gc, inline_inode; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count; + int prefree_count, call_count, cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int tot_blks, data_blks, node_blks; int curseg[NR_CURSEG_TYPE]; @@ -1055,11 +1320,37 @@ struct f2fs_stat_info { unsigned base_mem, cache_mem; }; -#define stat_inc_call_count(si) ((si)->call_count++) +static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) +{ + return (struct f2fs_stat_info *)sbi->stat_info; +} + +#define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_call_count(si) ((si)->call_count++) +#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) +#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) +#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) +#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++) +#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++) +#define stat_inc_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + ((F2FS_SB(inode->i_sb))->inline_inode++); \ + } while (0) +#define stat_dec_inline_inode(inode) \ + do { \ + if (f2fs_has_inline_data(inode)) \ + ((F2FS_SB(inode->i_sb))->inline_inode--); \ + } while (0) + +#define stat_inc_seg_type(sbi, curseg) \ + ((sbi)->segment_count[(curseg)->alloc_type]++) +#define stat_inc_block_count(sbi, curseg) \ + ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_seg_count(sbi, type) \ do { \ - struct f2fs_stat_info *si = sbi->stat_info; \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ (si)->tot_segs++; \ if (type == SUM_TYPE_DATA) \ si->data_segs++; \ @@ -1072,14 +1363,14 @@ struct f2fs_stat_info { #define stat_inc_data_blk_count(sbi, blks) \ do { \ - struct f2fs_stat_info *si = sbi->stat_info; \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ } while (0) #define stat_inc_node_blk_count(sbi, blks) \ do { \ - struct f2fs_stat_info *si = sbi->stat_info; \ + struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ } while (0) @@ -1089,7 +1380,17 @@ void f2fs_destroy_stats(struct f2fs_sb_info *); void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else +#define stat_inc_cp_count(si) #define stat_inc_call_count(si) +#define stat_inc_bggc_count(si) +#define stat_inc_dirty_dir(sbi) +#define stat_dec_dirty_dir(sbi) +#define stat_inc_total_hit(sb) +#define stat_inc_read_hit(sb) +#define stat_inc_inline_inode(inode) +#define stat_dec_inline_inode(inode) +#define stat_inc_seg_type(sbi, curseg) +#define stat_inc_block_count(sbi, curseg) #define stat_inc_seg_count(si, type) #define stat_inc_tot_blk_count(si, blks) #define stat_inc_data_blk_count(si, blks) @@ -1110,4 +1411,14 @@ extern const struct address_space_operations f2fs_meta_aops; extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; + +/* + * inline.c + */ +bool f2fs_may_inline(struct inode *); +int f2fs_read_inline_data(struct inode *, struct page *); +int f2fs_convert_inline_data(struct inode *, pgoff_t); +int f2fs_write_inline_data(struct inode *, struct page *, unsigned int); +void truncate_inline_data(struct inode *, u64); +int recover_inline_data(struct inode *, struct page *); #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 958a46da19a..7d8b9627509 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -13,17 +13,20 @@ #include <linux/stat.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/blkdev.h> #include <linux/falloc.h> #include <linux/types.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/mount.h> +#include <linux/pagevec.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "xattr.h" #include "acl.h" +#include <trace/events/f2fs.h> static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -31,7 +34,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - block_t old_blk_addr; struct dnode_of_data dn; int err; @@ -39,34 +41,19 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, sb_start_pagefault(inode->i_sb); - mutex_lock_op(sbi, DATA_NEW); - /* block allocation */ + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, 0); - if (err) { - mutex_unlock_op(sbi, DATA_NEW); + err = f2fs_reserve_block(&dn, page->index); + f2fs_unlock_op(sbi); + if (err) goto out; - } - - old_blk_addr = dn.data_blkaddr; - - if (old_blk_addr == NULL_ADDR) { - err = reserve_new_block(&dn); - if (err) { - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); - goto out; - } - } - f2fs_put_dnode(&dn); - - mutex_unlock_op(sbi, DATA_NEW); + file_update_time(vma->vm_file); lock_page(page); - if (page->mapping != inode->i_mapping || - page_offset(page) >= i_size_read(inode) || - !PageUptodate(page)) { + if (unlikely(page->mapping != inode->i_mapping || + page_offset(page) > i_size_read(inode) || + !PageUptodate(page))) { unlock_page(page); err = -EFAULT; goto out; @@ -76,10 +63,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, * check to see if the page is mapped already (no holes) */ if (PageMappedToDisk(page)) - goto out; - - /* fill the page */ - wait_on_page_writeback(page); + goto mapped; /* page is wholly or partially inside EOF */ if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { @@ -90,7 +74,10 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, set_page_dirty(page); SetPageUptodate(page); - file_update_time(vma->vm_file); + trace_f2fs_vm_page_mkwrite(page, DATA); +mapped: + /* fill the page */ + f2fs_wait_on_page_writeback(page, DATA); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(err); @@ -98,32 +85,36 @@ out: static const struct vm_operations_struct f2fs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; -static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode) +static int get_parent_ino(struct inode *inode, nid_t *pino) { struct dentry *dentry; - nid_t pino; inode = igrab(inode); dentry = d_find_any_alias(inode); - if (!dentry) { - iput(inode); + iput(inode); + if (!dentry) + return 0; + + if (update_dent_inode(inode, &dentry->d_name)) { + dput(dentry); return 0; } - pino = dentry->d_parent->d_inode->i_ino; + + *pino = parent_ino(dentry); dput(dentry); - iput(inode); - return !is_checkpointed_node(sbi, pino); + return 1; } int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; + struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - unsigned long long cur_version; int ret = 0; bool need_cp = false; struct writeback_control wbc = { @@ -132,58 +123,204 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) .for_reclaim = 0, }; - if (inode->i_sb->s_flags & MS_RDONLY) + if (unlikely(f2fs_readonly(inode->i_sb))) return 0; + trace_f2fs_sync_file_enter(inode); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) + if (ret) { + trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; + } /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); - mutex_lock(&inode->i_mutex); - - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - goto out; - - mutex_lock(&sbi->cp_mutex); - cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver); - mutex_unlock(&sbi->cp_mutex); - - if (F2FS_I(inode)->data_version != cur_version && - !(inode->i_state & I_DIRTY)) - goto out; - F2FS_I(inode)->data_version--; + down_read(&fi->i_sem); + /* + * Both of fdatasync() and fsync() are able to be recovered from + * sudden-power-off. + */ if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - else if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP)) + else if (file_wrong_pino(inode)) need_cp = true; else if (!space_for_roll_forward(sbi)) need_cp = true; - else if (need_to_sync_dir(sbi, inode)) + else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; + else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) + need_cp = true; + + up_read(&fi->i_sem); if (need_cp) { + nid_t pino; + /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); - clear_inode_flag(F2FS_I(inode), FI_NEED_CP); + + down_write(&fi->i_sem); + F2FS_I(inode)->xattr_ver = 0; + if (file_wrong_pino(inode) && inode->i_nlink == 1 && + get_parent_ino(inode, &pino)) { + F2FS_I(inode)->i_pino = pino; + file_got_pino(inode); + up_write(&fi->i_sem); + mark_inode_dirty_sync(inode); + ret = f2fs_write_inode(inode, NULL); + if (ret) + goto out; + } else { + up_write(&fi->i_sem); + } } else { /* if there is no written node page, write its inode page */ while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { + if (fsync_mark_done(sbi, inode->i_ino)) + goto out; + mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) goto out; } - filemap_fdatawait_range(sbi->node_inode->i_mapping, - 0, LONG_MAX); + ret = wait_on_node_pages_writeback(sbi, inode->i_ino); + if (ret) + goto out; + ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); } out: - mutex_unlock(&inode->i_mutex); + trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } +static pgoff_t __get_first_dirty_index(struct address_space *mapping, + pgoff_t pgofs, int whence) +{ + struct pagevec pvec; + int nr_pages; + + if (whence != SEEK_DATA) + return 0; + + /* find first dirty page index */ + pagevec_init(&pvec, 0); + nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); + pgofs = nr_pages ? pvec.pages[0]->index: LONG_MAX; + pagevec_release(&pvec); + return pgofs; +} + +static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, + int whence) +{ + switch (whence) { + case SEEK_DATA: + if ((blkaddr == NEW_ADDR && dirty == pgofs) || + (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR)) + return true; + break; + case SEEK_HOLE: + if (blkaddr == NULL_ADDR) + return true; + break; + } + return false; +} + +static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = inode->i_sb->s_maxbytes; + struct dnode_of_data dn; + pgoff_t pgofs, end_offset, dirty; + loff_t data_ofs = offset; + loff_t isize; + int err = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) + goto fail; + + /* handle inline data case */ + if (f2fs_has_inline_data(inode)) { + if (whence == SEEK_HOLE) + data_ofs = isize; + goto found; + } + + pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT); + + dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); + + for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + if (err && err != -ENOENT) { + goto fail; + } else if (err == -ENOENT) { + /* direct node is not exist */ + if (whence == SEEK_DATA) { + pgofs = PGOFS_OF_NEXT_DNODE(pgofs, + F2FS_I(inode)); + continue; + } else { + goto found; + } + } + + end_offset = IS_INODE(dn.node_page) ? + ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK; + + /* find data/hole in dnode block */ + for (; dn.ofs_in_node < end_offset; + dn.ofs_in_node++, pgofs++, + data_ofs = pgofs << PAGE_CACHE_SHIFT) { + block_t blkaddr; + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + + if (__found_offset(blkaddr, dirty, pgofs, whence)) { + f2fs_put_dnode(&dn); + goto found; + } + } + f2fs_put_dnode(&dn); + } + + if (whence == SEEK_DATA) + goto fail; +found: + if (whence == SEEK_HOLE && data_ofs > isize) + data_ofs = isize; + mutex_unlock(&inode->i_mutex); + return vfs_setpos(file, data_ofs, maxbytes); +fail: + mutex_unlock(&inode->i_mutex); + return -ENXIO; +} + +static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + loff_t maxbytes = inode->i_sb->s_maxbytes; + + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, whence, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + case SEEK_HOLE: + return f2fs_seek_block(file, offset, whence); + } + + return -EINVAL; +} + static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); @@ -191,31 +328,34 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int truncate_data_blocks_range(struct dnode_of_data *dn, int count) +int truncate_data_blocks_range(struct dnode_of_data *dn, int count) { int nr_free = 0, ofs = dn->ofs_in_node; struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct f2fs_node *raw_node; __le32 *addr; - raw_node = page_address(dn->node_page); + raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + ofs; - for ( ; count > 0; count--, addr++, dn->ofs_in_node++) { + for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); if (blkaddr == NULL_ADDR) continue; update_extent_cache(NULL_ADDR, dn); invalidate_blocks(sbi, blkaddr); - dec_valid_block_count(sbi, dn->inode, 1); nr_free++; } if (nr_free) { + dec_valid_block_count(sbi, dn->inode, nr_free); set_page_dirty(dn->node_page); sync_inode_page(dn); } dn->ofs_in_node = ofs; + + trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, + dn->ofs_in_node, nr_free); return nr_free; } @@ -229,50 +369,60 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) unsigned offset = from & (PAGE_CACHE_SIZE - 1); struct page *page; + if (f2fs_has_inline_data(inode)) + return truncate_inline_data(inode, from); + if (!offset) return; - page = find_data_page(inode, from >> PAGE_CACHE_SHIFT); + page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false); if (IS_ERR(page)) return; lock_page(page); - wait_on_page_writeback(page); + if (unlikely(page->mapping != inode->i_mapping)) { + f2fs_put_page(page, 1); + return; + } + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); f2fs_put_page(page, 1); } -static int truncate_blocks(struct inode *inode, u64 from) +int truncate_blocks(struct inode *inode, u64 from) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; - int count = 0; - int err; + int count = 0, err = 0; + + trace_f2fs_truncate_blocks_enter(inode, from); + + if (f2fs_has_inline_data(inode)) + goto done; free_from = (pgoff_t) ((from + blocksize - 1) >> (sbi->log_blocksize)); - mutex_lock_op(sbi, DATA_TRUNC); + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, free_from, RDONLY_NODE); + err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); if (err) { if (err == -ENOENT) goto free_next; - mutex_unlock_op(sbi, DATA_TRUNC); + f2fs_unlock_op(sbi); + trace_f2fs_truncate_blocks_exit(inode, err); return err; } - if (IS_INODE(dn.node_page)) - count = ADDRS_PER_INODE; - else - count = ADDRS_PER_BLOCK; + count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); count -= dn.ofs_in_node; - BUG_ON(count < 0); + f2fs_bug_on(count < 0); + if (dn.ofs_in_node || IS_INODE(dn.node_page)) { truncate_data_blocks_range(&dn, count); free_from += count; @@ -281,11 +431,12 @@ static int truncate_blocks(struct inode *inode, u64 from) f2fs_put_dnode(&dn); free_next: err = truncate_inode_blocks(inode, free_from); - mutex_unlock_op(sbi, DATA_TRUNC); - + f2fs_unlock_op(sbi); +done: /* lastly zero out the first data page */ truncate_partial_data_page(inode, from); + trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -295,13 +446,15 @@ void f2fs_truncate(struct inode *inode) S_ISLNK(inode->i_mode))) return; + trace_f2fs_truncate(inode); + if (!truncate_blocks(inode, i_size_read(inode))) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } } -static int f2fs_getattr(struct vfsmount *mnt, +int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; @@ -353,6 +506,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { + err = f2fs_convert_inline_data(inode, attr->ia_size); + if (err) + return err; + truncate_setsize(inode, attr->ia_size); f2fs_truncate(inode); f2fs_balance_fs(F2FS_SB(inode->i_sb)); @@ -361,7 +518,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) __setattr_copy(inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = f2fs_acl_chmod(inode); + err = posix_acl_chmod(inode, get_inode_mode(inode)); if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { inode->i_mode = fi->i_acl_mode; clear_inode_flag(fi, FI_ACL_MODE); @@ -376,12 +533,14 @@ const struct inode_operations f2fs_file_inode_operations = { .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, .removexattr = generic_removexattr, #endif + .fiemap = f2fs_fiemap, }; static void fill_zero(struct inode *inode, pgoff_t index, @@ -395,12 +554,12 @@ static void fill_zero(struct inode *inode, pgoff_t index, f2fs_balance_fs(sbi); - mutex_lock_op(sbi, DATA_NEW); - page = get_new_data_page(inode, index, false); - mutex_unlock_op(sbi, DATA_NEW); + f2fs_lock_op(sbi); + page = get_new_data_page(inode, NULL, index, false); + f2fs_unlock_op(sbi); if (!IS_ERR(page)) { - wait_on_page_writeback(page); + f2fs_wait_on_page_writeback(page, DATA); zero_user(page, start, len); set_page_dirty(page); f2fs_put_page(page, 1); @@ -414,15 +573,10 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) for (index = pg_start; index < pg_end; index++) { struct dnode_of_data dn; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - f2fs_balance_fs(sbi); - - mutex_lock_op(sbi, DATA_TRUNC); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, RDONLY_NODE); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { - mutex_unlock_op(sbi, DATA_TRUNC); if (err == -ENOENT) continue; return err; @@ -431,17 +585,20 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) if (dn.data_blkaddr != NULL_ADDR) truncate_data_blocks_range(&dn, 1); f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_TRUNC); } return 0; } -static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) +static int punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; int ret = 0; + ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1); + if (ret) + return ret; + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -461,21 +618,21 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) if (pg_start < pg_end) { struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + + f2fs_balance_fs(sbi); blk_start = pg_start << PAGE_CACHE_SHIFT; blk_end = pg_end << PAGE_CACHE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); + + f2fs_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); + f2fs_unlock_op(sbi); } } - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) <= (offset + len)) { - i_size_write(inode, offset); - mark_inode_dirty(inode); - } - return ret; } @@ -492,36 +649,29 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (ret) return ret; + ret = f2fs_convert_inline_data(inode, offset + len); + if (ret) + return ret; + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; off_start = offset & (PAGE_CACHE_SIZE - 1); off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + f2fs_lock_op(sbi); + for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; - mutex_lock_op(sbi, DATA_NEW); + if (index == pg_end && !off_end) + goto noalloc; set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, index, 0); - if (ret) { - mutex_unlock_op(sbi, DATA_NEW); + ret = f2fs_reserve_block(&dn, index); + if (ret) break; - } - - if (dn.data_blkaddr == NULL_ADDR) { - ret = reserve_new_block(&dn); - if (ret) { - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); - break; - } - } - f2fs_put_dnode(&dn); - - mutex_unlock_op(sbi, DATA_NEW); - +noalloc: if (pg_start == pg_end) new_size = offset + len; else if (index == pg_start && off_start) @@ -536,7 +686,9 @@ static int expand_inode_data(struct inode *inode, loff_t offset, i_size_read(inode) < new_size) { i_size_write(inode, new_size); mark_inode_dirty(inode); + update_inode_page(inode); } + f2fs_unlock_op(sbi); return ret; } @@ -550,8 +702,10 @@ static long f2fs_fallocate(struct file *file, int mode, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_PUNCH_HOLE) - ret = punch_hole(inode, offset, len, mode); + ret = punch_hole(inode, offset, len); else ret = expand_inode_data(inode, offset, len, mode); @@ -559,6 +713,10 @@ static long f2fs_fallocate(struct file *file, int mode, inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } + + mutex_unlock(&inode->i_mutex); + + trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; } @@ -583,14 +741,14 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) int ret; switch (cmd) { - case FS_IOC_GETFLAGS: + case F2FS_IOC_GETFLAGS: flags = fi->i_flags & FS_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); - case FS_IOC_SETFLAGS: + case F2FS_IOC_SETFLAGS: { unsigned int oldflags; - ret = mnt_want_write(filp->f_path.mnt); + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -627,7 +785,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); out: - mnt_drop_write(filp->f_path.mnt); + mnt_drop_write_file(filp); return ret; } default: @@ -653,11 +811,11 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) #endif const struct file_operations f2fs_file_operations = { - .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .llseek = f2fs_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .open = generic_file_open, .mmap = f2fs_file_mmap, .fsync = f2fs_sync_file, @@ -667,5 +825,5 @@ const struct file_operations f2fs_file_operations = { .compat_ioctl = f2fs_compat_ioctl, #endif .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 94b8a0c4845..b90dbe55403 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -11,7 +11,6 @@ #include <linux/fs.h> #include <linux/module.h> #include <linux/backing-dev.h> -#include <linux/proc_fs.h> #include <linux/init.h> #include <linux/f2fs_fs.h> #include <linux/kthread.h> @@ -23,16 +22,18 @@ #include "node.h" #include "segment.h" #include "gc.h" +#include <trace/events/f2fs.h> static struct kmem_cache *winode_slab; static int gc_thread_func(void *data) { struct f2fs_sb_info *sbi = data; + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; long wait_ms; - wait_ms = GC_THREAD_MIN_SLEEP_TIME; + wait_ms = gc_th->min_sleep_time; do { if (try_to_freeze()) @@ -45,7 +46,7 @@ static int gc_thread_func(void *data) break; if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { - wait_ms = GC_THREAD_MAX_SLEEP_TIME; + wait_ms = increase_sleep_time(gc_th, wait_ms); continue; } @@ -66,23 +67,24 @@ static int gc_thread_func(void *data) continue; if (!is_idle(sbi)) { - wait_ms = increase_sleep_time(wait_ms); + wait_ms = increase_sleep_time(gc_th, wait_ms); mutex_unlock(&sbi->gc_mutex); continue; } if (has_enough_invalid_blocks(sbi)) - wait_ms = decrease_sleep_time(wait_ms); + wait_ms = decrease_sleep_time(gc_th, wait_ms); else - wait_ms = increase_sleep_time(wait_ms); + wait_ms = increase_sleep_time(gc_th, wait_ms); - sbi->bg_gc++; + stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi)) - wait_ms = GC_THREAD_NOGC_SLEEP_TIME; - else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) - wait_ms = GC_THREAD_MAX_SLEEP_TIME; + wait_ms = gc_th->no_gc_sleep_time; + + /* balancing f2fs's metadata periodically */ + f2fs_balance_fs_bg(sbi); } while (!kthread_should_stop()); return 0; @@ -92,23 +94,33 @@ int start_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th; dev_t dev = sbi->sb->s_bdev->bd_dev; + int err = 0; if (!test_opt(sbi, BG_GC)) - return 0; + goto out; gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); - if (!gc_th) - return -ENOMEM; + if (!gc_th) { + err = -ENOMEM; + goto out; + } + + gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; + gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; + gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; + + gc_th->gc_idle = 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { + err = PTR_ERR(gc_th->f2fs_gc_task); kfree(gc_th); sbi->gc_thread = NULL; - return -ENOMEM; } - return 0; +out: + return err; } void stop_gc_thread(struct f2fs_sb_info *sbi) @@ -121,9 +133,17 @@ void stop_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread = NULL; } -static int select_gc_type(int gc_type) +static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) { - return (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + + if (gc_th && gc_th->gc_idle) { + if (gc_th->gc_idle == 1) + gc_mode = GC_CB; + else if (gc_th->gc_idle == 2) + gc_mode = GC_GREEDY; + } + return gc_mode; } static void select_policy(struct f2fs_sb_info *sbi, int gc_type, @@ -131,15 +151,21 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - if (p->alloc_mode) { + if (p->alloc_mode == SSR) { p->gc_mode = GC_GREEDY; p->dirty_segmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; p->ofs_unit = 1; } else { - p->gc_mode = select_gc_type(gc_type); + p->gc_mode = select_gc_type(sbi->gc_thread, gc_type); p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; + p->max_search = dirty_i->nr_dirty[DIRTY]; p->ofs_unit = sbi->segs_per_sec; } + + if (p->max_search > sbi->max_victim_search) + p->max_search = sbi->max_victim_search; + p->offset = sbi->last_victim[p->gc_mode]; } @@ -160,18 +186,21 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno; + unsigned int hint = 0; + unsigned int secno; /* * If the gc_type is FG_GC, we can select victim segments * selected by background GC before. * Those segments guarantee they have small valid blocks. */ - segno = find_next_bit(dirty_i->victim_segmap[BG_GC], - TOTAL_SEGS(sbi), 0); - if (segno < TOTAL_SEGS(sbi)) { - clear_bit(segno, dirty_i->victim_segmap[BG_GC]); - return segno; +next: + secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++); + if (secno < TOTAL_SECS(sbi)) { + if (sec_usage_check(sbi, secno)) + goto next; + clear_bit(secno, dirty_i->victim_secmap); + return secno * sbi->segs_per_sec; } return NULL_SEGNO; } @@ -208,8 +237,8 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } -static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, - struct victim_sel_policy *p) +static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, + unsigned int segno, struct victim_sel_policy *p) { if (p->alloc_mode == SSR) return get_seg_entry(sbi, segno)->ckpt_valid_blocks; @@ -222,7 +251,7 @@ static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, } /* - * This function is called from two pathes. + * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. * When it is called during GC, it just gets a victim segment * and it does not remove it from dirty seglist. @@ -234,14 +263,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; - unsigned int segno; + unsigned int secno, max_cost; int nsearched = 0; p.alloc_mode = alloc_mode; select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; - p.min_cost = get_max_cost(sbi, &p); + p.min_cost = max_cost = get_max_cost(sbi, &p); mutex_lock(&dirty_i->seglist_lock); @@ -253,6 +282,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, while (1) { unsigned long cost; + unsigned int segno; segno = find_next_bit(p.dirty_segmap, TOTAL_SEGS(sbi), p.offset); @@ -264,14 +294,16 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } break; } - p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit; - if (test_bit(segno, dirty_i->victim_segmap[FG_GC])) - continue; - if (gc_type == BG_GC && - test_bit(segno, dirty_i->victim_segmap[BG_GC])) + p.offset = segno + p.ofs_unit; + if (p.ofs_unit > 1) + p.offset -= segno % p.ofs_unit; + + secno = GET_SECNO(sbi, segno); + + if (sec_usage_check(sbi, secno)) continue; - if (IS_CURSEC(sbi, GET_SECNO(sbi, segno))) + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) continue; cost = get_gc_cost(sbi, segno, &p); @@ -279,25 +311,29 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (p.min_cost > cost) { p.min_segno = segno; p.min_cost = cost; - } - - if (cost == get_max_cost(sbi, &p)) + } else if (unlikely(cost == max_cost)) { continue; + } - if (nsearched++ >= MAX_VICTIM_SEARCH) { + if (nsearched++ >= p.max_search) { sbi->last_victim[p.gc_mode] = segno; break; } } -got_it: if (p.min_segno != NULL_SEGNO) { - *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; +got_it: if (p.alloc_mode == LFS) { - int i; - for (i = 0; i < p.ofs_unit; i++) - set_bit(*result + i, - dirty_i->victim_segmap[gc_type]); + secno = GET_SECNO(sbi, p.min_segno); + if (gc_type == FG_GC) + sbi->cur_victim_sec = secno; + else + set_bit(secno, dirty_i->victim_secmap); } + *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; + + trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, + sbi->cur_victim_sec, + prefree_segments(sbi), free_segments(sbi)); } mutex_unlock(&dirty_i->seglist_lock); @@ -310,35 +346,24 @@ static const struct victim_selection default_v_ops = { static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist) { - struct list_head *this; struct inode_entry *ie; - list_for_each(this, ilist) { - ie = list_entry(this, struct inode_entry, list); + list_for_each_entry(ie, ilist, list) if (ie->inode->i_ino == ino) return ie->inode; - } return NULL; } static void add_gc_inode(struct inode *inode, struct list_head *ilist) { - struct list_head *this; - struct inode_entry *new_ie, *ie; + struct inode_entry *new_ie; - list_for_each(this, ilist) { - ie = list_entry(this, struct inode_entry, list); - if (ie->inode == inode) { - iput(inode); - return; - } - } -repeat: - new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS); - if (!new_ie) { - cond_resched(); - goto repeat; + if (inode == find_gc_inode(inode->i_ino, ilist)) { + iput(inode); + return; } + + new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS); new_ie->inode = inode; list_add_tail(&new_ie->list, ilist); } @@ -381,6 +406,7 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, next_step: entry = sum; + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; @@ -401,11 +427,17 @@ next_step: continue; /* set page dirty and write it */ - if (!PageWriteback(node_page)) + if (gc_type == FG_GC) { + f2fs_wait_on_page_writeback(node_page, NODE); set_page_dirty(node_page); + } else { + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } f2fs_put_page(node_page, 1); stat_inc_node_blk_count(sbi, 1); } + if (initial) { initial = false; goto next_step; @@ -418,6 +450,13 @@ next_step: .for_reclaim = 0, }; sync_node_pages(sbi, 0, &wbc); + + /* + * In the case of FG_GC, it'd be better to reclaim this victim + * completely. + */ + if (get_valid_blocks(sbi, segno, 1) != 0) + goto next_step; } } @@ -428,7 +467,7 @@ next_step: * as indirect or double indirect node blocks, are given, it must be a caller's * bug. */ -block_t start_bidx_of_node(unsigned int node_ofs) +block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) { unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; unsigned int bidx; @@ -445,7 +484,7 @@ block_t start_bidx_of_node(unsigned int node_ofs) int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 5 - dec; } - return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; + return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); } static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -481,29 +520,23 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, static void move_data_page(struct inode *inode, struct page *page, int gc_type) { - if (page->mapping != inode->i_mapping) - goto out; - - if (inode != page->mapping->host) - goto out; - - if (PageWriteback(page)) - goto out; + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC, + }; if (gc_type == BG_GC) { + if (PageWriteback(page)) + goto out; set_page_dirty(page); set_cold_data(page); } else { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - mutex_lock_op(sbi, DATA_WRITE); - if (clear_page_dirty_for_io(page) && - S_ISDIR(inode->i_mode)) { - dec_page_count(sbi, F2FS_DIRTY_DENTS); + f2fs_wait_on_page_writeback(page, DATA); + + if (clear_page_dirty_for_io(page)) inode_dec_dirty_dents(inode); - } set_cold_data(page); - do_write_data_page(page); - mutex_unlock_op(sbi, DATA_WRITE); + do_write_data_page(page, &fio); clear_cold_data(page); } out: @@ -530,6 +563,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, next_step: entry = sum; + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { struct page *data_page; struct inode *inode; @@ -558,7 +592,6 @@ next_step: continue; } - start_bidx = start_bidx_of_node(nofs); ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 2) { @@ -566,8 +599,10 @@ next_step: if (IS_ERR(inode)) continue; + start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + data_page = find_data_page(inode, - start_bidx + ofs_in_node); + start_bidx + ofs_in_node, false); if (IS_ERR(data_page)) goto next_iput; @@ -576,6 +611,8 @@ next_step: } else { inode = find_gc_inode(dni.ino, ilist); if (inode) { + start_bidx = start_bidx_of_node(nofs, + F2FS_I(inode)); data_page = get_lock_data_page(inode, start_bidx + ofs_in_node); if (IS_ERR(data_page)) @@ -588,11 +625,22 @@ next_step: next_iput: iput(inode); } + if (++phase < 4) goto next_step; - if (gc_type == FG_GC) - f2fs_submit_bio(sbi, DATA, true); + if (gc_type == FG_GC) { + f2fs_submit_merged_bio(sbi, DATA, WRITE); + + /* + * In the case of FG_GC, it'd be better to reclaim this victim + * completely. + */ + if (get_valid_blocks(sbi, segno, 1) != 0) { + phase = 2; + goto next_step; + } + } } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -611,18 +659,13 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, { struct page *sum_page; struct f2fs_summary_block *sum; + struct blk_plug plug; /* read segment summary of victim */ sum_page = get_sum_page(sbi, segno); - if (IS_ERR(sum_page)) - return; - /* - * CP needs to lock sum_page. In this time, we don't need - * to lock this page, because this summary page is not gone anywhere. - * Also, this page is not gonna be updated before GC is done. - */ - unlock_page(sum_page); + blk_start_plug(&plug); + sum = page_address(sum_page); switch (GET_SUM_TYPE((&sum->footer))) { @@ -633,10 +676,12 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); break; } + blk_finish_plug(&plug); + stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); stat_inc_call_count(sbi->stat_info); - f2fs_put_page(sum_page, 0); + f2fs_put_page(sum_page, 1); } int f2fs_gc(struct f2fs_sb_info *sbi) @@ -649,22 +694,33 @@ int f2fs_gc(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&ilist); gc_more: - if (!(sbi->sb->s_flags & MS_ACTIVE)) + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) + goto stop; + if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) goto stop; - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; + write_checkpoint(sbi, false); + } if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) goto stop; ret = 0; + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, + META_SSA); + for (i = 0; i < sbi->segs_per_sec; i++) do_garbage_collect(sbi, segno + i, &ilist, gc_type); - if (gc_type == FG_GC && - get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + if (gc_type == FG_GC) { + sbi->cur_victim_sec = NULL_SEGNO; nfree++; + WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec)); + } if (has_not_enough_free_secs(sbi, nfree)) goto gc_more; @@ -686,7 +742,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) int __init create_gc_caches(void) { winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", - sizeof(struct inode_entry), NULL); + sizeof(struct inode_entry)); if (!winode_slab) return -ENOMEM; return 0; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 30b2db003ac..5d5eb6047bf 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,18 +13,26 @@ * whether IO subsystem is idle * or not */ -#define GC_THREAD_MIN_SLEEP_TIME 10000 /* milliseconds */ -#define GC_THREAD_MAX_SLEEP_TIME 30000 -#define GC_THREAD_NOGC_SLEEP_TIME 10000 +#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ +#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 +#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ /* Search max. number of dirty segments to select a victim segment */ -#define MAX_VICTIM_SEARCH 20 +#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */ struct f2fs_gc_kthread { struct task_struct *f2fs_gc_task; wait_queue_head_t gc_wait_queue_head; + + /* for gc sleep time */ + unsigned int min_sleep_time; + unsigned int max_sleep_time; + unsigned int no_gc_sleep_time; + + /* for changing gc mode */ + unsigned int gc_idle; }; struct inode_entry { @@ -56,19 +64,25 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; } -static inline long increase_sleep_time(long wait) +static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait) { - wait += GC_THREAD_MIN_SLEEP_TIME; - if (wait > GC_THREAD_MAX_SLEEP_TIME) - wait = GC_THREAD_MAX_SLEEP_TIME; + if (wait == gc_th->no_gc_sleep_time) + return wait; + + wait += gc_th->min_sleep_time; + if (wait > gc_th->max_sleep_time) + wait = gc_th->max_sleep_time; return wait; } -static inline long decrease_sleep_time(long wait) +static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait) { - wait -= GC_THREAD_MIN_SLEEP_TIME; - if (wait <= GC_THREAD_MIN_SLEEP_TIME) - wait = GC_THREAD_MIN_SLEEP_TIME; + if (wait == gc_th->no_gc_sleep_time) + wait = gc_th->max_sleep_time; + + wait -= gc_th->min_sleep_time; + if (wait <= gc_th->min_sleep_time) + wait = gc_th->min_sleep_time; return wait; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c new file mode 100644 index 00000000000..1bba5228c19 --- /dev/null +++ b/fs/f2fs/inline.c @@ -0,0 +1,250 @@ +/* + * fs/f2fs/inline.c + * Copyright (c) 2013, Intel Corporation + * Authors: Huajun Li <huajun.li@intel.com> + * Haicheng Li <haicheng.li@intel.com> + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" + +bool f2fs_may_inline(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + block_t nr_blocks; + loff_t i_size; + + if (!test_opt(sbi, INLINE_DATA)) + return false; + + nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2; + if (inode->i_blocks > nr_blocks) + return false; + + i_size = i_size_read(inode); + if (i_size > MAX_INLINE_DATA) + return false; + + return true; +} + +int f2fs_read_inline_data(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *ipage; + void *src_addr, *dst_addr; + + if (page->index) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + goto out; + } + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + unlock_page(page); + return PTR_ERR(ipage); + } + + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + + /* Copy the whole inline data block */ + src_addr = inline_data_addr(ipage); + dst_addr = kmap(page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + kunmap(page); + f2fs_put_page(ipage, 1); + +out: + SetPageUptodate(page); + unlock_page(page); + + return 0; +} + +static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) +{ + int err; + struct page *ipage; + struct dnode_of_data dn; + void *src_addr, *dst_addr; + block_t new_blk_addr; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC | REQ_PRIO, + }; + + f2fs_lock_op(sbi); + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } + + /* + * i_addr[0] is not used for inline data, + * so reserving new block will not destroy inline data + */ + set_new_dnode(&dn, inode, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, 0); + if (err) + goto out; + + f2fs_wait_on_page_writeback(page, DATA); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + + /* Copy the whole inline data block */ + src_addr = inline_data_addr(ipage); + dst_addr = kmap(page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + kunmap(page); + SetPageUptodate(page); + + /* write data page to try to make data consistent */ + set_page_writeback(page); + write_data_page(page, &dn, &new_blk_addr, &fio); + update_extent_cache(new_blk_addr, &dn); + f2fs_wait_on_page_writeback(page, DATA); + + /* clear inline data and flag after data writeback */ + zero_user_segment(ipage, INLINE_DATA_OFFSET, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + stat_dec_inline_inode(inode); + + sync_inode_page(&dn); + f2fs_put_dnode(&dn); +out: + f2fs_unlock_op(sbi); + return err; +} + +int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size) +{ + struct page *page; + int err; + + if (!f2fs_has_inline_data(inode)) + return 0; + else if (to_size <= MAX_INLINE_DATA) + return 0; + + page = grab_cache_page(inode->i_mapping, 0); + if (!page) + return -ENOMEM; + + err = __f2fs_convert_inline_data(inode, page); + f2fs_put_page(page, 1); + return err; +} + +int f2fs_write_inline_data(struct inode *inode, + struct page *page, unsigned size) +{ + void *src_addr, *dst_addr; + struct page *ipage; + struct dnode_of_data dn; + int err; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); + if (err) + return err; + ipage = dn.inode_page; + + f2fs_wait_on_page_writeback(ipage, NODE); + zero_user_segment(ipage, INLINE_DATA_OFFSET, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + src_addr = kmap(page); + dst_addr = inline_data_addr(ipage); + memcpy(dst_addr, src_addr, size); + kunmap(page); + + /* Release the first data block if it is allocated */ + if (!f2fs_has_inline_data(inode)) { + truncate_data_blocks_range(&dn, 1); + set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + stat_inc_inline_inode(inode); + } + + sync_inode_page(&dn); + f2fs_put_dnode(&dn); + + return 0; +} + +void truncate_inline_data(struct inode *inode, u64 from) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct page *ipage; + + if (from >= MAX_INLINE_DATA) + return; + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) + return; + + f2fs_wait_on_page_writeback(ipage, NODE); + + zero_user_segment(ipage, INLINE_DATA_OFFSET + from, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); +} + +int recover_inline_data(struct inode *inode, struct page *npage) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_inode *ri = NULL; + void *src_addr, *dst_addr; + struct page *ipage; + + /* + * The inline_data recovery policy is as follows. + * [prev.] [next] of inline_data flag + * o o -> recover inline_data + * o x -> remove inline_data, and then recover data blocks + * x o -> remove inline_data, and then recover inline_data + * x x -> recover data blocks + */ + if (IS_INODE(npage)) + ri = F2FS_INODE(npage); + + if (f2fs_has_inline_data(inode) && + ri && ri->i_inline & F2FS_INLINE_DATA) { +process_inline: + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + + f2fs_wait_on_page_writeback(ipage, NODE); + + src_addr = inline_data_addr(npage); + dst_addr = inline_data_addr(ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); + return -1; + } + + if (f2fs_has_inline_data(inode)) { + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + f2fs_wait_on_page_writeback(ipage, NODE); + zero_user_segment(ipage, INLINE_DATA_OFFSET, + INLINE_DATA_OFFSET + MAX_INLINE_DATA); + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); + } else if (ri && ri->i_inline & F2FS_INLINE_DATA) { + truncate_blocks(inode, 0); + set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + goto process_inline; + } + return 0; +} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ddae412d30c..2cf6962f6cc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -12,27 +12,59 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/bitops.h> #include "f2fs.h" #include "node.h" +#include <trace/events/f2fs.h> + void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; - - inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | - S_NOATIME | S_DIRSYNC); + unsigned int new_fl = 0; if (flags & FS_SYNC_FL) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (flags & FS_APPEND_FL) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (flags & FS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags & FS_NOATIME_FL) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (flags & FS_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + set_mask_bits(&inode->i_flags, + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl); +} + +static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +{ + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + if (ri->i_addr[0]) + inode->i_rdev = + old_decode_dev(le32_to_cpu(ri->i_addr[0])); + else + inode->i_rdev = + new_decode_dev(le32_to_cpu(ri->i_addr[1])); + } +} + +static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) +{ + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + if (old_valid_dev(inode->i_rdev)) { + ri->i_addr[0] = + cpu_to_le32(old_encode_dev(inode->i_rdev)); + ri->i_addr[1] = 0; + } else { + ri->i_addr[0] = 0; + ri->i_addr[1] = + cpu_to_le32(new_encode_dev(inode->i_rdev)); + ri->i_addr[2] = 0; + } + } } static int do_read_inode(struct inode *inode) @@ -40,18 +72,21 @@ static int do_read_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; - struct f2fs_node *rn; struct f2fs_inode *ri; /* Check if ino is within scope */ - check_nid_range(sbi, inode->i_ino); + if (check_nid_range(sbi, inode->i_ino)) { + f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", + (unsigned long) inode->i_ino); + WARN_ON(1); + return -EINVAL; + } node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); - rn = page_address(node_page); - ri = &(rn->i); + ri = F2FS_INODE(node_page); inode->i_mode = le16_to_cpu(ri->i_mode); i_uid_write(inode, le32_to_cpu(ri->i_uid)); @@ -67,19 +102,21 @@ static int do_read_inode(struct inode *inode) inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); inode->i_generation = le32_to_cpu(ri->i_generation); - if (ri->i_addr[0]) - inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0])); - else - inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1])); fi->i_current_depth = le32_to_cpu(ri->i_current_depth); fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); fi->i_flags = le32_to_cpu(ri->i_flags); fi->flags = 0; - fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1; fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); + fi->i_dir_level = ri->i_dir_level; + get_extent_info(&fi->ext, ri->i_ext); + get_inline_info(fi, ri); + + /* get rdev by using inline_info */ + __get_inode_rdev(inode, ri); + f2fs_put_page(node_page, 1); return 0; } @@ -88,25 +125,22 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - int ret; + int ret = 0; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + + if (!(inode->i_state & I_NEW)) { + trace_f2fs_iget(inode); return inode; + } if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) goto make_now; ret = do_read_inode(inode); if (ret) goto bad_inode; - - if (!sbi->por_doing && inode->i_nlink == 0) { - ret = -ENOENT; - goto bad_inode; - } - make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; @@ -122,8 +156,7 @@ make_now: inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE | - __GFP_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; @@ -136,23 +169,22 @@ make_now: goto bad_inode; } unlock_new_inode(inode); - + trace_f2fs_iget(inode); return inode; bad_inode: iget_failed(inode); + trace_f2fs_iget_exit(inode, ret); return ERR_PTR(ret); } void update_inode(struct inode *inode, struct page *node_page) { - struct f2fs_node *rn; struct f2fs_inode *ri; - wait_on_page_writeback(node_page); + f2fs_wait_on_page_writeback(node_page, NODE); - rn = page_address(node_page); - ri = &(rn->i); + ri = F2FS_INODE(node_page); ri->i_mode = cpu_to_le16(inode->i_mode); ri->i_advise = F2FS_I(inode)->i_advise; @@ -162,6 +194,7 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); + set_raw_inline(F2FS_I(inode), ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); @@ -174,55 +207,57 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); + ri->i_dir_level = F2FS_I(inode)->i_dir_level; - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { - if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = - cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; - } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = - cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; - } - } - + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); set_page_dirty(node_page); + + clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); } -int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +void update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *node_page; - bool need_lock = false; +retry: + node_page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(node_page)) { + int err = PTR_ERR(node_page); + if (err == -ENOMEM) { + cond_resched(); + goto retry; + } else if (err != -ENOENT) { + f2fs_stop_checkpoint(sbi); + } + return; + } + update_inode(inode, node_page); + f2fs_put_page(node_page, 1); +} + +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) return 0; + if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) + return 0; + + /* + * We need to lock here to prevent from producing dirty node pages + * during the urgent cleaning time when runing out of free sections. + */ + f2fs_lock_op(sbi); + update_inode_page(inode); + f2fs_unlock_op(sbi); + if (wbc) f2fs_balance_fs(sbi); - node_page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) - return PTR_ERR(node_page); - - if (!PageDirty(node_page)) { - need_lock = true; - f2fs_put_page(node_page, 1); - mutex_lock(&sbi->write_inode); - node_page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) { - mutex_unlock(&sbi->write_inode); - return PTR_ERR(node_page); - } - } - update_inode(inode, node_page); - f2fs_put_page(node_page, 1); - if (need_lock) - mutex_unlock(&sbi->write_inode); return 0; } @@ -233,13 +268,14 @@ void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - truncate_inode_pages(&inode->i_data, 0); + trace_f2fs_evict_inode(inode); + truncate_inode_pages_final(&inode->i_data); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) goto no_delete; - BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents)); + f2fs_bug_on(get_dirty_dents(inode)); remove_dirty_dir_inode(inode); if (inode->i_nlink || is_bad_inode(inode)) @@ -252,8 +288,13 @@ void f2fs_evict_inode(struct inode *inode) if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); + f2fs_lock_op(sbi); remove_inode_page(inode); + stat_dec_inline_inode(inode); + f2fs_unlock_op(sbi); + sb_end_intwrite(inode->i_sb); no_delete: clear_inode(inode); + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 1a49b881bac..a6bdddc33ce 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -15,8 +15,10 @@ #include <linux/ctype.h> #include "f2fs.h" +#include "node.h" #include "xattr.h" #include "acl.h" +#include <trace/events/f2fs.h> static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) { @@ -31,26 +33,17 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (!inode) return ERR_PTR(-ENOMEM); - mutex_lock_op(sbi, NODE_NEW); + f2fs_lock_op(sbi); if (!alloc_nid(sbi, &ino)) { - mutex_unlock_op(sbi, NODE_NEW); + f2fs_unlock_op(sbi); err = -ENOSPC; goto fail; } - mutex_unlock_op(sbi, NODE_NEW); + f2fs_unlock_op(sbi); - inode->i_uid = current_fsuid(); - - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - inode->i_gid = current_fsgid(); - } + inode_init_owner(inode, dir, mode); inode->i_ino = ino; - inode->i_mode = mode; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_generation = sbi->s_next_generation++; @@ -61,7 +54,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_free = true; goto out; } - + trace_f2fs_new_inode(inode, 0); mark_inode_dirty(inode); return inode; @@ -69,6 +62,8 @@ out: clear_nlink(inode); unlock_new_inode(inode); fail: + trace_f2fs_new_inode(inode, err); + make_bad_inode(inode); iput(inode); if (nid_free) alloc_nid_failed(sbi, ino); @@ -79,27 +74,17 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); - int ret; if (sublen > slen) - return 1; - - ret = memcmp(s + slen - sublen, sub, sublen); - if (ret) { /* compare upper case */ - int i; - char upper_sub[8]; - for (i = 0; i < sublen && i < sizeof(upper_sub); i++) - upper_sub[i] = toupper(sub[i]); - return memcmp(s + slen - sublen, upper_sub, sublen); - } + return 0; - return ret; + return !strncasecmp(s + slen - sublen, sub, sublen); } /* * Set multimedia files as cold files for hot/cold data separation */ -static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode, +static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { int i; @@ -107,8 +92,8 @@ static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode, int count = le32_to_cpu(sbi->raw_super->extension_count); for (i = 0; i < count; i++) { - if (!is_multimedia_file(name, extlist[i])) { - F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT; + if (is_multimedia_file(name, extlist[i])) { + file_set_cold(inode); break; } } @@ -130,26 +115,28 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return PTR_ERR(inode); if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_cold_file(sbi, inode, dentry->d_name.name); + set_cold_files(sbi, inode, dentry->d_name.name); inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); + f2fs_unlock_op(sbi); if (err) goto out; alloc_nid_done(sbi, ino); - if (!sbi->por_doing) - d_instantiate(dentry, inode); + d_instantiate(dentry, inode); unlock_new_inode(inode); return 0; out: clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, ino); return err; @@ -166,10 +153,12 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_balance_fs(sbi); inode->i_ctime = CURRENT_TIME; - atomic_inc(&inode->i_count); + ihold(inode); set_inode_flag(F2FS_I(inode), FI_INC_LINK); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); + f2fs_unlock_op(sbi); if (err) goto out; @@ -197,7 +186,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct f2fs_dir_entry *de; struct page *page; - if (dentry->d_name.len > F2FS_MAX_NAME_LEN) + if (dentry->d_name.len > F2FS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); de = f2fs_find_entry(dir, &dentry->d_name, &page); @@ -209,6 +198,8 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, inode = f2fs_iget(dir->i_sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + + stat_inc_inline_inode(inode); } return d_splice_alias(inode, dentry); @@ -223,24 +214,28 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct page *page; int err = -ENOENT; + trace_f2fs_unlink_enter(dir, dentry); f2fs_balance_fs(sbi); de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) goto fail; - err = check_orphan_space(sbi); + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); if (err) { + f2fs_unlock_op(sbi); kunmap(page); f2fs_put_page(page, 0); goto fail; } - f2fs_delete_entry(de, page, inode); + f2fs_unlock_op(sbi); /* In order to evict this inode, we set it dirty */ mark_inode_dirty(inode); fail: + trace_f2fs_unlink_exit(inode, err); return err; } @@ -262,7 +257,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); + f2fs_unlock_op(sbi); if (err) goto out; @@ -275,6 +272,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, out: clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, inode->i_ino); return err; @@ -298,7 +296,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); set_inode_flag(F2FS_I(inode), FI_INC_LINK); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); + f2fs_unlock_op(sbi); if (err) goto out_fail; @@ -313,6 +313,7 @@ out_fail: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, inode->i_ino); return err; @@ -346,7 +347,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); + f2fs_unlock_op(sbi); if (err) goto out; @@ -357,6 +360,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, out: clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, inode->i_ino); return err; @@ -370,7 +374,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct page *old_dir_page; - struct page *old_page; + struct page *old_page, *new_page; struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; @@ -389,10 +393,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_old; } - mutex_lock_op(sbi, RENAME); + f2fs_lock_op(sbi); if (new_inode) { - struct page *new_page; err = -ENOTEMPTY; if (old_dir_entry && !f2fs_empty_dir(new_inode)) @@ -404,15 +407,33 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) goto out_dir; + err = acquire_orphan_inode(sbi); + if (err) + goto put_out_dir; + + if (update_dent_inode(old_inode, &new_dentry->d_name)) { + release_orphan_inode(sbi); + goto put_out_dir; + } + f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME; + down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) drop_nlink(new_inode); drop_nlink(new_inode); + up_write(&F2FS_I(new_inode)->i_sem); + + mark_inode_dirty(new_inode); + if (!new_inode->i_nlink) add_orphan_inode(sbi, new_inode->i_ino); - f2fs_write_inode(new_inode, NULL); + else + release_orphan_inode(sbi); + + update_inode_page(old_inode); + update_inode_page(new_inode); } else { err = f2fs_add_link(new_dentry, old_inode); if (err) @@ -420,12 +441,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir_entry) { inc_nlink(new_dir); - f2fs_write_inode(new_dir, NULL); + update_inode_page(new_dir); } } + down_write(&F2FS_I(old_inode)->i_sem); + file_lost_pino(old_inode); + up_write(&F2FS_I(old_inode)->i_sem); + old_inode->i_ctime = CURRENT_TIME; - set_inode_flag(F2FS_I(old_inode), FI_NEED_CP); mark_inode_dirty(old_inode); f2fs_delete_entry(old_entry, old_page, NULL); @@ -434,23 +458,28 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + update_inode_page(old_inode); } else { kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } drop_nlink(old_dir); - f2fs_write_inode(old_dir, NULL); + mark_inode_dirty(old_dir); + update_inode_page(old_dir); } - mutex_unlock_op(sbi, RENAME); + f2fs_unlock_op(sbi); return 0; +put_out_dir: + kunmap(new_page); + f2fs_put_page(new_page, 0); out_dir: if (old_dir_entry) { kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } - mutex_unlock_op(sbi, RENAME); + f2fs_unlock_op(sbi); out_old: kunmap(old_page); f2fs_put_page(old_page, 0); @@ -468,8 +497,10 @@ const struct inode_operations f2fs_dir_inode_operations = { .rmdir = f2fs_rmdir, .mknod = f2fs_mknod, .rename = f2fs_rename, + .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -482,6 +513,7 @@ const struct inode_operations f2fs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .getattr = f2fs_getattr, .setattr = f2fs_setattr, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, @@ -492,8 +524,10 @@ const struct inode_operations f2fs_symlink_inode_operations = { }; const struct inode_operations f2fs_special_inode_operations = { + .getattr = f2fs_getattr, .setattr = f2fs_setattr, .get_acl = f2fs_get_acl, + .set_acl = f2fs_set_acl, #ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e275218904e..4b697ccc9b0 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,10 +19,37 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include <trace/events/f2fs.h> + +#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; +bool available_free_memory(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct sysinfo val; + unsigned long mem_size = 0; + bool res = false; + + si_meminfo(&val); + /* give 25%, 25%, 50% memory for each components respectively */ + if (type == FREE_NIDS) { + mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> 12; + res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); + } else if (type == NAT_ENTRIES) { + mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12; + res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); + } else if (type == DIRTY_DENTS) { + if (sbi->sb->s_bdi->dirty_exceeded) + return false; + mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); + res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1); + } + return res; +} + static void clear_node_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -81,33 +108,6 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } -/* - * Readahead NAT pages - */ -static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) -{ - struct address_space *mapping = sbi->meta_inode->i_mapping; - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct page *page; - pgoff_t index; - int i; - - for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { - if (nid >= nm_i->max_nid) - nid = 0; - index = current_nat_addr(sbi, nid); - - page = grab_cache_page(mapping, index); - if (!page) - continue; - if (f2fs_readpage(sbi, page, index, READ)) { - f2fs_put_page(page, 1); - continue; - } - f2fs_put_page(page, 0); - } -} - static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { return radix_tree_lookup(&nm_i->nat_root, n); @@ -141,6 +141,32 @@ int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) return is_cp; } +bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + bool fsync_done = false; + + read_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) + fsync_done = e->fsync_done; + read_unlock(&nm_i->nat_tree_lock); + return fsync_done; +} + +void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct nat_entry *e; + + write_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, nid); + if (e) + e->fsync_done = false; + write_unlock(&nm_i->nat_tree_lock); +} + static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) { struct nat_entry *new; @@ -154,6 +180,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) } memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); + new->checkpointed = true; list_add_tail(&new->list, &nm_i->nat_entries); nm_i->nat_cnt++; return new; @@ -172,16 +199,13 @@ retry: write_unlock(&nm_i->nat_tree_lock); goto retry; } - nat_set_blkaddr(e, le32_to_cpu(ne->block_addr)); - nat_set_ino(e, le32_to_cpu(ne->ino)); - nat_set_version(e, ne->version); - e->checkpointed = true; + node_info_from_raw_nat(&e->ni, ne); } write_unlock(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, - block_t new_blkaddr) + block_t new_blkaddr, bool fsync_done) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; @@ -195,8 +219,7 @@ retry: goto retry; } e->ni = *ni; - e->checkpointed = true; - BUG_ON(ni->blk_addr == NEW_ADDR); + f2fs_bug_on(ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { /* * when nid is reallocated, @@ -204,19 +227,16 @@ retry: * So, reinitialize it with new information. */ e->ni = *ni; - BUG_ON(ni->blk_addr != NULL_ADDR); + f2fs_bug_on(ni->blk_addr != NULL_ADDR); } - if (new_blkaddr == NEW_ADDR) - e->checkpointed = false; - /* sanity check */ - BUG_ON(nat_get_blkaddr(e) != ni->blk_addr); - BUG_ON(nat_get_blkaddr(e) == NULL_ADDR && + f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); + f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && new_blkaddr == NULL_ADDR); - BUG_ON(nat_get_blkaddr(e) == NEW_ADDR && + f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR && new_blkaddr == NEW_ADDR); - BUG_ON(nat_get_blkaddr(e) != NEW_ADDR && + f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR && nat_get_blkaddr(e) != NULL_ADDR && new_blkaddr == NEW_ADDR); @@ -229,14 +249,19 @@ retry: /* change address */ nat_set_blkaddr(e, new_blkaddr); __set_nat_cache_dirty(nm_i, e); + + /* update fsync_mark if its inode nat entry is still alive */ + e = __lookup_nat_cache(nm_i, ni->ino); + if (e) + e->fsync_done = fsync_done; write_unlock(&nm_i->nat_tree_lock); } -static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) +int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD) + if (available_free_memory(sbi, NAT_ENTRIES)) return 0; write_lock(&nm_i->nat_tree_lock); @@ -307,9 +332,10 @@ cache: * The maximum depth is four. * Offset[0] will have raw inode offset. */ -static int get_node_path(long block, int offset[4], unsigned int noffset[4]) +static int get_node_path(struct f2fs_inode_info *fi, long block, + int offset[4], unsigned int noffset[4]) { - const long direct_index = ADDRS_PER_INODE; + const long direct_index = ADDRS_PER_INODE(fi); const long direct_blks = ADDRS_PER_BLOCK; const long dptrs_per_blk = NIDS_PER_BLOCK; const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; @@ -320,15 +346,14 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[0] = 0; if (block < direct_index) { - offset[n++] = block; - level = 0; + offset[n] = block; goto got; } block -= direct_index; if (block < direct_blks) { offset[n++] = NODE_DIR1_BLOCK; noffset[n] = 1; - offset[n++] = block; + offset[n] = block; level = 1; goto got; } @@ -336,7 +361,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) if (block < direct_blks) { offset[n++] = NODE_DIR2_BLOCK; noffset[n] = 2; - offset[n++] = block; + offset[n] = block; level = 1; goto got; } @@ -346,7 +371,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[n] = 3; offset[n++] = block / direct_blks; noffset[n] = 4 + offset[n - 1]; - offset[n++] = block % direct_blks; + offset[n] = block % direct_blks; level = 2; goto got; } @@ -356,7 +381,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[n] = 4 + dptrs_per_blk; offset[n++] = block / direct_blks; noffset[n] = 5 + dptrs_per_blk + offset[n - 1]; - offset[n++] = block % direct_blks; + offset[n] = block % direct_blks; level = 2; goto got; } @@ -371,7 +396,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[n] = 7 + (dptrs_per_blk * 2) + offset[n - 2] * (dptrs_per_blk + 1) + offset[n - 1]; - offset[n++] = block % direct_blks; + offset[n] = block % direct_blks; level = 3; goto got; } else { @@ -383,8 +408,11 @@ got: /* * Caller should call f2fs_put_dnode(dn). + * Also, it should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op() only if ro is not set RDONLY_NODE. + * In the case of RDONLY_NODE, we don't need to care about mutex. */ -int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct page *npage[4]; @@ -395,15 +423,19 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) int level, i; int err = 0; - level = get_node_path(index, offset, noffset); + level = get_node_path(F2FS_I(dn->inode), index, offset, noffset); nids[0] = dn->inode->i_ino; - npage[0] = get_node_page(sbi, nids[0]); - if (IS_ERR(npage[0])) - return PTR_ERR(npage[0]); + npage[0] = dn->inode_page; + if (!npage[0]) { + npage[0] = get_node_page(sbi, nids[0]); + if (IS_ERR(npage[0])) + return PTR_ERR(npage[0]); + } parent = npage[0]; - nids[1] = get_nid(parent, offset[0], true); + if (level != 0) + nids[1] = get_nid(parent, offset[0], true); dn->inode_page = npage[0]; dn->inode_page_locked = true; @@ -411,30 +443,25 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) for (i = 1; i <= level; i++) { bool done = false; - if (!nids[i] && !ro) { - mutex_lock_op(sbi, NODE_NEW); - + if (!nids[i] && mode == ALLOC_NODE) { /* alloc new node */ if (!alloc_nid(sbi, &(nids[i]))) { - mutex_unlock_op(sbi, NODE_NEW); err = -ENOSPC; goto release_pages; } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i]); + npage[i] = new_node_page(dn, noffset[i], NULL); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); - mutex_unlock_op(sbi, NODE_NEW); err = PTR_ERR(npage[i]); goto release_pages; } set_nid(parent, offset[i - 1], nids[i], i == 1); alloc_nid_done(sbi, nids[i]); - mutex_unlock_op(sbi, NODE_NEW); done = true; - } else if (ro && i == level && level > 1) { + } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { npage[i] = get_node_page_ra(parent, offset[i - 1]); if (IS_ERR(npage[i])) { err = PTR_ERR(npage[i]); @@ -485,15 +512,15 @@ static void truncate_node(struct dnode_of_data *dn) get_node_info(sbi, dn->nid, &ni); if (dn->inode->i_blocks == 0) { - BUG_ON(ni.blk_addr != NULL_ADDR); + f2fs_bug_on(ni.blk_addr != NULL_ADDR); goto invalidate; } - BUG_ON(ni.blk_addr == NULL_ADDR); + f2fs_bug_on(ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, dn->inode, 1); - set_node_addr(sbi, &ni, NULL_ADDR); + dec_valid_node_count(sbi, dn->inode); + set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { remove_orphan_inode(sbi, dn->nid); @@ -506,7 +533,12 @@ invalidate: F2FS_SET_SB_DIRT(sbi); f2fs_put_page(dn->node_page, 1); + + invalidate_mapping_pages(NODE_MAPPING(sbi), + dn->node_page->index, dn->node_page->index); + dn->node_page = NULL; + trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); } static int truncate_dnode(struct dnode_of_data *dn) @@ -547,11 +579,15 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, if (dn->nid == 0) return NIDS_PER_BLOCK + 1; + trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); + page = get_node_page(sbi, dn->nid); - if (IS_ERR(page)) + if (IS_ERR(page)) { + trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); return PTR_ERR(page); + } - rn = (struct f2fs_node *)page_address(page); + rn = F2FS_NODE(page); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { child_nid = le32_to_cpu(rn->in.nid[i]); @@ -591,10 +627,12 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, } else { f2fs_put_page(page, 1); } + trace_f2fs_truncate_nodes_exit(dn->inode, freed); return freed; out_err: f2fs_put_page(page, 1); + trace_f2fs_truncate_nodes_exit(dn->inode, ret); return ret; } @@ -614,19 +652,19 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, return 0; /* get indirect nodes in the path */ - for (i = 0; i < depth - 1; i++) { + for (i = 0; i < idx + 1; i++) { /* refernece count'll be increased */ pages[i] = get_node_page(sbi, nid[i]); if (IS_ERR(pages[i])) { - depth = i + 1; err = PTR_ERR(pages[i]); + idx = i - 1; goto fail; } nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } /* free direct nodes linked to a partial indirect node */ - for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) { + for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { child_nid = get_nid(pages[idx], i, false); if (!child_nid) continue; @@ -637,7 +675,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, set_nid(pages[idx], i, 0, false); } - if (offset[depth - 1] == 0) { + if (offset[idx + 1] == 0) { dn->node_page = pages[idx]; dn->nid = nid[idx]; truncate_node(dn); @@ -645,10 +683,14 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, f2fs_put_page(pages[idx], 1); } offset[idx]++; - offset[depth - 1] = 0; + offset[idx + 1] = 0; + idx--; fail: - for (i = depth - 3; i >= 0; i--) + for (i = idx; i >= 0; i--) f2fs_put_page(pages[i], 1); + + trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); + return err; } @@ -661,20 +703,24 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) int err = 0, cont = 1; int level, offset[4], noffset[4]; unsigned int nofs = 0; - struct f2fs_node *rn; + struct f2fs_inode *ri; struct dnode_of_data dn; struct page *page; - level = get_node_path(from, offset, noffset); + trace_f2fs_truncate_inode_blocks_enter(inode, from); + level = get_node_path(F2FS_I(inode), from, offset, noffset); +restart: page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) + if (IS_ERR(page)) { + trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); return PTR_ERR(page); + } set_new_dnode(&dn, inode, page, NULL, 0); unlock_page(page); - rn = page_address(page); + ri = F2FS_INODE(page); switch (level) { case 0: case 1: @@ -684,7 +730,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) nofs = noffset[1]; if (!offset[level - 1]) goto skip_partial; - err = truncate_partial_nodes(&dn, &rn->i, offset, level); + err = truncate_partial_nodes(&dn, ri, offset, level); if (err < 0 && err != -ENOENT) goto fail; nofs += 1 + NIDS_PER_BLOCK; @@ -693,7 +739,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) nofs = 5 + 2 * NIDS_PER_BLOCK; if (!offset[level - 1]) goto skip_partial; - err = truncate_partial_nodes(&dn, &rn->i, offset, level); + err = truncate_partial_nodes(&dn, ri, offset, level); if (err < 0 && err != -ENOENT) goto fail; break; @@ -703,7 +749,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) skip_partial: while (cont) { - dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]); + dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -726,10 +772,14 @@ skip_partial: if (err < 0 && err != -ENOENT) goto fail; if (offset[1] == 0 && - rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) { + ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); - wait_on_page_writeback(page); - rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + f2fs_put_page(page, 1); + goto restart; + } + f2fs_wait_on_page_writeback(page, NODE); + ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); } @@ -739,100 +789,115 @@ skip_partial: } fail: f2fs_put_page(page, 0); + trace_f2fs_truncate_inode_blocks_exit(inode, err); return err > 0 ? 0 : err; } -int remove_inode_page(struct inode *inode) +int truncate_xattr_node(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct page *page; - nid_t ino = inode->i_ino; + nid_t nid = F2FS_I(inode)->i_xattr_nid; struct dnode_of_data dn; + struct page *npage; - mutex_lock_op(sbi, NODE_TRUNC); - page = get_node_page(sbi, ino); - if (IS_ERR(page)) { - mutex_unlock_op(sbi, NODE_TRUNC); - return PTR_ERR(page); - } + if (!nid) + return 0; - if (F2FS_I(inode)->i_xattr_nid) { - nid_t nid = F2FS_I(inode)->i_xattr_nid; - struct page *npage = get_node_page(sbi, nid); + npage = get_node_page(sbi, nid); + if (IS_ERR(npage)) + return PTR_ERR(npage); - if (IS_ERR(npage)) { - mutex_unlock_op(sbi, NODE_TRUNC); - return PTR_ERR(npage); - } + F2FS_I(inode)->i_xattr_nid = 0; - F2FS_I(inode)->i_xattr_nid = 0; - set_new_dnode(&dn, inode, page, npage, nid); - dn.inode_page_locked = 1; - truncate_node(&dn); - } + /* need to do checkpoint during fsync */ + F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); - /* 0 is possible, after f2fs_new_inode() is failed */ - BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1); - set_new_dnode(&dn, inode, page, page, ino); - truncate_node(&dn); + set_new_dnode(&dn, inode, page, npage, nid); - mutex_unlock_op(sbi, NODE_TRUNC); + if (page) + dn.inode_page_locked = true; + truncate_node(&dn); return 0; } -int new_inode_page(struct inode *inode, const struct qstr *name) +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +void remove_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; + nid_t ino = inode->i_ino; + struct dnode_of_data dn; + + page = get_node_page(sbi, ino); + if (IS_ERR(page)) + return; + + if (truncate_xattr_node(inode, page)) { + f2fs_put_page(page, 1); + return; + } + /* 0 is possible, after f2fs_new_inode() is failed */ + f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); + set_new_dnode(&dn, inode, page, page, ino); + truncate_node(&dn); +} + +struct page *new_inode_page(struct inode *inode, const struct qstr *name) +{ struct dnode_of_data dn; /* allocate inode page for new inode */ set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - mutex_lock_op(sbi, NODE_NEW); - page = new_node_page(&dn, 0); - init_dent_inode(name, page); - mutex_unlock_op(sbi, NODE_NEW); - if (IS_ERR(page)) - return PTR_ERR(page); - f2fs_put_page(page, 1); - return 0; + + /* caller should f2fs_put_page(page, 1); */ + return new_node_page(&dn, 0, NULL); } -struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) +struct page *new_node_page(struct dnode_of_data *dn, + unsigned int ofs, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); - struct address_space *mapping = sbi->node_inode->i_mapping; struct node_info old_ni, new_ni; struct page *page; int err; - if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)) + if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(mapping, dn->nid); + page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); if (!page) return ERR_PTR(-ENOMEM); - get_node_info(sbi, dn->nid, &old_ni); + if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { + err = -ENOSPC; + goto fail; + } - SetPageUptodate(page); - fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); + get_node_info(sbi, dn->nid, &old_ni); /* Reinitialize old_ni with new node page */ - BUG_ON(old_ni.blk_addr != NULL_ADDR); + f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); new_ni = old_ni; new_ni.ino = dn->inode->i_ino; + set_node_addr(sbi, &new_ni, NEW_ADDR, false); - if (!inc_valid_node_count(sbi, dn->inode, 1)) { - err = -ENOSPC; - goto fail; - } - set_node_addr(sbi, &new_ni, NEW_ADDR); + f2fs_wait_on_page_writeback(page, NODE); + fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); + SetPageUptodate(page); + set_page_dirty(page); + + if (f2fs_has_xattr_block(ofs)) + F2FS_I(dn->inode)->i_xattr_nid = dn->nid; dn->node_page = page; - sync_inode_page(dn); - set_page_dirty(page); + if (ipage) + update_inode(dn->inode, ipage); + else + sync_inode_page(dn); if (ofs == 0) inc_valid_inode_count(sbi); @@ -844,16 +909,28 @@ fail: return ERR_PTR(err); } -static int read_node_page(struct page *page, int type) +/* + * Caller should do after getting the following values. + * 0: f2fs_put_page(page, 0) + * LOCKED_PAGE: f2fs_put_page(page, 1) + * error: nothing + */ +static int read_node_page(struct page *page, int rw) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); struct node_info ni; get_node_info(sbi, page->index, &ni); - if (ni.blk_addr == NULL_ADDR) + if (unlikely(ni.blk_addr == NULL_ADDR)) { + f2fs_put_page(page, 1); return -ENOENT; - return f2fs_readpage(sbi, page, ni.blk_addr, type); + } + + if (PageUptodate(page)) + return LOCKED_PAGE; + + return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw); } /* @@ -861,44 +938,52 @@ static int read_node_page(struct page *page, int type) */ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) { - struct address_space *mapping = sbi->node_inode->i_mapping; struct page *apage; + int err; - apage = find_get_page(mapping, nid); - if (apage && PageUptodate(apage)) - goto release_out; + apage = find_get_page(NODE_MAPPING(sbi), nid); + if (apage && PageUptodate(apage)) { + f2fs_put_page(apage, 0); + return; + } f2fs_put_page(apage, 0); - apage = grab_cache_page(mapping, nid); + apage = grab_cache_page(NODE_MAPPING(sbi), nid); if (!apage) return; - if (read_node_page(apage, READA)) - unlock_page(apage); - -release_out: - f2fs_put_page(apage, 0); - return; + err = read_node_page(apage, READA); + if (err == 0) + f2fs_put_page(apage, 0); + else if (err == LOCKED_PAGE) + f2fs_put_page(apage, 1); } struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - int err; struct page *page; - struct address_space *mapping = sbi->node_inode->i_mapping; - - page = grab_cache_page(mapping, nid); + int err; +repeat: + page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) return ERR_PTR(-ENOMEM); err = read_node_page(page, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err < 0) return ERR_PTR(err); - } + else if (err == LOCKED_PAGE) + goto got_it; - BUG_ON(nid != nid_of_node(page)); - mark_page_accessed(page); + lock_page(page); + if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + f2fs_put_page(page, 1); + goto repeat; + } +got_it: return page; } @@ -909,32 +994,27 @@ struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) struct page *get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); - struct address_space *mapping = sbi->node_inode->i_mapping; - int i, end; - int err = 0; - nid_t nid; + struct blk_plug plug; struct page *page; + int err, i, end; + nid_t nid; /* First, try getting the desired direct node. */ nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); - - page = find_get_page(mapping, nid); - if (page && PageUptodate(page)) - goto page_hit; - f2fs_put_page(page, 0); - repeat: - page = grab_cache_page(mapping, nid); + page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) return ERR_PTR(-ENOMEM); - err = read_node_page(page, READA); - if (err) { - f2fs_put_page(page, 1); + err = read_node_page(page, READ_SYNC); + if (err < 0) return ERR_PTR(err); - } + else if (err == LOCKED_PAGE) + goto page_hit; + + blk_start_plug(&plug); /* Then, try readahead for siblings of the desired node */ end = start + MAX_RA_NODE; @@ -946,17 +1026,17 @@ repeat: ra_node_page(sbi, nid); } -page_hit: + blk_finish_plug(&plug); + lock_page(page); - if (PageError(page)) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + goto repeat; } - - /* Has the page been truncated? */ - if (page->mapping != mapping) { +page_hit: + if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); - goto repeat; + return ERR_PTR(-EIO); } return page; } @@ -972,14 +1052,13 @@ void sync_inode_page(struct dnode_of_data *dn) if (!dn->inode_page_locked) unlock_page(dn->inode_page); } else { - f2fs_write_inode(dn->inode, NULL); + update_inode_page(dn->inode); } } int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, struct writeback_control *wbc) { - struct address_space *mapping = sbi->node_inode->i_mapping; pgoff_t index, end; struct pagevec pvec; int step = ino ? 2 : 0; @@ -993,7 +1072,7 @@ next_step: while (index <= end) { int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) @@ -1026,7 +1105,7 @@ next_step: else if (!trylock_page(page)) continue; - if (unlikely(page->mapping != mapping)) { + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { continue_unlock: unlock_page(page); continue; @@ -1053,7 +1132,7 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); } - mapping->a_ops->writepage(page, wbc); + NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); wrote++; if (--wbc->nr_to_write == 0) @@ -1074,11 +1153,52 @@ continue_unlock: } if (wrote) - f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL); - + f2fs_submit_merged_bio(sbi, NODE, WRITE); return nwritten; } +int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) +{ + pgoff_t index = 0, end = LONG_MAX; + struct pagevec pvec; + int ret2 = 0, ret = 0; + + pagevec_init(&pvec, 0); + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* until radix tree lookup accepts end_index */ + if (unlikely(page->index > end)) + continue; + + if (ino && ino_of_node(page) == ino) { + f2fs_wait_on_page_writeback(page, NODE); + if (TestClearPageError(page)) + ret = -EIO; + } + } + pagevec_release(&pvec); + cond_resched(); + } + + if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags))) + ret2 = -ENOSPC; + if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags))) + ret2 = -EIO; + if (!ret) + ret = ret2; + return ret; +} + static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { @@ -1086,68 +1206,71 @@ static int f2fs_write_node_page(struct page *page, nid_t nid; block_t new_addr; struct node_info ni; + struct f2fs_io_info fio = { + .type = NODE, + .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + }; - if (wbc->for_reclaim) { - dec_page_count(sbi, F2FS_DIRTY_NODES); - wbc->pages_skipped++; - set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; - } + trace_f2fs_writepage(page, NODE); - wait_on_page_writeback(page); + if (unlikely(sbi->por_doing)) + goto redirty_out; - mutex_lock_op(sbi, NODE_WRITE); + f2fs_wait_on_page_writeback(page, NODE); /* get old block addr of this node page */ nid = nid_of_node(page); - BUG_ON(page->index != nid); + f2fs_bug_on(page->index != nid); get_node_info(sbi, nid, &ni); /* This page is already truncated */ - if (ni.blk_addr == NULL_ADDR) + if (unlikely(ni.blk_addr == NULL_ADDR)) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); return 0; + } - set_page_writeback(page); + if (wbc->for_reclaim) + goto redirty_out; - /* insert node offset */ - write_node_page(sbi, page, nid, ni.blk_addr, &new_addr); - set_node_addr(sbi, &ni, new_addr); + mutex_lock(&sbi->node_write); + set_page_writeback(page); + write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); + set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); - - mutex_unlock_op(sbi, NODE_WRITE); + mutex_unlock(&sbi->node_write); unlock_page(page); return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; } -/* - * It is very important to gather dirty pages and write at once, so that we can - * submit a big bio without interfering other data writes. - * Be default, 512 pages (2MB), a segment size, is quite reasonable. - */ -#define COLLECT_DIRTY_NODES 512 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - struct block_device *bdev = sbi->sb->s_bdev; - long nr_to_write = wbc->nr_to_write; + long diff; - /* First check balancing cached NAT entries */ - if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { - write_checkpoint(sbi, false); - return 0; - } + trace_f2fs_writepages(mapping->host, wbc, NODE); + + /* balancing f2fs's metadata in background */ + f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) - return 0; + if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - wbc->nr_to_write = bio_get_nr_vecs(bdev); + diff = nr_pages_to_write(sbi, NODE, wbc); + wbc->sync_mode = WB_SYNC_NONE; sync_node_pages(sbi, 0, wbc); - wbc->nr_to_write = nr_to_write - - (bio_get_nr_vecs(bdev) - wbc->nr_to_write); + wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + return 0; + +skip_write: + wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); return 0; } @@ -1156,6 +1279,8 @@ static int f2fs_set_node_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + trace_f2fs_set_page_dirty(page, NODE); + SetPageUptodate(page); if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); @@ -1166,7 +1291,8 @@ static int f2fs_set_node_page_dirty(struct page *page) return 0; } -static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) +static void f2fs_invalidate_node_page(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -1178,7 +1304,7 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) static int f2fs_release_node_page(struct page *page, gfp_t wait) { ClearPagePrivate(page); - return 0; + return 1; } /* @@ -1192,42 +1318,51 @@ const struct address_space_operations f2fs_node_aops = { .releasepage = f2fs_release_node_page, }; -static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) +static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, + nid_t n) { - struct list_head *this; - struct free_nid *i = NULL; - list_for_each(this, head) { - i = list_entry(this, struct free_nid, list); - if (i->nid == n) - break; - i = NULL; - } - return i; + return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct free_nid *i) +static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, + struct free_nid *i) { list_del(&i->list); - kmem_cache_free(free_nid_slab, i); + radix_tree_delete(&nm_i->free_nid_root, i->nid); } -static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; + struct nat_entry *ne; + bool allocated = false; + + if (!available_free_memory(sbi, FREE_NIDS)) + return -1; - if (nm_i->fcnt > 2 * MAX_FREE_NIDS) + /* 0 nid should not be used */ + if (unlikely(nid == 0)) return 0; -retry: - i = kmem_cache_alloc(free_nid_slab, GFP_NOFS); - if (!i) { - cond_resched(); - goto retry; + + if (build) { + /* do not add allocated nids */ + read_lock(&nm_i->nat_tree_lock); + ne = __lookup_nat_cache(nm_i, nid); + if (ne && + (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) + allocated = true; + read_unlock(&nm_i->nat_tree_lock); + if (allocated) + return 0; } + + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = NID_NEW; spin_lock(&nm_i->free_nid_list_lock); - if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) { + if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { spin_unlock(&nm_i->free_nid_list_lock); kmem_cache_free(free_nid_slab, i); return 0; @@ -1241,72 +1376,75 @@ retry: static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) { struct free_nid *i; + bool need_free = false; + spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __del_from_free_nid_list(i); + __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; + need_free = true; } spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } -static int scan_nat_page(struct f2fs_nm_info *nm_i, +static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; - int fcnt = 0; int i; - /* 0 nid should not be used */ - if (start_nid == 0) - ++start_nid; - i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { - blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); - BUG_ON(blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) - fcnt += add_free_nid(nm_i, start_nid); + + if (unlikely(start_nid >= nm_i->max_nid)) + break; + + blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); + f2fs_bug_on(blk_addr == NEW_ADDR); + if (blk_addr == NULL_ADDR) { + if (add_free_nid(sbi, start_nid, true) < 0) + break; + } } - return fcnt; } static void build_free_nids(struct f2fs_sb_info *sbi) { - struct free_nid *fnid, *next_fnid; struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - nid_t nid = 0; - bool is_cycled = false; - int fcnt = 0; - int i; + int i = 0; + nid_t nid = nm_i->next_scan_nid; - nid = nm_i->next_scan_nid; - nm_i->init_scan_nid = nid; + /* Enough entries */ + if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) + return; - ra_nat_pages(sbi, nid); + /* readahead nat pages to be scanned */ + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT); while (1) { struct page *page = get_current_nat_page(sbi, nid); - fcnt += scan_nat_page(nm_i, page, nid); + scan_nat_page(sbi, page, nid); f2fs_put_page(page, 1); nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); - - if (nid >= nm_i->max_nid) { + if (unlikely(nid >= nm_i->max_nid)) nid = 0; - is_cycled = true; - } - if (fcnt > MAX_FREE_NIDS) - break; - if (is_cycled && nm_i->init_scan_nid <= nid) + + if (i++ == FREE_NID_PAGES) break; } + /* go to the next free nat pages to find free nids abundantly */ nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ @@ -1315,22 +1453,11 @@ static void build_free_nids(struct f2fs_sb_info *sbi) block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); nid = le32_to_cpu(nid_in_journal(sum, i)); if (addr == NULL_ADDR) - add_free_nid(nm_i, nid); + add_free_nid(sbi, nid, true); else remove_free_nid(nm_i, nid); } mutex_unlock(&curseg->curseg_mutex); - - /* remove the free nids from current allocated nids */ - list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) { - struct nat_entry *ne; - - read_lock(&nm_i->nat_tree_lock); - ne = __lookup_nat_cache(nm_i, fnid->nid); - if (ne && nat_get_blkaddr(ne) != NULL_ADDR) - remove_free_nid(nm_i, fnid->nid); - read_unlock(&nm_i->nat_tree_lock); - } } /* @@ -1342,43 +1469,33 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; - struct list_head *this; retry: - mutex_lock(&nm_i->build_lock); - if (!nm_i->fcnt) { - /* scan NAT in order to build free nid list */ - build_free_nids(sbi); - if (!nm_i->fcnt) { - mutex_unlock(&nm_i->build_lock); - return false; - } - } - mutex_unlock(&nm_i->build_lock); + if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) + return false; - /* - * We check fcnt again since previous check is racy as - * we didn't hold free_nid_list_lock. So other thread - * could consume all of free nids. - */ spin_lock(&nm_i->free_nid_list_lock); - if (!nm_i->fcnt) { - spin_unlock(&nm_i->free_nid_list_lock); - goto retry; - } - BUG_ON(list_empty(&nm_i->free_nid_list)); - list_for_each(this, &nm_i->free_nid_list) { - i = list_entry(this, struct free_nid, list); - if (i->state == NID_NEW) - break; - } + /* We should not use stale free nids created by build_free_nids */ + if (nm_i->fcnt && !on_build_free_nids(nm_i)) { + f2fs_bug_on(list_empty(&nm_i->free_nid_list)); + list_for_each_entry(i, &nm_i->free_nid_list, list) + if (i->state == NID_NEW) + break; - BUG_ON(i->state != NID_NEW); - *nid = i->nid; - i->state = NID_ALLOC; - nm_i->fcnt--; + f2fs_bug_on(i->state != NID_NEW); + *nid = i->nid; + i->state = NID_ALLOC; + nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + return true; + } spin_unlock(&nm_i->free_nid_list_lock); - return true; + + /* Let's scan nat pages and its caches to get free nids */ + mutex_lock(&nm_i->build_lock); + build_free_nids(sbi); + mutex_unlock(&nm_i->build_lock); + goto retry; } /* @@ -1390,12 +1507,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct free_nid *i; spin_lock(&nm_i->free_nid_list_lock); - i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); - if (i) { - BUG_ON(i->state != NID_ALLOC); - __del_from_free_nid_list(i); - } + i = __lookup_free_nid_list(nm_i, nid); + f2fs_bug_on(!i || i->state != NID_ALLOC); + __del_from_free_nid_list(nm_i, i); spin_unlock(&nm_i->free_nid_list_lock); + + kmem_cache_free(free_nid_slab, i); } /* @@ -1403,8 +1520,27 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) */ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { - alloc_nid_done(sbi, nid); - add_free_nid(NM_I(sbi), nid); + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + bool need_free = false; + + if (!nid) + return; + + spin_lock(&nm_i->free_nid_list_lock); + i = __lookup_free_nid_list(nm_i, nid); + f2fs_bug_on(!i || i->state != NID_ALLOC); + if (!available_free_memory(sbi, FREE_NIDS)) { + __del_from_free_nid_list(nm_i, i); + need_free = true; + } else { + i->state = NID_NEW; + nm_i->fcnt++; + } + spin_unlock(&nm_i->free_nid_list_lock); + + if (need_free) + kmem_cache_free(free_nid_slab, i); } void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, @@ -1412,88 +1548,200 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, block_t new_blkaddr) { rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); - set_node_addr(sbi, ni, new_blkaddr); + set_node_addr(sbi, ni, new_blkaddr, false); clear_node_page_dirty(page); } +static void recover_inline_xattr(struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + void *src_addr, *dst_addr; + size_t inline_size; + struct page *ipage; + struct f2fs_inode *ri; + + if (!f2fs_has_inline_xattr(inode)) + return; + + if (!IS_INODE(page)) + return; + + ri = F2FS_INODE(page); + if (!(ri->i_inline & F2FS_INLINE_XATTR)) + return; + + ipage = get_node_page(sbi, inode->i_ino); + f2fs_bug_on(IS_ERR(ipage)); + + dst_addr = inline_xattr_addr(ipage); + src_addr = inline_xattr_addr(page); + inline_size = inline_xattr_size(inode); + + f2fs_wait_on_page_writeback(ipage, NODE); + memcpy(dst_addr, src_addr, inline_size); + + update_inode(inode, ipage); + f2fs_put_page(ipage, 1); +} + +bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; + nid_t new_xnid = nid_of_node(page); + struct node_info ni; + + recover_inline_xattr(inode, page); + + if (!f2fs_has_xattr_block(ofs_of_node(page))) + return false; + + /* 1: invalidate the previous xattr nid */ + if (!prev_xnid) + goto recover_xnid; + + /* Deallocate node address */ + get_node_info(sbi, prev_xnid, &ni); + f2fs_bug_on(ni.blk_addr == NULL_ADDR); + invalidate_blocks(sbi, ni.blk_addr); + dec_valid_node_count(sbi, inode); + set_node_addr(sbi, &ni, NULL_ADDR, false); + +recover_xnid: + /* 2: allocate new xattr nid */ + if (unlikely(!inc_valid_node_count(sbi, inode))) + f2fs_bug_on(1); + + remove_free_nid(NM_I(sbi), new_xnid); + get_node_info(sbi, new_xnid, &ni); + ni.ino = inode->i_ino; + set_node_addr(sbi, &ni, NEW_ADDR, false); + F2FS_I(inode)->i_xattr_nid = new_xnid; + + /* 3: update xattr blkaddr */ + refresh_sit_entry(sbi, NEW_ADDR, blkaddr); + set_node_addr(sbi, &ni, blkaddr, false); + + update_inode_page(inode); + return true; +} + int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) { - struct address_space *mapping = sbi->node_inode->i_mapping; - struct f2fs_node *src, *dst; + struct f2fs_inode *src, *dst; nid_t ino = ino_of_node(page); struct node_info old_ni, new_ni; struct page *ipage; - ipage = grab_cache_page(mapping, ino); + get_node_info(sbi, ino, &old_ni); + + if (unlikely(old_ni.blk_addr != NULL_ADDR)) + return -EINVAL; + + ipage = grab_cache_page(NODE_MAPPING(sbi), ino); if (!ipage) return -ENOMEM; /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); - get_node_info(sbi, ino, &old_ni); SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); - src = (struct f2fs_node *)page_address(page); - dst = (struct f2fs_node *)page_address(ipage); + src = F2FS_INODE(page); + dst = F2FS_INODE(ipage); - memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i); - dst->i.i_size = 0; - dst->i.i_blocks = cpu_to_le64(1); - dst->i.i_links = cpu_to_le32(1); - dst->i.i_xattr_nid = 0; + memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + dst->i_size = 0; + dst->i_blocks = cpu_to_le64(1); + dst->i_links = cpu_to_le32(1); + dst->i_xattr_nid = 0; new_ni = old_ni; new_ni.ino = ino; - set_node_addr(sbi, &new_ni, NEW_ADDR); + if (unlikely(!inc_valid_node_count(sbi, NULL))) + WARN_ON(1); + set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); - f2fs_put_page(ipage, 1); return 0; } +/* + * ra_sum_pages() merge contiguous pages into one bio and submit. + * these pre-readed pages are alloced in bd_inode's mapping tree. + */ +static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages, + int start, int nrpages) +{ + struct inode *inode = sbi->sb->s_bdev->bd_inode; + struct address_space *mapping = inode->i_mapping; + int i, page_idx = start; + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO + }; + + for (i = 0; page_idx < start + nrpages; page_idx++, i++) { + /* alloc page in bd_inode for reading node summary info */ + pages[i] = grab_cache_page(mapping, page_idx); + if (!pages[i]) + break; + f2fs_submit_page_mbio(sbi, pages[i], page_idx, &fio); + } + + f2fs_submit_merged_bio(sbi, META, READ); + return i; +} + int restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; struct f2fs_summary *sum_entry; - struct page *page; + struct inode *inode = sbi->sb->s_bdev->bd_inode; block_t addr; - int i, last_offset; - - /* alloc temporal page for read node */ - page = alloc_page(GFP_NOFS | __GFP_ZERO); - if (IS_ERR(page)) - return PTR_ERR(page); - lock_page(page); + int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + struct page *pages[bio_blocks]; + int i, idx, last_offset, nrpages, err = 0; /* scan the node segment */ last_offset = sbi->blocks_per_seg; addr = START_BLOCK(sbi, segno); sum_entry = &sum->entries[0]; - for (i = 0; i < last_offset; i++, sum_entry++) { - if (f2fs_readpage(sbi, page, addr, READ_SYNC)) - goto out; + for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { + nrpages = min(last_offset - i, bio_blocks); - rn = (struct f2fs_node *)page_address(page); - sum_entry->nid = rn->footer.nid; - sum_entry->version = 0; - sum_entry->ofs_in_node = 0; - addr++; + /* read ahead node pages */ + nrpages = ra_sum_pages(sbi, pages, addr, nrpages); + if (!nrpages) + return -ENOMEM; - /* - * In order to read next node page, - * we must clear PageUptodate flag. - */ - ClearPageUptodate(page); + for (idx = 0; idx < nrpages; idx++) { + if (err) + goto skip; + + lock_page(pages[idx]); + if (unlikely(!PageUptodate(pages[idx]))) { + err = -EIO; + } else { + rn = F2FS_NODE(pages[idx]); + sum_entry->nid = rn->footer.nid; + sum_entry->version = 0; + sum_entry->ofs_in_node = 0; + sum_entry++; + } + unlock_page(pages[idx]); +skip: + page_cache_release(pages[idx]); + } + + invalidate_mapping_pages(inode->i_mapping, addr, + addr + nrpages); } -out: - unlock_page(page); - __free_pages(page, 0); - return 0; + return err; } static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) @@ -1529,9 +1777,7 @@ retry: write_unlock(&nm_i->nat_tree_lock); goto retry; } - nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr)); - nat_set_ino(ne, le32_to_cpu(raw_ne.ino)); - nat_set_version(ne, raw_ne.version); + node_info_from_raw_nat(&ne->ni, &raw_ne); __set_nat_cache_dirty(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); } @@ -1548,7 +1794,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - struct list_head *cur, *n; + struct nat_entry *ne, *cur; struct page *page = NULL; struct f2fs_nat_block *nat_blk = NULL; nid_t start_nid = 0, end_nid = 0; @@ -1560,18 +1806,16 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) mutex_lock(&curseg->curseg_mutex); /* 1) flush dirty nat caches */ - list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) { - struct nat_entry *ne; + list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { nid_t nid; struct f2fs_nat_entry raw_ne; int offset = -1; - block_t new_blkaddr; - - ne = list_entry(cur, struct nat_entry, list); - nid = nat_get_nid(ne); if (nat_get_blkaddr(ne) == NEW_ADDR) continue; + + nid = nat_get_nid(ne); + if (flushed) goto to_nat_page; @@ -1598,14 +1842,10 @@ to_nat_page: nat_blk = page_address(page); } - BUG_ON(!nat_blk); + f2fs_bug_on(!nat_blk); raw_ne = nat_blk->entries[nid - start_nid]; flush_now: - new_blkaddr = nat_get_blkaddr(ne); - - raw_ne.ino = cpu_to_le32(nat_get_ino(ne)); - raw_ne.block_addr = cpu_to_le32(new_blkaddr); - raw_ne.version = nat_get_version(ne); + raw_nat_from_node_info(&raw_ne, &ne->ni); if (offset < 0) { nat_blk->entries[nid - start_nid] = raw_ne; @@ -1614,26 +1854,20 @@ flush_now: nid_in_journal(sum, offset) = cpu_to_le32(nid); } - if (nat_get_blkaddr(ne) == NULL_ADDR) { + if (nat_get_blkaddr(ne) == NULL_ADDR && + add_free_nid(sbi, nid, false) <= 0) { write_lock(&nm_i->nat_tree_lock); __del_from_nat_cache(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); - - /* We can reuse this freed nid at this point */ - add_free_nid(NM_I(sbi), nid); } else { write_lock(&nm_i->nat_tree_lock); __clear_nat_cache_dirty(nm_i, ne); - ne->checkpointed = true; write_unlock(&nm_i->nat_tree_lock); } } if (!flushed) mutex_unlock(&curseg->curseg_mutex); f2fs_put_page(page, 1); - - /* 2) shrink nat caches if necessary */ - try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD); } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -1648,10 +1882,16 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + + /* not used nids: 0, node, meta, (and root counted as valid node) */ + nm_i->available_nids = nm_i->max_nid - 3; nm_i->fcnt = 0; nm_i->nat_cnt = 0; + nm_i->ram_thresh = DEF_RAM_THRESHOLD; + INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->nat_entries); @@ -1661,19 +1901,16 @@ static int init_node_manager(struct f2fs_sb_info *sbi) spin_lock_init(&nm_i->free_nid_list_lock); rwlock_init(&nm_i->nat_tree_lock); - nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); - nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); - - nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL); - if (!nm_i->nat_bitmap) - return -ENOMEM; + nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); if (!version_bitmap) return -EFAULT; - /* copy version bitmap */ - memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size); + nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap) + return -ENOMEM; return 0; } @@ -1707,11 +1944,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy free nid list */ spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - BUG_ON(i->state == NID_ALLOC); - __del_from_free_nid_list(i); + f2fs_bug_on(i->state == NID_ALLOC); + __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + kmem_cache_free(free_nid_slab, i); + spin_lock(&nm_i->free_nid_list_lock); } - BUG_ON(nm_i->fcnt); + f2fs_bug_on(nm_i->fcnt); spin_unlock(&nm_i->free_nid_list_lock); /* destroy nat cache */ @@ -1719,13 +1959,11 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; - for (idx = 0; idx < found; idx++) { - struct nat_entry *e = natvec[idx]; - nid = nat_get_nid(e) + 1; - __del_from_nat_cache(nm_i, e); - } + nid = nat_get_nid(natvec[found - 1]) + 1; + for (idx = 0; idx < found; idx++) + __del_from_nat_cache(nm_i, natvec[idx]); } - BUG_ON(nm_i->nat_cnt); + f2fs_bug_on(nm_i->nat_cnt); write_unlock(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); @@ -1736,12 +1974,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) int __init create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", - sizeof(struct nat_entry), NULL); + sizeof(struct nat_entry)); if (!nat_entry_slab) return -ENOMEM; free_nid_slab = f2fs_kmem_cache_create("free_nid", - sizeof(struct free_nid), NULL); + sizeof(struct free_nid)); if (!free_nid_slab) { kmem_cache_destroy(nat_entry_slab); return -ENOMEM; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index afdb130f782..7281112cd1c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -17,18 +17,18 @@ /* # of pages to perform readahead before building free nids */ #define FREE_NID_PAGES 4 -/* maximum # of free node ids to produce during build_free_nids */ -#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) - /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 -/* maximum cached nat entries to manage memory footprint */ -#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK) +/* control the memory footprint threshold (10MB per 1GB ram) */ +#define DEF_RAM_THRESHOLD 10 /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 +/* return value for read_node_page */ +#define LOCKED_PAGE 1 + /* * For node information */ @@ -42,6 +42,7 @@ struct node_info { struct nat_entry { struct list_head list; /* for clean or dirty nat list */ bool checkpointed; /* whether it is checkpointed or not */ + bool fsync_done; /* whether the latest node has fsync mark */ struct node_info ni; /* in-memory node information */ }; @@ -55,9 +56,15 @@ struct nat_entry { #define nat_set_version(nat, v) (nat->ni.version = v) #define __set_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); + do { \ + ne->checkpointed = false; \ + list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \ + } while (0) #define __clear_nat_cache_dirty(nm_i, ne) \ - list_move_tail(&ne->list, &nm_i->nat_entries); + do { \ + ne->checkpointed = true; \ + list_move_tail(&ne->list, &nm_i->nat_entries); \ + } while (0) #define inc_node_version(version) (++version) static inline void node_info_from_raw_nat(struct node_info *ni, @@ -68,6 +75,20 @@ static inline void node_info_from_raw_nat(struct node_info *ni, ni->version = raw_ne->version; } +static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, + struct node_info *ni) +{ + raw_ne->ino = cpu_to_le32(ni->ino); + raw_ne->block_addr = cpu_to_le32(ni->blk_addr); + raw_ne->version = ni->version; +} + +enum mem_type { + FREE_NIDS, /* indicates the free nid list */ + NAT_ENTRIES, /* indicates the cached nat entry */ + DIRTY_DENTS /* indicates dirty dentry pages */ +}; + /* * For free nid mangement */ @@ -152,8 +173,7 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) static inline void fill_node_footer(struct page *page, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(page); if (reset) memset(rn, 0, sizeof(*rn)); rn->footer.nid = cpu_to_le32(nid); @@ -163,10 +183,8 @@ static inline void fill_node_footer(struct page *page, nid_t nid, static inline void copy_node_footer(struct page *dst, struct page *src) { - void *src_addr = page_address(src); - void *dst_addr = page_address(dst); - struct f2fs_node *src_rn = (struct f2fs_node *)src_addr; - struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr; + struct f2fs_node *src_rn = F2FS_NODE(src); + struct f2fs_node *dst_rn = F2FS_NODE(dst); memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); } @@ -174,45 +192,40 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(page); + rn->footer.cp_ver = ckpt->checkpoint_ver; rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } static inline nid_t ino_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); return le32_to_cpu(rn->footer.ino); } static inline nid_t nid_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); return le32_to_cpu(rn->footer.nid); } static inline unsigned int ofs_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); unsigned flag = le32_to_cpu(rn->footer.flag); return flag >> OFFSET_BIT_SHIFT; } static inline unsigned long long cpver_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); return le64_to_cpu(rn->footer.cp_ver); } static inline block_t next_blkaddr_of_node(struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(node_page); return le32_to_cpu(rn->footer.next_blkaddr); } @@ -229,17 +242,27 @@ static inline block_t next_blkaddr_of_node(struct page *node_page) * | `- direct node (5 + N => 5 + 2N - 1) * `- double indirect node (5 + 2N) * `- indirect node (6 + 2N) - * `- direct node (x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + x(N + 1)) + * `- direct node + * ...... + * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) + * `- direct node */ static inline bool IS_DNODE(struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); + + if (f2fs_has_xattr_block(ofs)) + return false; + if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) return false; if (ofs >= 6 + 2 * NIDS_PER_BLOCK) { ofs -= 6 + 2 * NIDS_PER_BLOCK; - if ((long int)ofs % (NIDS_PER_BLOCK + 1)) + if (!((long int)ofs % (NIDS_PER_BLOCK + 1))) return false; } return true; @@ -247,9 +270,9 @@ static inline bool IS_DNODE(struct page *node_page) static inline void set_nid(struct page *p, int off, nid_t nid, bool i) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(p); + struct f2fs_node *rn = F2FS_NODE(p); - wait_on_page_writeback(p); + f2fs_wait_on_page_writeback(p, NODE); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); @@ -260,7 +283,8 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i) static inline nid_t get_nid(struct page *p, int off, bool i) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(p); + struct f2fs_node *rn = F2FS_NODE(p); + if (i) return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); return le32_to_cpu(rn->in.nid[off]); @@ -272,11 +296,28 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold node blocks in their node footer * - Mark cold data pages in page cache */ -static inline int is_cold_file(struct inode *inode) +static inline int is_file(struct inode *inode, int type) +{ + return F2FS_I(inode)->i_advise & type; +} + +static inline void set_file(struct inode *inode, int type) +{ + F2FS_I(inode)->i_advise |= type; +} + +static inline void clear_file(struct inode *inode, int type) { - return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT; + F2FS_I(inode)->i_advise &= ~type; } +#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) +#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) +#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) + static inline int is_cold_data(struct page *page) { return PageChecked(page); @@ -292,33 +333,19 @@ static inline void clear_cold_data(struct page *page) ClearPageChecked(page); } -static inline int is_cold_node(struct page *page) +static inline int is_node(struct page *page, int type) { - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned int flag = le32_to_cpu(rn->footer.flag); - return flag & (0x1 << COLD_BIT_SHIFT); + struct f2fs_node *rn = F2FS_NODE(page); + return le32_to_cpu(rn->footer.flag) & (1 << type); } -static inline unsigned char is_fsync_dnode(struct page *page) -{ - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned int flag = le32_to_cpu(rn->footer.flag); - return flag & (0x1 << FSYNC_BIT_SHIFT); -} - -static inline unsigned char is_dent_dnode(struct page *page) -{ - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned int flag = le32_to_cpu(rn->footer.flag); - return flag & (0x1 << DENT_BIT_SHIFT); -} +#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) +#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) +#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) static inline void set_cold_node(struct inode *inode, struct page *page) { - struct f2fs_node *rn = (struct f2fs_node *)page_address(page); + struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (S_ISDIR(inode->i_mode)) @@ -328,26 +355,15 @@ static inline void set_cold_node(struct inode *inode, struct page *page) rn->footer.flag = cpu_to_le32(flag); } -static inline void set_fsync_mark(struct page *page, int mark) -{ - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; - unsigned int flag = le32_to_cpu(rn->footer.flag); - if (mark) - flag |= (0x1 << FSYNC_BIT_SHIFT); - else - flag &= ~(0x1 << FSYNC_BIT_SHIFT); - rn->footer.flag = cpu_to_le32(flag); -} - -static inline void set_dentry_mark(struct page *page, int mark) +static inline void set_mark(struct page *page, int mark, int type) { - void *kaddr = page_address(page); - struct f2fs_node *rn = (struct f2fs_node *)kaddr; + struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) - flag |= (0x1 << DENT_BIT_SHIFT); + flag |= (0x1 << type); else - flag &= ~(0x1 << DENT_BIT_SHIFT); + flag &= ~(0x1 << type); rn->footer.flag = cpu_to_le32(flag); } +#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) +#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b235215ac13..a112368a4a8 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -27,57 +27,93 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi) static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, nid_t ino) { - struct list_head *this; struct fsync_inode_entry *entry; - list_for_each(this, head) { - entry = list_entry(this, struct fsync_inode_entry, list); + list_for_each_entry(entry, head, list) if (entry->inode->i_ino == ino) return entry; - } + return NULL; } static int recover_dentry(struct page *ipage, struct inode *inode) { - struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage); - struct f2fs_inode *raw_inode = &(raw_node->i); - struct qstr name; + struct f2fs_inode *raw_inode = F2FS_INODE(ipage); + nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; + struct qstr name; struct page *page; - struct inode *dir; + struct inode *dir, *einode; int err = 0; - if (!is_dent_dnode(ipage)) - goto out; - - dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino)); + dir = f2fs_iget(inode->i_sb, pino); if (IS_ERR(dir)) { - err = -EINVAL; + err = PTR_ERR(dir); goto out; } name.len = le32_to_cpu(raw_inode->i_namelen); name.name = raw_inode->i_name; + if (unlikely(name.len > F2FS_NAME_LEN)) { + WARN_ON(1); + err = -ENAMETOOLONG; + goto out_err; + } +retry: de = f2fs_find_entry(dir, &name, &page); + if (de && inode->i_ino == le32_to_cpu(de->ino)) + goto out_unmap_put; if (de) { - kunmap(page); - f2fs_put_page(page, 0); + einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); + if (IS_ERR(einode)) { + WARN_ON(1); + err = PTR_ERR(einode); + if (err == -ENOENT) + err = -EEXIST; + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_SB(inode->i_sb)); + if (err) { + iput(einode); + goto out_unmap_put; + } + f2fs_delete_entry(de, page, einode); + iput(einode); + goto retry; + } + err = __f2fs_add_link(dir, &name, inode); + if (err) + goto out_err; + + if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) { + iput(dir); } else { - err = __f2fs_add_link(dir, &name, inode); + add_dirty_dir_inode(dir); + set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); } + + goto out; + +out_unmap_put: + kunmap(page); + f2fs_put_page(page, 0); +out_err: iput(dir); out: - kunmap(ipage); + f2fs_msg(inode->i_sb, KERN_NOTICE, + "%s: ino = %x, name = %s, dir = %lx, err = %d", + __func__, ino_of_node(ipage), raw_inode->i_name, + IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } static int recover_inode(struct inode *inode, struct page *node_page) { - void *kaddr = page_address(node_page); - struct f2fs_node *raw_node = (struct f2fs_node *)kaddr; - struct f2fs_inode *raw_inode = &(raw_node->i); + struct f2fs_inode *raw_inode = F2FS_INODE(node_page); + + if (!IS_INODE(node_page)) + return 0; inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_size_write(inode, le64_to_cpu(raw_inode->i_size)); @@ -88,12 +124,17 @@ static int recover_inode(struct inode *inode, struct page *node_page) inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - return recover_dentry(node_page, inode); + if (is_dent_dnode(node_page)) + return recover_dentry(node_page, inode); + + f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", + ino_of_node(node_page), raw_inode->i_name); + return 0; } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); + unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page; block_t blkaddr; @@ -101,75 +142,73 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); - blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff; + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* read node page */ page = alloc_page(GFP_F2FS_ZERO); - if (IS_ERR(page)) - return PTR_ERR(page); + if (!page) + return -ENOMEM; lock_page(page); while (1) { struct fsync_inode_entry *entry; - if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC)) - goto out; + err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); + if (err) + return err; + + lock_page(page); if (cp_ver != cpver_of_node(page)) - goto out; + break; if (!is_fsync_dnode(page)) goto next; entry = get_fsync_inode(head, ino_of_node(page)); if (entry) { - entry->blkaddr = blkaddr; if (IS_INODE(page) && is_dent_dnode(page)) set_inode_flag(F2FS_I(entry->inode), FI_INC_LINK); } else { if (IS_INODE(page) && is_dent_dnode(page)) { - if (recover_inode_page(sbi, page)) { - err = -ENOMEM; - goto out; - } + err = recover_inode_page(sbi, page); + if (err) + break; } /* add this fsync inode to the list */ entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); if (!entry) { err = -ENOMEM; - goto out; + break; } entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); if (IS_ERR(entry->inode)) { err = PTR_ERR(entry->inode); kmem_cache_free(fsync_entry_slab, entry); - goto out; + break; } - list_add_tail(&entry->list, head); - entry->blkaddr = blkaddr; - } - if (IS_INODE(page)) { - err = recover_inode(entry->inode, page); - if (err) - goto out; } + entry->blkaddr = blkaddr; + + err = recover_inode(entry->inode, page); + if (err && err != -ENOENT) + break; next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); - ClearPageUptodate(page); } -out: + unlock_page(page); __free_pages(page, 0); + return err; } -static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, - struct list_head *head) +static void destroy_fsync_dnodes(struct list_head *head) { struct fsync_inode_entry *entry, *tmp; @@ -180,81 +219,111 @@ static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, } } -static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, - block_t blkaddr) +static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, + block_t blkaddr, struct dnode_of_data *dn) { struct seg_entry *sentry; unsigned int segno = GET_SEGNO(sbi, blkaddr); - unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & - (sbi->blocks_per_seg - 1); + unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + struct f2fs_summary_block *sum_node; struct f2fs_summary sum; - nid_t ino; - void *kaddr; + struct page *sum_page, *node_page; + nid_t ino, nid; struct inode *inode; - struct page *node_page; + unsigned int offset; block_t bidx; int i; sentry = get_seg_entry(sbi, segno); if (!f2fs_test_bit(blkoff, sentry->cur_valid_map)) - return; + return 0; /* Get the previous summary */ for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; - break; + goto got_it; } } - if (i > CURSEG_COLD_DATA) { - struct page *sum_page = get_sum_page(sbi, segno); - struct f2fs_summary_block *sum_node; - kaddr = page_address(sum_page); - sum_node = (struct f2fs_summary_block *)kaddr; - sum = sum_node->entries[blkoff]; - f2fs_put_page(sum_page, 1); + + sum_page = get_sum_page(sbi, segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + sum = sum_node->entries[blkoff]; + f2fs_put_page(sum_page, 1); +got_it: + /* Use the locked dnode page and inode */ + nid = le32_to_cpu(sum.nid); + if (dn->inode->i_ino == nid) { + struct dnode_of_data tdn = *dn; + tdn.nid = nid; + tdn.node_page = dn->inode_page; + tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + truncate_data_blocks_range(&tdn, 1); + return 0; + } else if (dn->nid == nid) { + struct dnode_of_data tdn = *dn; + tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); + truncate_data_blocks_range(&tdn, 1); + return 0; } /* Get the node page */ - node_page = get_node_page(sbi, le32_to_cpu(sum.nid)); - bidx = start_bidx_of_node(ofs_of_node(node_page)) + - le16_to_cpu(sum.ofs_in_node); + node_page = get_node_page(sbi, nid); + if (IS_ERR(node_page)) + return PTR_ERR(node_page); + + offset = ofs_of_node(node_page); ino = ino_of_node(node_page); f2fs_put_page(node_page, 1); /* Deallocate previous index in the node page */ inode = f2fs_iget(sbi->sb, ino); if (IS_ERR(inode)) - return; + return PTR_ERR(inode); + + bidx = start_bidx_of_node(offset, F2FS_I(inode)) + + le16_to_cpu(sum.ofs_in_node); truncate_hole(inode, bidx, bidx + 1); iput(inode); + return 0; } -static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, +static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, block_t blkaddr) { + struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int start, end; struct dnode_of_data dn; struct f2fs_summary sum; struct node_info ni; + int err = 0, recovered = 0; - start = start_bidx_of_node(ofs_of_node(page)); - if (IS_INODE(page)) - end = start + ADDRS_PER_INODE; - else - end = start + ADDRS_PER_BLOCK; + if (recover_inline_data(inode, page)) + goto out; + + if (recover_xattr_data(inode, page, blkaddr)) + goto out; + + start = start_bidx_of_node(ofs_of_node(page), fi); + end = start + ADDRS_PER_PAGE(page, fi); + + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, start, 0)) - return; - wait_on_page_writeback(dn.node_page); + err = get_dnode_of_data(&dn, start, ALLOC_NODE); + if (err) { + f2fs_unlock_op(sbi); + goto out; + } + + f2fs_wait_on_page_writeback(dn.node_page, NODE); get_node_info(sbi, dn.nid, &ni); - BUG_ON(ni.ino != ino_of_node(page)); - BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page)); + f2fs_bug_on(ni.ino != ino_of_node(page)); + f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page)); for (; start < end; start++) { block_t src, dest; @@ -264,19 +333,22 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) { if (src == NULL_ADDR) { - int err = reserve_new_block(&dn); + err = reserve_new_block(&dn); /* We should not get -ENOSPC */ - BUG_ON(err); + f2fs_bug_on(err); } /* Check the previous node page having this index */ - check_index_in_prev_nodes(sbi, dest); + err = check_index_in_prev_nodes(sbi, dest, &dn); + if (err) + goto err; set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* write dummy data page */ recover_data_page(sbi, NULL, &sum, src, dest); update_extent_cache(dest, &dn); + recovered++; } dn.ofs_in_node++; } @@ -292,15 +364,23 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, set_page_dirty(dn.node_page); recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); +err: f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); +out: + f2fs_msg(sbi->sb, KERN_NOTICE, + "recover_data: ino = %lx, recovered = %d blocks, err = %d", + inode->i_ino, recovered, err); + return err; } -static void recover_data(struct f2fs_sb_info *sbi, +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head, int type) { - unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); + unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page; + int err = 0; block_t blkaddr; /* get node pages in the current segment */ @@ -308,25 +388,31 @@ static void recover_data(struct f2fs_sb_info *sbi, blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* read node page */ - page = alloc_page(GFP_NOFS | __GFP_ZERO); - if (IS_ERR(page)) - return; + page = alloc_page(GFP_F2FS_ZERO); + if (!page) + return -ENOMEM; + lock_page(page); while (1) { struct fsync_inode_entry *entry; - if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC)) - goto out; + err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); + if (err) + return err; + + lock_page(page); if (cp_ver != cpver_of_node(page)) - goto out; + break; entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) goto next; - do_recover_data(sbi, entry->inode, page, blkaddr); + err = do_recover_data(sbi, entry->inode, page, blkaddr); + if (err) + break; if (entry->blkaddr == blkaddr) { iput(entry->inode); @@ -336,40 +422,48 @@ static void recover_data(struct f2fs_sb_info *sbi, next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); - ClearPageUptodate(page); } -out: + unlock_page(page); __free_pages(page, 0); - allocate_new_segments(sbi); + if (!err) + allocate_new_segments(sbi); + return err; } -void recover_fsync_data(struct f2fs_sb_info *sbi) +int recover_fsync_data(struct f2fs_sb_info *sbi) { struct list_head inode_list; + int err; + bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", - sizeof(struct fsync_inode_entry), NULL); - if (unlikely(!fsync_entry_slab)) - return; + sizeof(struct fsync_inode_entry)); + if (!fsync_entry_slab) + return -ENOMEM; INIT_LIST_HEAD(&inode_list); /* step #1: find fsynced inode numbers */ - if (find_fsync_dnodes(sbi, &inode_list)) + sbi->por_doing = true; + err = find_fsync_dnodes(sbi, &inode_list); + if (err) goto out; if (list_empty(&inode_list)) goto out; + need_writecp = true; + /* step #2: recover data */ - sbi->por_doing = 1; - recover_data(sbi, &inode_list, CURSEG_WARM_NODE); - sbi->por_doing = 0; - BUG_ON(!list_empty(&inode_list)); + err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + f2fs_bug_on(!list_empty(&inode_list)); out: - destroy_fsync_dnodes(sbi, &inode_list); + destroy_fsync_dnodes(&inode_list); kmem_cache_destroy(fsync_entry_slab); - write_checkpoint(sbi, false); + sbi->por_doing = false; + if (!err && need_writecp) + write_checkpoint(sbi, false); + return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 777f17e496e..d04613df710 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -13,11 +13,164 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/prefetch.h> +#include <linux/kthread.h> #include <linux/vmalloc.h> +#include <linux/swap.h> #include "f2fs.h" #include "segment.h" #include "node.h" +#include <trace/events/f2fs.h> + +#define __reverse_ffz(x) __reverse_ffs(~(x)) + +static struct kmem_cache *discard_entry_slab; + +/* + * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since + * MSB and LSB are reversed in a byte by f2fs_set_bit. + */ +static inline unsigned long __reverse_ffs(unsigned long word) +{ + int num = 0; + +#if BITS_PER_LONG == 64 + if ((word & 0xffffffff) == 0) { + num += 32; + word >>= 32; + } +#endif + if ((word & 0xffff) == 0) { + num += 16; + word >>= 16; + } + if ((word & 0xff) == 0) { + num += 8; + word >>= 8; + } + if ((word & 0xf0) == 0) + num += 4; + else + word >>= 4; + if ((word & 0xc) == 0) + num += 2; + else + word >>= 2; + if ((word & 0x2) == 0) + num += 1; + return num; +} + +/* + * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue + * f2fs_set_bit makes MSB and LSB reversed in a byte. + * Example: + * LSB <--> MSB + * f2fs_set_bit(0, bitmap) => 0000 0001 + * f2fs_set_bit(7, bitmap) => 1000 0000 + */ +static unsigned long __find_rev_next_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + unsigned long mask, submask; + unsigned long quot, rest; + + if (offset >= size) + return size; + + size -= result; + offset %= BITS_PER_LONG; + if (!offset) + goto aligned; + + tmp = *(p++); + quot = (offset >> 3) << 3; + rest = offset & 0x7; + mask = ~0UL << quot; + submask = (unsigned char)(0xff << rest) >> rest; + submask <<= quot; + mask &= submask; + tmp &= mask; + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + + size -= BITS_PER_LONG; + result += BITS_PER_LONG; +aligned: + while (size & ~(BITS_PER_LONG-1)) { + tmp = *(p++); + if (tmp) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __reverse_ffs(tmp); +} + +static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, + unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + unsigned long mask, submask; + unsigned long quot, rest; + + if (offset >= size) + return size; + + size -= result; + offset %= BITS_PER_LONG; + if (!offset) + goto aligned; + + tmp = *(p++); + quot = (offset >> 3) << 3; + rest = offset & 0x7; + mask = ~(~0UL << quot); + submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest); + submask <<= quot; + mask += submask; + tmp |= mask; + if (size < BITS_PER_LONG) + goto found_first; + if (~tmp) + goto found_middle; + + size -= BITS_PER_LONG; + result += BITS_PER_LONG; +aligned: + while (size & ~(BITS_PER_LONG - 1)) { + tmp = *(p++); + if (~tmp) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp |= ~0UL << size; + if (tmp == ~0UL) /* Are any bits zero? */ + return result + size; /* Nope. */ +found_middle: + return result + __reverse_ffz(tmp); +} /* * This function balances dirty node and dentry pages. @@ -35,6 +188,114 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) } } +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) +{ + /* check the # of cached NAT entries and prefree segments */ + if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || + excess_prefree_segs(sbi)) + f2fs_sync_fs(sbi->sb, true); +} + +static int issue_flush_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + wait_queue_head_t *q = &fcc->flush_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + spin_lock(&fcc->issue_lock); + if (fcc->issue_list) { + fcc->dispatch_list = fcc->issue_list; + fcc->issue_list = fcc->issue_tail = NULL; + } + spin_unlock(&fcc->issue_lock); + + if (fcc->dispatch_list) { + struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct flush_cmd *cmd, *next; + int ret; + + bio->bi_bdev = sbi->sb->s_bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + + for (cmd = fcc->dispatch_list; cmd; cmd = next) { + cmd->ret = ret; + next = cmd->next; + complete(&cmd->wait); + } + bio_put(bio); + fcc->dispatch_list = NULL; + } + + wait_event_interruptible(*q, + kthread_should_stop() || fcc->issue_list); + goto repeat; +} + +int f2fs_issue_flush(struct f2fs_sb_info *sbi) +{ + struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd cmd; + + if (!test_opt(sbi, FLUSH_MERGE)) + return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); + + init_completion(&cmd.wait); + cmd.next = NULL; + + spin_lock(&fcc->issue_lock); + if (fcc->issue_list) + fcc->issue_tail->next = &cmd; + else + fcc->issue_list = &cmd; + fcc->issue_tail = &cmd; + spin_unlock(&fcc->issue_lock); + + if (!fcc->dispatch_list) + wake_up(&fcc->flush_wait_queue); + + wait_for_completion(&cmd.wait); + + return cmd.ret; +} + +int create_flush_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct flush_cmd_control *fcc; + int err = 0; + + fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); + if (!fcc) + return -ENOMEM; + spin_lock_init(&fcc->issue_lock); + init_waitqueue_head(&fcc->flush_wait_queue); + sbi->sm_info->cmd_control_info = fcc; + fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, + "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(fcc->f2fs_issue_flush)) { + err = PTR_ERR(fcc->f2fs_issue_flush); + kfree(fcc); + sbi->sm_info->cmd_control_info = NULL; + return err; + } + + return err; +} + +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) +{ + struct flush_cmd_control *fcc = + sbi->sm_info->cmd_control_info; + + if (fcc && fcc->f2fs_issue_flush) + kthread_stop(fcc->f2fs_issue_flush); + kfree(fcc); + sbi->sm_info->cmd_control_info = NULL; +} + static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { @@ -49,9 +310,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); - dirty_type = sentry->type; - if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) - dirty_i->nr_dirty[dirty_type]++; + enum dirty_type t = sentry->type; + + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]++; } } @@ -65,12 +327,14 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); - dirty_type = sentry->type; - if (test_and_clear_bit(segno, - dirty_i->dirty_segmap[dirty_type])) - dirty_i->nr_dirty[dirty_type]--; - clear_bit(segno, dirty_i->victim_segmap[FG_GC]); - clear_bit(segno, dirty_i->victim_segmap[BG_GC]); + enum dirty_type t = sentry->type; + + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; + + if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + clear_bit(GET_SECNO(sbi, segno), + dirty_i->victim_secmap); } } @@ -79,7 +343,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, * Adding dirty entry into seglist is not critical operation. * If a given segment is one of current working segments, it won't be added. */ -void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) +static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned short valid_blocks; @@ -102,7 +366,69 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } mutex_unlock(&dirty_i->seglist_lock); - return; +} + +static int f2fs_issue_discard(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart); + sector_t len = SECTOR_FROM_BLOCK(sbi, blklen); + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); +} + +void discard_next_dnode(struct f2fs_sb_info *sbi) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); + block_t blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + + if (f2fs_issue_discard(sbi, blkaddr, 1)) { + struct page *page = grab_meta_page(sbi, blkaddr); + /* zero-filled page */ + set_page_dirty(page); + f2fs_put_page(page, 1); + } +} + +static void add_discard_addrs(struct f2fs_sb_info *sbi, + unsigned int segno, struct seg_entry *se) +{ + struct list_head *head = &SM_I(sbi)->discard_list; + struct discard_entry *new; + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + int max_blocks = sbi->blocks_per_seg; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long dmap[entries]; + unsigned int start = 0, end = -1; + int i; + + if (!test_opt(sbi, DISCARD)) + return; + + /* zero block will be discarded through the prefree list */ + if (!se->valid_blocks || se->valid_blocks == max_blocks) + return; + + /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ + for (i = 0; i < entries; i++) + dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; + + while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + start = __find_rev_next_bit(dmap, max_blocks, end + 1); + if (start >= max_blocks) + break; + + end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + + new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); + INIT_LIST_HEAD(&new->list); + new->blkaddr = START_BLOCK(sbi, segno) + start; + new->len = end - start; + + list_add_tail(&new->list, head); + SM_I(sbi)->nr_discards += end - start; + } } /* @@ -111,48 +437,58 @@ void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno, offset = 0; + unsigned int segno = -1; unsigned int total_segs = TOTAL_SEGS(sbi); mutex_lock(&dirty_i->seglist_lock); while (1) { segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - offset); + segno + 1); if (segno >= total_segs) break; __set_test_and_free(sbi, segno); - offset = segno + 1; } mutex_unlock(&dirty_i->seglist_lock); } void clear_prefree_segments(struct f2fs_sb_info *sbi) { + struct list_head *head = &(SM_I(sbi)->discard_list); + struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno, offset = 0; + unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int total_segs = TOTAL_SEGS(sbi); + unsigned int start = 0, end = -1; mutex_lock(&dirty_i->seglist_lock); + while (1) { - segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs, - offset); - if (segno >= total_segs) + int i; + start = find_next_bit(prefree_map, total_segs, end + 1); + if (start >= total_segs) break; + end = find_next_zero_bit(prefree_map, total_segs, start + 1); - offset = segno + 1; - if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE])) - dirty_i->nr_dirty[PRE]--; - - /* Let's use trim */ - if (test_opt(sbi, DISCARD)) - blkdev_issue_discard(sbi->sb->s_bdev, - START_BLOCK(sbi, segno) << - sbi->log_sectors_per_block, - 1 << (sbi->log_sectors_per_block + - sbi->log_blocks_per_seg), - GFP_NOFS, 0); + for (i = start; i < end; i++) + clear_bit(i, prefree_map); + + dirty_i->nr_dirty[PRE] -= end - start; + + if (!test_opt(sbi, DISCARD)) + continue; + + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + (end - start) << sbi->log_blocks_per_seg); } mutex_unlock(&dirty_i->seglist_lock); + + /* send small discards */ + list_for_each_entry_safe(entry, this, head, list) { + f2fs_issue_discard(sbi, entry->blkaddr, entry->len); + list_del(&entry->list); + SM_I(sbi)->nr_discards -= entry->len; + kmem_cache_free(discard_entry_slab, entry); + } } static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -181,9 +517,9 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) se = get_seg_entry(sbi, segno); new_vblocks = se->valid_blocks + del; - offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1); + offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) || + f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || (new_vblocks > sbi->blocks_per_seg))); se->valid_blocks = new_vblocks; @@ -210,12 +546,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -static void refresh_sit_entry(struct f2fs_sb_info *sbi, - block_t old_blkaddr, block_t new_blkaddr) +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new) { - update_sit_entry(sbi, new_blkaddr, 1); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) - update_sit_entry(sbi, old_blkaddr, -1); + update_sit_entry(sbi, new, 1); + if (GET_SEGNO(sbi, old) != NULL_SEGNO) + update_sit_entry(sbi, old, -1); + + locate_dirty_segment(sbi, GET_SEGNO(sbi, old)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, new)); } void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) @@ -223,7 +561,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); - BUG_ON(addr == NULL_ADDR); + f2fs_bug_on(addr == NULL_ADDR); if (addr == NEW_ADDR) return; @@ -242,13 +580,12 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) * This function should be resided under the curseg_mutex lock */ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, - struct f2fs_summary *sum, unsigned short offset) + struct f2fs_summary *sum) { struct curseg_info *curseg = CURSEG_I(sbi, type); void *addr = curseg->sum_blk; - addr += offset * sizeof(struct f2fs_summary); + addr += curseg->next_blkoff * sizeof(struct f2fs_summary); memcpy(addr, sum, sizeof(struct f2fs_summary)); - return; } /* @@ -256,9 +593,8 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, */ int npages_for_summary_flush(struct f2fs_sb_info *sbi) { - int total_size_bytes = 0; int valid_sum_count = 0; - int i, sum_space; + int i, sum_in_page; for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { if (sbi->ckpt->alloc_type[i] == SSR) @@ -267,13 +603,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi) valid_sum_count += curseg_blkoff(sbi, i); } - total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1) - + sizeof(struct nat_journal) + 2 - + sizeof(struct sit_journal) + 2; - sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE; - if (total_size_bytes < sum_space) + sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE - + SUM_FOOTER_SIZE) / SUMMARY_SIZE; + if (valid_sum_count <= sum_in_page) return 1; - else if (total_size_bytes < 2 * sum_space) + else if ((valid_sum_count - sum_in_page) <= + (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } @@ -296,48 +631,15 @@ static void write_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } -static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, - int ofs_unit, int type) +static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) { - struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE]; - unsigned int segno, next_segno, i; - int ofs = 0; - - /* - * If there is not enough reserved sections, - * we should not reuse prefree segments. - */ - if (has_not_enough_free_secs(sbi, 0)) - return NULL_SEGNO; + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno + 1; + struct free_segmap_info *free_i = FREE_I(sbi); - /* - * NODE page should not reuse prefree segment, - * since those information is used for SPOR. - */ - if (IS_NODESEG(type)) - return NULL_SEGNO; -next: - segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++); - ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit; - if (segno < TOTAL_SEGS(sbi)) { - /* skip intermediate segments in a section */ - if (segno % ofs_unit) - goto next; - - /* skip if whole section is not prefree */ - next_segno = find_next_zero_bit(prefree_segmap, - TOTAL_SEGS(sbi), segno + 1); - if (next_segno - segno < ofs_unit) - goto next; - - /* skip if whole section was not free at the last checkpoint */ - for (i = 0; i < ofs_unit; i++) - if (get_seg_entry(sbi, segno)->ckpt_valid_blocks) - goto next; - return segno; - } - return NULL_SEGNO; + if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec) + return !test_bit(segno, free_i->free_segmap); + return 0; } /* @@ -348,9 +650,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, int dir) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int total_secs = sbi->total_sections; unsigned int segno, secno, zoneno; - unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone; + unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = *newseg / sbi->segs_per_sec; unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); unsigned int left_start = hint; @@ -363,16 +664,17 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, TOTAL_SEGS(sbi), *newseg + 1); - if (segno < TOTAL_SEGS(sbi)) + if (segno - *newseg < sbi->segs_per_sec - + (*newseg % sbi->segs_per_sec)) goto got_it; } find_other_zone: - secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint); - if (secno >= total_secs) { + secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint); + if (secno >= TOTAL_SECS(sbi)) { if (dir == ALLOC_RIGHT) { secno = find_next_zero_bit(free_i->free_secmap, - total_secs, 0); - BUG_ON(secno >= total_secs); + TOTAL_SECS(sbi), 0); + f2fs_bug_on(secno >= TOTAL_SECS(sbi)); } else { go_left = 1; left_start = hint - 1; @@ -387,8 +689,8 @@ find_other_zone: continue; } left_start = find_next_zero_bit(free_i->free_secmap, - total_secs, 0); - BUG_ON(left_start >= total_secs); + TOTAL_SECS(sbi), 0); + f2fs_bug_on(left_start >= TOTAL_SECS(sbi)); break; } secno = left_start; @@ -427,7 +729,7 @@ skip_left: } got_it: /* set it as dirty segment in free segmap */ - BUG_ON(test_bit(segno, free_i->free_segmap)); + f2fs_bug_on(test_bit(segno, free_i->free_segmap)); __set_inuse(sbi, segno); *newseg = segno; write_unlock(&free_i->segmap_lock); @@ -463,7 +765,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) int dir = ALLOC_LEFT; write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + GET_SUM_BLOCK(sbi, segno)); if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA) dir = ALLOC_RIGHT; @@ -480,13 +782,18 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg, block_t start) { struct seg_entry *se = get_seg_entry(sbi, seg->segno); - block_t ofs; - for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) { - if (!f2fs_test_bit(ofs, se->ckpt_valid_map) - && !f2fs_test_bit(ofs, se->cur_valid_map)) - break; - } - seg->next_blkoff = ofs; + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + unsigned long target_map[entries]; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + int i, pos; + + for (i = 0; i < entries; i++) + target_map[i] = ckpt_map[i] | cur_map[i]; + + pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); + + seg->next_blkoff = pos; } /* @@ -561,26 +868,19 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, int type, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int ofs_unit; - if (force) { + if (force) new_curseg(sbi, type, true); - goto out; - } - - ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec; - curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type); - - if (curseg->next_segno != NULL_SEGNO) - change_curseg(sbi, type, false); else if (type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type, true); else new_curseg(sbi, type, false); -out: - sbi->segment_count[curseg->alloc_type]++; + + stat_inc_seg_type(sbi, curseg); } void allocate_new_segments(struct f2fs_sb_info *sbi) @@ -601,121 +901,6 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; -static void f2fs_end_io_write(struct bio *bio, int err) -{ - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct bio_private *p = bio->bi_private; - - do { - struct page *page = bvec->bv_page; - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - if (!uptodate) { - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); - set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG); - p->sbi->sb->s_flags |= MS_RDONLY; - } - end_page_writeback(page); - dec_page_count(p->sbi, F2FS_WRITEBACK); - } while (bvec >= bio->bi_io_vec); - - if (p->is_sync) - complete(p->wait); - kfree(p); - bio_put(bio); -} - -struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages) -{ - struct bio *bio; - struct bio_private *priv; -retry: - priv = kmalloc(sizeof(struct bio_private), GFP_NOFS); - if (!priv) { - cond_resched(); - goto retry; - } - - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); - bio->bi_bdev = bdev; - bio->bi_private = priv; - return bio; -} - -static void do_submit_bio(struct f2fs_sb_info *sbi, - enum page_type type, bool sync) -{ - int rw = sync ? WRITE_SYNC : WRITE; - enum page_type btype = type > META ? META : type; - - if (type >= META_FLUSH) - rw = WRITE_FLUSH_FUA; - - if (sbi->bio[btype]) { - struct bio_private *p = sbi->bio[btype]->bi_private; - p->sbi = sbi; - sbi->bio[btype]->bi_end_io = f2fs_end_io_write; - if (type == META_FLUSH) { - DECLARE_COMPLETION_ONSTACK(wait); - p->is_sync = true; - p->wait = &wait; - submit_bio(rw, sbi->bio[btype]); - wait_for_completion(&wait); - } else { - p->is_sync = false; - submit_bio(rw, sbi->bio[btype]); - } - sbi->bio[btype] = NULL; - } -} - -void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync) -{ - down_write(&sbi->bio_sem); - do_submit_bio(sbi, type, sync); - up_write(&sbi->bio_sem); -} - -static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, enum page_type type) -{ - struct block_device *bdev = sbi->sb->s_bdev; - - verify_block_addr(sbi, blk_addr); - - down_write(&sbi->bio_sem); - - inc_page_count(sbi, F2FS_WRITEBACK); - - if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1) - do_submit_bio(sbi, type, false); -alloc_new: - if (sbi->bio[type] == NULL) { - sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev)); - sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); - /* - * The end_io will be assigned at the sumbission phase. - * Until then, let bio_add_page() merge consecutive IOs as much - * as possible. - */ - } - - if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { - do_submit_bio(sbi, type, false); - goto alloc_new; - } - - sbi->last_block_in_bio[type] = blk_addr; - - up_write(&sbi->bio_sem); -} - static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -756,7 +941,7 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type) if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; - else if (is_cold_data(page) || is_cold_file(inode)) + else if (is_cold_data(page) || file_is_cold(inode)) return CURSEG_COLD_DATA; else return CURSEG_WARM_DATA; @@ -779,20 +964,18 @@ static int __get_segment_type(struct page *page, enum page_type p_type) return __get_segment_type_4(page, p_type); } /* NR_CURSEG_TYPE(6) logs by default */ - BUG_ON(sbi->active_logs != NR_CURSEG_TYPE); + f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE); return __get_segment_type_6(page, p_type); } -static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, - block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, enum page_type p_type) +void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; unsigned int old_cursegno; - int type; - type = __get_segment_type(page, p_type); curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); @@ -805,68 +988,82 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, * because, this function updates a summary entry in the * current summary block. */ - __add_sum_entry(sbi, type, sum, curseg->next_blkoff); + __add_sum_entry(sbi, type, sum); mutex_lock(&sit_i->sentry_lock); __refresh_next_blkoff(sbi, curseg); - sbi->block_count[curseg->alloc_type]++; + stat_inc_block_count(sbi, curseg); + + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + mutex_unlock(&sit_i->sentry_lock); - if (p_type == NODE) + if (page && IS_NODESEG(type)) fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); - /* writeout dirty page into bdev */ - submit_write_page(sbi, page, *new_blkaddr, p_type); - mutex_unlock(&curseg->curseg_mutex); } +static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, struct f2fs_io_info *fio) +{ + int type = __get_segment_type(page, fio->type); + + allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type); + + /* writeout dirty page into bdev */ + f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio); +} + void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) { + struct f2fs_io_info fio = { + .type = META, + .rw = WRITE_SYNC | REQ_META | REQ_PRIO + }; + set_page_writeback(page); - submit_write_page(sbi, page, page->index, META); + f2fs_submit_page_mbio(sbi, page, page->index, &fio); } void write_node_page(struct f2fs_sb_info *sbi, struct page *page, + struct f2fs_io_info *fio, unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr) { struct f2fs_summary sum; set_summary(&sum, nid, 0, 0); - do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE); + do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio); } -void write_data_page(struct inode *inode, struct page *page, - struct dnode_of_data *dn, block_t old_blkaddr, - block_t *new_blkaddr) +void write_data_page(struct page *page, struct dnode_of_data *dn, + block_t *new_blkaddr, struct f2fs_io_info *fio) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct f2fs_summary sum; struct node_info ni; - BUG_ON(old_blkaddr == NULL_ADDR); + f2fs_bug_on(dn->data_blkaddr == NULL_ADDR); get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - do_write_page(sbi, page, old_blkaddr, - new_blkaddr, &sum, DATA); + do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio); } -void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page, - block_t old_blk_addr) +void rewrite_data_page(struct page *page, block_t old_blkaddr, + struct f2fs_io_info *fio) { - submit_write_page(sbi, page, old_blk_addr, DATA); + struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio); } void recover_data_page(struct f2fs_sb_info *sbi, @@ -902,14 +1099,11 @@ void recover_data_page(struct f2fs_sb_info *sbi, change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); - __add_sum_entry(sbi, type, sum, curseg->next_blkoff); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); + __add_sum_entry(sbi, type, sum); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); @@ -925,6 +1119,10 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, unsigned int segno, old_cursegno; block_t next_blkaddr = next_blkaddr_of_node(page); unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr); + struct f2fs_io_info fio = { + .type = NODE, + .rw = WRITE_SYNC, + }; curseg = CURSEG_I(sbi, type); @@ -939,31 +1137,62 @@ void rewrite_node_page(struct f2fs_sb_info *sbi, curseg->next_segno = segno; change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) & - (sbi->blocks_per_seg - 1); - __add_sum_entry(sbi, type, sum, curseg->next_blkoff); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); + __add_sum_entry(sbi, type, sum); /* change the current log to the next block addr in advance */ if (next_segno != segno) { curseg->next_segno = next_segno; change_curseg(sbi, type, true); } - curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) & - (sbi->blocks_per_seg - 1); + curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr); /* rewrite node page */ set_page_writeback(page); - submit_write_page(sbi, page, new_blkaddr, NODE); - f2fs_submit_bio(sbi, NODE, true); + f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); + f2fs_submit_merged_bio(sbi, NODE, WRITE); refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); } +static inline bool is_merged_page(struct f2fs_sb_info *sbi, + struct page *page, enum page_type type) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = &sbi->write_io[btype]; + struct bio_vec *bvec; + int i; + + down_read(&io->io_rwsem); + if (!io->bio) + goto out; + + bio_for_each_segment_all(bvec, io->bio, i) { + if (page == bvec->bv_page) { + up_read(&io->io_rwsem); + return true; + } + } + +out: + up_read(&io->io_rwsem); + return false; +} + +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type) +{ + struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + if (PageWriteback(page)) { + if (is_merged_page(sbi, page, type)) + f2fs_submit_merged_bio(sbi, type, WRITE); + wait_on_page_writeback(page); + } +} + static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1068,9 +1297,12 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - if (restore_node_summary(sbi, segno, sum)) { + int err; + + err = restore_node_summary(sbi, segno, sum); + if (err) { f2fs_put_page(new, 1); - return -EINVAL; + return err; } } } @@ -1091,6 +1323,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { int type = CURSEG_HOT_DATA; + int err; if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { /* restore for compacted data summary */ @@ -1099,9 +1332,12 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) type = CURSEG_HOT_NODE; } - for (; type <= CURSEG_COLD_NODE; type++) - if (read_normal_summaries(sbi, type)) - return -EINVAL; + for (; type <= CURSEG_COLD_NODE; type++) { + err = read_normal_summaries(sbi, type); + if (err) + return err; + } + return 0; } @@ -1128,8 +1364,6 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; - set_page_dirty(page); - /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { unsigned short blkoff; @@ -1148,18 +1382,20 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) summary = (struct f2fs_summary *)(kaddr + written_size); *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; - set_page_dirty(page); if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) continue; + set_page_dirty(page); f2fs_put_page(page, 1); page = NULL; } } - if (page) + if (page) { + set_page_dirty(page); f2fs_put_page(page, 1); + } } static void write_normal_summaries(struct f2fs_sb_info *sbi, @@ -1191,7 +1427,6 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); - return; } int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, @@ -1246,7 +1481,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, /* get current sit block page without lock */ src_page = get_meta_page(sbi, src_off); dst_page = grab_meta_page(sbi, dst_off); - BUG_ON(PageDirty(src_page)); + f2fs_bug_on(PageDirty(src_page)); src_addr = page_address(src_page); dst_addr = page_address(dst_page); @@ -1278,9 +1513,9 @@ static bool flush_sits_in_journal(struct f2fs_sb_info *sbi) __mark_sit_entry_dirty(sbi, segno); } update_sits_in_cursum(sum, -sits_in_cursum(sum)); - return 1; + return true; } - return 0; + return false; } /* @@ -1315,6 +1550,10 @@ void flush_sit_entries(struct f2fs_sb_info *sbi) sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + /* add discard candidates */ + if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) + add_discard_addrs(sbi, segno, se); + if (flushed) goto to_sit_page; @@ -1390,7 +1629,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) } if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = vzalloc(sbi->total_sections * + sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) * sizeof(struct sec_entry)); if (!sit_i->sec_entries) return -ENOMEM; @@ -1403,10 +1642,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); - dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); if (!dst_bitmap) return -ENOMEM; - memcpy(dst_bitmap, src_bitmap, bitmap_size); /* init SIT information */ sit_i->s_ops = &default_salloc_ops; @@ -1442,7 +1680,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) if (!free_i->free_segmap) return -ENOMEM; - sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections); + sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -1487,36 +1725,48 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - unsigned int start; - - for (start = 0; start < TOTAL_SEGS(sbi); start++) { - struct seg_entry *se = &sit_i->sentries[start]; - struct f2fs_sit_block *sit_blk; - struct f2fs_sit_entry sit; - struct page *page; - int i; + int sit_blk_cnt = SIT_BLK_CNT(sbi); + unsigned int i, start, end; + unsigned int readed, start_blk = 0; + int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < sits_in_cursum(sum); i++) { - if (le32_to_cpu(segno_in_journal(sum, i)) == start) { - sit = sit_in_journal(sum, i); - mutex_unlock(&curseg->curseg_mutex); - goto got_it; + do { + readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); + + start = start_blk * sit_i->sents_per_block; + end = (start_blk + readed) * sit_i->sents_per_block; + + for (; start < end && start < TOTAL_SEGS(sbi); start++) { + struct seg_entry *se = &sit_i->sentries[start]; + struct f2fs_sit_block *sit_blk; + struct f2fs_sit_entry sit; + struct page *page; + + mutex_lock(&curseg->curseg_mutex); + for (i = 0; i < sits_in_cursum(sum); i++) { + if (le32_to_cpu(segno_in_journal(sum, i)) + == start) { + sit = sit_in_journal(sum, i); + mutex_unlock(&curseg->curseg_mutex); + goto got_it; + } } - } - mutex_unlock(&curseg->curseg_mutex); - page = get_current_sit_page(sbi, start); - sit_blk = (struct f2fs_sit_block *)page_address(page); - sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; - f2fs_put_page(page, 1); + mutex_unlock(&curseg->curseg_mutex); + + page = get_current_sit_page(sbi, start); + sit_blk = (struct f2fs_sit_block *)page_address(page); + sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; + f2fs_put_page(page, 1); got_it: - check_block_count(sbi, start, &sit); - seg_info_from_raw_sit(se, &sit); - if (sbi->segs_per_sec > 1) { - struct sec_entry *e = get_sec_entry(sbi, start); - e->valid_blocks += se->valid_blocks; + check_block_count(sbi, start, &sit); + seg_info_from_raw_sit(se, &sit); + if (sbi->segs_per_sec > 1) { + struct sec_entry *e = get_sec_entry(sbi, start); + e->valid_blocks += se->valid_blocks; + } } - } + start_blk += readed; + } while (start_blk < sit_blk_cnt); } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -1541,13 +1791,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int segno = 0, offset = 0; + unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi); unsigned short valid_blocks; - while (segno < TOTAL_SEGS(sbi)) { + while (1) { /* find dirty segment based on free segmap */ - segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset); - if (segno >= TOTAL_SEGS(sbi)) + segno = find_next_inuse(free_i, total_segs, offset); + if (segno >= total_segs) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, 0); @@ -1559,14 +1809,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) } } -static int init_victim_segmap(struct f2fs_sb_info *sbi) +static int init_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); - dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL); - dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL); - if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC]) + dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dirty_i->victim_secmap) return -ENOMEM; return 0; } @@ -1593,7 +1842,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) } init_dirty_segmap(sbi); - return init_victim_segmap(sbi); + return init_victim_secmap(sbi); } /* @@ -1637,8 +1886,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi) /* init sm info */ sbi->sm_info = sm_info; - INIT_LIST_HEAD(&sm_info->wblist_head); - spin_lock_init(&sm_info->wblist_lock); sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); sm_info->segment_count = le32_to_cpu(raw_super->segment_count); @@ -1646,6 +1893,20 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + sm_info->rec_prefree_segments = sm_info->main_segments * + DEF_RECLAIM_PREFREE_SEGMENTS / 100; + sm_info->ipu_policy = F2FS_IPU_DISABLE; + sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; + + INIT_LIST_HEAD(&sm_info->discard_list); + sm_info->nr_discards = 0; + sm_info->max_discards = 0; + + if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { + err = create_flush_cmd_control(sbi); + if (err) + return err; + } err = build_sit_info(sbi); if (err) @@ -1680,18 +1941,10 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, mutex_unlock(&dirty_i->seglist_lock); } -void reset_victim_segmap(struct f2fs_sb_info *sbi) -{ - unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); - memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size); -} - -static void destroy_victim_segmap(struct f2fs_sb_info *sbi) +static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - - kfree(dirty_i->victim_segmap[FG_GC]); - kfree(dirty_i->victim_segmap[BG_GC]); + kfree(dirty_i->victim_secmap); } static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) @@ -1706,7 +1959,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) for (i = 0; i < NR_DIRTY_TYPE; i++) discard_dirty_segmap(sbi, i); - destroy_victim_segmap(sbi); + destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; kfree(dirty_i); } @@ -1761,6 +2014,10 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) void destroy_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); + + if (!sm_info) + return; + destroy_flush_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); @@ -1768,3 +2025,17 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) sbi->sm_info = NULL; kfree(sm_info); } + +int __init create_segment_manager_caches(void) +{ + discard_entry_slab = f2fs_kmem_cache_create("discard_entry", + sizeof(struct discard_entry)); + if (!discard_entry_slab) + return -ENOMEM; + return 0; +} + +void destroy_segment_manager_caches(void) +{ + kmem_cache_destroy(discard_entry_slab); +} diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 552dadbb232..7091204680f 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -8,28 +8,28 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#include <linux/blkdev.h> + /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) +#define NULL_SECNO ((unsigned int)(~0)) + +#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ -/* V: Logical segment # in volume, R: Relative segment # in main area */ +/* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) -#define IS_DATASEG(t) \ - ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \ - (t == CURSEG_WARM_DATA)) +#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) -#define IS_NODESEG(t) \ - ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \ - (t == CURSEG_WARM_NODE)) - -#define IS_CURSEG(sbi, segno) \ - ((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) +#define IS_CURSEG(sbi, seg) \ + ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ @@ -57,6 +57,9 @@ ((blk_addr) - SM_I(sbi)->seg0_blkaddr) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) +#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + #define GET_SEGNO(sbi, blk_addr) \ (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ @@ -78,19 +81,19 @@ (segno / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(sit_i, segno) \ (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) +#define SIT_BLK_CNT(sbi) \ + ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) #define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) +#define TOTAL_SECS(sbi) (sbi->total_sections) #define SECTOR_FROM_BLOCK(sbi, blk_addr) \ - (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) - -/* during checkpoint, bio_private is used to synchronize the last bio */ -struct bio_private { - struct f2fs_sb_info *sbi; - bool is_sync; - void *wait; -}; + (((sector_t)blk_addr) << (sbi)->log_sectors_per_block) +#define SECTOR_TO_BLOCK(sbi, sectors) \ + (sectors >> (sbi)->log_sectors_per_block) +#define MAX_BIO_BLOCKS(max_hw_blocks) \ + (min((int)max_hw_blocks, BIO_MAX_PAGES)) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -136,6 +139,7 @@ struct victim_sel_policy { int alloc_mode; /* LFS or SSR */ int gc_mode; /* GC_CB or GC_GREEDY */ unsigned long *dirty_segmap; /* dirty segment bitmap */ + unsigned int max_search; /* maximum # of segments to search */ unsigned int offset; /* last scanned bitmap offset */ unsigned int ofs_unit; /* bitmap search unit */ unsigned int min_cost; /* minimum cost */ @@ -213,7 +217,7 @@ struct dirty_seglist_info { unsigned long *dirty_segmap[NR_DIRTY_TYPE]; struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ - unsigned long *victim_segmap[2]; /* BG_GC, FG_GC */ + unsigned long *victim_secmap; /* background GC victims */ }; /* victim selection function for cleaning and SSR */ @@ -376,26 +380,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, static inline block_t written_block_count(struct f2fs_sb_info *sbi) { - struct sit_info *sit_i = SIT_I(sbi); - block_t vblocks; - - mutex_lock(&sit_i->sentry_lock); - vblocks = sit_i->written_valid_blocks; - mutex_unlock(&sit_i->sentry_lock); - - return vblocks; + return SIT_I(sbi)->written_valid_blocks; } static inline unsigned int free_segments(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_segs; - - read_lock(&free_i->segmap_lock); - free_segs = free_i->free_segments; - read_unlock(&free_i->segmap_lock); - - return free_segs; + return FREE_I(sbi)->free_segments; } static inline int reserved_segments(struct f2fs_sb_info *sbi) @@ -405,14 +395,7 @@ static inline int reserved_segments(struct f2fs_sb_info *sbi) static inline unsigned int free_sections(struct f2fs_sb_info *sbi) { - struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int free_secs; - - read_lock(&free_i->segmap_lock); - free_secs = free_i->free_sections; - read_unlock(&free_i->segmap_lock); - - return free_secs; + return FREE_I(sbi)->free_sections; } static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) @@ -447,7 +430,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) static inline bool need_SSR(struct f2fs_sb_info *sbi) { - return (free_sections(sbi) < overprovision_sections(sbi)); + return (prefree_segments(sbi) / sbi->segs_per_sec) + + free_sections(sbi) < overprovision_sections(sbi); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) @@ -455,34 +439,71 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - if (sbi->por_doing) + if (unlikely(sbi->por_doing)) return false; - return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi))); + return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi)); +} + +static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) +{ + return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; } static inline int utilization(struct f2fs_sb_info *sbi) { - return (long int)valid_user_blocks(sbi) * 100 / - (long int)sbi->user_block_count; + return div_u64((u64)valid_user_blocks(sbi) * 100, + sbi->user_block_count); } /* * Sometimes f2fs may be better to drop out-of-place update policy. - * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write - * data in the original place likewise other traditional file systems. - * But, currently set 100 in percentage, which means it is disabled. - * See below need_inplace_update(). + * And, users can control the policy through sysfs entries. + * There are five policies with triggering conditions as follows. + * F2FS_IPU_FORCE - all the time, + * F2FS_IPU_SSR - if SSR mode is activated, + * F2FS_IPU_UTIL - if FS utilization is over threashold, + * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over + * threashold, + * F2FS_IPUT_DISABLE - disable IPU. (=default option) */ -#define MIN_IPU_UTIL 100 +#define DEF_MIN_IPU_UTIL 70 + +enum { + F2FS_IPU_FORCE, + F2FS_IPU_SSR, + F2FS_IPU_UTIL, + F2FS_IPU_SSR_UTIL, + F2FS_IPU_DISABLE, +}; + static inline bool need_inplace_update(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + + /* IPU can be done only for the user data */ if (S_ISDIR(inode->i_mode)) return false; - if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL) + + switch (SM_I(sbi)->ipu_policy) { + case F2FS_IPU_FORCE: return true; + case F2FS_IPU_SSR: + if (need_SSR(sbi)) + return true; + break; + case F2FS_IPU_UTIL: + if (utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + break; + case F2FS_IPU_SSR_UTIL: + if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + break; + case F2FS_IPU_DISABLE: + break; + } return false; } @@ -506,16 +527,13 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) return curseg->next_blkoff; } +#ifdef CONFIG_F2FS_CHECK_FS static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) { unsigned int end_segno = SM_I(sbi)->segment_count - 1; BUG_ON(segno > end_segno); } -/* - * This function is used for only debugging. - * NOTE: In future, we have to remove this function. - */ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { struct f2fs_sm_info *sm_info = SM_I(sbi); @@ -534,8 +552,9 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, { struct f2fs_sm_info *sm_info = SM_I(sbi); unsigned int end_segno = sm_info->segment_count - 1; + bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; - int i; + int cur_pos = 0, next_pos; /* check segment usage */ BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); @@ -544,11 +563,26 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, BUG_ON(segno > end_segno); /* check bitmap with valid block count */ - for (i = 0; i < sbi->blocks_per_seg; i++) - if (f2fs_test_bit(i, raw_sit->valid_map)) - valid_blocks++; + do { + if (is_valid) { + next_pos = find_next_zero_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + cur_pos); + valid_blocks += next_pos - cur_pos; + } else + next_pos = find_next_bit_le(&raw_sit->valid_map, + sbi->blocks_per_seg, + cur_pos); + cur_pos = next_pos; + is_valid = !is_valid; + } while (cur_pos < sbi->blocks_per_seg); BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); } +#else +#define check_seg_range(sbi, segno) +#define verify_block_addr(sbi, blk_addr) +#define check_block_count(sbi, segno, raw_sit) +#endif static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int start) @@ -616,3 +650,60 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count) - (base + 1) + type; } + +static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) +{ + if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) + return true; + return false; +} + +static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); +} + +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * By default, 512 pages for directory data, + * 512 pages (2MB) * 3 for three types of nodes, and + * max_bio_blocks for meta are set. + */ +static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) +{ + if (type == DATA) + return sbi->blocks_per_seg; + else if (type == NODE) + return 3 * sbi->blocks_per_seg; + else if (type == META) + return MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + else + return 0; +} + +/* + * When writing pages, it'd better align nr_to_write for segment size. + */ +static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, + struct writeback_control *wbc) +{ + long nr_to_write, desired; + + if (wbc->sync_mode != WB_SYNC_NONE) + return 0; + + nr_to_write = wbc->nr_to_write; + + if (type == DATA) + desired = 4096; + else if (type == NODE) + desired = 3 * max_hw_blocks(sbi); + else + desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + + wbc->nr_to_write = desired; + return desired - nr_to_write; +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fea6e582a2e..8f96d9372ad 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -12,47 +12,213 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/statfs.h> -#include <linux/proc_fs.h> #include <linux/buffer_head.h> #include <linux/backing-dev.h> #include <linux/kthread.h> #include <linux/parser.h> #include <linux/mount.h> #include <linux/seq_file.h> +#include <linux/proc_fs.h> #include <linux/random.h> #include <linux/exportfs.h> +#include <linux/blkdev.h> #include <linux/f2fs_fs.h> +#include <linux/sysfs.h> #include "f2fs.h" #include "node.h" +#include "segment.h" #include "xattr.h" +#include "gc.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/f2fs.h> +static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; +static struct kset *f2fs_kset; enum { - Opt_gc_background_off, + Opt_gc_background, Opt_disable_roll_forward, Opt_discard, Opt_noheap, + Opt_user_xattr, Opt_nouser_xattr, + Opt_acl, Opt_noacl, Opt_active_logs, Opt_disable_ext_identify, + Opt_inline_xattr, + Opt_inline_data, + Opt_flush_merge, Opt_err, }; static match_table_t f2fs_tokens = { - {Opt_gc_background_off, "background_gc_off"}, + {Opt_gc_background, "background_gc=%s"}, {Opt_disable_roll_forward, "disable_roll_forward"}, {Opt_discard, "discard"}, {Opt_noheap, "no_heap"}, + {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, + {Opt_inline_xattr, "inline_xattr"}, + {Opt_inline_data, "inline_data"}, + {Opt_flush_merge, "flush_merge"}, {Opt_err, NULL}, }; +/* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +}; + +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int struct_type; + int offset; +}; + +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI) + return (unsigned char *)sbi; + return NULL; +} + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + unsigned char *ptr = NULL; + unsigned int *ui; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned char *ptr; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; + *ui = t; + return count; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .struct_type = _struct_type, \ + .offset = _offset \ +} + +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), + NULL, +}; + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_ktype = { + .default_attrs = f2fs_attrs, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -72,28 +238,170 @@ static void init_once(void *foo) inode_init_once(&fi->vfs_inode); } +static int parse_options(struct super_block *sb, char *options) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + substring_t args[MAX_OPT_ARGS]; + char *p, *name; + int arg = 0; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + /* + * Initialize args struct so we know whether arg was + * found; some options take optional arguments. + */ + args[0].to = args[0].from = NULL; + token = match_token(p, f2fs_tokens, args); + + switch (token) { + case Opt_gc_background: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (strlen(name) == 2 && !strncmp(name, "on", 2)) + set_opt(sbi, BG_GC); + else if (strlen(name) == 3 && !strncmp(name, "off", 3)) + clear_opt(sbi, BG_GC); + else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_disable_roll_forward: + set_opt(sbi, DISABLE_ROLL_FORWARD); + break; + case Opt_discard: + set_opt(sbi, DISCARD); + break; + case Opt_noheap: + set_opt(sbi, NOHEAP); + break; +#ifdef CONFIG_F2FS_FS_XATTR + case Opt_user_xattr: + set_opt(sbi, XATTR_USER); + break; + case Opt_nouser_xattr: + clear_opt(sbi, XATTR_USER); + break; + case Opt_inline_xattr: + set_opt(sbi, INLINE_XATTR); + break; +#else + case Opt_user_xattr: + f2fs_msg(sb, KERN_INFO, + "user_xattr options not supported"); + break; + case Opt_nouser_xattr: + f2fs_msg(sb, KERN_INFO, + "nouser_xattr options not supported"); + break; + case Opt_inline_xattr: + f2fs_msg(sb, KERN_INFO, + "inline_xattr options not supported"); + break; +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + case Opt_acl: + set_opt(sbi, POSIX_ACL); + break; + case Opt_noacl: + clear_opt(sbi, POSIX_ACL); + break; +#else + case Opt_acl: + f2fs_msg(sb, KERN_INFO, "acl options not supported"); + break; + case Opt_noacl: + f2fs_msg(sb, KERN_INFO, "noacl options not supported"); + break; +#endif + case Opt_active_logs: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) + return -EINVAL; + sbi->active_logs = arg; + break; + case Opt_disable_ext_identify: + set_opt(sbi, DISABLE_EXT_IDENTIFY); + break; + case Opt_inline_data: + set_opt(sbi, INLINE_DATA); + break; + case Opt_flush_merge: + set_opt(sbi, FLUSH_MERGE); + break; + default: + f2fs_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" or missing value", + p); + return -EINVAL; + } + } + return 0; +} + static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO); + fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO); if (!fi) return NULL; init_once((void *) fi); - /* Initilize f2fs-specific inode info */ + /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; atomic_set(&fi->dirty_dents, 0); fi->i_current_depth = 1; fi->i_advise = 0; rwlock_init(&fi->ext.ext_lock); + init_rwsem(&fi->i_sem); set_inode_flag(fi, FI_NEW_INODE); + if (test_opt(F2FS_SB(sb), INLINE_XATTR)) + set_inode_flag(fi, FI_INLINE_XATTR); + + /* Will be used by directory only */ + fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } +static int f2fs_drop_inode(struct inode *inode) +{ + /* + * This is to avoid a deadlock condition like below. + * writeback_single_inode(inode) + * - f2fs_write_data_page + * - f2fs_gc -> iput -> evict + * - inode_wait_for_writeback(inode) + */ + if (!inode_unhashed(inode) && inode->i_state & I_SYNC) + return 0; + return generic_drop_inode(inode); +} + +/* + * f2fs_dirty_inode() is called from __mark_inode_dirty() + * + * We should call set_dirty_inode to write the dirty inode through write_inode. + */ +static void f2fs_dirty_inode(struct inode *inode, int flags) +{ + set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); +} + static void f2fs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); @@ -109,10 +417,18 @@ static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + kobject_del(&sbi->s_kobj); + f2fs_destroy_stats(sbi); stop_gc_thread(sbi); - write_checkpoint(sbi, true); + /* We don't need to do checkpoint when it's clean */ + if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES)) + write_checkpoint(sbi, true); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -122,6 +438,8 @@ static void f2fs_put_super(struct super_block *sb) destroy_segment_manager(sbi); kfree(sbi->ckpt); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); sb->s_fs_info = NULL; brelse(sbi->raw_super_buf); @@ -132,13 +450,18 @@ int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + trace_f2fs_sync_fs(sb, sync); + if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES)) return 0; - if (sync) + if (sync) { + mutex_lock(&sbi->gc_mutex); write_checkpoint(sbi, false); - else + mutex_unlock(&sbi->gc_mutex); + } else { f2fs_balance_fs(sbi); + } return 0; } @@ -147,7 +470,7 @@ static int f2fs_freeze(struct super_block *sb) { int err; - if (sb->s_flags & MS_RDONLY) + if (f2fs_readonly(sb)) return 0; err = f2fs_sync_fs(sb, 1); @@ -180,7 +503,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = sbi->total_node_count; buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi); - buf->f_namelen = F2FS_MAX_NAME_LEN; + buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); @@ -191,10 +514,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (test_opt(sbi, BG_GC)) - seq_puts(seq, ",background_gc_on"); + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) + seq_printf(seq, ",background_gc=%s", "on"); else - seq_puts(seq, ",background_gc_off"); + seq_printf(seq, ",background_gc=%s", "off"); if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, DISCARD)) @@ -206,6 +529,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",user_xattr"); else seq_puts(seq, ",nouser_xattr"); + if (test_opt(sbi, INLINE_XATTR)) + seq_puts(seq, ",inline_xattr"); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -215,16 +540,139 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) #endif if (test_opt(sbi, DISABLE_EXT_IDENTIFY)) seq_puts(seq, ",disable_ext_identify"); - + if (test_opt(sbi, INLINE_DATA)) + seq_puts(seq, ",inline_data"); + if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) + seq_puts(seq, ",flush_merge"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; } +static int segment_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-5d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, 1)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + + return 0; +} + +static int segment_info_open_fs(struct inode *inode, struct file *file) +{ + return single_open(file, segment_info_seq_show, PDE_DATA(inode)); +} + +static const struct file_operations f2fs_seq_segment_info_fops = { + .owner = THIS_MODULE, + .open = segment_info_open_fs, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int f2fs_remount(struct super_block *sb, int *flags, char *data) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_mount_info org_mount_opt; + int err, active_logs; + bool need_restart_gc = false; + bool need_stop_gc = false; + + sync_filesystem(sb); + + /* + * Save the old mount options in case we + * need to restore them. + */ + org_mount_opt = sbi->mount_opt; + active_logs = sbi->active_logs; + + /* parse mount options */ + err = parse_options(sb, data); + if (err) + goto restore_opts; + + /* + * Previous and new state of filesystem is RO, + * so skip checking GC and FLUSH_MERGE conditions. + */ + if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) + goto skip; + + /* + * We stop the GC thread if FS is mounted as RO + * or if background_gc = off is passed in mount + * option. Also sync the filesystem. + */ + if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { + if (sbi->gc_thread) { + stop_gc_thread(sbi); + f2fs_sync_fs(sb, 1); + need_restart_gc = true; + } + } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) { + err = start_gc_thread(sbi); + if (err) + goto restore_opts; + need_stop_gc = true; + } + + /* + * We stop issue flush thread if FS is mounted as RO + * or if flush_merge is not passed in mount option. + */ + if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { + destroy_flush_cmd_control(sbi); + } else if (test_opt(sbi, FLUSH_MERGE) && + !sbi->sm_info->cmd_control_info) { + err = create_flush_cmd_control(sbi); + if (err) + goto restore_gc; + } +skip: + /* Update the POSIXACL Flag */ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + return 0; +restore_gc: + if (need_restart_gc) { + if (start_gc_thread(sbi)) + f2fs_msg(sbi->sb, KERN_WARNING, + "background gc thread is stop"); + } else if (need_stop_gc) { + stop_gc_thread(sbi); + } +restore_opts: + sbi->mount_opt = org_mount_opt; + sbi->active_logs = active_logs; + return err; +} + static struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, + .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, .write_inode = f2fs_write_inode, + .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, @@ -232,6 +680,7 @@ static struct super_operations f2fs_sops = { .freeze_fs = f2fs_freeze, .unfreeze_fs = f2fs_unfreeze, .statfs = f2fs_statfs, + .remount_fs = f2fs_remount, }; static struct inode *f2fs_nfs_get_inode(struct super_block *sb, @@ -240,7 +689,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - if (ino < F2FS_ROOT_INO(sbi)) + if (check_nid_range(sbi, ino)) return ERR_PTR(-ESTALE); /* @@ -251,7 +700,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb, inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { + if (unlikely(generation && inode->i_generation != generation)) { /* we didn't find the right inode.. */ iput(inode); return ERR_PTR(-ESTALE); @@ -279,82 +728,9 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi, - char *options) -{ - substring_t args[MAX_OPT_ARGS]; - char *p; - int arg = 0; - - if (!options) - return 0; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - /* - * Initialize args struct so we know whether arg was - * found; some options take optional arguments. - */ - args[0].to = args[0].from = NULL; - token = match_token(p, f2fs_tokens, args); - - switch (token) { - case Opt_gc_background_off: - clear_opt(sbi, BG_GC); - break; - case Opt_disable_roll_forward: - set_opt(sbi, DISABLE_ROLL_FORWARD); - break; - case Opt_discard: - set_opt(sbi, DISCARD); - break; - case Opt_noheap: - set_opt(sbi, NOHEAP); - break; -#ifdef CONFIG_F2FS_FS_XATTR - case Opt_nouser_xattr: - clear_opt(sbi, XATTR_USER); - break; -#else - case Opt_nouser_xattr: - f2fs_msg(sb, KERN_INFO, - "nouser_xattr options not supported"); - break; -#endif -#ifdef CONFIG_F2FS_FS_POSIX_ACL - case Opt_noacl: - clear_opt(sbi, POSIX_ACL); - break; -#else - case Opt_noacl: - f2fs_msg(sb, KERN_INFO, "noacl options not supported"); - break; -#endif - case Opt_active_logs: - if (args->from && match_int(args, &arg)) - return -EINVAL; - if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) - return -EINVAL; - sbi->active_logs = arg; - break; - case Opt_disable_ext_identify: - set_opt(sbi, DISABLE_EXT_IDENTIFY); - break; - default: - f2fs_msg(sb, KERN_ERR, - "Unrecognized mount option \"%s\" or missing value", - p); - return -EINVAL; - } - } - return 0; -} - static loff_t max_file_size(unsigned bits) { - loff_t result = ADDRS_PER_INODE; + loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); loff_t leaf_count = ADDRS_PER_BLOCK; /* two direct node blocks */ @@ -427,10 +803,10 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); fsmeta += le32_to_cpu(raw_super->segment_count_ssa); - if (fsmeta >= total) + if (unlikely(fsmeta >= total)) return 1; - if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; } @@ -457,35 +833,57 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->root_ino_num = le32_to_cpu(raw_super->root_ino); sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); + sbi->cur_victim_sec = NULL_SECNO; + sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + + sbi->dir_level = DEF_DIR_LEVEL; } -static int validate_superblock(struct super_block *sb, - struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf, sector_t block) +/* + * Read f2fs raw super block. + * Because we have two copies of super block, so read the first one at first, + * if the first one is invalid, move to read the second one. + */ +static int read_raw_super_block(struct super_block *sb, + struct f2fs_super_block **raw_super, + struct buffer_head **raw_super_buf) { - const char *super = (block == 0 ? "first" : "second"); + int block = 0; - /* read f2fs raw super block */ +retry: *raw_super_buf = sb_bread(sb, block); if (!*raw_super_buf) { - f2fs_msg(sb, KERN_ERR, "unable to read %s superblock", - super); - return 1; + f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", + block + 1); + if (block == 0) { + block++; + goto retry; + } else { + return -EIO; + } } *raw_super = (struct f2fs_super_block *) ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET); /* sanity checking of raw super */ - if (!sanity_check_raw_super(sb, *raw_super)) - return 0; + if (sanity_check_raw_super(sb, *raw_super)) { + brelse(*raw_super_buf); + f2fs_msg(sb, KERN_ERR, + "Can't find valid F2FS filesystem in %dth superblock", + block + 1); + if (block == 0) { + block++; + goto retry; + } else { + return -EINVAL; + } + } - f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " - "in %s superblock", super); - return 1; + return 0; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) @@ -503,16 +901,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; /* set a block size */ - if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { + if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; } - if (validate_superblock(sb, &raw_super, &raw_super_buf, 0)) { - brelse(raw_super_buf); - if (validate_superblock(sb, &raw_super, &raw_super_buf, 1)) - goto free_sb_buf; - } + err = read_raw_super_block(sb, &raw_super, &raw_super_buf); + if (err) + goto free_sbi; + + sb->s_fs_info = sbi; /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; @@ -525,7 +923,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi, POSIX_ACL); #endif /* parse mount options */ - if (parse_options(sb, sbi, (char *)data)) + err = parse_options(sb, (char *)data); + if (err) goto free_sb_buf; sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); @@ -536,7 +935,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; - sb->s_fs_info = sbi; sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); @@ -547,14 +945,23 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->raw_super = raw_super; sbi->raw_super_buf = raw_super_buf; mutex_init(&sbi->gc_mutex); - mutex_init(&sbi->write_inode); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); - for (i = 0; i < NR_LOCK_TYPE; i++) - mutex_init(&sbi->fs_lock[i]); - sbi->por_doing = 0; + mutex_init(&sbi->node_write); + sbi->por_doing = false; spin_lock_init(&sbi->stat_lock); - init_rwsem(&sbi->bio_sem); + + init_rwsem(&sbi->read_io.io_rwsem); + sbi->read_io.sbi = sbi; + sbi->read_io.bio = NULL; + for (i = 0; i < NR_PAGE_TYPE; i++) { + init_rwsem(&sbi->write_io[i].io_rwsem); + sbi->write_io[i].sbi = sbi; + sbi->write_io[i].bio = NULL; + } + + init_rwsem(&sbi->cp_rwsem); + init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); /* get an inode for meta space */ @@ -617,9 +1024,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } /* if there are nt orphan nodes free them */ - err = -EINVAL; - if (recover_orphan_inodes(sbi)) - goto free_node_inode; + recover_orphan_inodes(sbi); /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); @@ -628,8 +1033,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = PTR_ERR(root); goto free_node_inode; } - if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + err = -EINVAL; goto free_root_inode; + } sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { @@ -637,22 +1044,60 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_root_inode; } - /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) - recover_fsync_data(sbi); - - /* After POR, we can run background GC thread */ - err = start_gc_thread(sbi); + err = f2fs_build_stats(sbi); if (err) - goto fail; + goto free_root_inode; - err = f2fs_build_stats(sbi); + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + + if (test_opt(sbi, DISCARD)) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + if (!blk_queue_discard(q)) + f2fs_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } + + sbi->s_kobj.kset = f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, + "%s", sb->s_id); if (err) - goto fail; + goto free_proc; + /* recover fsynced data */ + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + err = recover_fsync_data(sbi); + if (err) + f2fs_msg(sb, KERN_ERR, + "Cannot recover all fsync data errno=%ld", err); + } + + /* + * If filesystem is not mounted as read-only then + * do start the gc_thread. + */ + if (!(sb->s_flags & MS_RDONLY)) { + /* After POR, we can run background GC thread.*/ + err = start_gc_thread(sbi); + if (err) + goto free_kobj; + } return 0; -fail: - stop_gc_thread(sbi); + +free_kobj: + kobject_del(&sbi->s_kobj); +free_proc: + if (sbi->s_proc) { + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry(sb->s_id, f2fs_proc_root); + } + f2fs_destroy_stats(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -692,8 +1137,8 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info), NULL); - if (f2fs_inode_cachep == NULL) + sizeof(struct f2fs_inode_info)); + if (!f2fs_inode_cachep) return -ENOMEM; return 0; } @@ -717,29 +1162,55 @@ static int __init init_f2fs_fs(void) goto fail; err = create_node_manager_caches(); if (err) - goto fail; + goto free_inodecache; + err = create_segment_manager_caches(); + if (err) + goto free_node_manager_caches; err = create_gc_caches(); if (err) - goto fail; + goto free_segment_manager_caches; err = create_checkpoint_caches(); if (err) - goto fail; + goto free_gc_caches; + f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); + if (!f2fs_kset) { + err = -ENOMEM; + goto free_checkpoint_caches; + } err = register_filesystem(&f2fs_fs_type); if (err) - goto fail; + goto free_kset; f2fs_create_root_stats(); + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return 0; + +free_kset: + kset_unregister(f2fs_kset); +free_checkpoint_caches: + destroy_checkpoint_caches(); +free_gc_caches: + destroy_gc_caches(); +free_segment_manager_caches: + destroy_segment_manager_caches(); +free_node_manager_caches: + destroy_node_manager_caches(); +free_inodecache: + destroy_inodecache(); fail: return err; } static void __exit exit_f2fs_fs(void) { + remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); destroy_checkpoint_caches(); destroy_gc_caches(); + destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); + kset_unregister(f2fs_kset); } module_init(init_f2fs_fs) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8038c049650..8bea941ee30 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -20,11 +20,13 @@ */ #include <linux/rwsem.h> #include <linux/f2fs_fs.h> +#include <linux/security.h> +#include <linux/posix_acl_xattr.h> #include "f2fs.h" #include "xattr.h" static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) + size_t list_size, const char *name, size_t len, int type) { struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); int total_len, prefix_len = 0; @@ -43,15 +45,19 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list, prefix = XATTR_TRUSTED_PREFIX; prefix_len = XATTR_TRUSTED_PREFIX_LEN; break; + case F2FS_XATTR_INDEX_SECURITY: + prefix = XATTR_SECURITY_PREFIX; + prefix_len = XATTR_SECURITY_PREFIX_LEN; + break; default: return -EINVAL; } - total_len = prefix_len + name_len + 1; + total_len = prefix_len + len + 1; if (list && total_len <= list_size) { memcpy(list, prefix, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; + memcpy(list + prefix_len, name, len); + list[prefix_len + len] = '\0'; } return total_len; } @@ -70,13 +76,14 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, if (!capable(CAP_SYS_ADMIN)) return -EPERM; break; + case F2FS_XATTR_INDEX_SECURITY: + break; default: return -EINVAL; } if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_getxattr(dentry->d_inode, type, name, - buffer, size); + return f2fs_getxattr(dentry->d_inode, type, name, buffer, size); } static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, @@ -93,17 +100,20 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, if (!capable(CAP_SYS_ADMIN)) return -EPERM; break; + case F2FS_XATTR_INDEX_SECURITY: + break; default: return -EINVAL; } if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_setxattr(dentry->d_inode, type, name, value, size); + return f2fs_setxattr(dentry->d_inode, type, name, + value, size, NULL, flags); } static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, - size_t list_size, const char *name, size_t name_len, int type) + size_t list_size, const char *name, size_t len, int type) { const char *xname = F2FS_SYSTEM_ADVISE_PREFIX; size_t size; @@ -145,6 +155,31 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, return 0; } +#ifdef CONFIG_F2FS_FS_SECURITY +static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *page) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, + xattr->name, xattr->value, + xattr->value_len, (struct page *)page, 0); + if (err < 0) + break; + } + return err; +} + +int f2fs_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct page *ipage) +{ + return security_inode_init_security(inode, dir, qstr, + &f2fs_initxattrs, ipage); +} +#endif + const struct xattr_handler f2fs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = F2FS_XATTR_INDEX_USER, @@ -169,106 +204,253 @@ const struct xattr_handler f2fs_xattr_advise_handler = { .set = f2fs_xattr_advise_set, }; +const struct xattr_handler f2fs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = F2FS_XATTR_INDEX_SECURITY, + .list = f2fs_xattr_generic_list, + .get = f2fs_xattr_generic_get, + .set = f2fs_xattr_generic_set, +}; + static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL - [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler, - [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, + [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler, +#ifdef CONFIG_F2FS_FS_SECURITY + [F2FS_XATTR_INDEX_SECURITY] = &f2fs_xattr_security_handler, +#endif [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, }; const struct xattr_handler *f2fs_xattr_handlers[] = { &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL - &f2fs_xattr_acl_access_handler, - &f2fs_xattr_acl_default_handler, + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, #endif &f2fs_xattr_trusted_handler, +#ifdef CONFIG_F2FS_FS_SECURITY + &f2fs_xattr_security_handler, +#endif &f2fs_xattr_advise_handler, NULL, }; -static inline const struct xattr_handler *f2fs_xattr_handler(int name_index) +static inline const struct xattr_handler *f2fs_xattr_handler(int index) { const struct xattr_handler *handler = NULL; - if (name_index > 0 && name_index < ARRAY_SIZE(f2fs_xattr_handler_map)) - handler = f2fs_xattr_handler_map[name_index]; + if (index > 0 && index < ARRAY_SIZE(f2fs_xattr_handler_map)) + handler = f2fs_xattr_handler_map[index]; return handler; } -int f2fs_getxattr(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size) +static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + + list_for_each_xattr(entry, base_addr) { + if (entry->e_name_index != index) + continue; + if (entry->e_name_len != len) + continue; + if (!memcmp(entry->e_name, name, len)) + break; + } + return entry; +} + +static void *read_all_xattrs(struct inode *inode, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_xattr_header *header; + size_t size = PAGE_SIZE, inline_size = 0; + void *txattr_addr; + + inline_size = inline_xattr_size(inode); + + txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); + if (!txattr_addr) + return NULL; + + /* read from inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + goto fail; + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + } + + /* read from xattr node block */ + if (F2FS_I(inode)->i_xattr_nid) { + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + if (IS_ERR(xpage)) + goto fail; + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + f2fs_put_page(xpage, 1); + } + + header = XATTR_HDR(txattr_addr); + + /* never been allocated xattrs */ + if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { + header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); + header->h_refcount = cpu_to_le32(1); + } + return txattr_addr; +fail: + kzfree(txattr_addr); + return NULL; +} + +static inline int write_all_xattrs(struct inode *inode, __u32 hsize, + void *txattr_addr, struct page *ipage) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + size_t inline_size = 0; + void *xattr_addr; + struct page *xpage; + nid_t new_nid = 0; + int err; + + inline_size = inline_xattr_size(inode); + + if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) + if (!alloc_nid(sbi, &new_nid)) + return -ENOSPC; + + /* write to inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + f2fs_wait_on_page_writeback(ipage, NODE); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + alloc_nid_failed(sbi, new_nid); + return PTR_ERR(page); + } + inline_addr = inline_xattr_addr(page); + f2fs_wait_on_page_writeback(page, NODE); + } + memcpy(inline_addr, txattr_addr, inline_size); + f2fs_put_page(page, 1); + + /* no need to use xattr node block */ + if (hsize <= inline_size) { + err = truncate_xattr_node(inode, ipage); + alloc_nid_failed(sbi, new_nid); + return err; + } + } + + /* write to xattr node block */ + if (F2FS_I(inode)->i_xattr_nid) { + xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_nid); + return PTR_ERR(xpage); + } + f2fs_bug_on(new_nid); + f2fs_wait_on_page_writeback(xpage, NODE); + } else { + struct dnode_of_data dn; + set_new_dnode(&dn, inode, NULL, NULL, new_nid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_nid); + return PTR_ERR(xpage); + } + alloc_nid_done(sbi, new_nid); + } + + xattr_addr = page_address(xpage); + memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE - + sizeof(struct node_footer)); + set_page_dirty(xpage); + f2fs_put_page(xpage, 1); + + /* need to checkpoint during fsync */ + F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); + return 0; +} + +int f2fs_getxattr(struct inode *inode, int index, const char *name, + void *buffer, size_t buffer_size) +{ struct f2fs_xattr_entry *entry; - struct page *page; void *base_addr; - int error = 0, found = 0; - size_t value_len, name_len; + int error = 0; + size_t size, len; if (name == NULL) return -EINVAL; - name_len = strlen(name); - if (!fi->i_xattr_nid) - return -ENODATA; + len = strlen(name); + if (len > F2FS_NAME_LEN) + return -ERANGE; - page = get_node_page(sbi, fi->i_xattr_nid); - base_addr = page_address(page); + base_addr = read_all_xattrs(inode, NULL); + if (!base_addr) + return -ENOMEM; - list_for_each_xattr(entry, base_addr) { - if (entry->e_name_index != name_index) - continue; - if (entry->e_name_len != name_len) - continue; - if (!memcmp(entry->e_name, name, name_len)) { - found = 1; - break; - } - } - if (!found) { + entry = __find_xattr(base_addr, index, len, name); + if (IS_XATTR_LAST_ENTRY(entry)) { error = -ENODATA; goto cleanup; } - value_len = le16_to_cpu(entry->e_value_size); + size = le16_to_cpu(entry->e_value_size); - if (buffer && value_len > buffer_size) { + if (buffer && size > buffer_size) { error = -ERANGE; goto cleanup; } if (buffer) { char *pval = entry->e_name + entry->e_name_len; - memcpy(buffer, pval, value_len); + memcpy(buffer, pval, size); } - error = value_len; + error = size; cleanup: - f2fs_put_page(page, 1); + kzfree(base_addr); return error; } ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = dentry->d_inode; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *entry; - struct page *page; void *base_addr; int error = 0; size_t rest = buffer_size; - if (!fi->i_xattr_nid) - return 0; - - page = get_node_page(sbi, fi->i_xattr_nid); - base_addr = page_address(page); + base_addr = read_all_xattrs(inode, NULL); + if (!base_addr) + return -ENOMEM; list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = @@ -291,116 +473,77 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - f2fs_put_page(page, 1); + kzfree(base_addr); return error; } -int f2fs_setxattr(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len) +static int __f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, + struct page *ipage, int flags) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_xattr_header *header = NULL; struct f2fs_xattr_entry *here, *last; - struct page *page; void *base_addr; - int error, found, free, newsize; - size_t name_len; - char *pval; + int found, newsize; + size_t len; + __u32 new_hsize; + int error = -ENOMEM; if (name == NULL) return -EINVAL; - name_len = strlen(name); if (value == NULL) - value_len = 0; + size = 0; - if (name_len > 255 || value_len > MAX_VALUE_LEN) - return -ERANGE; + len = strlen(name); - f2fs_balance_fs(sbi); - - mutex_lock_op(sbi, NODE_NEW); - if (!fi->i_xattr_nid) { - /* Allocate new attribute block */ - struct dnode_of_data dn; - - if (!alloc_nid(sbi, &fi->i_xattr_nid)) { - mutex_unlock_op(sbi, NODE_NEW); - return -ENOSPC; - } - set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid); - mark_inode_dirty(inode); - - page = new_node_page(&dn, XATTR_NODE_OFFSET); - if (IS_ERR(page)) { - alloc_nid_failed(sbi, fi->i_xattr_nid); - fi->i_xattr_nid = 0; - mutex_unlock_op(sbi, NODE_NEW); - return PTR_ERR(page); - } + if (len > F2FS_NAME_LEN || size > MAX_VALUE_LEN(inode)) + return -ERANGE; - alloc_nid_done(sbi, fi->i_xattr_nid); - base_addr = page_address(page); - header = XATTR_HDR(base_addr); - header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); - header->h_refcount = cpu_to_le32(1); - } else { - /* The inode already has an extended attribute block. */ - page = get_node_page(sbi, fi->i_xattr_nid); - if (IS_ERR(page)) { - mutex_unlock_op(sbi, NODE_NEW); - return PTR_ERR(page); - } + base_addr = read_all_xattrs(inode, ipage); + if (!base_addr) + goto exit; - base_addr = page_address(page); - header = XATTR_HDR(base_addr); - } + /* find entry with wanted name. */ + here = __find_xattr(base_addr, index, len, name); - if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) { - error = -EIO; - goto cleanup; - } + found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; - /* find entry with wanted name. */ - found = 0; - list_for_each_xattr(here, base_addr) { - if (here->e_name_index != name_index) - continue; - if (here->e_name_len != name_len) - continue; - if (!memcmp(here->e_name, name, name_len)) { - found = 1; - break; - } + if ((flags & XATTR_REPLACE) && !found) { + error = -ENODATA; + goto exit; + } else if ((flags & XATTR_CREATE) && found) { + error = -EEXIST; + goto exit; } last = here; - while (!IS_XATTR_LAST_ENTRY(last)) last = XATTR_NEXT_ENTRY(last); - newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + - name_len + value_len); + newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); /* 1. Check space */ if (value) { - /* If value is NULL, it is remove operation. + int free; + /* + * If value is NULL, it is remove operation. * In case of update operation, we caculate free. */ - free = MIN_OFFSET - ((char *)last - (char *)header); + free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); if (found) - free = free - ENTRY_SIZE(here); + free = free + ENTRY_SIZE(here); - if (free < newsize) { + if (unlikely(free < newsize)) { error = -ENOSPC; - goto cleanup; + goto exit; } } /* 2. Remove old entry */ if (found) { - /* If entry is found, remove old entry. + /* + * If entry is found, remove old entry. * If not found, remove operation is not needed. */ struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here); @@ -411,33 +554,63 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, memset(last, 0, oldsize); } + new_hsize = (char *)last - (char *)base_addr; + /* 3. Write new entry */ if (value) { - /* Before we come here, old entry is removed. - * We just write new entry. */ + char *pval; + /* + * Before we come here, old entry is removed. + * We just write new entry. + */ memset(last, 0, newsize); - last->e_name_index = name_index; - last->e_name_len = name_len; - memcpy(last->e_name, name, name_len); - pval = last->e_name + name_len; - memcpy(pval, value, value_len); - last->e_value_size = cpu_to_le16(value_len); + last->e_name_index = index; + last->e_name_len = len; + memcpy(last->e_name, name, len); + pval = last->e_name + len; + memcpy(pval, value, size); + last->e_value_size = cpu_to_le16(size); + new_hsize += newsize; } - set_page_dirty(page); - f2fs_put_page(page, 1); + error = write_all_xattrs(inode, new_hsize, base_addr, ipage); + if (error) + goto exit; if (is_inode_flag_set(fi, FI_ACL_MODE)) { inode->i_mode = fi->i_acl_mode; inode->i_ctime = CURRENT_TIME; clear_inode_flag(fi, FI_ACL_MODE); } - f2fs_write_inode(inode, NULL); - mutex_unlock_op(sbi, NODE_NEW); - return 0; -cleanup: - f2fs_put_page(page, 1); - mutex_unlock_op(sbi, NODE_NEW); + if (ipage) + update_inode(inode, ipage); + else + update_inode_page(inode); +exit: + kzfree(base_addr); return error; } + +int f2fs_setxattr(struct inode *inode, int index, const char *name, + const void *value, size_t size, + struct page *ipage, int flags) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int err; + + /* this case is only from init_inode_metadata */ + if (ipage) + return __f2fs_setxattr(inode, index, name, value, + size, ipage, flags); + f2fs_balance_fs(sbi); + + f2fs_lock_op(sbi); + /* protect xattr_ver */ + down_write(&F2FS_I(inode)->i_sem); + err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_sem); + f2fs_unlock_op(sbi); + + return err; +} diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 49c9558305e..34ab7dbcf5e 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -51,7 +51,7 @@ struct f2fs_xattr_entry { #define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr)) #define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr)) -#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr)+1)) +#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) #define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) @@ -69,17 +69,16 @@ struct f2fs_xattr_entry { !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \ + sizeof(struct node_footer) - sizeof(__u32)) -#define MIN_OFFSET XATTR_ALIGN(PAGE_SIZE - \ - sizeof(struct node_footer) - \ - sizeof(__u32)) - -#define MAX_VALUE_LEN (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \ - sizeof(struct f2fs_xattr_entry)) +#define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ + sizeof(struct f2fs_xattr_header) - \ + sizeof(struct f2fs_xattr_entry)) /* * On-disk structure of f2fs_xattr - * We use only 1 block for xattr. + * We use inline xattrs space + 1 block for xattr. * * +--------------------+ * | f2fs_xattr_header | @@ -109,28 +108,24 @@ struct f2fs_xattr_entry { #ifdef CONFIG_F2FS_FS_XATTR extern const struct xattr_handler f2fs_xattr_user_handler; extern const struct xattr_handler f2fs_xattr_trusted_handler; -extern const struct xattr_handler f2fs_xattr_acl_access_handler; -extern const struct xattr_handler f2fs_xattr_acl_default_handler; extern const struct xattr_handler f2fs_xattr_advise_handler; +extern const struct xattr_handler f2fs_xattr_security_handler; extern const struct xattr_handler *f2fs_xattr_handlers[]; -extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name, - const void *value, size_t value_len); -extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name, - void *buffer, size_t buffer_size); -extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, - size_t buffer_size); - +extern int f2fs_setxattr(struct inode *, int, const char *, + const void *, size_t, struct page *, int); +extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t); +extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); #else #define f2fs_xattr_handlers NULL -static inline int f2fs_setxattr(struct inode *inode, int name_index, - const char *name, const void *value, size_t value_len) +static inline int f2fs_setxattr(struct inode *inode, int index, + const char *name, const void *value, size_t size, int flags) { return -EOPNOTSUPP; } -static inline int f2fs_getxattr(struct inode *inode, int name_index, +static inline int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) { return -EOPNOTSUPP; @@ -142,4 +137,14 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, } #endif +#ifdef CONFIG_F2FS_FS_SECURITY +extern int f2fs_init_security(struct inode *, struct inode *, + const struct qstr *, struct page *); +#else +static inline int f2fs_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr, struct page *ipage) +{ + return 0; +} +#endif #endif /* __F2FS_XATTR_H__ */ |
